Print this page
11083 support NFS server in zone
Portions contributed by: Dan Kruchinin <dan.kruchinin@nexenta.com>
Portions contributed by: Stepan Zastupov <stepan.zastupov@gmail.com>
Portions contributed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Portions contributed by: Mike Zeller <mike@mikezeller.net>
Portions contributed by: Dan McDonald <danmcd@joyent.com>
Portions contributed by: Gordon Ross <gordon.w.ross@gmail.com>
Portions contributed by: Vitaliy Gusev <gusev.vitaliy@gmail.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Jason King <jbk@joyent.com>
Reviewed by: C Fraire <cfraire@me.com>
Change-Id: I22f289d357503f9b48a0bc2482cc4328a6d43d16
        
@@ -20,12 +20,12 @@
  */
 /*
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2017 Joyent Inc
+ * Copyright 2019 Nexenta by DDN, Inc.
  */
 
 /*
  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  *      All rights reserved.
@@ -81,11 +81,10 @@
 #include <nfs/export.h>
 #include <nfs/nfssys.h>
 #include <nfs/nfs_clnt.h>
 #include <nfs/nfs_acl.h>
 #include <nfs/nfs_log.h>
-#include <nfs/nfs_cmd.h>
 #include <nfs/lm.h>
 #include <nfs/nfs_dispatch.h>
 #include <nfs/nfs4_drc.h>
 
 #include <sys/modctl.h>
@@ -107,22 +106,23 @@
 
 static struct modlinkage modlinkage = {
         MODREV_1, (void *)&modlmisc, NULL
 };
 
+zone_key_t      nfssrv_zone_key;
+list_t          nfssrv_globals_list;
+krwlock_t       nfssrv_globals_rwl;
+
 kmem_cache_t *nfs_xuio_cache;
 int nfs_loaned_buffers = 0;
 
 int
 _init(void)
 {
         int status;
 
-        if ((status = nfs_srvinit()) != 0) {
-                cmn_err(CE_WARN, "_init: nfs_srvinit failed");
-                return (status);
-        }
+        nfs_srvinit();
 
         status = mod_install((struct modlinkage *)&modlinkage);
         if (status != 0) {
                 /*
                  * Could not load module, cleanup previous
@@ -175,31 +175,30 @@
  * modifying those routines to avoid the duplication. For now, we optimize
  * by calling exportmatch() only after checking that the dispatch routine
  * supports RPC_PUBLICFH_OK, and if the filesystem is explicitly exported
  * public (i.e., not the placeholder).
  */
-#define PUBLICFH_CHECK(disp, exi, fsid, xfid) \
+#define PUBLICFH_CHECK(ne, disp, exi, fsid, xfid) \
                 ((disp->dis_flags & RPC_PUBLICFH_OK) && \
                 ((exi->exi_export.ex_flags & EX_PUBLIC) || \
-                (exi == exi_public && exportmatch(exi_root, \
+                (exi == ne->exi_public && exportmatch(ne->exi_root, \
                 fsid, xfid))))
 
 static void     nfs_srv_shutdown_all(int);
-static void     rfs4_server_start(int);
+static void     rfs4_server_start(nfs_globals_t *, int);
 static void     nullfree(void);
 static void     rfs_dispatch(struct svc_req *, SVCXPRT *);
 static void     acl_dispatch(struct svc_req *, SVCXPRT *);
-static void     common_dispatch(struct svc_req *, SVCXPRT *,
-                rpcvers_t, rpcvers_t, char *,
-                struct rpc_disptable *);
-static void     hanfsv4_failover(void);
 static  int     checkauth(struct exportinfo *, struct svc_req *, cred_t *, int,
                 bool_t, bool_t *);
 static char     *client_name(struct svc_req *req);
 static char     *client_addr(struct svc_req *req, char *buf);
 extern  int     sec_svc_getcred(struct svc_req *, cred_t *cr, char **, int *);
 extern  bool_t  sec_svc_inrootlist(int, caddr_t, int, caddr_t *);
+static void     *nfs_server_zone_init(zoneid_t);
+static void     nfs_server_zone_fini(zoneid_t, void *);
+static void     nfs_server_zone_shutdown(zoneid_t, void *);
 
 #define NFSLOG_COPY_NETBUF(exi, xprt, nb)       {               \
         (nb)->maxlen = (xprt)->xp_rtaddr.maxlen;                \
         (nb)->len = (xprt)->xp_rtaddr.len;                      \
         (nb)->buf = kmem_alloc((nb)->len, KM_SLEEP);            \
@@ -246,58 +245,58 @@
 };
 
 static SVC_CALLOUT_TABLE nfs_sct_rdma = {
         sizeof (__nfs_sc_rdma) / sizeof (__nfs_sc_rdma[0]), FALSE, __nfs_sc_rdma
 };
-rpcvers_t nfs_versmin = NFS_VERSMIN_DEFAULT;
-rpcvers_t nfs_versmax = NFS_VERSMAX_DEFAULT;
 
 /*
- * Used to track the state of the server so that initialization
- * can be done properly.
- */
-typedef enum {
-        NFS_SERVER_STOPPED,     /* server state destroyed */
-        NFS_SERVER_STOPPING,    /* server state being destroyed */
-        NFS_SERVER_RUNNING,
-        NFS_SERVER_QUIESCED,    /* server state preserved */
-        NFS_SERVER_OFFLINE      /* server pool offline */
-} nfs_server_running_t;
-
-static nfs_server_running_t nfs_server_upordown;
-static kmutex_t nfs_server_upordown_lock;
-static  kcondvar_t nfs_server_upordown_cv;
-
-/*
  * DSS: distributed stable storage
  * lists of all DSS paths: current, and before last warmstart
  */
 nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths;
 
 int rfs4_dispatch(struct rpcdisp *, struct svc_req *, SVCXPRT *, char *);
 bool_t rfs4_minorvers_mismatch(struct svc_req *, SVCXPRT *, void *);
 
 /*
- * RDMA wait variables.
+ * Stash NFS zone globals in TSD to avoid some lock contention
+ * from frequent zone_getspecific calls.
  */
-static kcondvar_t rdma_wait_cv;
-static kmutex_t rdma_wait_mutex;
+static uint_t nfs_server_tsd_key;
 
+nfs_globals_t *
+nfs_srv_getzg(void)
+{
+        nfs_globals_t *ng;
+
+        ng = tsd_get(nfs_server_tsd_key);
+        if (ng == NULL) {
+                ng = zone_getspecific(nfssrv_zone_key, curzone);
+                (void) tsd_set(nfs_server_tsd_key, ng);
+        }
+
+        return (ng);
+}
+
 /*
  * Will be called at the point the server pool is being unregistered
  * from the pool list. From that point onwards, the pool is waiting
  * to be drained and as such the server state is stale and pertains
  * to the old instantiation of the NFS server pool.
  */
 void
 nfs_srv_offline(void)
 {
-        mutex_enter(&nfs_server_upordown_lock);
-        if (nfs_server_upordown == NFS_SERVER_RUNNING) {
-                nfs_server_upordown = NFS_SERVER_OFFLINE;
+        nfs_globals_t *ng;
+
+        ng = nfs_srv_getzg();
+
+        mutex_enter(&ng->nfs_server_upordown_lock);
+        if (ng->nfs_server_upordown == NFS_SERVER_RUNNING) {
+                ng->nfs_server_upordown = NFS_SERVER_OFFLINE;
         }
-        mutex_exit(&nfs_server_upordown_lock);
+        mutex_exit(&ng->nfs_server_upordown_lock);
 }
 
 /*
  * Will be called at the point the server pool is being destroyed so
  * all transports have been closed and no service threads are in
@@ -322,37 +321,45 @@
         int quiesce = 1;
         nfs_srv_shutdown_all(quiesce);
 }
 
 static void
-nfs_srv_shutdown_all(int quiesce) {
-        mutex_enter(&nfs_server_upordown_lock);
+nfs_srv_shutdown_all(int quiesce)
+{
+        nfs_globals_t *ng = nfs_srv_getzg();
+
+        mutex_enter(&ng->nfs_server_upordown_lock);
         if (quiesce) {
-                if (nfs_server_upordown == NFS_SERVER_RUNNING ||
-                        nfs_server_upordown == NFS_SERVER_OFFLINE) {
-                        nfs_server_upordown = NFS_SERVER_QUIESCED;
-                        cv_signal(&nfs_server_upordown_cv);
+                if (ng->nfs_server_upordown == NFS_SERVER_RUNNING ||
+                    ng->nfs_server_upordown == NFS_SERVER_OFFLINE) {
+                        ng->nfs_server_upordown = NFS_SERVER_QUIESCED;
+                        cv_signal(&ng->nfs_server_upordown_cv);
 
-                        /* reset DSS state, for subsequent warm restart */
+                        /* reset DSS state */
                         rfs4_dss_numnewpaths = 0;
                         rfs4_dss_newpaths = NULL;
 
                         cmn_err(CE_NOTE, "nfs_server: server is now quiesced; "
                             "NFSv4 state has been preserved");
                 }
         } else {
-                if (nfs_server_upordown == NFS_SERVER_OFFLINE) {
-                        nfs_server_upordown = NFS_SERVER_STOPPING;
-                        mutex_exit(&nfs_server_upordown_lock);
-                        rfs4_state_fini();
-                        rfs4_fini_drc(nfs4_drc);
-                        mutex_enter(&nfs_server_upordown_lock);
-                        nfs_server_upordown = NFS_SERVER_STOPPED;
-                        cv_signal(&nfs_server_upordown_cv);
+                if (ng->nfs_server_upordown == NFS_SERVER_OFFLINE) {
+                        ng->nfs_server_upordown = NFS_SERVER_STOPPING;
+                        mutex_exit(&ng->nfs_server_upordown_lock);
+                        rfs4_state_zone_fini();
+                        rfs4_fini_drc();
+                        mutex_enter(&ng->nfs_server_upordown_lock);
+                        ng->nfs_server_upordown = NFS_SERVER_STOPPED;
+
+                        /* reset DSS state */
+                        rfs4_dss_numnewpaths = 0;
+                        rfs4_dss_newpaths = NULL;
+
+                        cv_signal(&ng->nfs_server_upordown_cv);
                 }
         }
-        mutex_exit(&nfs_server_upordown_lock);
+        mutex_exit(&ng->nfs_server_upordown_lock);
 }
 
 static int
 nfs_srv_set_sc_versions(struct file *fp, SVC_CALLOUT_TABLE **sctpp,
                         rpcvers_t versmin, rpcvers_t versmax)
@@ -416,10 +423,11 @@
  * uap->fd is the fd of an open transport provider
  */
 int
 nfs_svc(struct nfs_svc_args *arg, model_t model)
 {
+        nfs_globals_t *ng;
         file_t *fp;
         SVCMASTERXPRT *xprt;
         int error;
         int readsize;
         char buf[KNC_STRSIZE];
@@ -430,17 +438,22 @@
 
 #ifdef lint
         model = model;          /* STRUCT macros don't always refer to it */
 #endif
 
+        ng = nfs_srv_getzg();
         STRUCT_SET_HANDLE(uap, model, arg);
 
         /* Check privileges in nfssys() */
 
         if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
                 return (EBADF);
 
+        /* Setup global file handle in nfs_export */
+        if ((error = nfs_export_get_rootfh(ng)) != 0)
+                return (error);
+
         /*
          * Set read buffer size to rsize
          * and add room for RPC headers.
          */
         readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
@@ -463,31 +476,31 @@
                 releasef(STRUCT_FGET(uap, fd));
                 kmem_free(addrmask.buf, addrmask.maxlen);
                 return (error);
         }
 
-        nfs_versmin = STRUCT_FGET(uap, versmin);
-        nfs_versmax = STRUCT_FGET(uap, versmax);
+        ng->nfs_versmin = STRUCT_FGET(uap, versmin);
+        ng->nfs_versmax = STRUCT_FGET(uap, versmax);
 
         /* Double check the vers min/max ranges */
-        if ((nfs_versmin > nfs_versmax) ||
-            (nfs_versmin < NFS_VERSMIN) ||
-            (nfs_versmax > NFS_VERSMAX)) {
-                nfs_versmin = NFS_VERSMIN_DEFAULT;
-                nfs_versmax = NFS_VERSMAX_DEFAULT;
+        if ((ng->nfs_versmin > ng->nfs_versmax) ||
+            (ng->nfs_versmin < NFS_VERSMIN) ||
+            (ng->nfs_versmax > NFS_VERSMAX)) {
+                ng->nfs_versmin = NFS_VERSMIN_DEFAULT;
+                ng->nfs_versmax = NFS_VERSMAX_DEFAULT;
         }
 
-        if (error =
-            nfs_srv_set_sc_versions(fp, &sctp, nfs_versmin, nfs_versmax)) {
+        if (error = nfs_srv_set_sc_versions(fp, &sctp, ng->nfs_versmin,
+            ng->nfs_versmax)) {
                 releasef(STRUCT_FGET(uap, fd));
                 kmem_free(addrmask.buf, addrmask.maxlen);
                 return (error);
         }
 
         /* Initialize nfsv4 server */
-        if (nfs_versmax == (rpcvers_t)NFS_V4)
-                rfs4_server_start(STRUCT_FGET(uap, delegation));
+        if (ng->nfs_versmax == (rpcvers_t)NFS_V4)
+                rfs4_server_start(ng, STRUCT_FGET(uap, delegation));
 
         /* Create a transport handle. */
         error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &xprt,
             sctp, NULL, NFS_SVCPOOL_ID, TRUE);
 
@@ -502,72 +515,50 @@
 
         return (error);
 }
 
 static void
-rfs4_server_start(int nfs4_srv_delegation)
+rfs4_server_start(nfs_globals_t *ng, int nfs4_srv_delegation)
 {
         /*
          * Determine if the server has previously been "started" and
          * if not, do the per instance initialization
          */
-        mutex_enter(&nfs_server_upordown_lock);
+        mutex_enter(&ng->nfs_server_upordown_lock);
 
-        if (nfs_server_upordown != NFS_SERVER_RUNNING) {
+        if (ng->nfs_server_upordown != NFS_SERVER_RUNNING) {
                 /* Do we need to stop and wait on the previous server? */
-                while (nfs_server_upordown == NFS_SERVER_STOPPING ||
-                    nfs_server_upordown == NFS_SERVER_OFFLINE)
-                        cv_wait(&nfs_server_upordown_cv,
-                            &nfs_server_upordown_lock);
+                while (ng->nfs_server_upordown == NFS_SERVER_STOPPING ||
+                    ng->nfs_server_upordown == NFS_SERVER_OFFLINE)
+                        cv_wait(&ng->nfs_server_upordown_cv,
+                            &ng->nfs_server_upordown_lock);
 
-                if (nfs_server_upordown != NFS_SERVER_RUNNING) {
+                if (ng->nfs_server_upordown != NFS_SERVER_RUNNING) {
                         (void) svc_pool_control(NFS_SVCPOOL_ID,
                             SVCPSET_UNREGISTER_PROC, (void *)&nfs_srv_offline);
                         (void) svc_pool_control(NFS_SVCPOOL_ID,
                             SVCPSET_SHUTDOWN_PROC, (void *)&nfs_srv_stop_all);
 
-                        /* is this an nfsd warm start? */
-                        if (nfs_server_upordown == NFS_SERVER_QUIESCED) {
-                                cmn_err(CE_NOTE, "nfs_server: "
-                                    "server was previously quiesced; "
-                                    "existing NFSv4 state will be re-used");
+                        rfs4_do_server_start(ng->nfs_server_upordown,
+                            nfs4_srv_delegation,
+                            cluster_bootflags & CLUSTER_BOOTED);
 
-                                /*
-                                 * HA-NFSv4: this is also the signal
-                                 * that a Resource Group failover has
-                                 * occurred.
-                                 */
-                                if (cluster_bootflags & CLUSTER_BOOTED)
-                                        hanfsv4_failover();
-                        } else {
-                                /* cold start */
-                                rfs4_state_init();
-                                nfs4_drc = rfs4_init_drc(nfs4_drc_max,
-                                    nfs4_drc_hash);
+                        ng->nfs_server_upordown = NFS_SERVER_RUNNING;
                         }
-
-                        /*
-                         * Check to see if delegation is to be
-                         * enabled at the server
-                         */
-                        if (nfs4_srv_delegation != FALSE)
-                                rfs4_set_deleg_policy(SRV_NORMAL_DELEGATE);
-
-                        nfs_server_upordown = NFS_SERVER_RUNNING;
+                cv_signal(&ng->nfs_server_upordown_cv);
                 }
-                cv_signal(&nfs_server_upordown_cv);
-        }
-        mutex_exit(&nfs_server_upordown_lock);
+        mutex_exit(&ng->nfs_server_upordown_lock);
 }
 
 /*
  * If RDMA device available,
  * start RDMA listener.
  */
 int
 rdma_start(struct rdma_svc_args *rsa)
 {
+        nfs_globals_t *ng;
         int error;
         rdma_xprt_group_t started_rdma_xprts;
         rdma_stat stat;
         int svc_state = 0;
 
@@ -576,13 +567,15 @@
             (rsa->nfs_versmin < NFS_VERSMIN) ||
             (rsa->nfs_versmax > NFS_VERSMAX)) {
                 rsa->nfs_versmin = NFS_VERSMIN_DEFAULT;
                 rsa->nfs_versmax = NFS_VERSMAX_DEFAULT;
         }
-        nfs_versmin = rsa->nfs_versmin;
-        nfs_versmax = rsa->nfs_versmax;
 
+        ng = nfs_srv_getzg();
+        ng->nfs_versmin = rsa->nfs_versmin;
+        ng->nfs_versmax = rsa->nfs_versmax;
+
         /* Set the versions in the callout table */
         __nfs_sc_rdma[0].sc_versmin = rsa->nfs_versmin;
         __nfs_sc_rdma[0].sc_versmax = rsa->nfs_versmax;
         /* For the NFS_ACL program, check the max version */
         __nfs_sc_rdma[1].sc_versmin = rsa->nfs_versmin;
@@ -591,11 +584,11 @@
         else
                 __nfs_sc_rdma[1].sc_versmax = rsa->nfs_versmax;
 
         /* Initialize nfsv4 server */
         if (rsa->nfs_versmax == (rpcvers_t)NFS_V4)
-                rfs4_server_start(rsa->delegation);
+                rfs4_server_start(ng, rsa->delegation);
 
         started_rdma_xprts.rtg_count = 0;
         started_rdma_xprts.rtg_listhead = NULL;
         started_rdma_xprts.rtg_poolid = rsa->poolid;
 
@@ -608,11 +601,11 @@
         while (!error) {
 
                 /*
                  * wait till either interrupted by a signal on
                  * nfs service stop/restart or signalled by a
-                 * rdma plugin attach/detatch.
+                 * rdma attach/detatch.
                  */
 
                 stat = rdma_kwait();
 
                 /*
@@ -659,14 +652,14 @@
 /* ARGSUSED */
 void
 rpc_null_v3(caddr_t *argp, caddr_t *resp, struct exportinfo *exi,
     struct svc_req *req, cred_t *cr, bool_t ro)
 {
-        DTRACE_NFSV3_3(op__null__start, struct svc_req *, req,
-            cred_t *, cr, vnode_t *, NULL);
-        DTRACE_NFSV3_3(op__null__done, struct svc_req *, req,
-            cred_t *, cr, vnode_t *, NULL);
+        DTRACE_NFSV3_4(op__null__start, struct svc_req *, req,
+            cred_t *, cr, vnode_t *, NULL, struct exportinfo *, exi);
+        DTRACE_NFSV3_4(op__null__done, struct svc_req *, req,
+            cred_t *, cr, vnode_t *, NULL, struct exportinfo *, exi);
 }
 
 /* ARGSUSED */
 static void
 rfs_error(caddr_t *argp, caddr_t *resp, struct exportinfo *exi,
@@ -1340,17 +1333,17 @@
 };
 
 static struct rpc_disptable rfs_disptable[] = {
         {sizeof (rfsdisptab_v2) / sizeof (rfsdisptab_v2[0]),
             rfscallnames_v2,
-            &rfsproccnt_v2_ptr, rfsdisptab_v2},
+            rfsdisptab_v2},
         {sizeof (rfsdisptab_v3) / sizeof (rfsdisptab_v3[0]),
             rfscallnames_v3,
-            &rfsproccnt_v3_ptr, rfsdisptab_v3},
+            rfsdisptab_v3},
         {sizeof (rfsdisptab_v4) / sizeof (rfsdisptab_v4[0]),
             rfscallnames_v4,
-            &rfsproccnt_v4_ptr, rfsdisptab_v4},
+            rfsdisptab_v4},
 };
 
 /*
  * If nfs_portmon is set, then clients are required to use privileged
  * ports (ports < IPPORT_RESERVED) in order to get NFS services.
@@ -1365,11 +1358,10 @@
 #ifdef DEBUG
 static int cred_hits = 0;
 static int cred_misses = 0;
 #endif
 
-
 #ifdef DEBUG
 /*
  * Debug code to allow disabling of rfs_dispatch() use of
  * fastxdrargs() and fastxdrres() calls for testing purposes.
  */
@@ -1469,15 +1461,13 @@
                         return (TRUE);
         }
         return (FALSE);
 }
 
-
 static void
 common_dispatch(struct svc_req *req, SVCXPRT *xprt, rpcvers_t min_vers,
-                rpcvers_t max_vers, char *pgmname,
-                struct rpc_disptable *disptable)
+    rpcvers_t max_vers, char *pgmname, struct rpc_disptable *disptable)
 {
         int which;
         rpcvers_t vers;
         char *args;
         union {
@@ -1506,13 +1496,22 @@
         bool_t logging_enabled = FALSE;
         struct exportinfo *nfslog_exi = NULL;
         char **procnames;
         char cbuf[INET6_ADDRSTRLEN];    /* to hold both IPv4 and IPv6 addr */
         bool_t ro = FALSE;
+        nfs_globals_t *ng = nfs_srv_getzg();
+        nfs_export_t *ne = ng->nfs_export;
+        kstat_named_t *svstat, *procstat;
 
+        ASSERT(req->rq_prog == NFS_PROGRAM || req->rq_prog == NFS_ACL_PROGRAM);
+
         vers = req->rq_vers;
 
+        svstat = ng->svstat[req->rq_vers];
+        procstat = (req->rq_prog == NFS_PROGRAM) ?
+            ng->rfsproccnt[vers] : ng->aclproccnt[vers];
+
         if (vers < min_vers || vers > max_vers) {
                 svcerr_progvers(req->rq_xprt, min_vers, max_vers);
                 error++;
                 cmn_err(CE_NOTE, "%s: bad version number %u", pgmname, vers);
                 goto done;
@@ -1524,11 +1523,11 @@
                 svcerr_noproc(req->rq_xprt);
                 error++;
                 goto done;
         }
 
-        (*(disptable[(int)vers].dis_proccntp))[which].value.ui64++;
+        procstat[which].value.ui64++;
 
         disp = &disptable[(int)vers].dis_table[which];
         procnames = disptable[(int)vers].dis_procnames;
 
         auth_flavor = req->rq_cred.oa_flavor;
@@ -1630,17 +1629,19 @@
                         anon_ok = 0;
 
                 cr = xprt->xp_cred;
                 ASSERT(cr != NULL);
 #ifdef DEBUG
+                {
                 if (crgetref(cr) != 1) {
                         crfree(cr);
                         cr = crget();
                         xprt->xp_cred = cr;
                         cred_misses++;
                 } else
                         cred_hits++;
+                }
 #else
                 if (crgetref(cr) != 1) {
                         crfree(cr);
                         cr = crget();
                         xprt->xp_cred = cr;
@@ -1648,11 +1649,11 @@
 #endif
 
                 exi = checkexport(fsid, xfid);
 
                 if (exi != NULL) {
-                        publicfh_ok = PUBLICFH_CHECK(disp, exi, fsid, xfid);
+                        publicfh_ok = PUBLICFH_CHECK(ne, disp, exi, fsid, xfid);
 
                         /*
                          * Don't allow non-V4 clients access
                          * to pseudo exports
                          */
@@ -1761,11 +1762,11 @@
          * the later writing of the log record.  This is done for
          * the case that a lookup is done across a non-logged public
          * file system.
          */
         if (nfslog_buffer_list != NULL) {
-                nfslog_exi = nfslog_get_exi(exi, req, res, &nfslog_rec_id);
+                nfslog_exi = nfslog_get_exi(ne, exi, req, res, &nfslog_rec_id);
                 /*
                  * Is logging enabled?
                  */
                 logging_enabled = (nfslog_exi != NULL);
 
@@ -1844,13 +1845,12 @@
         }
 
         if (exi != NULL)
                 exi_rele(exi);
 
-        global_svstat_ptr[req->rq_vers][NFS_BADCALLS].value.ui64 += error;
-
-        global_svstat_ptr[req->rq_vers][NFS_CALLS].value.ui64++;
+        svstat[NFS_BADCALLS].value.ui64 += error;
+        svstat[NFS_CALLS].value.ui64++;
 }
 
 static void
 rfs_dispatch(struct svc_req *req, SVCXPRT *xprt)
 {
@@ -1969,14 +1969,14 @@
 };
 
 static struct rpc_disptable acl_disptable[] = {
         {sizeof (acldisptab_v2) / sizeof (acldisptab_v2[0]),
                 aclcallnames_v2,
-                &aclproccnt_v2_ptr, acldisptab_v2},
+                acldisptab_v2},
         {sizeof (acldisptab_v3) / sizeof (acldisptab_v3[0]),
                 aclcallnames_v3,
-                &aclproccnt_v3_ptr, acldisptab_v3},
+                acldisptab_v3},
 };
 
 static void
 acl_dispatch(struct svc_req *req, SVCXPRT *xprt)
 {
@@ -2566,35 +2566,33 @@
  * once.  It performs the following tasks:
  *      - Call sub-initialization routines (localize access to variables)
  *      - Initialize all locks
  *      - initialize the version 3 write verifier
  */
-int
+void
 nfs_srvinit(void)
 {
-        int error;
 
-        error = nfs_exportinit();
-        if (error != 0)
-                return (error);
-        error = rfs4_srvrinit();
-        if (error != 0) {
-                nfs_exportfini();
-                return (error);
-        }
+        /* Truly global stuff in this module (not per zone) */
+        rw_init(&nfssrv_globals_rwl, NULL, RW_DEFAULT, NULL);
+        list_create(&nfssrv_globals_list, sizeof (nfs_globals_t),
+            offsetof(nfs_globals_t, nfs_g_link));
+        tsd_create(&nfs_server_tsd_key, NULL);
+
+        /* The order here is important */
+        nfs_exportinit();
         rfs_srvrinit();
         rfs3_srvrinit();
+        rfs4_srvrinit();
         nfsauth_init();
 
-        /* Init the stuff to control start/stop */
-        nfs_server_upordown = NFS_SERVER_STOPPED;
-        mutex_init(&nfs_server_upordown_lock, NULL, MUTEX_DEFAULT, NULL);
-        cv_init(&nfs_server_upordown_cv, NULL, CV_DEFAULT, NULL);
-        mutex_init(&rdma_wait_mutex, NULL, MUTEX_DEFAULT, NULL);
-        cv_init(&rdma_wait_cv, NULL, CV_DEFAULT, NULL);
-
-        return (0);
+        /*
+         * NFS server zone-specific global variables
+         * Note the zone_init is called for the GZ here.
+         */
+        zone_key_create(&nfssrv_zone_key, nfs_server_zone_init,
+            nfs_server_zone_shutdown, nfs_server_zone_fini);
 }
 
 /*
  * NFS Server finalization routine. This routine is called to cleanup the
  * initialization work previously performed if the NFS server module could
@@ -2601,25 +2599,130 @@
  * not be loaded correctly.
  */
 void
 nfs_srvfini(void)
 {
+
+        /*
+         * NFS server zone-specific global variables
+         * Note the zone_fini is called for the GZ here.
+         */
+        (void) zone_key_delete(nfssrv_zone_key);
+
+        /* The order here is important (reverse of init) */
         nfsauth_fini();
+        rfs4_srvrfini();
         rfs3_srvrfini();
         rfs_srvrfini();
         nfs_exportfini();
 
-        mutex_destroy(&nfs_server_upordown_lock);
-        cv_destroy(&nfs_server_upordown_cv);
-        mutex_destroy(&rdma_wait_mutex);
-        cv_destroy(&rdma_wait_cv);
+        /* Truly global stuff in this module (not per zone) */
+        tsd_destroy(&nfs_server_tsd_key);
+        list_destroy(&nfssrv_globals_list);
+        rw_destroy(&nfssrv_globals_rwl);
 }
 
 /*
- * Set up an iovec array of up to cnt pointers.
+ * Zone init, shutdown, fini functions for the NFS server
+ *
+ * This design is careful to create the entire hierarhcy of
+ * NFS server "globals" (including those created by various
+ * per-module *_zone_init functions, etc.) so that all these
+ * objects have exactly the same lifetime.
+ *
+ * These objects are also kept on a list for two reasons:
+ * 1: It makes finding these in mdb _much_ easier.
+ * 2: It allows operating across all zone globals for
+ *    functions like nfs_auth.c:exi_cache_reclaim
  */
+static void *
+nfs_server_zone_init(zoneid_t zoneid)
+{
+        nfs_globals_t *ng;
 
+        ng = kmem_zalloc(sizeof (*ng), KM_SLEEP);
+
+        ng->nfs_versmin = NFS_VERSMIN_DEFAULT;
+        ng->nfs_versmax = NFS_VERSMAX_DEFAULT;
+
+        /* Init the stuff to control start/stop */
+        ng->nfs_server_upordown = NFS_SERVER_STOPPED;
+        mutex_init(&ng->nfs_server_upordown_lock, NULL, MUTEX_DEFAULT, NULL);
+        cv_init(&ng->nfs_server_upordown_cv, NULL, CV_DEFAULT, NULL);
+        mutex_init(&ng->rdma_wait_mutex, NULL, MUTEX_DEFAULT, NULL);
+        cv_init(&ng->rdma_wait_cv, NULL, CV_DEFAULT, NULL);
+
+        ng->nfs_zoneid = zoneid;
+
+        /*
+         * Order here is important.
+         * export init must precede srv init calls.
+         */
+        nfs_export_zone_init(ng);
+        rfs_stat_zone_init(ng);
+        rfs_srv_zone_init(ng);
+        rfs3_srv_zone_init(ng);
+        rfs4_srv_zone_init(ng);
+        nfsauth_zone_init(ng);
+
+        rw_enter(&nfssrv_globals_rwl, RW_WRITER);
+        list_insert_tail(&nfssrv_globals_list, ng);
+        rw_exit(&nfssrv_globals_rwl);
+
+        return (ng);
+}
+
+/* ARGSUSED */
+static void
+nfs_server_zone_shutdown(zoneid_t zoneid, void *data)
+{
+        nfs_globals_t *ng;
+
+        ng = (nfs_globals_t *)data;
+
+        /*
+         * Order is like _fini, but only
+         * some modules need this hook.
+         */
+        nfsauth_zone_shutdown(ng);
+        nfs_export_zone_shutdown(ng);
+}
+
+/* ARGSUSED */
+static void
+nfs_server_zone_fini(zoneid_t zoneid, void *data)
+{
+        nfs_globals_t *ng;
+
+        ng = (nfs_globals_t *)data;
+
+        rw_enter(&nfssrv_globals_rwl, RW_WRITER);
+        list_remove(&nfssrv_globals_list, ng);
+        rw_exit(&nfssrv_globals_rwl);
+
+        /*
+         * Order here is important.
+         * reverse order from init
+         */
+        nfsauth_zone_fini(ng);
+        rfs4_srv_zone_fini(ng);
+        rfs3_srv_zone_fini(ng);
+        rfs_srv_zone_fini(ng);
+        rfs_stat_zone_fini(ng);
+        nfs_export_zone_fini(ng);
+
+        mutex_destroy(&ng->nfs_server_upordown_lock);
+        cv_destroy(&ng->nfs_server_upordown_cv);
+        mutex_destroy(&ng->rdma_wait_mutex);
+        cv_destroy(&ng->rdma_wait_cv);
+
+        kmem_free(ng, sizeof (*ng));
+}
+
+/*
+ * Set up an iovec array of up to cnt pointers.
+ */
 void
 mblk_to_iov(mblk_t *m, int cnt, struct iovec *iovp)
 {
         while (m != NULL && cnt-- > 0) {
                 iovp->iov_base = (caddr_t)m->b_rptr;
@@ -2853,15 +2956,17 @@
                          */
 
                         /* Release the reference on the old exi value */
                         ASSERT(*exi != NULL);
                         exi_rele(*exi);
+                        *exi = NULL;
 
                         if (error = nfs_check_vpexi(mc_dvp, *vpp, kcred, exi)) {
                                 VN_RELE(*vpp);
                                 goto publicfh_done;
                         }
+                        /* Have a new *exi */
                 }
         }
 
 publicfh_done:
         if (mc_dvp)
@@ -2884,18 +2989,20 @@
 {
         char namebuf[TYPICALMAXPATHLEN];
         struct pathname pn;
         int error;
 
+        ASSERT3U(crgetzoneid(cr), ==, curzone->zone_id);
+
         /*
          * If pathname starts with '/', then set startdvp to root.
          */
         if (*path == '/') {
                 while (*path == '/')
                         path++;
 
-                startdvp = rootdir;
+                startdvp = ZONE_ROOTVP();
         }
 
         error = pn_get_buf(path, UIO_SYSSPACE, &pn, namebuf, sizeof (namebuf));
         if (error == 0) {
                 /*
@@ -2914,11 +3021,11 @@
                         if ((pn.pn_pathlen = strlen(pn.pn_path)) == 0)
                                 return (ENOENT);
                 }
                 VN_HOLD(startdvp);
                 error = lookuppnvp(&pn, NULL, NO_FOLLOW, dirvpp, compvpp,
-                    rootdir, startdvp, cr);
+                    ZONE_ROOTVP(), startdvp, cr);
         }
         if (error == ENAMETOOLONG) {
                 /*
                  * This thread used a pathname > TYPICALMAXPATHLEN bytes long.
                  */
@@ -2931,11 +3038,11 @@
                                 return (ENOENT);
                         }
                 }
                 VN_HOLD(startdvp);
                 error = lookuppnvp(&pn, NULL, NO_FOLLOW, dirvpp, compvpp,
-                    rootdir, startdvp, cr);
+                    ZONE_ROOTVP(), startdvp, cr);
                 pn_free(&pn);
         }
 
         return (error);
 }
@@ -3035,172 +3142,10 @@
         }
 
         return (error);
 }
 
-/*
- * Do the main work of handling HA-NFSv4 Resource Group failover on
- * Sun Cluster.
- * We need to detect whether any RG admin paths have been added or removed,
- * and adjust resources accordingly.
- * Currently we're using a very inefficient algorithm, ~ 2 * O(n**2). In
- * order to scale, the list and array of paths need to be held in more
- * suitable data structures.
- */
-static void
-hanfsv4_failover(void)
-{
-        int i, start_grace, numadded_paths = 0;
-        char **added_paths = NULL;
-        rfs4_dss_path_t *dss_path;
-
-        /*
-         * Note: currently, rfs4_dss_pathlist cannot be NULL, since
-         * it will always include an entry for NFS4_DSS_VAR_DIR. If we
-         * make the latter dynamically specified too, the following will
-         * need to be adjusted.
-         */
-
-        /*
-         * First, look for removed paths: RGs that have been failed-over
-         * away from this node.
-         * Walk the "currently-serving" rfs4_dss_pathlist and, for each
-         * path, check if it is on the "passed-in" rfs4_dss_newpaths array
-         * from nfsd. If not, that RG path has been removed.
-         *
-         * Note that nfsd has sorted rfs4_dss_newpaths for us, and removed
-         * any duplicates.
-         */
-        dss_path = rfs4_dss_pathlist;
-        do {
-                int found = 0;
-                char *path = dss_path->path;
-
-                /* used only for non-HA so may not be removed */
-                if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) {
-                        dss_path = dss_path->next;
-                        continue;
-                }
-
-                for (i = 0; i < rfs4_dss_numnewpaths; i++) {
-                        int cmpret;
-                        char *newpath = rfs4_dss_newpaths[i];
-
-                        /*
-                         * Since nfsd has sorted rfs4_dss_newpaths for us,
-                         * once the return from strcmp is negative we know
-                         * we've passed the point where "path" should be,
-                         * and can stop searching: "path" has been removed.
-                         */
-                        cmpret = strcmp(path, newpath);
-                        if (cmpret < 0)
-                                break;
-                        if (cmpret == 0) {
-                                found = 1;
-                                break;
-                        }
-                }
-
-                if (found == 0) {
-                        unsigned index = dss_path->index;
-                        rfs4_servinst_t *sip = dss_path->sip;
-                        rfs4_dss_path_t *path_next = dss_path->next;
-
-                        /*
-                         * This path has been removed.
-                         * We must clear out the servinst reference to
-                         * it, since it's now owned by another
-                         * node: we should not attempt to touch it.
-                         */
-                        ASSERT(dss_path == sip->dss_paths[index]);
-                        sip->dss_paths[index] = NULL;
-
-                        /* remove from "currently-serving" list, and destroy */
-                        remque(dss_path);
-                        /* allow for NUL */
-                        kmem_free(dss_path->path, strlen(dss_path->path) + 1);
-                        kmem_free(dss_path, sizeof (rfs4_dss_path_t));
-
-                        dss_path = path_next;
-                } else {
-                        /* path was found; not removed */
-                        dss_path = dss_path->next;
-                }
-        } while (dss_path != rfs4_dss_pathlist);
-
-        /*
-         * Now, look for added paths: RGs that have been failed-over
-         * to this node.
-         * Walk the "passed-in" rfs4_dss_newpaths array from nfsd and,
-         * for each path, check if it is on the "currently-serving"
-         * rfs4_dss_pathlist. If not, that RG path has been added.
-         *
-         * Note: we don't do duplicate detection here; nfsd does that for us.
-         *
-         * Note: numadded_paths <= rfs4_dss_numnewpaths, which gives us
-         * an upper bound for the size needed for added_paths[numadded_paths].
-         */
-
-        /* probably more space than we need, but guaranteed to be enough */
-        if (rfs4_dss_numnewpaths > 0) {
-                size_t sz = rfs4_dss_numnewpaths * sizeof (char *);
-                added_paths = kmem_zalloc(sz, KM_SLEEP);
-        }
-
-        /* walk the "passed-in" rfs4_dss_newpaths array from nfsd */
-        for (i = 0; i < rfs4_dss_numnewpaths; i++) {
-                int found = 0;
-                char *newpath = rfs4_dss_newpaths[i];
-
-                dss_path = rfs4_dss_pathlist;
-                do {
-                        char *path = dss_path->path;
-
-                        /* used only for non-HA */
-                        if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) {
-                                dss_path = dss_path->next;
-                                continue;
-                        }
-
-                        if (strncmp(path, newpath, strlen(path)) == 0) {
-                                found = 1;
-                                break;
-                        }
-
-                        dss_path = dss_path->next;
-                } while (dss_path != rfs4_dss_pathlist);
-
-                if (found == 0) {
-                        added_paths[numadded_paths] = newpath;
-                        numadded_paths++;
-                }
-        }
-
-        /* did we find any added paths? */
-        if (numadded_paths > 0) {
-                /* create a new server instance, and start its grace period */
-                start_grace = 1;
-                rfs4_servinst_create(start_grace, numadded_paths, added_paths);
-
-                /* read in the stable storage state from these paths */
-                rfs4_dss_readstate(numadded_paths, added_paths);
-
-                /*
-                 * Multiple failovers during a grace period will cause
-                 * clients of the same resource group to be partitioned
-                 * into different server instances, with different
-                 * grace periods.  Since clients of the same resource
-                 * group must be subject to the same grace period,
-                 * we need to reset all currently active grace periods.
-                 */
-                rfs4_grace_reset_all();
-        }
-
-        if (rfs4_dss_numnewpaths > 0)
-                kmem_free(added_paths, rfs4_dss_numnewpaths * sizeof (char *));
-}
-
 /*
  * Used by NFSv3 and NFSv4 server to query label of
  * a pathname component during lookup/access ops.
  */
 ts_label_t *