Print this page
NEX-16917 Need to reduce the impact of NFS per-share kstats on failover
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
NEX-16712 NFS dtrace providers do not support per-share filtering
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Reviewed by: Yuri Pankon <yuri.pankov@nexenta.com>
NEX-15279 support NFS server in zone
NEX-15520 online NFS shares cause zoneadm halt to hang in nfs_export_zone_fini
Portions contributed by: Dan Kruchinin dan.kruchinin@nexenta.com
Portions contributed by: Stepan Zastupov stepan.zastupov@gmail.com
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
NEX-9275 Got "bad mutex" panic when run IO to nfs share from clients
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
NEX-14051 Be careful with RPC groups
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
8085 Handle RPC groups better
Reviewed by: "Joshua M. Clulow" <josh@sysmgr.org>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Dan McDonald <danmcd@omniti.com>
NEX-7366 Getting panic in "module "nfssrv" due to a NULL pointer dereference" when updating NFS shares on a pool
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Reviewed by: Steve Peng <steve.peng@nexenta.com>
NEX-6778 NFS kstats leak and cause system to hang
Revert "NEX-4261 Per-client NFS server IOPS, bandwidth, and latency kstats"
This reverts commit 586c3ab1927647487f01c337ddc011c642575a52.
Revert "NEX-5354 Aggregated IOPS, bandwidth, and latency kstats for NFS server"
This reverts commit c91d7614da8618ef48018102b077f60ecbbac8c2.
Revert "NEX-5667 nfssrv_stats_flags does not work for aggregated kstats"
This reverts commit 3dcf42618be7dd5f408c327f429c81e07ca08e74.
Revert "NEX-5750 Time values for aggregated NFS server kstats should be normalized"
This reverts commit 1f4d4f901153b0191027969fa4a8064f9d3b9ee1.
Revert "NEX-5942 Panic in rfs4_minorvers_mismatch() with NFSv4.1 client"
This reverts commit 40766417094a162f5e4cc8786c0fa0a7e5871cd9.
Revert "NEX-5752 NFS server: namespace collision in kstats"
This reverts commit ae81e668db86050da8e483264acb0cce0444a132.
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-4261 Per-client NFS server IOPS, bandwidth, and latency kstats
Reviewed by: Kevin Crowe <kevin.crowe@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-3097 IOPS, bandwidth, and latency kstats for NFS server
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
NEX-1974 Support for more than 16 groups with AUTH_SYS
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
NEX-1128 NFS server: Generic uid and gid remapping for AUTH_SYS
Reviewed by: Jan Kryl <jan.kryl@nexenta.com>
re #13613 rb4516 Tunables needs volatile keyword

@@ -16,24 +16,28 @@
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2017 Joyent Inc
  */
 
 /*
  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  *      All rights reserved.
  *      Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2018 Nexenta Systems, Inc.
+ * Copyright (c) 2017 Joyent Inc
+ */
+
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/cred.h>
 #include <sys/proc.h>

@@ -81,11 +85,10 @@
 #include <nfs/export.h>
 #include <nfs/nfssys.h>
 #include <nfs/nfs_clnt.h>
 #include <nfs/nfs_acl.h>
 #include <nfs/nfs_log.h>
-#include <nfs/nfs_cmd.h>
 #include <nfs/lm.h>
 #include <nfs/nfs_dispatch.h>
 #include <nfs/nfs4_drc.h>
 
 #include <sys/modctl.h>

@@ -107,22 +110,20 @@
 
 static struct modlinkage modlinkage = {
         MODREV_1, (void *)&modlmisc, NULL
 };
 
+zone_key_t nfssrv_zone_key;
 kmem_cache_t *nfs_xuio_cache;
 int nfs_loaned_buffers = 0;
 
 int
 _init(void)
 {
         int status;
 
-        if ((status = nfs_srvinit()) != 0) {
-                cmn_err(CE_WARN, "_init: nfs_srvinit failed");
-                return (status);
-        }
+        nfs_srvinit();
 
         status = mod_install((struct modlinkage *)&modlinkage);
         if (status != 0) {
                 /*
                  * Could not load module, cleanup previous

@@ -175,31 +176,32 @@
  * modifying those routines to avoid the duplication. For now, we optimize
  * by calling exportmatch() only after checking that the dispatch routine
  * supports RPC_PUBLICFH_OK, and if the filesystem is explicitly exported
  * public (i.e., not the placeholder).
  */
-#define PUBLICFH_CHECK(disp, exi, fsid, xfid) \
+#define PUBLICFH_CHECK(ne, disp, exi, fsid, xfid) \
                 ((disp->dis_flags & RPC_PUBLICFH_OK) && \
                 ((exi->exi_export.ex_flags & EX_PUBLIC) || \
-                (exi == exi_public && exportmatch(exi_root, \
+                (exi == ne->exi_public && exportmatch(ne->exi_root, \
                 fsid, xfid))))
 
 static void     nfs_srv_shutdown_all(int);
-static void     rfs4_server_start(int);
+static void     rfs4_server_start(nfs_globals_t *, int);
 static void     nullfree(void);
 static void     rfs_dispatch(struct svc_req *, SVCXPRT *);
 static void     acl_dispatch(struct svc_req *, SVCXPRT *);
 static void     common_dispatch(struct svc_req *, SVCXPRT *,
                 rpcvers_t, rpcvers_t, char *,
                 struct rpc_disptable *);
-static void     hanfsv4_failover(void);
 static  int     checkauth(struct exportinfo *, struct svc_req *, cred_t *, int,
                 bool_t, bool_t *);
 static char     *client_name(struct svc_req *req);
 static char     *client_addr(struct svc_req *req, char *buf);
 extern  int     sec_svc_getcred(struct svc_req *, cred_t *cr, char **, int *);
 extern  bool_t  sec_svc_inrootlist(int, caddr_t, int, caddr_t *);
+static void     *nfs_srv_zone_init(zoneid_t);
+static void     nfs_srv_zone_fini(zoneid_t, void *);
 
 #define NFSLOG_COPY_NETBUF(exi, xprt, nb)       {               \
         (nb)->maxlen = (xprt)->xp_rtaddr.maxlen;                \
         (nb)->len = (xprt)->xp_rtaddr.len;                      \
         (nb)->buf = kmem_alloc((nb)->len, KM_SLEEP);            \

@@ -246,58 +248,39 @@
 };
 
 static SVC_CALLOUT_TABLE nfs_sct_rdma = {
         sizeof (__nfs_sc_rdma) / sizeof (__nfs_sc_rdma[0]), FALSE, __nfs_sc_rdma
 };
-rpcvers_t nfs_versmin = NFS_VERSMIN_DEFAULT;
-rpcvers_t nfs_versmax = NFS_VERSMAX_DEFAULT;
 
 /*
- * Used to track the state of the server so that initialization
- * can be done properly.
- */
-typedef enum {
-        NFS_SERVER_STOPPED,     /* server state destroyed */
-        NFS_SERVER_STOPPING,    /* server state being destroyed */
-        NFS_SERVER_RUNNING,
-        NFS_SERVER_QUIESCED,    /* server state preserved */
-        NFS_SERVER_OFFLINE      /* server pool offline */
-} nfs_server_running_t;
-
-static nfs_server_running_t nfs_server_upordown;
-static kmutex_t nfs_server_upordown_lock;
-static  kcondvar_t nfs_server_upordown_cv;
-
-/*
  * DSS: distributed stable storage
  * lists of all DSS paths: current, and before last warmstart
  */
 nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths;
 
-int rfs4_dispatch(struct rpcdisp *, struct svc_req *, SVCXPRT *, char *);
+int rfs4_dispatch(struct rpcdisp *, struct svc_req *, SVCXPRT *, char *,
+    size_t *);
 bool_t rfs4_minorvers_mismatch(struct svc_req *, SVCXPRT *, void *);
 
 /*
- * RDMA wait variables.
- */
-static kcondvar_t rdma_wait_cv;
-static kmutex_t rdma_wait_mutex;
-
-/*
  * Will be called at the point the server pool is being unregistered
  * from the pool list. From that point onwards, the pool is waiting
  * to be drained and as such the server state is stale and pertains
  * to the old instantiation of the NFS server pool.
  */
 void
 nfs_srv_offline(void)
 {
-        mutex_enter(&nfs_server_upordown_lock);
-        if (nfs_server_upordown == NFS_SERVER_RUNNING) {
-                nfs_server_upordown = NFS_SERVER_OFFLINE;
+        nfs_globals_t *ng;
+
+        ng = zone_getspecific(nfssrv_zone_key, curzone);
+
+        mutex_enter(&ng->nfs_server_upordown_lock);
+        if (ng->nfs_server_upordown == NFS_SERVER_RUNNING) {
+                ng->nfs_server_upordown = NFS_SERVER_OFFLINE;
         }
-        mutex_exit(&nfs_server_upordown_lock);
+        mutex_exit(&ng->nfs_server_upordown_lock);
 }
 
 /*
  * Will be called at the point the server pool is being destroyed so
  * all transports have been closed and no service threads are in

@@ -322,37 +305,40 @@
         int quiesce = 1;
         nfs_srv_shutdown_all(quiesce);
 }
 
 static void
-nfs_srv_shutdown_all(int quiesce) {
-        mutex_enter(&nfs_server_upordown_lock);
+nfs_srv_shutdown_all(int quiesce)
+{
+        nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
+
+        mutex_enter(&ng->nfs_server_upordown_lock);
         if (quiesce) {
-                if (nfs_server_upordown == NFS_SERVER_RUNNING ||
-                        nfs_server_upordown == NFS_SERVER_OFFLINE) {
-                        nfs_server_upordown = NFS_SERVER_QUIESCED;
-                        cv_signal(&nfs_server_upordown_cv);
+                if (ng->nfs_server_upordown == NFS_SERVER_RUNNING ||
+                    ng->nfs_server_upordown == NFS_SERVER_OFFLINE) {
+                        ng->nfs_server_upordown = NFS_SERVER_QUIESCED;
+                        cv_signal(&ng->nfs_server_upordown_cv);
 
                         /* reset DSS state, for subsequent warm restart */
                         rfs4_dss_numnewpaths = 0;
                         rfs4_dss_newpaths = NULL;
 
                         cmn_err(CE_NOTE, "nfs_server: server is now quiesced; "
                             "NFSv4 state has been preserved");
                 }
         } else {
-                if (nfs_server_upordown == NFS_SERVER_OFFLINE) {
-                        nfs_server_upordown = NFS_SERVER_STOPPING;
-                        mutex_exit(&nfs_server_upordown_lock);
-                        rfs4_state_fini();
-                        rfs4_fini_drc(nfs4_drc);
-                        mutex_enter(&nfs_server_upordown_lock);
-                        nfs_server_upordown = NFS_SERVER_STOPPED;
-                        cv_signal(&nfs_server_upordown_cv);
+                if (ng->nfs_server_upordown == NFS_SERVER_OFFLINE) {
+                        ng->nfs_server_upordown = NFS_SERVER_STOPPING;
+                        mutex_exit(&ng->nfs_server_upordown_lock);
+                        rfs4_state_zone_fini();
+                        rfs4_fini_drc();
+                        mutex_enter(&ng->nfs_server_upordown_lock);
+                        ng->nfs_server_upordown = NFS_SERVER_STOPPED;
+                        cv_signal(&ng->nfs_server_upordown_cv);
                 }
         }
-        mutex_exit(&nfs_server_upordown_lock);
+        mutex_exit(&ng->nfs_server_upordown_lock);
 }
 
 static int
 nfs_srv_set_sc_versions(struct file *fp, SVC_CALLOUT_TABLE **sctpp,
                         rpcvers_t versmin, rpcvers_t versmax)

@@ -416,10 +402,11 @@
  * uap->fd is the fd of an open transport provider
  */
 int
 nfs_svc(struct nfs_svc_args *arg, model_t model)
 {
+        nfs_globals_t *ng;
         file_t *fp;
         SVCMASTERXPRT *xprt;
         int error;
         int readsize;
         char buf[KNC_STRSIZE];

@@ -430,10 +417,11 @@
 
 #ifdef lint
         model = model;          /* STRUCT macros don't always refer to it */
 #endif
 
+        ng = zone_getspecific(nfssrv_zone_key, curzone);
         STRUCT_SET_HANDLE(uap, model, arg);
 
         /* Check privileges in nfssys() */
 
         if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)

@@ -463,31 +451,31 @@
                 releasef(STRUCT_FGET(uap, fd));
                 kmem_free(addrmask.buf, addrmask.maxlen);
                 return (error);
         }
 
-        nfs_versmin = STRUCT_FGET(uap, versmin);
-        nfs_versmax = STRUCT_FGET(uap, versmax);
+        ng->nfs_versmin = STRUCT_FGET(uap, versmin);
+        ng->nfs_versmax = STRUCT_FGET(uap, versmax);
 
         /* Double check the vers min/max ranges */
-        if ((nfs_versmin > nfs_versmax) ||
-            (nfs_versmin < NFS_VERSMIN) ||
-            (nfs_versmax > NFS_VERSMAX)) {
-                nfs_versmin = NFS_VERSMIN_DEFAULT;
-                nfs_versmax = NFS_VERSMAX_DEFAULT;
+        if ((ng->nfs_versmin > ng->nfs_versmax) ||
+            (ng->nfs_versmin < NFS_VERSMIN) ||
+            (ng->nfs_versmax > NFS_VERSMAX)) {
+                ng->nfs_versmin = NFS_VERSMIN_DEFAULT;
+                ng->nfs_versmax = NFS_VERSMAX_DEFAULT;
         }
 
-        if (error =
-            nfs_srv_set_sc_versions(fp, &sctp, nfs_versmin, nfs_versmax)) {
+        if (error = nfs_srv_set_sc_versions(fp, &sctp, ng->nfs_versmin,
+            ng->nfs_versmax)) {
                 releasef(STRUCT_FGET(uap, fd));
                 kmem_free(addrmask.buf, addrmask.maxlen);
                 return (error);
         }
 
         /* Initialize nfsv4 server */
-        if (nfs_versmax == (rpcvers_t)NFS_V4)
-                rfs4_server_start(STRUCT_FGET(uap, delegation));
+        if (ng->nfs_versmax == (rpcvers_t)NFS_V4)
+                rfs4_server_start(ng, STRUCT_FGET(uap, delegation));
 
         /* Create a transport handle. */
         error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &xprt,
             sctp, NULL, NFS_SVCPOOL_ID, TRUE);
 

@@ -502,72 +490,50 @@
 
         return (error);
 }
 
 static void
-rfs4_server_start(int nfs4_srv_delegation)
+rfs4_server_start(nfs_globals_t *ng, int nfs4_srv_delegation)
 {
         /*
          * Determine if the server has previously been "started" and
          * if not, do the per instance initialization
          */
-        mutex_enter(&nfs_server_upordown_lock);
+        mutex_enter(&ng->nfs_server_upordown_lock);
 
-        if (nfs_server_upordown != NFS_SERVER_RUNNING) {
+        if (ng->nfs_server_upordown != NFS_SERVER_RUNNING) {
                 /* Do we need to stop and wait on the previous server? */
-                while (nfs_server_upordown == NFS_SERVER_STOPPING ||
-                    nfs_server_upordown == NFS_SERVER_OFFLINE)
-                        cv_wait(&nfs_server_upordown_cv,
-                            &nfs_server_upordown_lock);
+                while (ng->nfs_server_upordown == NFS_SERVER_STOPPING ||
+                    ng->nfs_server_upordown == NFS_SERVER_OFFLINE)
+                        cv_wait(&ng->nfs_server_upordown_cv,
+                            &ng->nfs_server_upordown_lock);
 
-                if (nfs_server_upordown != NFS_SERVER_RUNNING) {
+                if (ng->nfs_server_upordown != NFS_SERVER_RUNNING) {
                         (void) svc_pool_control(NFS_SVCPOOL_ID,
                             SVCPSET_UNREGISTER_PROC, (void *)&nfs_srv_offline);
                         (void) svc_pool_control(NFS_SVCPOOL_ID,
                             SVCPSET_SHUTDOWN_PROC, (void *)&nfs_srv_stop_all);
 
-                        /* is this an nfsd warm start? */
-                        if (nfs_server_upordown == NFS_SERVER_QUIESCED) {
-                                cmn_err(CE_NOTE, "nfs_server: "
-                                    "server was previously quiesced; "
-                                    "existing NFSv4 state will be re-used");
+                        rfs4_do_server_start(ng->nfs_server_upordown,
+                            nfs4_srv_delegation,
+                            cluster_bootflags & CLUSTER_BOOTED);
 
-                                /*
-                                 * HA-NFSv4: this is also the signal
-                                 * that a Resource Group failover has
-                                 * occurred.
-                                 */
-                                if (cluster_bootflags & CLUSTER_BOOTED)
-                                        hanfsv4_failover();
-                        } else {
-                                /* cold start */
-                                rfs4_state_init();
-                                nfs4_drc = rfs4_init_drc(nfs4_drc_max,
-                                    nfs4_drc_hash);
+                        ng->nfs_server_upordown = NFS_SERVER_RUNNING;
                         }
-
-                        /*
-                         * Check to see if delegation is to be
-                         * enabled at the server
-                         */
-                        if (nfs4_srv_delegation != FALSE)
-                                rfs4_set_deleg_policy(SRV_NORMAL_DELEGATE);
-
-                        nfs_server_upordown = NFS_SERVER_RUNNING;
+                cv_signal(&ng->nfs_server_upordown_cv);
                 }
-                cv_signal(&nfs_server_upordown_cv);
-        }
-        mutex_exit(&nfs_server_upordown_lock);
+        mutex_exit(&ng->nfs_server_upordown_lock);
 }
 
 /*
  * If RDMA device available,
  * start RDMA listener.
  */
 int
 rdma_start(struct rdma_svc_args *rsa)
 {
+        nfs_globals_t *ng;
         int error;
         rdma_xprt_group_t started_rdma_xprts;
         rdma_stat stat;
         int svc_state = 0;
 

@@ -576,13 +542,15 @@
             (rsa->nfs_versmin < NFS_VERSMIN) ||
             (rsa->nfs_versmax > NFS_VERSMAX)) {
                 rsa->nfs_versmin = NFS_VERSMIN_DEFAULT;
                 rsa->nfs_versmax = NFS_VERSMAX_DEFAULT;
         }
-        nfs_versmin = rsa->nfs_versmin;
-        nfs_versmax = rsa->nfs_versmax;
 
+        ng = zone_getspecific(nfssrv_zone_key, curzone);
+        ng->nfs_versmin = rsa->nfs_versmin;
+        ng->nfs_versmax = rsa->nfs_versmax;
+
         /* Set the versions in the callout table */
         __nfs_sc_rdma[0].sc_versmin = rsa->nfs_versmin;
         __nfs_sc_rdma[0].sc_versmax = rsa->nfs_versmax;
         /* For the NFS_ACL program, check the max version */
         __nfs_sc_rdma[1].sc_versmin = rsa->nfs_versmin;

@@ -591,11 +559,11 @@
         else
                 __nfs_sc_rdma[1].sc_versmax = rsa->nfs_versmax;
 
         /* Initialize nfsv4 server */
         if (rsa->nfs_versmax == (rpcvers_t)NFS_V4)
-                rfs4_server_start(rsa->delegation);
+                rfs4_server_start(ng, rsa->delegation);
 
         started_rdma_xprts.rtg_count = 0;
         started_rdma_xprts.rtg_listhead = NULL;
         started_rdma_xprts.rtg_poolid = rsa->poolid;
 

@@ -608,11 +576,11 @@
         while (!error) {
 
                 /*
                  * wait till either interrupted by a signal on
                  * nfs service stop/restart or signalled by a
-                 * rdma plugin attach/detatch.
+                 * rdma attach/detatch.
                  */
 
                 stat = rdma_kwait();
 
                 /*

@@ -659,14 +627,14 @@
 /* ARGSUSED */
 void
 rpc_null_v3(caddr_t *argp, caddr_t *resp, struct exportinfo *exi,
     struct svc_req *req, cred_t *cr, bool_t ro)
 {
-        DTRACE_NFSV3_3(op__null__start, struct svc_req *, req,
-            cred_t *, cr, vnode_t *, NULL);
-        DTRACE_NFSV3_3(op__null__done, struct svc_req *, req,
-            cred_t *, cr, vnode_t *, NULL);
+        DTRACE_NFSV3_4(op__null__start, struct svc_req *, req,
+            cred_t *, cr, vnode_t *, NULL, struct exportinfo *, exi);
+        DTRACE_NFSV3_4(op__null__done, struct svc_req *, req,
+            cred_t *, cr, vnode_t *, NULL, struct exportinfo *, exi);
 }
 
 /* ARGSUSED */
 static void
 rfs_error(caddr_t *argp, caddr_t *resp, struct exportinfo *exi,

@@ -1340,17 +1308,17 @@
 };
 
 static struct rpc_disptable rfs_disptable[] = {
         {sizeof (rfsdisptab_v2) / sizeof (rfsdisptab_v2[0]),
             rfscallnames_v2,
-            &rfsproccnt_v2_ptr, rfsdisptab_v2},
+            &rfsproccnt_v2_ptr, &rfsprocio_v2_ptr, rfsdisptab_v2},
         {sizeof (rfsdisptab_v3) / sizeof (rfsdisptab_v3[0]),
             rfscallnames_v3,
-            &rfsproccnt_v3_ptr, rfsdisptab_v3},
+            &rfsproccnt_v3_ptr, &rfsprocio_v3_ptr, rfsdisptab_v3},
         {sizeof (rfsdisptab_v4) / sizeof (rfsdisptab_v4[0]),
             rfscallnames_v4,
-            &rfsproccnt_v4_ptr, rfsdisptab_v4},
+            &rfsproccnt_v4_ptr, &rfsprocio_v4_ptr, rfsdisptab_v4},
 };
 
 /*
  * If nfs_portmon is set, then clients are required to use privileged
  * ports (ports < IPPORT_RESERVED) in order to get NFS services.

@@ -1358,18 +1326,17 @@
  * N.B.: this attempt to carry forward the already ill-conceived notion
  * of privileged ports for TCP/UDP is really quite ineffectual.  Not only
  * is it transport-dependent, it's laughably easy to spoof.  If you're
  * really interested in security, you must start with secure RPC instead.
  */
-static int nfs_portmon = 0;
+volatile int nfs_portmon = 0;
 
 #ifdef DEBUG
 static int cred_hits = 0;
 static int cred_misses = 0;
 #endif
 
-
 #ifdef DEBUG
 /*
  * Debug code to allow disabling of rfs_dispatch() use of
  * fastxdrargs() and fastxdrres() calls for testing purposes.
  */

@@ -1472,12 +1439,11 @@
 }
 
 
 static void
 common_dispatch(struct svc_req *req, SVCXPRT *xprt, rpcvers_t min_vers,
-                rpcvers_t max_vers, char *pgmname,
-                struct rpc_disptable *disptable)
+    rpcvers_t max_vers, char *pgmname, struct rpc_disptable *disptable)
 {
         int which;
         rpcvers_t vers;
         char *args;
         union {

@@ -1506,10 +1472,16 @@
         bool_t logging_enabled = FALSE;
         struct exportinfo *nfslog_exi = NULL;
         char **procnames;
         char cbuf[INET6_ADDRSTRLEN];    /* to hold both IPv4 and IPv6 addr */
         bool_t ro = FALSE;
+        kstat_t *ksp = NULL;
+        kstat_t *exi_ksp = NULL;
+        size_t pos;                     /* request size */
+        size_t rlen;                    /* reply size */
+        bool_t rsent = FALSE;           /* reply was sent successfully */
+        nfs_export_t *ne = nfs_get_export();
 
         vers = req->rq_vers;
 
         if (vers < min_vers || vers > max_vers) {
                 svcerr_progvers(req->rq_xprt, min_vers, max_vers);

@@ -1526,10 +1498,18 @@
                 goto done;
         }
 
         (*(disptable[(int)vers].dis_proccntp))[which].value.ui64++;
 
+        ksp = (*(disptable[(int)vers].dis_prociop))[which];
+        if (ksp != NULL) {
+                mutex_enter(ksp->ks_lock);
+                kstat_runq_enter(KSTAT_IO_PTR(ksp));
+                mutex_exit(ksp->ks_lock);
+        }
+        pos = XDR_GETPOS(&xprt->xp_xdrin);
+
         disp = &disptable[(int)vers].dis_table[which];
         procnames = disptable[(int)vers].dis_procnames;
 
         auth_flavor = req->rq_cred.oa_flavor;
 

@@ -1569,11 +1549,13 @@
 
         /*
          * If Version 4 use that specific dispatch function.
          */
         if (req->rq_vers == 4) {
-                error += rfs4_dispatch(disp, req, xprt, args);
+                error += rfs4_dispatch(disp, req, xprt, args, &rlen);
+                if (error == 0)
+                        rsent = TRUE;
                 goto done;
         }
 
         dis_flags = disp->dis_flags;
 

@@ -1630,17 +1612,19 @@
                         anon_ok = 0;
 
                 cr = xprt->xp_cred;
                 ASSERT(cr != NULL);
 #ifdef DEBUG
+                {
                 if (crgetref(cr) != 1) {
                         crfree(cr);
                         cr = crget();
                         xprt->xp_cred = cr;
                         cred_misses++;
                 } else
                         cred_hits++;
+                }
 #else
                 if (crgetref(cr) != 1) {
                         crfree(cr);
                         cr = crget();
                         xprt->xp_cred = cr;

@@ -1648,12 +1632,38 @@
 #endif
 
                 exi = checkexport(fsid, xfid);
 
                 if (exi != NULL) {
-                        publicfh_ok = PUBLICFH_CHECK(disp, exi, fsid, xfid);
+                        rw_enter(&ne->exported_lock, RW_READER);
+                        exi_ksp = NULL;
 
+                        if (exi->exi_kstats != NULL) {
+                                switch (req->rq_vers) {
+                                case NFS_VERSION:
+                                        exi_ksp = exp_kstats_v2(exi->exi_kstats,
+                                            which);
+                                        break;
+                                case NFS_V3:
+                                        exi_ksp = exp_kstats_v3(exi->exi_kstats,
+                                            which);
+                                        break;
+                                default:
+                                        ASSERT(0);
+                                        break;
+                                }
+                        }
+
+                        if (exi_ksp != NULL) {
+                                mutex_enter(exi_ksp->ks_lock);
+                                kstat_runq_enter(KSTAT_IO_PTR(exi_ksp));
+                                mutex_exit(exi_ksp->ks_lock);
+                        } else {
+                                rw_exit(&ne->exported_lock);
+                        }
+
+                        publicfh_ok = PUBLICFH_CHECK(ne, disp, exi, fsid, xfid);
                         /*
                          * Don't allow non-V4 clients access
                          * to pseudo exports
                          */
                         if (PSEUDO(exi)) {

@@ -1761,11 +1771,11 @@
          * the later writing of the log record.  This is done for
          * the case that a lookup is done across a non-logged public
          * file system.
          */
         if (nfslog_buffer_list != NULL) {
-                nfslog_exi = nfslog_get_exi(exi, req, res, &nfslog_rec_id);
+                nfslog_exi = nfslog_get_exi(ne, exi, req, res, &nfslog_rec_id);
                 /*
                  * Is logging enabled?
                  */
                 logging_enabled = (nfslog_exi != NULL);
 

@@ -1798,26 +1808,32 @@
         {
                 if (!svc_sendreply(xprt, disp->dis_fastxdrres, res)) {
                         cmn_err(CE_NOTE, "%s: bad sendreply", pgmname);
                         svcerr_systemerr(xprt);
                         error++;
+                } else {
+                        rlen = xdr_sizeof(disp->dis_fastxdrres, res);
+                        rsent = TRUE;
                 }
         } else {
                 if (!svc_sendreply(xprt, disp->dis_xdrres, res)) {
                         cmn_err(CE_NOTE, "%s: bad sendreply", pgmname);
                         svcerr_systemerr(xprt);
                         error++;
+                } else {
+                        rlen = xdr_sizeof(disp->dis_xdrres, res);
+                        rsent = TRUE;
                 }
         }
 
         /*
          * Log if needed
          */
         if (logging_enabled) {
                 nfslog_write_record(nfslog_exi, req, args, (char *)&res_buf,
                     cr, &nb, nfslog_rec_id, NFSLOG_ONE_BUFFER);
-                exi_rele(nfslog_exi);
+                exi_rele(&nfslog_exi);
                 kmem_free((&nb)->buf, (&nb)->len);
         }
 
         /*
          * Free results struct. With the addition of NFS V4 we can

@@ -1826,10 +1842,14 @@
         if (disp->dis_resfree != nullfree && dupcached == FALSE) {
                 (*disp->dis_resfree)(res);
         }
 
 done:
+        if (ksp != NULL || exi_ksp != NULL) {
+                pos = XDR_GETPOS(&xprt->xp_xdrin) - pos;
+        }
+
         /*
          * Free arguments struct
          */
         if (disp) {
                 if (!SVC_FREEARGS(xprt, disp->dis_xdrargs, args)) {

@@ -1841,13 +1861,39 @@
                         cmn_err(CE_NOTE, "%s: bad freeargs", pgmname);
                         error++;
                 }
         }
 
+        if (exi_ksp != NULL) {
+                mutex_enter(exi_ksp->ks_lock);
+                KSTAT_IO_PTR(exi_ksp)->nwritten += pos;
+                KSTAT_IO_PTR(exi_ksp)->writes++;
+                if (rsent) {
+                        KSTAT_IO_PTR(exi_ksp)->nread += rlen;
+                        KSTAT_IO_PTR(exi_ksp)->reads++;
+                }
+                kstat_runq_exit(KSTAT_IO_PTR(exi_ksp));
+                mutex_exit(exi_ksp->ks_lock);
+
+                rw_exit(&ne->exported_lock);
+        }
+
         if (exi != NULL)
-                exi_rele(exi);
+                exi_rele(&exi);
 
+        if (ksp != NULL) {
+                mutex_enter(ksp->ks_lock);
+                KSTAT_IO_PTR(ksp)->nwritten += pos;
+                KSTAT_IO_PTR(ksp)->writes++;
+                if (rsent) {
+                        KSTAT_IO_PTR(ksp)->nread += rlen;
+                        KSTAT_IO_PTR(ksp)->reads++;
+                }
+                kstat_runq_exit(KSTAT_IO_PTR(ksp));
+                mutex_exit(ksp->ks_lock);
+        }
+
         global_svstat_ptr[req->rq_vers][NFS_BADCALLS].value.ui64 += error;
 
         global_svstat_ptr[req->rq_vers][NFS_CALLS].value.ui64++;
 }
 

@@ -1969,14 +2015,14 @@
 };
 
 static struct rpc_disptable acl_disptable[] = {
         {sizeof (acldisptab_v2) / sizeof (acldisptab_v2[0]),
                 aclcallnames_v2,
-                &aclproccnt_v2_ptr, acldisptab_v2},
+                &aclproccnt_v2_ptr, &aclprocio_v2_ptr, acldisptab_v2},
         {sizeof (acldisptab_v3) / sizeof (acldisptab_v3[0]),
                 aclcallnames_v3,
-                &aclproccnt_v3_ptr, acldisptab_v3},
+                &aclproccnt_v3_ptr, &aclprocio_v3_ptr, acldisptab_v3},
 };
 
 static void
 acl_dispatch(struct svc_req *req, SVCXPRT *xprt)
 {

@@ -2566,35 +2612,22 @@
  * once.  It performs the following tasks:
  *      - Call sub-initialization routines (localize access to variables)
  *      - Initialize all locks
  *      - initialize the version 3 write verifier
  */
-int
+void
 nfs_srvinit(void)
 {
-        int error;
+        /* NFS server zone-specific global variables */
+        zone_key_create(&nfssrv_zone_key, nfs_srv_zone_init,
+            NULL, nfs_srv_zone_fini);
 
-        error = nfs_exportinit();
-        if (error != 0)
-                return (error);
-        error = rfs4_srvrinit();
-        if (error != 0) {
-                nfs_exportfini();
-                return (error);
-        }
+        nfs_exportinit();
         rfs_srvrinit();
         rfs3_srvrinit();
+        rfs4_srvrinit();
         nfsauth_init();
-
-        /* Init the stuff to control start/stop */
-        nfs_server_upordown = NFS_SERVER_STOPPED;
-        mutex_init(&nfs_server_upordown_lock, NULL, MUTEX_DEFAULT, NULL);
-        cv_init(&nfs_server_upordown_cv, NULL, CV_DEFAULT, NULL);
-        mutex_init(&rdma_wait_mutex, NULL, MUTEX_DEFAULT, NULL);
-        cv_init(&rdma_wait_cv, NULL, CV_DEFAULT, NULL);
-
-        return (0);
 }
 
 /*
  * NFS Server finalization routine. This routine is called to cleanup the
  * initialization work previously performed if the NFS server module could

@@ -2602,24 +2635,57 @@
  */
 void
 nfs_srvfini(void)
 {
         nfsauth_fini();
+        rfs4_srvrfini();
         rfs3_srvrfini();
         rfs_srvrfini();
         nfs_exportfini();
 
-        mutex_destroy(&nfs_server_upordown_lock);
-        cv_destroy(&nfs_server_upordown_cv);
-        mutex_destroy(&rdma_wait_mutex);
-        cv_destroy(&rdma_wait_cv);
+        (void) zone_key_delete(nfssrv_zone_key);
 }
 
+/* ARGSUSED */
+static void *
+nfs_srv_zone_init(zoneid_t zoneid)
+{
+        nfs_globals_t *ng;
+
+        ng = kmem_zalloc(sizeof (*ng), KM_SLEEP);
+
+        ng->nfs_versmin = NFS_VERSMIN_DEFAULT;
+        ng->nfs_versmax = NFS_VERSMAX_DEFAULT;
+
+        /* Init the stuff to control start/stop */
+        ng->nfs_server_upordown = NFS_SERVER_STOPPED;
+        mutex_init(&ng->nfs_server_upordown_lock, NULL, MUTEX_DEFAULT, NULL);
+        cv_init(&ng->nfs_server_upordown_cv, NULL, CV_DEFAULT, NULL);
+        mutex_init(&ng->rdma_wait_mutex, NULL, MUTEX_DEFAULT, NULL);
+        cv_init(&ng->rdma_wait_cv, NULL, CV_DEFAULT, NULL);
+
+        return (ng);
+}
+
+/* ARGSUSED */
+static void
+nfs_srv_zone_fini(zoneid_t zoneid, void *data)
+{
+        nfs_globals_t *ng;
+
+        ng = (nfs_globals_t *)data;
+        mutex_destroy(&ng->nfs_server_upordown_lock);
+        cv_destroy(&ng->nfs_server_upordown_cv);
+        mutex_destroy(&ng->rdma_wait_mutex);
+        cv_destroy(&ng->rdma_wait_cv);
+
+        kmem_free(ng, sizeof (*ng));
+}
+
 /*
  * Set up an iovec array of up to cnt pointers.
  */
-
 void
 mblk_to_iov(mblk_t *m, int cnt, struct iovec *iovp)
 {
         while (m != NULL && cnt-- > 0) {
                 iovp->iov_base = (caddr_t)m->b_rptr;

@@ -2852,11 +2918,11 @@
                          * option argument and leads us to another filesystem
                          */
 
                         /* Release the reference on the old exi value */
                         ASSERT(*exi != NULL);
-                        exi_rele(*exi);
+                        exi_rele(exi);
 
                         if (error = nfs_check_vpexi(mc_dvp, *vpp, kcred, exi)) {
                                 VN_RELE(*vpp);
                                 goto publicfh_done;
                         }

@@ -2891,11 +2957,11 @@
          */
         if (*path == '/') {
                 while (*path == '/')
                         path++;
 
-                startdvp = rootdir;
+                startdvp = ZONE_ROOTVP();
         }
 
         error = pn_get_buf(path, UIO_SYSSPACE, &pn, namebuf, sizeof (namebuf));
         if (error == 0) {
                 /*

@@ -2914,11 +2980,11 @@
                         if ((pn.pn_pathlen = strlen(pn.pn_path)) == 0)
                                 return (ENOENT);
                 }
                 VN_HOLD(startdvp);
                 error = lookuppnvp(&pn, NULL, NO_FOLLOW, dirvpp, compvpp,
-                    rootdir, startdvp, cr);
+                    ZONE_ROOTVP(), startdvp, cr);
         }
         if (error == ENAMETOOLONG) {
                 /*
                  * This thread used a pathname > TYPICALMAXPATHLEN bytes long.
                  */

@@ -2931,11 +2997,11 @@
                                 return (ENOENT);
                         }
                 }
                 VN_HOLD(startdvp);
                 error = lookuppnvp(&pn, NULL, NO_FOLLOW, dirvpp, compvpp,
-                    rootdir, startdvp, cr);
+                    ZONE_ROOTVP(), startdvp, cr);
                 pn_free(&pn);
         }
 
         return (error);
 }

@@ -3035,172 +3101,10 @@
         }
 
         return (error);
 }
 
-/*
- * Do the main work of handling HA-NFSv4 Resource Group failover on
- * Sun Cluster.
- * We need to detect whether any RG admin paths have been added or removed,
- * and adjust resources accordingly.
- * Currently we're using a very inefficient algorithm, ~ 2 * O(n**2). In
- * order to scale, the list and array of paths need to be held in more
- * suitable data structures.
- */
-static void
-hanfsv4_failover(void)
-{
-        int i, start_grace, numadded_paths = 0;
-        char **added_paths = NULL;
-        rfs4_dss_path_t *dss_path;
-
-        /*
-         * Note: currently, rfs4_dss_pathlist cannot be NULL, since
-         * it will always include an entry for NFS4_DSS_VAR_DIR. If we
-         * make the latter dynamically specified too, the following will
-         * need to be adjusted.
-         */
-
-        /*
-         * First, look for removed paths: RGs that have been failed-over
-         * away from this node.
-         * Walk the "currently-serving" rfs4_dss_pathlist and, for each
-         * path, check if it is on the "passed-in" rfs4_dss_newpaths array
-         * from nfsd. If not, that RG path has been removed.
-         *
-         * Note that nfsd has sorted rfs4_dss_newpaths for us, and removed
-         * any duplicates.
-         */
-        dss_path = rfs4_dss_pathlist;
-        do {
-                int found = 0;
-                char *path = dss_path->path;
-
-                /* used only for non-HA so may not be removed */
-                if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) {
-                        dss_path = dss_path->next;
-                        continue;
-                }
-
-                for (i = 0; i < rfs4_dss_numnewpaths; i++) {
-                        int cmpret;
-                        char *newpath = rfs4_dss_newpaths[i];
-
-                        /*
-                         * Since nfsd has sorted rfs4_dss_newpaths for us,
-                         * once the return from strcmp is negative we know
-                         * we've passed the point where "path" should be,
-                         * and can stop searching: "path" has been removed.
-                         */
-                        cmpret = strcmp(path, newpath);
-                        if (cmpret < 0)
-                                break;
-                        if (cmpret == 0) {
-                                found = 1;
-                                break;
-                        }
-                }
-
-                if (found == 0) {
-                        unsigned index = dss_path->index;
-                        rfs4_servinst_t *sip = dss_path->sip;
-                        rfs4_dss_path_t *path_next = dss_path->next;
-
-                        /*
-                         * This path has been removed.
-                         * We must clear out the servinst reference to
-                         * it, since it's now owned by another
-                         * node: we should not attempt to touch it.
-                         */
-                        ASSERT(dss_path == sip->dss_paths[index]);
-                        sip->dss_paths[index] = NULL;
-
-                        /* remove from "currently-serving" list, and destroy */
-                        remque(dss_path);
-                        /* allow for NUL */
-                        kmem_free(dss_path->path, strlen(dss_path->path) + 1);
-                        kmem_free(dss_path, sizeof (rfs4_dss_path_t));
-
-                        dss_path = path_next;
-                } else {
-                        /* path was found; not removed */
-                        dss_path = dss_path->next;
-                }
-        } while (dss_path != rfs4_dss_pathlist);
-
-        /*
-         * Now, look for added paths: RGs that have been failed-over
-         * to this node.
-         * Walk the "passed-in" rfs4_dss_newpaths array from nfsd and,
-         * for each path, check if it is on the "currently-serving"
-         * rfs4_dss_pathlist. If not, that RG path has been added.
-         *
-         * Note: we don't do duplicate detection here; nfsd does that for us.
-         *
-         * Note: numadded_paths <= rfs4_dss_numnewpaths, which gives us
-         * an upper bound for the size needed for added_paths[numadded_paths].
-         */
-
-        /* probably more space than we need, but guaranteed to be enough */
-        if (rfs4_dss_numnewpaths > 0) {
-                size_t sz = rfs4_dss_numnewpaths * sizeof (char *);
-                added_paths = kmem_zalloc(sz, KM_SLEEP);
-        }
-
-        /* walk the "passed-in" rfs4_dss_newpaths array from nfsd */
-        for (i = 0; i < rfs4_dss_numnewpaths; i++) {
-                int found = 0;
-                char *newpath = rfs4_dss_newpaths[i];
-
-                dss_path = rfs4_dss_pathlist;
-                do {
-                        char *path = dss_path->path;
-
-                        /* used only for non-HA */
-                        if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) {
-                                dss_path = dss_path->next;
-                                continue;
-                        }
-
-                        if (strncmp(path, newpath, strlen(path)) == 0) {
-                                found = 1;
-                                break;
-                        }
-
-                        dss_path = dss_path->next;
-                } while (dss_path != rfs4_dss_pathlist);
-
-                if (found == 0) {
-                        added_paths[numadded_paths] = newpath;
-                        numadded_paths++;
-                }
-        }
-
-        /* did we find any added paths? */
-        if (numadded_paths > 0) {
-                /* create a new server instance, and start its grace period */
-                start_grace = 1;
-                rfs4_servinst_create(start_grace, numadded_paths, added_paths);
-
-                /* read in the stable storage state from these paths */
-                rfs4_dss_readstate(numadded_paths, added_paths);
-
-                /*
-                 * Multiple failovers during a grace period will cause
-                 * clients of the same resource group to be partitioned
-                 * into different server instances, with different
-                 * grace periods.  Since clients of the same resource
-                 * group must be subject to the same grace period,
-                 * we need to reset all currently active grace periods.
-                 */
-                rfs4_grace_reset_all();
-        }
-
-        if (rfs4_dss_numnewpaths > 0)
-                kmem_free(added_paths, rfs4_dss_numnewpaths * sizeof (char *));
-}
-
 /*
  * Used by NFSv3 and NFSv4 server to query label of
  * a pathname component during lookup/access ops.
  */
 ts_label_t *