Print this page
11083 support NFS server in zone
Portions contributed by: Dan Kruchinin <dan.kruchinin@nexenta.com>
Portions contributed by: Stepan Zastupov <stepan.zastupov@gmail.com>
Portions contributed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Portions contributed by: Mike Zeller <mike@mikezeller.net>
Portions contributed by: Dan McDonald <danmcd@joyent.com>
Portions contributed by: Gordon Ross <gordon.w.ross@gmail.com>
Portions contributed by: Vitaliy Gusev <gusev.vitaliy@gmail.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Jason King <jbk@joyent.com>
Reviewed by: C Fraire <cfraire@me.com>
Change-Id: I22f289d357503f9b48a0bc2482cc4328a6d43d16

@@ -18,20 +18,24 @@
  *
  * CDDL HEADER END
  */
 
 /*
- * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 /*
  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  *      All Rights Reserved
  */
 
+/*
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright 2019 Nexenta Systems, Inc.
+ * Copyright 2019 Nexenta by DDN, Inc.
+ */
+
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/cred.h>
 #include <sys/buf.h>

@@ -64,14 +68,16 @@
 #include <rpc/auth.h>
 #include <rpc/rpcsec_gss.h>
 #include <rpc/svc.h>
 
 #include <nfs/nfs.h>
+#include <nfs/nfssys.h>
 #include <nfs/export.h>
 #include <nfs/nfs_cmd.h>
 #include <nfs/lm.h>
 #include <nfs/nfs4.h>
+#include <nfs/nfs4_drc.h>
 
 #include <sys/strsubr.h>
 #include <sys/strsun.h>
 
 #include <inet/common.h>

@@ -145,20 +151,16 @@
  *
  */
 #define DIRENT64_TO_DIRCOUNT(dp) \
         (3 * BYTES_PER_XDR_UNIT + DIRENT64_NAMELEN((dp)->d_reclen))
 
-time_t rfs4_start_time;                 /* Initialized in rfs4_srvrinit */
 
 static sysid_t lockt_sysid;             /* dummy sysid for all LOCKT calls */
 
 u_longlong_t    nfs4_srv_caller_id;
 uint_t          nfs4_srv_vkey = 0;
 
-verifier4       Write4verf;
-verifier4       Readdir4verf;
-
 void    rfs4_init_compound_state(struct compound_state *);
 
 static void     nullfree(caddr_t);
 static void     rfs4_op_inval(nfs_argop4 *, nfs_resop4 *, struct svc_req *,
                         struct compound_state *);

@@ -243,15 +245,16 @@
                         struct svc_req *req, struct compound_state *);
 static void     rfs4_op_secinfo(nfs_argop4 *, nfs_resop4 *, struct svc_req *,
                         struct compound_state *);
 static void     rfs4_op_secinfo_free(nfs_resop4 *);
 
-static nfsstat4 check_open_access(uint32_t,
-                                struct compound_state *, struct svc_req *);
+static nfsstat4 check_open_access(uint32_t, struct compound_state *,
+                    struct svc_req *);
 nfsstat4 rfs4_client_sysid(rfs4_client_t *, sysid_t *);
-void rfs4_ss_clid(rfs4_client_t *);
+void            rfs4_ss_clid(nfs4_srv_t *, rfs4_client_t *);
 
+
 /*
  * translation table for attrs
  */
 struct nfs4_ntov_table {
         union nfs4_attr_u *na;

@@ -266,17 +269,15 @@
 
 static nfsstat4 do_rfs4_set_attrs(bitmap4 *resp, fattr4 *fattrp,
                     struct compound_state *cs, struct nfs4_svgetit_arg *sargp,
                     struct nfs4_ntov_table *ntovp, nfs4_attr_cmd_t cmd);
 
+static void     hanfsv4_failover(nfs4_srv_t *);
+
 fem_t           *deleg_rdops;
 fem_t           *deleg_wrops;
 
-rfs4_servinst_t *rfs4_cur_servinst = NULL;      /* current server instance */
-kmutex_t        rfs4_servinst_lock;     /* protects linked list */
-int             rfs4_seen_first_compound;       /* set first time we see one */
-
 /*
  * NFS4 op dispatch table
  */
 
 struct rfsv4disp {

@@ -464,11 +465,11 @@
         "rfs4_op_release_lockowner",
         "rfs4_op_illegal"
 };
 #endif
 
-void    rfs4_ss_chkclid(rfs4_client_t *);
+void    rfs4_ss_chkclid(nfs4_srv_t *, rfs4_client_t *);
 
 extern size_t   strlcpy(char *dst, const char *src, size_t dstsize);
 
 extern void     rfs4_free_fs_locations4(fs_locations4 *);
 

@@ -497,18 +498,27 @@
         VOPNAME_SETSECATTR,     { .femop_setsecattr = deleg_wr_setsecattr },
         VOPNAME_VNEVENT,        { .femop_vnevent = deleg_wr_vnevent },
         NULL,                   NULL
 };
 
-int
-rfs4_srvrinit(void)
+nfs4_srv_t *
+nfs4_get_srv(void)
 {
+        nfs_globals_t *ng = nfs_srv_getzg();
+        nfs4_srv_t *srv = ng->nfs4_srv;
+        ASSERT(srv != NULL);
+        return (srv);
+}
+
+void
+rfs4_srv_zone_init(nfs_globals_t *ng)
+{
+        nfs4_srv_t *nsrv4;
         timespec32_t verf;
-        int error;
-        extern void rfs4_attr_init();
-        extern krwlock_t rfs4_deleg_policy_lock;
 
+        nsrv4 = kmem_zalloc(sizeof (*nsrv4), KM_SLEEP);
+
         /*
          * The following algorithm attempts to find a unique verifier
          * to be used as the write verifier returned from the server
          * to the client.  It is important that this verifier change
          * whenever the server reboots.  Of secondary importance, it

@@ -533,65 +543,117 @@
 
                 gethrestime(&tverf);
                 verf.tv_sec = (time_t)tverf.tv_sec;
                 verf.tv_nsec = tverf.tv_nsec;
         }
+        nsrv4->write4verf = *(uint64_t *)&verf;
 
-        Write4verf = *(uint64_t *)&verf;
+        /* Used to manage create/destroy of server state */
+        nsrv4->nfs4_server_state = NULL;
+        nsrv4->nfs4_cur_servinst = NULL;
+        nsrv4->nfs4_deleg_policy = SRV_NEVER_DELEGATE;
+        mutex_init(&nsrv4->deleg_lock, NULL, MUTEX_DEFAULT, NULL);
+        mutex_init(&nsrv4->state_lock, NULL, MUTEX_DEFAULT, NULL);
+        mutex_init(&nsrv4->servinst_lock, NULL, MUTEX_DEFAULT, NULL);
+        rw_init(&nsrv4->deleg_policy_lock, NULL, RW_DEFAULT, NULL);
 
-        rfs4_attr_init();
-        mutex_init(&rfs4_deleg_lock, NULL, MUTEX_DEFAULT, NULL);
+        ng->nfs4_srv = nsrv4;
+}
 
-        /* Used to manage create/destroy of server state */
-        mutex_init(&rfs4_state_lock, NULL, MUTEX_DEFAULT, NULL);
+void
+rfs4_srv_zone_fini(nfs_globals_t *ng)
+{
+        nfs4_srv_t *nsrv4 = ng->nfs4_srv;
 
-        /* Used to manage access to server instance linked list */
-        mutex_init(&rfs4_servinst_lock, NULL, MUTEX_DEFAULT, NULL);
+        ng->nfs4_srv = NULL;
 
-        /* Used to manage access to rfs4_deleg_policy */
-        rw_init(&rfs4_deleg_policy_lock, NULL, RW_DEFAULT, NULL);
+        mutex_destroy(&nsrv4->deleg_lock);
+        mutex_destroy(&nsrv4->state_lock);
+        mutex_destroy(&nsrv4->servinst_lock);
+        rw_destroy(&nsrv4->deleg_policy_lock);
 
-        error = fem_create("deleg_rdops", nfs4_rd_deleg_tmpl, &deleg_rdops);
-        if (error != 0) {
+        kmem_free(nsrv4, sizeof (*nsrv4));
+}
+
+void
+rfs4_srvrinit(void)
+{
+        extern void rfs4_attr_init();
+
+        rfs4_attr_init();
+
+        if (fem_create("deleg_rdops", nfs4_rd_deleg_tmpl, &deleg_rdops) != 0) {
                 rfs4_disable_delegation();
-        } else {
-                error = fem_create("deleg_wrops", nfs4_wr_deleg_tmpl,
-                    &deleg_wrops);
-                if (error != 0) {
+        } else if (fem_create("deleg_wrops", nfs4_wr_deleg_tmpl,
+            &deleg_wrops) != 0) {
                         rfs4_disable_delegation();
                         fem_free(deleg_rdops);
                 }
-        }
 
         nfs4_srv_caller_id = fs_new_caller_id();
-
         lockt_sysid = lm_alloc_sysidt();
-
         vsd_create(&nfs4_srv_vkey, NULL);
-
-        return (0);
+        rfs4_state_g_init();
 }
 
 void
 rfs4_srvrfini(void)
 {
-        extern krwlock_t rfs4_deleg_policy_lock;
-
         if (lockt_sysid != LM_NOSYSID) {
                 lm_free_sysidt(lockt_sysid);
                 lockt_sysid = LM_NOSYSID;
         }
 
-        mutex_destroy(&rfs4_deleg_lock);
-        mutex_destroy(&rfs4_state_lock);
-        rw_destroy(&rfs4_deleg_policy_lock);
+        rfs4_state_g_fini();
 
         fem_free(deleg_rdops);
         fem_free(deleg_wrops);
 }
 
 void
+rfs4_do_server_start(int server_upordown,
+    int srv_delegation, int cluster_booted)
+{
+        nfs4_srv_t *nsrv4 = nfs4_get_srv();
+
+        /* Is this a warm start? */
+        if (server_upordown == NFS_SERVER_QUIESCED) {
+                cmn_err(CE_NOTE, "nfs4_srv: "
+                    "server was previously quiesced; "
+                    "existing NFSv4 state will be re-used");
+
+                /*
+                 * HA-NFSv4: this is also the signal
+                 * that a Resource Group failover has
+                 * occurred.
+                 */
+                if (cluster_booted)
+                        hanfsv4_failover(nsrv4);
+        } else {
+                /* Cold start */
+                nsrv4->rfs4_start_time = 0;
+                rfs4_state_zone_init(nsrv4);
+                nsrv4->nfs4_drc = rfs4_init_drc(nfs4_drc_max,
+                    nfs4_drc_hash);
+
+                /*
+                 * The nfsd service was started with the -s option
+                 * we need to pull in any state from the paths indicated.
+                 */
+                if (curzone == global_zone && rfs4_dss_numnewpaths > 0) {
+                        /* read in the stable storage state from these paths */
+                        rfs4_dss_readstate(nsrv4, rfs4_dss_numnewpaths,
+                            rfs4_dss_newpaths);
+                }
+        }
+
+        /* Check if delegation is to be enabled */
+        if (srv_delegation != FALSE)
+                rfs4_set_deleg_policy(nsrv4, SRV_NORMAL_DELEGATE);
+}
+
+void
 rfs4_init_compound_state(struct compound_state *cs)
 {
         bzero(cs, sizeof (*cs));
         cs->cont = TRUE;
         cs->access = CS_ACCESS_DENIED;

@@ -650,38 +712,39 @@
 
 /*
  * reset all currently active grace periods
  */
 void
-rfs4_grace_reset_all(void)
+rfs4_grace_reset_all(nfs4_srv_t *nsrv4)
 {
         rfs4_servinst_t *sip;
 
-        mutex_enter(&rfs4_servinst_lock);
-        for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev)
+        mutex_enter(&nsrv4->servinst_lock);
+        for (sip = nsrv4->nfs4_cur_servinst; sip != NULL; sip = sip->prev)
                 if (rfs4_servinst_in_grace(sip))
                         rfs4_grace_start(sip);
-        mutex_exit(&rfs4_servinst_lock);
+        mutex_exit(&nsrv4->servinst_lock);
 }
 
 /*
  * start any new instances' grace periods
  */
 void
-rfs4_grace_start_new(void)
+rfs4_grace_start_new(nfs4_srv_t *nsrv4)
 {
         rfs4_servinst_t *sip;
 
-        mutex_enter(&rfs4_servinst_lock);
-        for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev)
+        mutex_enter(&nsrv4->servinst_lock);
+        for (sip = nsrv4->nfs4_cur_servinst; sip != NULL; sip = sip->prev)
                 if (rfs4_servinst_grace_new(sip))
                         rfs4_grace_start(sip);
-        mutex_exit(&rfs4_servinst_lock);
+        mutex_exit(&nsrv4->servinst_lock);
 }
 
 static rfs4_dss_path_t *
-rfs4_dss_newpath(rfs4_servinst_t *sip, char *path, unsigned index)
+rfs4_dss_newpath(nfs4_srv_t *nsrv4, rfs4_servinst_t *sip,
+    char *path, unsigned index)
 {
         size_t len;
         rfs4_dss_path_t *dss_path;
 
         dss_path = kmem_alloc(sizeof (rfs4_dss_path_t), KM_SLEEP);

@@ -701,19 +764,19 @@
 
         /*
          * Add to list of served paths.
          * No locking required, as we're only ever called at startup.
          */
-        if (rfs4_dss_pathlist == NULL) {
+        if (nsrv4->dss_pathlist == NULL) {
                 /* this is the first dss_path_t */
 
                 /* needed for insque/remque */
                 dss_path->next = dss_path->prev = dss_path;
 
-                rfs4_dss_pathlist = dss_path;
+                nsrv4->dss_pathlist = dss_path;
         } else {
-                insque(dss_path, rfs4_dss_pathlist);
+                insque(dss_path, nsrv4->dss_pathlist);
         }
 
         return (dss_path);
 }
 

@@ -721,11 +784,12 @@
  * Create a new server instance, and make it the currently active instance.
  * Note that starting the grace period too early will reduce the clients'
  * recovery window.
  */
 void
-rfs4_servinst_create(int start_grace, int dss_npaths, char **dss_paths)
+rfs4_servinst_create(nfs4_srv_t *nsrv4, int start_grace,
+    int dss_npaths, char **dss_paths)
 {
         unsigned i;
         rfs4_servinst_t *sip;
         rfs4_oldstate_t *oldstate;
 

@@ -752,75 +816,93 @@
         sip->dss_npaths = dss_npaths;
         sip->dss_paths = kmem_alloc(dss_npaths *
             sizeof (rfs4_dss_path_t *), KM_SLEEP);
 
         for (i = 0; i < dss_npaths; i++) {
-                sip->dss_paths[i] = rfs4_dss_newpath(sip, dss_paths[i], i);
+                sip->dss_paths[i] =
+                    rfs4_dss_newpath(nsrv4, sip, dss_paths[i], i);
         }
 
-        mutex_enter(&rfs4_servinst_lock);
-        if (rfs4_cur_servinst != NULL) {
+        mutex_enter(&nsrv4->servinst_lock);
+        if (nsrv4->nfs4_cur_servinst != NULL) {
                 /* add to linked list */
-                sip->prev = rfs4_cur_servinst;
-                rfs4_cur_servinst->next = sip;
+                sip->prev = nsrv4->nfs4_cur_servinst;
+                nsrv4->nfs4_cur_servinst->next = sip;
         }
         if (start_grace)
                 rfs4_grace_start(sip);
         /* make the new instance "current" */
-        rfs4_cur_servinst = sip;
+        nsrv4->nfs4_cur_servinst = sip;
 
-        mutex_exit(&rfs4_servinst_lock);
+        mutex_exit(&nsrv4->servinst_lock);
 }
 
 /*
  * In future, we might add a rfs4_servinst_destroy(sip) but, for now, destroy
  * all instances directly.
  */
 void
-rfs4_servinst_destroy_all(void)
+rfs4_servinst_destroy_all(nfs4_srv_t *nsrv4)
 {
         rfs4_servinst_t *sip, *prev, *current;
 #ifdef DEBUG
         int n = 0;
 #endif
 
-        mutex_enter(&rfs4_servinst_lock);
-        ASSERT(rfs4_cur_servinst != NULL);
-        current = rfs4_cur_servinst;
-        rfs4_cur_servinst = NULL;
+        mutex_enter(&nsrv4->servinst_lock);
+        ASSERT(nsrv4->nfs4_cur_servinst != NULL);
+        current = nsrv4->nfs4_cur_servinst;
+        nsrv4->nfs4_cur_servinst = NULL;
         for (sip = current; sip != NULL; sip = prev) {
                 prev = sip->prev;
                 rw_destroy(&sip->rwlock);
                 if (sip->oldstate)
                         kmem_free(sip->oldstate, sizeof (rfs4_oldstate_t));
-                if (sip->dss_paths)
+                if (sip->dss_paths) {
+                        int i = sip->dss_npaths;
+
+                        while (i > 0) {
+                                i--;
+                                if (sip->dss_paths[i] != NULL) {
+                                        char *path = sip->dss_paths[i]->path;
+
+                                        if (path != NULL) {
+                                                kmem_free(path,
+                                                    strlen(path) + 1);
+                                        }
+                                        kmem_free(sip->dss_paths[i],
+                                            sizeof (rfs4_dss_path_t));
+                                }
+                        }
                         kmem_free(sip->dss_paths,
                             sip->dss_npaths * sizeof (rfs4_dss_path_t *));
+                }
                 kmem_free(sip, sizeof (rfs4_servinst_t));
 #ifdef DEBUG
                 n++;
 #endif
         }
-        mutex_exit(&rfs4_servinst_lock);
+        mutex_exit(&nsrv4->servinst_lock);
 }
 
 /*
  * Assign the current server instance to a client_t.
  * Should be called with cp->rc_dbe held.
  */
 void
-rfs4_servinst_assign(rfs4_client_t *cp, rfs4_servinst_t *sip)
+rfs4_servinst_assign(nfs4_srv_t *nsrv4, rfs4_client_t *cp,
+    rfs4_servinst_t *sip)
 {
         ASSERT(rfs4_dbe_refcnt(cp->rc_dbe) > 0);
 
         /*
          * The lock ensures that if the current instance is in the process
          * of changing, we will see the new one.
          */
-        mutex_enter(&rfs4_servinst_lock);
+        mutex_enter(&nsrv4->servinst_lock);
         cp->rc_server_instance = sip;
-        mutex_exit(&rfs4_servinst_lock);
+        mutex_exit(&nsrv4->servinst_lock);
 }
 
 rfs4_servinst_t *
 rfs4_servinst(rfs4_client_t *cp)
 {

@@ -869,46 +951,51 @@
 static nfsstat4
 do_rfs4_op_secinfo(struct compound_state *cs, char *nm, SECINFO4res *resp)
 {
         int error, different_export = 0;
         vnode_t *dvp, *vp;
-        struct exportinfo *exi = NULL;
+        struct exportinfo *exi;
         fid_t fid;
         uint_t count, i;
         secinfo4 *resok_val;
         struct secinfo *secp;
         seconfig_t *si;
         bool_t did_traverse = FALSE;
         int dotdot, walk;
+        nfs_export_t *ne = nfs_get_export();
 
         dvp = cs->vp;
+        exi = cs->exi;
+        ASSERT(exi != NULL);
         dotdot = (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0');
 
         /*
          * If dotdotting, then need to check whether it's above the
          * root of a filesystem, or above an export point.
          */
         if (dotdot) {
+                vnode_t *zone_rootvp = ne->exi_root->exi_vp;
 
+                ASSERT3U(exi->exi_zoneid, ==, ne->exi_root->exi_zoneid);
                 /*
                  * If dotdotting at the root of a filesystem, then
                  * need to traverse back to the mounted-on filesystem
                  * and do the dotdot lookup there.
                  */
-                if (cs->vp->v_flag & VROOT) {
+                if ((dvp->v_flag & VROOT) || VN_CMP(dvp, zone_rootvp)) {
 
                         /*
                          * If at the system root, then can
                          * go up no further.
                          */
-                        if (VN_CMP(dvp, rootdir))
+                        if (VN_CMP(dvp, zone_rootvp))
                                 return (puterrno4(ENOENT));
 
                         /*
                          * Traverse back to the mounted-on filesystem
                          */
-                        dvp = untraverse(cs->vp);
+                        dvp = untraverse(dvp, zone_rootvp);
 
                         /*
                          * Set the different_export flag so we remember
                          * to pick up a new exportinfo entry for
                          * this new filesystem.

@@ -918,11 +1005,11 @@
 
                         /*
                          * If dotdotting above an export point then set
                          * the different_export to get new export info.
                          */
-                        different_export = nfs_exported(cs->exi, cs->vp);
+                        different_export = nfs_exported(exi, dvp);
                 }
         }
 
         /*
          * Get the vnode for the component "nm".

@@ -937,13 +1024,13 @@
          * used in the request is valid but not an explicitly shared flavor,
          * or the access bit indicates that this is a limited access,
          * check whether this vnode is visible.
          */
         if (!different_export &&
-            (PSEUDO(cs->exi) || ! is_exported_sec(cs->nfsflavor, cs->exi) ||
+            (PSEUDO(exi) || !is_exported_sec(cs->nfsflavor, exi) ||
             cs->access & CS_ACCESS_LIMITED)) {
-                if (! nfs_visible(cs->exi, vp, &different_export)) {
+                if (! nfs_visible(exi, vp, &different_export)) {
                         VN_RELE(vp);
                         return (puterrno4(ENOENT));
                 }
         }
 

@@ -981,10 +1068,11 @@
                 if (error) {
                         VN_RELE(vp);
                         return (puterrno4(error));
                 }
 
+                /* We'll need to reassign "exi". */
                 if (dotdot)
                         exi = nfs_vptoexi(NULL, vp, cs->cr, &walk, NULL, TRUE);
                 else
                         exi = checkexport4(&vp->v_vfsp->vfs_fsid, &fid, vp);
 

@@ -1001,12 +1089,10 @@
                         } else {
                                 VN_RELE(vp);
                                 return (puterrno4(EACCES));
                         }
                 }
-        } else {
-                exi = cs->exi;
         }
         ASSERT(exi != NULL);
 
 
         /*

@@ -1015,11 +1101,11 @@
          *
          * Return all flavors for a pseudo node.
          * For a real export node, return the flavor that the client
          * has access with.
          */
-        ASSERT(RW_LOCK_HELD(&exported_lock));
+        ASSERT(RW_LOCK_HELD(&ne->exported_lock));
         if (PSEUDO(exi)) {
                 count = exi->exi_export.ex_seccnt; /* total sec count */
                 resok_val = kmem_alloc(count * sizeof (secinfo4), KM_SLEEP);
                 secp = exi->exi_export.ex_secinfo;
 

@@ -1378,10 +1464,11 @@
         COMMIT4res *resp = &resop->nfs_resop4_u.opcommit;
         int error;
         vnode_t *vp = cs->vp;
         cred_t *cr = cs->cr;
         vattr_t va;
+        nfs4_srv_t *nsrv4;
 
         DTRACE_NFSV4_2(op__commit__start, struct compound_state *, cs,
             COMMIT4args *, args);
 
         if (vp == NULL) {

@@ -1434,12 +1521,13 @@
         if (error) {
                 *cs->statusp = resp->status = puterrno4(error);
                 goto out;
         }
 
+        nsrv4 = nfs4_get_srv();
         *cs->statusp = resp->status = NFS4_OK;
-        resp->writeverf = Write4verf;
+        resp->writeverf = nsrv4->write4verf;
 out:
         DTRACE_NFSV4_2(op__commit__done, struct compound_state *, cs,
             COMMIT4res *, resp);
 }
 

@@ -2631,29 +2719,32 @@
          * If dotdotting, then need to check whether it's
          * above the root of a filesystem, or above an
          * export point.
          */
         if (dotdot) {
+                vnode_t *zone_rootvp;
 
+                ASSERT(cs->exi != NULL);
+                zone_rootvp = cs->exi->exi_ne->exi_root->exi_vp;
                 /*
                  * If dotdotting at the root of a filesystem, then
                  * need to traverse back to the mounted-on filesystem
                  * and do the dotdot lookup there.
                  */
-                if (cs->vp->v_flag & VROOT) {
+                if ((cs->vp->v_flag & VROOT) || VN_CMP(cs->vp, zone_rootvp)) {
 
                         /*
                          * If at the system root, then can
                          * go up no further.
                          */
-                        if (VN_CMP(cs->vp, rootdir))
+                        if (VN_CMP(cs->vp, zone_rootvp))
                                 return (puterrno4(ENOENT));
 
                         /*
                          * Traverse back to the mounted-on filesystem
                          */
-                        cs->vp = untraverse(cs->vp);
+                        cs->vp = untraverse(cs->vp, zone_rootvp);
 
                         /*
                          * Set the different_export flag so we remember
                          * to pick up a new exportinfo entry for
                          * this new filesystem.

@@ -3407,10 +3498,11 @@
         PUTPUBFH4res    *resp = &resop->nfs_resop4_u.opputpubfh;
         int             error;
         vnode_t         *vp;
         struct exportinfo *exi, *sav_exi;
         nfs_fh4_fmt_t   *fh_fmtp;
+        nfs_export_t *ne = nfs_get_export();
 
         DTRACE_NFSV4_1(op__putpubfh__start, struct compound_state *, cs);
 
         if (cs->vp) {
                 VN_RELE(cs->vp);

@@ -3420,23 +3512,23 @@
         if (cs->cr)
                 crfree(cs->cr);
 
         cs->cr = crdup(cs->basecr);
 
-        vp = exi_public->exi_vp;
+        vp = ne->exi_public->exi_vp;
         if (vp == NULL) {
                 *cs->statusp = resp->status = NFS4ERR_SERVERFAULT;
                 goto out;
         }
 
-        error = makefh4(&cs->fh, vp, exi_public);
+        error = makefh4(&cs->fh, vp, ne->exi_public);
         if (error != 0) {
                 *cs->statusp = resp->status = puterrno4(error);
                 goto out;
         }
         sav_exi = cs->exi;
-        if (exi_public == exi_root) {
+        if (ne->exi_public == ne->exi_root) {
                 /*
                  * No filesystem is actually shared public, so we default
                  * to exi_root. In this case, we must check whether root
                  * is exported.
                  */

@@ -3447,16 +3539,16 @@
                  * should use is what checkexport4 returns, because root_exi is
                  * actually a mostly empty struct.
                  */
                 exi = checkexport4(&fh_fmtp->fh4_fsid,
                     (fid_t *)&fh_fmtp->fh4_xlen, NULL);
-                cs->exi = ((exi != NULL) ? exi : exi_public);
+                cs->exi = ((exi != NULL) ? exi : ne->exi_public);
         } else {
                 /*
                  * it's a properly shared filesystem
                  */
-                cs->exi = exi_public;
+                cs->exi = ne->exi_public;
         }
 
         if (is_system_labeled()) {
                 bslabel_t *clabel;
 

@@ -3594,11 +3686,11 @@
          * Using rootdir, the system root vnode,
          * get its fid.
          */
         bzero(&fid, sizeof (fid));
         fid.fid_len = MAXFIDSZ;
-        error = vop_fid_pseudo(rootdir, &fid);
+        error = vop_fid_pseudo(ZONE_ROOTVP(), &fid);
         if (error != 0) {
                 *cs->statusp = resp->status = puterrno4(error);
                 goto out;
         }
 

@@ -3608,11 +3700,11 @@
          * If the server root isn't exported directly, then
          * it should at least be a pseudo export based on
          * one or more exports further down in the server's
          * file tree.
          */
-        exi = checkexport4(&rootdir->v_vfsp->vfs_fsid, &fid, NULL);
+        exi = checkexport4(&ZONE_ROOTVP()->v_vfsp->vfs_fsid, &fid, NULL);
         if (exi == NULL || exi->exi_export.ex_flags & EX_PUBLIC) {
                 NFS4_DEBUG(rfs4_debug,
                     (CE_WARN, "rfs4_op_putrootfh: export check failure"));
                 *cs->statusp = resp->status = NFS4ERR_SERVERFAULT;
                 goto out;

@@ -3620,24 +3712,24 @@
 
         /*
          * Now make a filehandle based on the root
          * export and root vnode.
          */
-        error = makefh4(&cs->fh, rootdir, exi);
+        error = makefh4(&cs->fh, ZONE_ROOTVP(), exi);
         if (error != 0) {
                 *cs->statusp = resp->status = puterrno4(error);
                 goto out;
         }
 
         sav_exi = cs->exi;
         cs->exi = exi;
 
-        VN_HOLD(rootdir);
-        cs->vp = rootdir;
+        VN_HOLD(ZONE_ROOTVP());
+        cs->vp = ZONE_ROOTVP();
 
         if ((resp->status = call_checkauth4(cs, req)) != NFS4_OK) {
-                VN_RELE(rootdir);
+                VN_RELE(cs->vp);
                 cs->vp = NULL;
                 cs->exi = sav_exi;
                 goto out;
         }
 

@@ -3719,14 +3811,16 @@
         data = kmem_alloc(MAXPATHLEN + 1, KM_SLEEP);
 
         if (is_referral) {
                 char *s;
                 size_t strsz;
+                kstat_named_t *stat =
+                    cs->exi->exi_ne->ne_globals->svstat[NFS_V4];
 
                 /* Get an artificial symlink based on a referral */
                 s = build_symlink(vp, cs->cr, &strsz);
-                global_svstat_ptr[4][NFS_REFERLINKS].value.ui64++;
+                stat[NFS_REFERLINKS].value.ui64++;
                 DTRACE_PROBE2(nfs4serv__func__referral__reflink,
                     vnode_t *, vp, char *, s);
                 if (s == NULL)
                         error = EINVAL;
                 else {

@@ -4169,11 +4263,11 @@
                          * not ENOTEMPTY, if the directory is not
                          * empty.  A System V NFS server needs to map
                          * NFS4ERR_EXIST to NFS4ERR_NOTEMPTY to
                          * transmit over the wire.
                          */
-                        if ((error = VOP_RMDIR(dvp, name, rootdir, cs->cr,
+                        if ((error = VOP_RMDIR(dvp, name, ZONE_ROOTVP(), cs->cr,
                             NULL, 0)) == EEXIST)
                                 error = ENOTEMPTY;
                 }
         } else {
                 if ((error = VOP_REMOVE(dvp, name, cs->cr, NULL, 0)) == 0 &&

@@ -4281,18 +4375,19 @@
         RENAME4args *args = &argop->nfs_argop4_u.oprename;
         RENAME4res *resp = &resop->nfs_resop4_u.oprename;
         int error;
         vnode_t *odvp;
         vnode_t *ndvp;
-        vnode_t *srcvp, *targvp;
+        vnode_t *srcvp, *targvp, *tvp;
         struct vattr obdva, oidva, oadva;
         struct vattr nbdva, nidva, nadva;
         char *onm, *nnm;
         uint_t olen, nlen;
         rfs4_file_t *fp, *sfp;
         int in_crit_src, in_crit_targ;
         int fp_rele_grant_hold, sfp_rele_grant_hold;
+        int unlinked;
         bslabel_t *clabel;
         struct sockaddr *ca;
         char *converted_onm = NULL;
         char *converted_nnm = NULL;
         nfsstat4 status;

@@ -4299,13 +4394,14 @@
 
         DTRACE_NFSV4_2(op__rename__start, struct compound_state *, cs,
             RENAME4args *, args);
 
         fp = sfp = NULL;
-        srcvp = targvp = NULL;
+        srcvp = targvp = tvp = NULL;
         in_crit_src = in_crit_targ = 0;
         fp_rele_grant_hold = sfp_rele_grant_hold = 0;
+        unlinked = 0;
 
         /* CURRENT_FH: target directory */
         ndvp = cs->vp;
         if (ndvp == NULL) {
                 *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE;

@@ -4474,11 +4570,10 @@
                         goto err_out;
                 }
         }
         fp_rele_grant_hold = 1;
 
-
         /* Check for NBMAND lock on both source and target */
         if (nbl_need_check(srcvp)) {
                 nbl_start_crit(srcvp, RW_READER);
                 in_crit_src = 1;
                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {

@@ -4509,35 +4604,45 @@
         }
 
         NFS4_SET_FATTR4_CHANGE(resp->source_cinfo.before, obdva.va_ctime)
         NFS4_SET_FATTR4_CHANGE(resp->target_cinfo.before, nbdva.va_ctime)
 
-        if ((error = VOP_RENAME(odvp, converted_onm, ndvp, converted_nnm,
-            cs->cr, NULL, 0)) == 0 && fp != NULL) {
-                struct vattr va;
-                vnode_t *tvp;
+        error = VOP_RENAME(odvp, converted_onm, ndvp, converted_nnm, cs->cr,
+            NULL, 0);
 
+        /*
+         * If target existed and was unlinked by VOP_RENAME, state will need
+         * closed. To avoid deadlock, rfs4_close_all_state will be done after
+         * any necessary nbl_end_crit on srcvp and tgtvp.
+         */
+        if (error == 0 && fp != NULL) {
                 rfs4_dbe_lock(fp->rf_dbe);
                 tvp = fp->rf_vp;
                 if (tvp)
                         VN_HOLD(tvp);
                 rfs4_dbe_unlock(fp->rf_dbe);
 
                 if (tvp) {
+                        struct vattr va;
                         va.va_mask = AT_NLINK;
+
                         if (!VOP_GETATTR(tvp, &va, 0, cs->cr, NULL) &&
                             va.va_nlink == 0) {
-                                /* The file is gone and so should the state */
-                                if (in_crit_targ) {
-                                        nbl_end_crit(targvp);
-                                        in_crit_targ = 0;
+                                unlinked = 1;
+
+                                /* DEBUG data */
+                                if ((srcvp == targvp) || (tvp != targvp)) {
+                                        cmn_err(CE_WARN, "rfs4_op_rename: "
+                                            "srcvp %p, targvp: %p, tvp: %p",
+                                            (void *)srcvp, (void *)targvp,
+                                            (void *)tvp);
                                 }
-                                rfs4_close_all_state(fp);
-                        }
+                        } else {
                         VN_RELE(tvp);
                 }
         }
+        }
         if (error == 0)
                 vn_renamepath(ndvp, srcvp, nnm, nlen - 1);
 
         if (in_crit_src)
                 nbl_end_crit(srcvp);

@@ -4546,10 +4651,25 @@
         if (in_crit_targ)
                 nbl_end_crit(targvp);
         if (targvp)
                 VN_RELE(targvp);
 
+        if (unlinked) {
+                ASSERT(fp != NULL);
+                ASSERT(tvp != NULL);
+
+                /* DEBUG data */
+                if (RW_READ_HELD(&tvp->v_nbllock)) {
+                        cmn_err(CE_WARN, "rfs4_op_rename: "
+                            "RW_READ_HELD(%p)", (void *)tvp);
+                }
+
+                /* The file is gone and so should the state */
+                rfs4_close_all_state(fp);
+                VN_RELE(tvp);
+        }
+
         if (sfp) {
                 rfs4_clear_dont_grant(sfp);
                 rfs4_file_rele(sfp);
         }
         if (fp) {

@@ -5482,10 +5602,11 @@
         cred_t *savecred, *cr;
         bool_t *deleg = &cs->deleg;
         nfsstat4 stat;
         int in_crit = 0;
         caller_context_t ct;
+        nfs4_srv_t *nsrv4;
 
         DTRACE_NFSV4_2(op__write__start, struct compound_state *, cs,
             WRITE4args *, args);
 
         vp = cs->vp;

@@ -5552,15 +5673,16 @@
         if (MANDLOCK(vp, bva.va_mode)) {
                 *cs->statusp = resp->status = NFS4ERR_ACCESS;
                 goto out;
         }
 
+        nsrv4 = nfs4_get_srv();
         if (args->data_len == 0) {
                 *cs->statusp = resp->status = NFS4_OK;
                 resp->count = 0;
                 resp->committed = args->stable;
-                resp->writeverf = Write4verf;
+                resp->writeverf = nsrv4->write4verf;
                 goto out;
         }
 
         if (args->mblk != NULL) {
                 mblk_t *m;

@@ -5652,11 +5774,11 @@
         if (ioflag == 0)
                 resp->committed = UNSTABLE4;
         else
                 resp->committed = FILE_SYNC4;
 
-        resp->writeverf = Write4verf;
+        resp->writeverf = nsrv4->write4verf;
 
 out:
         if (in_crit)
                 nbl_end_crit(vp);
 

@@ -5672,22 +5794,28 @@
 rfs4_compound(COMPOUND4args *args, COMPOUND4res *resp, struct exportinfo *exi,
     struct svc_req *req, cred_t *cr, int *rv)
 {
         uint_t i;
         struct compound_state cs;
+        nfs4_srv_t *nsrv4;
+        nfs_export_t *ne = nfs_get_export();
 
         if (rv != NULL)
                 *rv = 0;
         rfs4_init_compound_state(&cs);
         /*
-         * Form a reply tag by copying over the reqeuest tag.
+         * Form a reply tag by copying over the request tag.
          */
+        resp->tag.utf8string_len = args->tag.utf8string_len;
+        if (args->tag.utf8string_len != 0) {
         resp->tag.utf8string_val =
             kmem_alloc(args->tag.utf8string_len, KM_SLEEP);
-        resp->tag.utf8string_len = args->tag.utf8string_len;
         bcopy(args->tag.utf8string_val, resp->tag.utf8string_val,
             resp->tag.utf8string_len);
+        } else {
+                resp->tag.utf8string_val = NULL;
+        }
 
         cs.statusp = &resp->status;
         cs.req = req;
         resp->array = NULL;
         resp->array_len = 0;

@@ -5729,10 +5857,11 @@
         resp->array_len = args->array_len;
         resp->array = kmem_zalloc(args->array_len * sizeof (nfs_resop4),
             KM_SLEEP);
 
         cs.basecr = cr;
+        nsrv4 = nfs4_get_srv();
 
         DTRACE_NFSV4_2(compound__start, struct compound_state *, &cs,
             COMPOUND4args *, args);
 
         /*

@@ -5743,30 +5872,31 @@
          * per proc (excluding public exinfo), and exi_count design
          * is sufficient to protect concurrent execution of NFS2/3
          * ops along with unexport.  This lock will be removed as
          * part of the NFSv4 phase 2 namespace redesign work.
          */
-        rw_enter(&exported_lock, RW_READER);
+        rw_enter(&ne->exported_lock, RW_READER);
 
         /*
          * If this is the first compound we've seen, we need to start all
          * new instances' grace periods.
          */
-        if (rfs4_seen_first_compound == 0) {
-                rfs4_grace_start_new();
+        if (nsrv4->seen_first_compound == 0) {
+                rfs4_grace_start_new(nsrv4);
                 /*
                  * This must be set after rfs4_grace_start_new(), otherwise
                  * another thread could proceed past here before the former
                  * is finished.
                  */
-                rfs4_seen_first_compound = 1;
+                nsrv4->seen_first_compound = 1;
         }
 
         for (i = 0; i < args->array_len && cs.cont; i++) {
                 nfs_argop4 *argop;
                 nfs_resop4 *resop;
                 uint_t op;
+                kstat_named_t *stat = ne->ne_globals->rfsproccnt[NFS_V4];
 
                 argop = &args->array[i];
                 resop = &resp->array[i];
                 resop->resop = argop->argop;
                 op = (uint_t)resop->resop;

@@ -5774,11 +5904,11 @@
                 if (op < rfsv4disp_cnt) {
                         /*
                          * Count the individual ops here; NULL and COMPOUND
                          * are counted in common_dispatch()
                          */
-                        rfsproccnt_v4_ptr[op].value.ui64++;
+                        stat[op].value.ui64++;
 
                         NFS4_DEBUG(rfs4_debug > 1,
                             (CE_NOTE, "Executing %s", rfs4_op_string[op]));
                         (*rfsv4disptab[op].dis_proc)(argop, resop, req, &cs);
                         NFS4_DEBUG(rfs4_debug > 1, (CE_NOTE, "%s returned %d",

@@ -5791,11 +5921,11 @@
                          * will have already returned BADXDR if op doesn't
                          * decode to legal value.  This only done for a
                          * day when XDR code doesn't verify v4 opcodes.
                          */
                         op = OP_ILLEGAL;
-                        rfsproccnt_v4_ptr[OP_ILLEGAL_IDX].value.ui64++;
+                        stat[OP_ILLEGAL_IDX].value.ui64++;
 
                         rfs4_op_illegal(argop, resop, req, &cs);
                         cs.cont = FALSE;
                 }
 

@@ -5814,19 +5944,26 @@
                         resp->array_len =  i + 1;
                         resp->array = new_res;
                 }
         }
 
-        rw_exit(&exported_lock);
+        rw_exit(&ne->exported_lock);
 
-        DTRACE_NFSV4_2(compound__done, struct compound_state *, &cs,
-            COMPOUND4res *, resp);
-
+        /*
+         * clear exportinfo and vnode fields from compound_state before dtrace
+         * probe, to avoid tracing residual values for path and share path.
+         */
         if (cs.vp)
                 VN_RELE(cs.vp);
         if (cs.saved_vp)
                 VN_RELE(cs.saved_vp);
+        cs.exi = cs.saved_exi = NULL;
+        cs.vp = cs.saved_vp = NULL;
+
+        DTRACE_NFSV4_2(compound__done, struct compound_state *, &cs,
+            COMPOUND4res *, resp);
+
         if (cs.saved_fh.nfs_fh4_val)
                 kmem_free(cs.saved_fh.nfs_fh4_val, NFS4_FHSIZE);
 
         if (cs.basecr)
                 crfree(cs.basecr);

@@ -6526,29 +6663,31 @@
                  */
 
                 if (trunc) {
                         int in_crit = 0;
                         rfs4_file_t *fp;
+                        nfs4_srv_t *nsrv4;
                         bool_t create = FALSE;
 
                         /*
                          * We are writing over an existing file.
                          * Check to see if we need to recall a delegation.
                          */
-                        rfs4_hold_deleg_policy();
+                        nsrv4 = nfs4_get_srv();
+                        rfs4_hold_deleg_policy(nsrv4);
                         if ((fp = rfs4_findfile(vp, NULL, &create)) != NULL) {
                                 if (rfs4_check_delegated_byfp(FWRITE, fp,
                                     (reqsize == 0), FALSE, FALSE, &clientid)) {
                                         rfs4_file_rele(fp);
-                                        rfs4_rele_deleg_policy();
+                                        rfs4_rele_deleg_policy(nsrv4);
                                         VN_RELE(vp);
                                         *attrset = 0;
                                         return (NFS4ERR_DELAY);
                                 }
                                 rfs4_file_rele(fp);
                         }
-                        rfs4_rele_deleg_policy();
+                        rfs4_rele_deleg_policy(nsrv4);
 
                         if (nbl_need_check(vp)) {
                                 in_crit = 1;
 
                                 ASSERT(reqsize == 0);

@@ -8102,15 +8241,17 @@
         SETCLIENTID_CONFIRM4args *args =
             &argop->nfs_argop4_u.opsetclientid_confirm;
         SETCLIENTID_CONFIRM4res *res =
             &resop->nfs_resop4_u.opsetclientid_confirm;
         rfs4_client_t *cp, *cptoclose = NULL;
+        nfs4_srv_t *nsrv4;
 
         DTRACE_NFSV4_2(op__setclientid__confirm__start,
             struct compound_state *, cs,
             SETCLIENTID_CONFIRM4args *, args);
 
+        nsrv4 = nfs4_get_srv();
         *cs->statusp = res->status = NFS4_OK;
 
         cp = rfs4_findclient_by_id(args->clientid, TRUE);
 
         if (cp == NULL) {

@@ -8142,18 +8283,18 @@
 
         /*
          * Update the client's associated server instance, if it's changed
          * since the client was created.
          */
-        if (rfs4_servinst(cp) != rfs4_cur_servinst)
-                rfs4_servinst_assign(cp, rfs4_cur_servinst);
+        if (rfs4_servinst(cp) != nsrv4->nfs4_cur_servinst)
+                rfs4_servinst_assign(nsrv4, cp, nsrv4->nfs4_cur_servinst);
 
         /*
          * Record clientid in stable storage.
          * Must be done after server instance has been assigned.
          */
-        rfs4_ss_clid(cp);
+        rfs4_ss_clid(nsrv4, cp);
 
         rfs4_dbe_unlock(cp->rc_dbe);
 
         if (cptoclose)
                 /* don't need to rele, client_close does it */

@@ -8164,11 +8305,11 @@
         rfs4_update_lease(cp);
 
         /*
          * Check to see if client can perform reclaims
          */
-        rfs4_ss_chkclid(cp);
+        rfs4_ss_chkclid(nsrv4, cp);
 
         rfs4_client_rele(cp);
 
 out:
         DTRACE_NFSV4_2(op__setclientid__confirm__done,

@@ -9808,6 +9949,170 @@
         if (ci == NULL)
                 return (0);
         is_downrev = ci->ri_no_referrals;
         rfs4_dbe_rele(ci->ri_dbe);
         return (is_downrev);
+}
+
+/*
+ * Do the main work of handling HA-NFSv4 Resource Group failover on
+ * Sun Cluster.
+ * We need to detect whether any RG admin paths have been added or removed,
+ * and adjust resources accordingly.
+ * Currently we're using a very inefficient algorithm, ~ 2 * O(n**2). In
+ * order to scale, the list and array of paths need to be held in more
+ * suitable data structures.
+ */
+static void
+hanfsv4_failover(nfs4_srv_t *nsrv4)
+{
+        int i, start_grace, numadded_paths = 0;
+        char **added_paths = NULL;
+        rfs4_dss_path_t *dss_path;
+
+        /*
+         * Note: currently, dss_pathlist cannot be NULL, since
+         * it will always include an entry for NFS4_DSS_VAR_DIR. If we
+         * make the latter dynamically specified too, the following will
+         * need to be adjusted.
+         */
+
+        /*
+         * First, look for removed paths: RGs that have been failed-over
+         * away from this node.
+         * Walk the "currently-serving" dss_pathlist and, for each
+         * path, check if it is on the "passed-in" rfs4_dss_newpaths array
+         * from nfsd. If not, that RG path has been removed.
+         *
+         * Note that nfsd has sorted rfs4_dss_newpaths for us, and removed
+         * any duplicates.
+         */
+        dss_path = nsrv4->dss_pathlist;
+        do {
+                int found = 0;
+                char *path = dss_path->path;
+
+                /* used only for non-HA so may not be removed */
+                if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) {
+                        dss_path = dss_path->next;
+                        continue;
+                }
+
+                for (i = 0; i < rfs4_dss_numnewpaths; i++) {
+                        int cmpret;
+                        char *newpath = rfs4_dss_newpaths[i];
+
+                        /*
+                         * Since nfsd has sorted rfs4_dss_newpaths for us,
+                         * once the return from strcmp is negative we know
+                         * we've passed the point where "path" should be,
+                         * and can stop searching: "path" has been removed.
+                         */
+                        cmpret = strcmp(path, newpath);
+                        if (cmpret < 0)
+                                break;
+                        if (cmpret == 0) {
+                                found = 1;
+                                break;
+                        }
+                }
+
+                if (found == 0) {
+                        unsigned index = dss_path->index;
+                        rfs4_servinst_t *sip = dss_path->sip;
+                        rfs4_dss_path_t *path_next = dss_path->next;
+
+                        /*
+                         * This path has been removed.
+                         * We must clear out the servinst reference to
+                         * it, since it's now owned by another
+                         * node: we should not attempt to touch it.
+                         */
+                        ASSERT(dss_path == sip->dss_paths[index]);
+                        sip->dss_paths[index] = NULL;
+
+                        /* remove from "currently-serving" list, and destroy */
+                        remque(dss_path);
+                        /* allow for NUL */
+                        kmem_free(dss_path->path, strlen(dss_path->path) + 1);
+                        kmem_free(dss_path, sizeof (rfs4_dss_path_t));
+
+                        dss_path = path_next;
+                } else {
+                        /* path was found; not removed */
+                        dss_path = dss_path->next;
+                }
+        } while (dss_path != nsrv4->dss_pathlist);
+
+        /*
+         * Now, look for added paths: RGs that have been failed-over
+         * to this node.
+         * Walk the "passed-in" rfs4_dss_newpaths array from nfsd and,
+         * for each path, check if it is on the "currently-serving"
+         * dss_pathlist. If not, that RG path has been added.
+         *
+         * Note: we don't do duplicate detection here; nfsd does that for us.
+         *
+         * Note: numadded_paths <= rfs4_dss_numnewpaths, which gives us
+         * an upper bound for the size needed for added_paths[numadded_paths].
+         */
+
+        /* probably more space than we need, but guaranteed to be enough */
+        if (rfs4_dss_numnewpaths > 0) {
+                size_t sz = rfs4_dss_numnewpaths * sizeof (char *);
+                added_paths = kmem_zalloc(sz, KM_SLEEP);
+        }
+
+        /* walk the "passed-in" rfs4_dss_newpaths array from nfsd */
+        for (i = 0; i < rfs4_dss_numnewpaths; i++) {
+                int found = 0;
+                char *newpath = rfs4_dss_newpaths[i];
+
+                dss_path = nsrv4->dss_pathlist;
+                do {
+                        char *path = dss_path->path;
+
+                        /* used only for non-HA */
+                        if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) {
+                                dss_path = dss_path->next;
+                                continue;
+                        }
+
+                        if (strncmp(path, newpath, strlen(path)) == 0) {
+                                found = 1;
+                                break;
+                        }
+
+                        dss_path = dss_path->next;
+                } while (dss_path != nsrv4->dss_pathlist);
+
+                if (found == 0) {
+                        added_paths[numadded_paths] = newpath;
+                        numadded_paths++;
+                }
+        }
+
+        /* did we find any added paths? */
+        if (numadded_paths > 0) {
+
+                /* create a new server instance, and start its grace period */
+                start_grace = 1;
+                /* CSTYLED */
+                rfs4_servinst_create(nsrv4, start_grace, numadded_paths, added_paths);
+
+                /* read in the stable storage state from these paths */
+                rfs4_dss_readstate(nsrv4, numadded_paths, added_paths);
+
+                /*
+                 * Multiple failovers during a grace period will cause
+                 * clients of the same resource group to be partitioned
+                 * into different server instances, with different
+                 * grace periods.  Since clients of the same resource
+                 * group must be subject to the same grace period,
+                 * we need to reset all currently active grace periods.
+                 */
+                rfs4_grace_reset_all(nsrv4);
+        }
+
+        if (rfs4_dss_numnewpaths > 0)
+                kmem_free(added_paths, rfs4_dss_numnewpaths * sizeof (char *));
 }