big-one Udiff usr/src/uts/common/fs/nfs/nfs

Print this page

NEX-17125 NFS: nbmand lock entered but not exited on error path
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
NEX-15279 support NFS server in zone
NEX-15520 online NFS shares cause zoneadm halt to hang in nfs_export_zone_fini
Portions contributed by: Dan Kruchinin dan.kruchinin@nexenta.com
Portions contributed by: Stepan Zastupov stepan.zastupov@gmail.com
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
NEX-9275 Got "bad mutex" panic when run IO to nfs share from clients
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
NEX-3524 CLONE - Port NEX-3505 "wrong authentication" messages with root=@0.0.0.0/0 set, result in loss of client access
Reviewed by: Marcel Telka <marcel.telka@nexenta.com>
NEX-3533 CLONE - Port NEX-3019 NFSv3 writes underneath mounted filesystem to directory
Reviewed by: Dan Fields <dan.fields@nexenta.com>
NEX-3095 Issues related to NFS nohide
Reviewed by: Dan Fields <dan.fields@nexenta.com>
NEX-1128 NFS server: Generic uid and gid remapping for AUTH_SYS
Reviewed by: Jan Kryl <jan.kryl@nexenta.com>
OS-20 share_nfs(1m) charset handling is unreliable
OS-22 Page fault at nfscmd_dropped_entrysize+0x1e()
OS-23 NFSv2/3/4: READDIR responses are inconsistent when charset conversion fails
OS-24 rfs3_readdir(): Issues related to nfscmd_convdirent()
Reviewed by: Jan Kryl <jan.kryl@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
re #13613 rb4516 Tunables needs volatile keyword
closes #12112 rb3823 - nfs-nohide: lookup("..") for submount should be correct
re #3541 rb11254 - nfs nohide - "nfssrv: need ability to go to submounts for v3 and v2 protocols"

@@ -16,21 +16,25 @@
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
  */
 
 /*
  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  *      All rights reserved.
  */
 
+/*
+ * Copyright 2018 Nexenta Systems, Inc.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/cred.h>
 #include <sys/buf.h>

@@ -67,20 +71,38 @@
 #include <vm/seg_map.h>
 #include <vm/seg_kmem.h>
 
 #include <sys/strsubr.h>
 
+struct rfs_async_write_list;
+
 /*
+ * Zone globals of NFSv2 server
+ */
+typedef struct nfs_srv {
+        kmutex_t                        async_write_lock;
+        struct rfs_async_write_list     *async_write_head;
+
+        /*
+         * enables write clustering if == 1
+         */
+        int             write_async;
+} nfs_srv_t;
+
+/*
  * These are the interface routines for the server side of the
  * Network File System.  See the NFS version 2 protocol specification
  * for a description of this interface.
  */
 
 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
                         cred_t *);
+static void     *rfs_zone_init(zoneid_t zoneid);
+static void     rfs_zone_fini(zoneid_t zoneid, void *data);
 
+
 /*
  * Some "over the wire" UNIX file types.  These are encoded
  * into the mode.  This needs to be fixed in the next rev.
  */
 #define IFMT            0170000         /* type of file */

@@ -87,10 +109,11 @@
 #define IFCHR           0020000         /* character special */
 #define IFBLK           0060000         /* block special */
 #define IFSOCK          0140000         /* socket */
 
 u_longlong_t nfs2_srv_caller_id;
+static zone_key_t rfs_zone_key;
 
 /*
  * Get file attributes.
  * Returns the current attributes of the file with the given fhandle.
  */

@@ -327,11 +350,85 @@
 rfs_setattr_getfh(struct nfssaargs *args)
 {
         return (&args->saa_fh);
 }
 
+/* Change and release @exip and @vpp only in success */
+int
+rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
+{
+        struct exportinfo *exi;
+        vnode_t *vp = *vpp;
+        fid_t fid;
+        int error;
+
+        VN_HOLD(vp);
+
+        if ((error = traverse(&vp)) != 0) {
+                VN_RELE(vp);
+                return (error);
+        }
+
+        bzero(&fid, sizeof (fid));
+        fid.fid_len = MAXFIDSZ;
+        error = VOP_FID(vp, &fid, NULL);
+        if (error) {
+                VN_RELE(vp);
+                return (error);
+        }
+
+        exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
+        if (exi == NULL ||
+            (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
+                /*
+                 * It is not error, just subdir is not exported
+                 * or "nohide" is not set
+                 */
+                if (exi != NULL)
+                        exi_rele(&exi);
+                VN_RELE(vp);
+        } else {
+                /* go to submount */
+                exi_rele(exip);
+                *exip = exi;
+
+                VN_RELE(*vpp);
+                *vpp = vp;
+        }
+
+        return (0);
+}
+
 /*
+ * Given mounted "dvp" and "exi", go upper mountpoint
+ * with dvp/exi correction
+ * Return 0 in success
+ */
+int
+rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
+{
+        struct exportinfo *exi;
+        vnode_t *dvp = *dvpp;
+
+        ASSERT(dvp->v_flag & VROOT);
+
+        VN_HOLD(dvp);
+        dvp = untraverse(dvp);
+        exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
+        if (exi == NULL) {
+                VN_RELE(dvp);
+                return (-1);
+        }
+
+        exi_rele(exip);
+        *exip = exi;
+        VN_RELE(*dvpp);
+        *dvpp = dvp;
+
+        return (0);
+}
+/*
  * Directory lookup.
  * Returns an fhandle and file attributes for file name in a directory.
  */
 /* ARGSUSED */
 void

@@ -369,39 +466,51 @@
         /*
          * Allow lookups from the root - the default
          * location of the public filehandle.
          */
         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
-                dvp = rootdir;
+                dvp = ZONE_ROOTVP();
                 VN_HOLD(dvp);
         } else {
                 dvp = nfs_fhtovp(fhp, exi);
                 if (dvp == NULL) {
                         dr->dr_status = NFSERR_STALE;
                         return;
                 }
         }
 
+        exi_hold(exi);
+
         /*
          * Not allow lookup beyond root.
          * If the filehandle matches a filehandle of the exi,
          * then the ".." refers beyond the root of an exported filesystem.
          */
         if (strcmp(da->da_name, "..") == 0 &&
             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
-                VN_RELE(dvp);
-                dr->dr_status = NFSERR_NOENT;
-                return;
+                if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
+                    (dvp->v_flag & VROOT)) {
+                        /*
+                         * special case for ".." and 'nohide'exported root
+                         */
+                        if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
+                                error = NFSERR_ACCES;
+                                goto out;
         }
+                } else  {
+                        error = NFSERR_NOENT;
+                        goto out;
+                }
+        }
 
         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
             MAXPATHLEN);
 
         if (name == NULL) {
-                dr->dr_status = NFSERR_ACCES;
-                return;
+                error = NFSERR_ACCES;
+                goto out;
         }
 
         /*
          * If the public filehandle is used then allow
          * a multi-component lookup, i.e. evaluate

@@ -411,10 +520,13 @@
          * This may result in a vnode in another filesystem
          * which is OK as long as the filesystem is exported.
          */
         if (PUBLIC_FH2(fhp)) {
                 publicfh_flag = TRUE;
+
+                exi_rele(&exi);
+
                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
                     &sec);
         } else {
                 /*
                  * Do a normal single component lookup.

@@ -424,10 +536,15 @@
         }
 
         if (name != da->da_name)
                 kmem_free(name, MAXPATHLEN);
 
+        if (error == 0 && vn_ismntpt(vp)) {
+                error = rfs_cross_mnt(&vp, &exi);
+                if (error)
+                        VN_RELE(vp);
+        }
 
         if (!error) {
                 va.va_mask = AT_ALL;    /* we want everything */
 
                 error = rfs4_delegated_getattr(vp, &va, 0, cr);

@@ -450,20 +567,15 @@
                         }
                 }
                 VN_RELE(vp);
         }
 
+out:
         VN_RELE(dvp);
 
-        /*
-         * If publicfh_flag is true then we have called rfs_publicfh_mclookup
-         * and have obtained a new exportinfo in exi which needs to be
-         * released. Note the the original exportinfo pointed to by exi
-         * will be released by the caller, comon_dispatch.
-         */
-        if (publicfh_flag && exi != NULL)
-                exi_rele(exi);
+        if (exi != NULL)
+                exi_rele(&exi);
 
         /*
          * If it's public fh, no 0x81, and client's flavor is
          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.

@@ -683,10 +795,12 @@
 
         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 
         /* check if a monitor detected a delegation conflict */
         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
+                if (in_crit)
+                        nbl_end_crit(vp);
                 VN_RELE(vp);
                 /* mark as wouldblock so response is dropped */
                 curthread->t_flag |= T_WOULDBLOCK;
 
                 rr->rr_data = NULL;

@@ -1008,14 +1122,11 @@
 
         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
 
         /* check if a monitor detected a delegation conflict */
         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
-                VN_RELE(vp);
-                /* mark as wouldblock so response is dropped */
-                curthread->t_flag |= T_WOULDBLOCK;
-                return;
+                goto out;
         }
 
         if (wa->wa_data || wa->wa_rlist) {
                 /* Do the RDMA thing if necessary */
                 if (wa->wa_rlist) {

@@ -1051,10 +1162,11 @@
                 savecred = curthread->t_cred;
                 curthread->t_cred = cr;
                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
                 curthread->t_cred = savecred;
         } else {
+
                 iovcnt = 0;
                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
                         iovcnt++;
                 if (iovcnt <= MAX_IOVECS) {
 #ifdef DEBUG

@@ -1149,11 +1261,11 @@
         struct rfs_async_write_list *next;
 };
 
 static struct rfs_async_write_list *rfs_async_write_head = NULL;
 static kmutex_t rfs_async_write_lock;
-static int rfs_write_async = 1; /* enables write clustering if == 1 */
+volatile int rfs_write_async = 1;       /* enables write clustering if == 1 */
 
 #define MAXCLIOVECS     42
 #define RFSWRITE_INITVAL (enum nfsstat) -1
 
 #ifdef DEBUG

@@ -1194,12 +1306,14 @@
         struct rfs_async_write_list nlpsp;
         ushort_t t_flag;
         cred_t *savecred;
         int in_crit = 0;
         caller_context_t ct;
+        nfs_srv_t *nsrv;
 
-        if (!rfs_write_async) {
+        nsrv = zone_getspecific(rfs_zone_key, curzone);
+        if (!nsrv->write_async) {
                 rfs_write_sync(wa, ns, exi, req, cr, ro);
                 return;
         }
 
         /*

@@ -1220,12 +1334,12 @@
 
         /*
          * Look to see if there is already a cluster started
          * for this file.
          */
-        mutex_enter(&rfs_async_write_lock);
-        for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
+        mutex_enter(&nsrv->async_write_lock);
+        for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
                 if (bcmp(&wa->wa_fhandle, lp->fhp,
                     sizeof (fhandle_t)) == 0)
                         break;
         }

@@ -1247,12 +1361,12 @@
                 if (trp == NULL)
                         lp->list = nrp;
                 else
                         trp->list = nrp;
                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
-                        cv_wait(&lp->cv, &rfs_async_write_lock);
-                mutex_exit(&rfs_async_write_lock);
+                        cv_wait(&lp->cv, &nsrv->async_write_lock);
+                mutex_exit(&nsrv->async_write_lock);
 
                 return;
         }
 
         /*

@@ -1265,31 +1379,31 @@
         nlp->fhp = &wa->wa_fhandle;
         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
         nlp->list = nrp;
         nlp->next = NULL;
 
-        if (rfs_async_write_head == NULL) {
-                rfs_async_write_head = nlp;
+        if (nsrv->async_write_head == NULL) {
+                nsrv->async_write_head = nlp;
         } else {
-                lp = rfs_async_write_head;
+                lp = nsrv->async_write_head;
                 while (lp->next != NULL)
                         lp = lp->next;
                 lp->next = nlp;
         }
-        mutex_exit(&rfs_async_write_lock);
+        mutex_exit(&nsrv->async_write_lock);
 
         /*
          * Convert the file handle common to all of the requests
          * in this cluster to a vnode.
          */
         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
         if (vp == NULL) {
-                mutex_enter(&rfs_async_write_lock);
-                if (rfs_async_write_head == nlp)
-                        rfs_async_write_head = nlp->next;
+                mutex_enter(&nsrv->async_write_lock);
+                if (nsrv->async_write_head == nlp)
+                        nsrv->async_write_head = nlp->next;
                 else {
-                        lp = rfs_async_write_head;
+                        lp = nsrv->async_write_head;
                         while (lp->next != nlp)
                                 lp = lp->next;
                         lp->next = nlp->next;
                 }
                 t_flag = curthread->t_flag & T_WOULDBLOCK;

@@ -1296,11 +1410,11 @@
                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
                         rp->ns->ns_status = NFSERR_STALE;
                         rp->thread->t_flag |= t_flag;
                 }
                 cv_broadcast(&nlp->cv);
-                mutex_exit(&rfs_async_write_lock);
+                mutex_exit(&nsrv->async_write_lock);
 
                 return;
         }
 
         /*

@@ -1307,15 +1421,15 @@
          * Can only write regular files.  Attempts to write any
          * other file types fail with EISDIR.
          */
         if (vp->v_type != VREG) {
                 VN_RELE(vp);
-                mutex_enter(&rfs_async_write_lock);
-                if (rfs_async_write_head == nlp)
-                        rfs_async_write_head = nlp->next;
+                mutex_enter(&nsrv->async_write_lock);
+                if (nsrv->async_write_head == nlp)
+                        nsrv->async_write_head = nlp->next;
                 else {
-                        lp = rfs_async_write_head;
+                        lp = nsrv->async_write_head;
                         while (lp->next != nlp)
                                 lp = lp->next;
                         lp->next = nlp->next;
                 }
                 t_flag = curthread->t_flag & T_WOULDBLOCK;

@@ -1322,11 +1436,11 @@
                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
                         rp->ns->ns_status = NFSERR_ISDIR;
                         rp->thread->t_flag |= t_flag;
                 }
                 cv_broadcast(&nlp->cv);
-                mutex_exit(&rfs_async_write_lock);
+                mutex_exit(&nsrv->async_write_lock);
 
                 return;
         }
 
         /*

@@ -1354,15 +1468,15 @@
                 if (in_crit)
                         nbl_end_crit(vp);
                 VN_RELE(vp);
                 /* mark as wouldblock so response is dropped */
                 curthread->t_flag |= T_WOULDBLOCK;
-                mutex_enter(&rfs_async_write_lock);
-                if (rfs_async_write_head == nlp)
-                        rfs_async_write_head = nlp->next;
+                mutex_enter(&nsrv->async_write_lock);
+                if (nsrv->async_write_head == nlp)
+                        nsrv->async_write_head = nlp->next;
                 else {
-                        lp = rfs_async_write_head;
+                        lp = nsrv->async_write_head;
                         while (lp->next != nlp)
                                 lp = lp->next;
                         lp->next = nlp->next;
                 }
                 for (rp = nlp->list; rp != NULL; rp = rp->list) {

@@ -1370,11 +1484,11 @@
                                 rp->ns->ns_status = puterrno(error);
                                 rp->thread->t_flag |= T_WOULDBLOCK;
                         }
                 }
                 cv_broadcast(&nlp->cv);
-                mutex_exit(&rfs_async_write_lock);
+                mutex_exit(&nsrv->async_write_lock);
 
                 return;
         }
 
         /*

@@ -1392,20 +1506,20 @@
          * a new cluster and be blocked in VOP_RWLOCK while
          * the first request is being processed.  This delay
          * will allow more requests to be clustered in this
          * second cluster.
          */
-        mutex_enter(&rfs_async_write_lock);
-        if (rfs_async_write_head == nlp)
-                rfs_async_write_head = nlp->next;
+        mutex_enter(&nsrv->async_write_lock);
+        if (nsrv->async_write_head == nlp)
+                nsrv->async_write_head = nlp->next;
         else {
-                lp = rfs_async_write_head;
+                lp = nsrv->async_write_head;
                 while (lp->next != nlp)
                         lp = lp->next;
                 lp->next = nlp->next;
         }
-        mutex_exit(&rfs_async_write_lock);
+        mutex_exit(&nsrv->async_write_lock);
 
         /*
          * Step through the list of requests in this cluster.
          * We need to check permissions to make sure that all
          * of the requests have sufficient permission to write

@@ -1646,19 +1760,19 @@
         if (in_crit)
                 nbl_end_crit(vp);
         VN_RELE(vp);
 
         t_flag = curthread->t_flag & T_WOULDBLOCK;
-        mutex_enter(&rfs_async_write_lock);
+        mutex_enter(&nsrv->async_write_lock);
         for (rp = nlp->list; rp != NULL; rp = rp->list) {
                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
                         rp->ns->ns_status = puterrno(error);
                         rp->thread->t_flag |= t_flag;
                 }
         }
         cv_broadcast(&nlp->cv);
-        mutex_exit(&rfs_async_write_lock);
+        mutex_exit(&nsrv->async_write_lock);
 
 }
 
 void *
 rfs_write_getfh(struct nfswriteargs *wa)

@@ -1716,10 +1830,16 @@
                 VN_RELE(dvp);
                 dr->dr_status = NFSERR_INVAL;
                 return;
         }
 
+        if (protect_zfs_mntpt(dvp) != 0) {
+                VN_RELE(dvp);
+                dr->dr_status = NFSERR_ACCES;
+                return;
+        }
+
         /*
          * This is a completely gross hack to make mknod
          * work over the wire until we can wack the protocol
          */
         if ((va.va_mode & IFMT) == IFCHR) {

@@ -2055,11 +2175,11 @@
         if (to_exi == NULL) {
                 VN_RELE(fromvp);
                 *status = NFSERR_ACCES;
                 return;
         }
-        exi_rele(to_exi);
+        exi_rele(&to_exi);
 
         if (to_exi != exi) {
                 VN_RELE(fromvp);
                 *status = NFSERR_XDEV;
                 return;

@@ -2095,10 +2215,17 @@
                 VN_RELE(fromvp);
                 *status = NFSERR_ROFS;
                 return;
         }
 
+        if (protect_zfs_mntpt(tovp) != 0) {
+                VN_RELE(tovp);
+                VN_RELE(fromvp);
+                *status = NFSERR_ACCES;
+                return;
+        }
+
         /*
          * Check for a conflict with a non-blocking mandatory share reservation.
          */
         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
             NULL, cr, NULL, NULL, NULL);

@@ -2119,11 +2246,11 @@
                 return;
         }
 
         /* Check for delegation on the file being renamed over, if it exists */
 
-        if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
+        if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
             NULL, NULL, NULL) == 0) {
 
                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
                         VN_RELE(tovp);

@@ -2201,11 +2328,11 @@
         if (to_exi == NULL) {
                 VN_RELE(fromvp);
                 *status = NFSERR_ACCES;
                 return;
         }
-        exi_rele(to_exi);
+        exi_rele(&to_exi);
 
         if (to_exi != exi) {
                 VN_RELE(fromvp);
                 *status = NFSERR_XDEV;
                 return;

@@ -2239,10 +2366,17 @@
                 VN_RELE(fromvp);
                 *status = NFSERR_ROFS;
                 return;
         }
 
+        if (protect_zfs_mntpt(tovp) != 0) {
+                VN_RELE(tovp);
+                VN_RELE(fromvp);
+                *status = NFSERR_ACCES;
+                return;
+        }
+
         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
 
         /*
          * Force modified data and metadata out to stable storage.
          */

@@ -2261,11 +2395,11 @@
         return (args->la_from);
 }
 
 /*
  * Symbolicly link to a file.
- * Create a file (to) with the given attributes which is a symbolic link
+ * Create a file (from) with the given attributes which is a symbolic link
  * to the given path name (to).
  */
 void
 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)

@@ -2309,10 +2443,16 @@
                 VN_RELE(vp);
                 *status = NFSERR_INVAL;
                 return;
         }
 
+        if (protect_zfs_mntpt(vp) != 0) {
+                VN_RELE(vp);
+                *status = NFSERR_ACCES;
+                return;
+        }
+
         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
         name = nfscmd_convname(ca, exi, args->sla_tnm,
             NFSCMD_CONV_INBOUND, MAXPATHLEN);
 
         if (name == NULL) {

@@ -2401,10 +2541,16 @@
                 VN_RELE(vp);
                 dr->dr_status = NFSERR_INVAL;
                 return;
         }
 
+        if (protect_zfs_mntpt(vp) != 0) {
+                VN_RELE(vp);
+                dr->dr_status = NFSERR_ACCES;
+                return;
+        }
+
         va.va_type = VDIR;
         va.va_mask |= AT_TYPE;
 
         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);

@@ -2486,11 +2632,11 @@
          * Of course, NFS servers have no idea what their
          * clients' current directories are.  We fake it by
          * supplying a vnode known to exist and illegal to
          * remove.
          */
-        error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
+        error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
 
         /*
          * Force modified data and metadata out to stable storage.
          */
         (void) VOP_FSYNC(vp, 0, cr, NULL);

@@ -2513,123 +2659,204 @@
 rfs_rmdir_getfh(struct nfsdiropargs *da)
 {
         return (da->da_fhandle);
 }
 
+#ifdef nextdp
+#undef nextdp
+#endif
+#define nextdp(dp)      ((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
+
 /* ARGSUSED */
 void
 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 {
         int error;
-        int iseof;
+        vnode_t *vp;
         struct iovec iov;
         struct uio uio;
-        vnode_t *vp;
-        char *ndata = NULL;
+        int iseof;
+
+        uint32_t count = rda->rda_count;
+        uint32_t size;          /* size of the readdirres structure */
+        int overflow = 0;
+
+        size_t datasz;
+        char *data = NULL;
+        dirent64_t *dp;
+
         struct sockaddr *ca;
-        size_t nents;
-        int ret;
+        struct nfsentry **eptr;
+        struct nfsentry *entry;
 
         vp = nfs_fhtovp(&rda->rda_fh, exi);
         if (vp == NULL) {
-                rd->rd_entries = NULL;
                 rd->rd_status = NFSERR_STALE;
                 return;
         }
 
         if (vp->v_type != VDIR) {
                 VN_RELE(vp);
-                rd->rd_entries = NULL;
                 rd->rd_status = NFSERR_NOTDIR;
                 return;
         }
 
         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
 
         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
-
-        if (error) {
-                rd->rd_entries = NULL;
+        if (error)
                 goto bad;
-        }
 
-        if (rda->rda_count == 0) {
-                rd->rd_entries = NULL;
-                rd->rd_size = 0;
-                rd->rd_eof = FALSE;
-                goto bad;
-        }
+        /*
+         * Don't allow arbitrary counts for allocation
+         */
+        if (count > NFS_MAXDATA)
+                count = NFS_MAXDATA;
 
-        rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
+        /*
+         * struct readdirres:
+         *   status:            1
+         *   entries (bool):    1
+         *   eof:               1
+         */
+        size = (1 + 1 + 1) * BYTES_PER_XDR_UNIT;
 
+        if (size > count) {
+                eptr = &rd->rd_entries;
+                iseof = 0;
+                size = 0;
+
+                goto done;
+        }
+
         /*
-         * Allocate data for entries.  This will be freed by rfs_rddirfree.
+         * This is simplification.  The dirent64_t size is not the same as the
+         * size of XDR representation of entry, but the sizes are similar so
+         * we'll assume they are same.  This assumption should not cause any
+         * harm.  In worst case we will need to issue VOP_READDIR() once more.
          */
-        rd->rd_bufsize = (uint_t)rda->rda_count;
-        rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
+        datasz = count;
 
         /*
-         * Set up io vector to read directory data
+         * Make sure that there is room to read at least one entry
+         * if any are available.
          */
-        iov.iov_base = (caddr_t)rd->rd_entries;
-        iov.iov_len = rda->rda_count;
+        if (datasz < DIRENT64_RECLEN(MAXNAMELEN))
+                datasz = DIRENT64_RECLEN(MAXNAMELEN);
+
+        data = kmem_alloc(datasz, KM_NOSLEEP);
+        if (data == NULL) {
+                /* The allocation failed; downsize and wait for it this time */
+                if (datasz > MAXBSIZE)
+                        datasz = MAXBSIZE;
+                data = kmem_alloc(datasz, KM_SLEEP);
+        }
+
         uio.uio_iov = &iov;
         uio.uio_iovcnt = 1;
         uio.uio_segflg = UIO_SYSSPACE;
         uio.uio_extflg = UIO_COPY_CACHED;
         uio.uio_loffset = (offset_t)rda->rda_offset;
-        uio.uio_resid = rda->rda_count;
+        uio.uio_resid = datasz;
 
-        /*
-         * read directory
-         */
+        ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
+        eptr = &rd->rd_entries;
+        entry = NULL;
+
+getmoredents:
+        iov.iov_base = data;
+        iov.iov_len = datasz;
+
         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
+        if (error) {
+                iseof = 0;
+                goto done;
+        }
 
-        /*
-         * Clean up
-         */
-        if (!error) {
-                /*
-                 * set size and eof
-                 */
-                if (uio.uio_resid == rda->rda_count) {
-                        rd->rd_size = 0;
-                        rd->rd_eof = TRUE;
-                } else {
-                        rd->rd_size = (uint32_t)(rda->rda_count -
-                            uio.uio_resid);
-                        rd->rd_eof = iseof ? TRUE : FALSE;
+        if (iov.iov_len == datasz)
+                goto done;
+
+        for (dp = (dirent64_t *)data;
+            (char *)dp - data < datasz - iov.iov_len && !overflow;
+            dp = nextdp(dp)) {
+                char *name;
+                uint32_t esize;
+                uint32_t cookie;
+
+                overflow = (uint64_t)dp->d_off > UINT32_MAX;
+                if (overflow) {
+                        cookie = 0;
+                        iseof = 1;
+                } else
+                        cookie = (uint32_t)dp->d_off;
+
+                if (dp->d_ino == 0 || (uint64_t)dp->d_ino > UINT32_MAX) {
+                        if (entry != NULL)
+                                entry->cookie = cookie;
+                        continue;
                 }
+
+                name = nfscmd_convname(ca, exi, dp->d_name,
+                    NFSCMD_CONV_OUTBOUND, NFS_MAXPATHLEN + 1);
+                if (name == NULL) {
+                        if (entry != NULL)
+                                entry->cookie = cookie;
+                        continue;
         }
 
-        ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
-        nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
-        ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
-            rda->rda_count, &ndata);
-
-        if (ret != 0) {
-                size_t dropbytes;
                 /*
-                 * We had to drop one or more entries in order to fit
-                 * during the character conversion.  We need to patch
-                 * up the size and eof info.
+                 * struct entry:
+                 *   fileid:            1
+                 *   name (length):     1
+                 *   name (data):       length (rounded up)
+                 *   cookie:            1
+                 *   nextentry (bool):  1
                  */
-                if (rd->rd_eof)
-                        rd->rd_eof = FALSE;
-                dropbytes = nfscmd_dropped_entrysize(
-                    (struct dirent64 *)rd->rd_entries, nents, ret);
-                rd->rd_size -= dropbytes;
+                esize = (1 + 1 + 1 + 1) * BYTES_PER_XDR_UNIT +
+                    RNDUP(strlen(name));
+
+                /* If the new entry does not fit, discard it */
+                if (esize > count - size) {
+                        if (name != dp->d_name)
+                                kmem_free(name, NFS_MAXPATHLEN + 1);
+                        iseof = 0;
+                        goto done;
         }
-        if (ndata == NULL) {
-                ndata = (char *)rd->rd_entries;
-        } else if (ndata != (char *)rd->rd_entries) {
-                kmem_free(rd->rd_entries, rd->rd_bufsize);
-                rd->rd_entries = (void *)ndata;
-                rd->rd_bufsize = rda->rda_count;
+
+                entry = kmem_alloc(sizeof (struct nfsentry), KM_SLEEP);
+
+                entry->fileid = (uint32_t)dp->d_ino;
+                entry->name = strdup(name);
+                if (name != dp->d_name)
+                        kmem_free(name, NFS_MAXPATHLEN + 1);
+                entry->cookie = cookie;
+
+                size += esize;
+
+                /* Add the entry to the linked list */
+                *eptr = entry;
+                eptr = &entry->nextentry;
         }
 
+        if (!iseof && size < count) {
+                uio.uio_resid = MIN(datasz, MAXBSIZE);
+                goto getmoredents;
+        }
+
+done:
+        *eptr = NULL;
+
+        if (iseof || rd->rd_entries != NULL || !error) {
+                error = 0;
+                rd->rd_eof = iseof ? TRUE : FALSE;
+
+                /* This is for nfslog only */
+                rd->rd_offset = rda->rda_offset;
+                rd->rd_size = size;
+        }
+
 bad:
         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
 
 #if 0 /* notyet */
         /*

@@ -2645,21 +2872,30 @@
 
         VN_RELE(vp);
 
         rd->rd_status = puterrno(error);
 
+        if (data != NULL)
+                kmem_free(data, datasz);
 }
 void *
 rfs_readdir_getfh(struct nfsrddirargs *rda)
 {
         return (&rda->rda_fh);
 }
 void
 rfs_rddirfree(struct nfsrddirres *rd)
 {
-        if (rd->rd_entries != NULL)
-                kmem_free(rd->rd_entries, rd->rd_bufsize);
+        if (rd->rd_status == NFS_OK) {
+                struct nfsentry *entry, *nentry;
+
+                for (entry = rd->rd_entries; entry != NULL; entry = nentry) {
+                        nentry = entry->nextentry;
+                        strfree(entry->name);
+                        kmem_free(entry, sizeof (struct nfsentry));
+                }
+        }
 }
 
 /* ARGSUSED */
 void
 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,

@@ -2761,11 +2997,11 @@
                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
         }
         return (0);
 }
 
-static enum nfsftype vt_to_nf[] = {
+static const enum nfsftype vt_to_nf[] = {
         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
 };
 
 /*
  * check the following fields for overflow: nodeid, size, and time.

@@ -2980,20 +3216,44 @@
 }
 
 void
 rfs_srvrinit(void)
 {
-        mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
         nfs2_srv_caller_id = fs_new_caller_id();
+        zone_key_create(&rfs_zone_key, rfs_zone_init, NULL, rfs_zone_fini);
 }
 
 void
 rfs_srvrfini(void)
 {
-        mutex_destroy(&rfs_async_write_lock);
 }
 
+/* ARGSUSED */
+static void *
+rfs_zone_init(zoneid_t zoneid)
+{
+        nfs_srv_t *ns;
+
+        ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
+
+        mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
+        ns->write_async = 1;
+
+        return (ns);
+}
+
+/* ARGSUSED */
+static void
+rfs_zone_fini(zoneid_t zoneid, void *data)
+{
+        nfs_srv_t *ns;
+
+        ns = (nfs_srv_t *)data;
+        mutex_destroy(&ns->async_write_lock);
+        kmem_free(ns, sizeof (*ns));
+}
+
 static int
 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
 {
         struct clist    *wcl;
         int             wlist_len;