big-one Cdiff usr/src/uts/common/fs/nfs/nfs

Print this page

NEX-17125 NFS: nbmand lock entered but not exited on error path
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
NEX-15279 support NFS server in zone
NEX-15520 online NFS shares cause zoneadm halt to hang in nfs_export_zone_fini
Portions contributed by: Dan Kruchinin dan.kruchinin@nexenta.com
Portions contributed by: Stepan Zastupov stepan.zastupov@gmail.com
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
NEX-9275 Got "bad mutex" panic when run IO to nfs share from clients
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
NEX-3524 CLONE - Port NEX-3505 "wrong authentication" messages with root=@0.0.0.0/0 set, result in loss of client access
Reviewed by: Marcel Telka <marcel.telka@nexenta.com>
NEX-3533 CLONE - Port NEX-3019 NFSv3 writes underneath mounted filesystem to directory
Reviewed by: Dan Fields <dan.fields@nexenta.com>
NEX-3095 Issues related to NFS nohide
Reviewed by: Dan Fields <dan.fields@nexenta.com>
NEX-1128 NFS server: Generic uid and gid remapping for AUTH_SYS
Reviewed by: Jan Kryl <jan.kryl@nexenta.com>
OS-20 share_nfs(1m) charset handling is unreliable
OS-22 Page fault at nfscmd_dropped_entrysize+0x1e()
OS-23 NFSv2/3/4: READDIR responses are inconsistent when charset conversion fails
OS-24 rfs3_readdir(): Issues related to nfscmd_convdirent()
Reviewed by: Jan Kryl <jan.kryl@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
re #13613 rb4516 Tunables needs volatile keyword
closes #12112 rb3823 - nfs-nohide: lookup("..") for submount should be correct
re #3541 rb11254 - nfs nohide - "nfssrv: need ability to go to submounts for v3 and v2 protocols"


*** 16,36 ****
   * fields enclosed by brackets "[]" replaced with your own identifying
   * information: Portions Copyright [yyyy] [name of copyright owner]
   *
   * CDDL HEADER END
   */
  /*
   * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
-  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
-  * Copyright (c) 2016 by Delphix. All rights reserved.
   */
  
  /*
   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
   *      All rights reserved.
   */
  
  #include <sys/param.h>
  #include <sys/types.h>
  #include <sys/systm.h>
  #include <sys/cred.h>
  #include <sys/buf.h>
--- 16,40 ----
   * fields enclosed by brackets "[]" replaced with your own identifying
   * information: Portions Copyright [yyyy] [name of copyright owner]
   *
   * CDDL HEADER END
   */
+ 
  /*
   * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
   */
  
  /*
   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
   *      All rights reserved.
   */
  
+ /*
+  * Copyright 2018 Nexenta Systems, Inc.
+  * Copyright (c) 2016 by Delphix. All rights reserved.
+  */
+ 
  #include <sys/param.h>
  #include <sys/types.h>
  #include <sys/systm.h>
  #include <sys/cred.h>
  #include <sys/buf.h>
*** 67,86 ****
--- 71,108 ----
  #include <vm/seg_map.h>
  #include <vm/seg_kmem.h>
  
  #include <sys/strsubr.h>
  
+ struct rfs_async_write_list;
+ 
  /*
+  * Zone globals of NFSv2 server
+  */
+ typedef struct nfs_srv {
+         kmutex_t                        async_write_lock;
+         struct rfs_async_write_list     *async_write_head;
+ 
+         /*
+          * enables write clustering if == 1
+          */
+         int             write_async;
+ } nfs_srv_t;
+ 
+ /*
   * These are the interface routines for the server side of the
   * Network File System.  See the NFS version 2 protocol specification
   * for a description of this interface.
   */
  
  static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
                          cred_t *);
+ static void     *rfs_zone_init(zoneid_t zoneid);
+ static void     rfs_zone_fini(zoneid_t zoneid, void *data);
  
+ 
  /*
   * Some "over the wire" UNIX file types.  These are encoded
   * into the mode.  This needs to be fixed in the next rev.
   */
  #define IFMT            0170000         /* type of file */
*** 87,96 ****
--- 109,119 ----
  #define IFCHR           0020000         /* character special */
  #define IFBLK           0060000         /* block special */
  #define IFSOCK          0140000         /* socket */
  
  u_longlong_t nfs2_srv_caller_id;
+ static zone_key_t rfs_zone_key;
  
  /*
   * Get file attributes.
   * Returns the current attributes of the file with the given fhandle.
   */
*** 327,337 ****
--- 350,434 ----
  rfs_setattr_getfh(struct nfssaargs *args)
  {
          return (&args->saa_fh);
  }
  
+ /* Change and release @exip and @vpp only in success */
+ int
+ rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
+ {
+         struct exportinfo *exi;
+         vnode_t *vp = *vpp;
+         fid_t fid;
+         int error;
+ 
+         VN_HOLD(vp);
+ 
+         if ((error = traverse(&vp)) != 0) {
+                 VN_RELE(vp);
+                 return (error);
+         }
+ 
+         bzero(&fid, sizeof (fid));
+         fid.fid_len = MAXFIDSZ;
+         error = VOP_FID(vp, &fid, NULL);
+         if (error) {
+                 VN_RELE(vp);
+                 return (error);
+         }
+ 
+         exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
+         if (exi == NULL ||
+             (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
+                 /*
+                  * It is not error, just subdir is not exported
+                  * or "nohide" is not set
+                  */
+                 if (exi != NULL)
+                         exi_rele(&exi);
+                 VN_RELE(vp);
+         } else {
+                 /* go to submount */
+                 exi_rele(exip);
+                 *exip = exi;
+ 
+                 VN_RELE(*vpp);
+                 *vpp = vp;
+         }
+ 
+         return (0);
+ }
+ 
  /*
+  * Given mounted "dvp" and "exi", go upper mountpoint
+  * with dvp/exi correction
+  * Return 0 in success
+  */
+ int
+ rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
+ {
+         struct exportinfo *exi;
+         vnode_t *dvp = *dvpp;
+ 
+         ASSERT(dvp->v_flag & VROOT);
+ 
+         VN_HOLD(dvp);
+         dvp = untraverse(dvp);
+         exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
+         if (exi == NULL) {
+                 VN_RELE(dvp);
+                 return (-1);
+         }
+ 
+         exi_rele(exip);
+         *exip = exi;
+         VN_RELE(*dvpp);
+         *dvpp = dvp;
+ 
+         return (0);
+ }
+ /*
   * Directory lookup.
   * Returns an fhandle and file attributes for file name in a directory.
   */
  /* ARGSUSED */
  void
*** 369,407 ****
          /*
           * Allow lookups from the root - the default
           * location of the public filehandle.
           */
          if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
!                 dvp = rootdir;
                  VN_HOLD(dvp);
          } else {
                  dvp = nfs_fhtovp(fhp, exi);
                  if (dvp == NULL) {
                          dr->dr_status = NFSERR_STALE;
                          return;
                  }
          }
  
          /*
           * Not allow lookup beyond root.
           * If the filehandle matches a filehandle of the exi,
           * then the ".." refers beyond the root of an exported filesystem.
           */
          if (strcmp(da->da_name, "..") == 0 &&
              EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
!                 VN_RELE(dvp);
!                 dr->dr_status = NFSERR_NOENT;
!                 return;
          }
  
          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
          name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
              MAXPATHLEN);
  
          if (name == NULL) {
!                 dr->dr_status = NFSERR_ACCES;
!                 return;
          }
  
          /*
           * If the public filehandle is used then allow
           * a multi-component lookup, i.e. evaluate
--- 466,516 ----
          /*
           * Allow lookups from the root - the default
           * location of the public filehandle.
           */
          if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
!                 dvp = ZONE_ROOTVP();
                  VN_HOLD(dvp);
          } else {
                  dvp = nfs_fhtovp(fhp, exi);
                  if (dvp == NULL) {
                          dr->dr_status = NFSERR_STALE;
                          return;
                  }
          }
  
+         exi_hold(exi);
+ 
          /*
           * Not allow lookup beyond root.
           * If the filehandle matches a filehandle of the exi,
           * then the ".." refers beyond the root of an exported filesystem.
           */
          if (strcmp(da->da_name, "..") == 0 &&
              EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
!                 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
!                     (dvp->v_flag & VROOT)) {
!                         /*
!                          * special case for ".." and 'nohide'exported root
!                          */
!                         if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
!                                 error = NFSERR_ACCES;
!                                 goto out;
                          }
+                 } else  {
+                         error = NFSERR_NOENT;
+                         goto out;
+                 }
+         }
  
          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
          name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
              MAXPATHLEN);
  
          if (name == NULL) {
!                 error = NFSERR_ACCES;
!                 goto out;
          }
  
          /*
           * If the public filehandle is used then allow
           * a multi-component lookup, i.e. evaluate
*** 411,420 ****
--- 520,532 ----
           * This may result in a vnode in another filesystem
           * which is OK as long as the filesystem is exported.
           */
          if (PUBLIC_FH2(fhp)) {
                  publicfh_flag = TRUE;
+ 
+                 exi_rele(&exi);
+ 
                  error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
                      &sec);
          } else {
                  /*
                   * Do a normal single component lookup.
*** 424,433 ****
--- 536,550 ----
          }
  
          if (name != da->da_name)
                  kmem_free(name, MAXPATHLEN);
  
+         if (error == 0 && vn_ismntpt(vp)) {
+                 error = rfs_cross_mnt(&vp, &exi);
+                 if (error)
+                         VN_RELE(vp);
+         }
  
          if (!error) {
                  va.va_mask = AT_ALL;    /* we want everything */
  
                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
*** 450,469 ****
                          }
                  }
                  VN_RELE(vp);
          }
  
          VN_RELE(dvp);
  
!         /*
!          * If publicfh_flag is true then we have called rfs_publicfh_mclookup
!          * and have obtained a new exportinfo in exi which needs to be
!          * released. Note the the original exportinfo pointed to by exi
!          * will be released by the caller, comon_dispatch.
!          */
!         if (publicfh_flag && exi != NULL)
!                 exi_rele(exi);
  
          /*
           * If it's public fh, no 0x81, and client's flavor is
           * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
           * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
--- 567,581 ----
                          }
                  }
                  VN_RELE(vp);
          }
  
+ out:
          VN_RELE(dvp);
  
!         if (exi != NULL)
!                 exi_rele(&exi);
  
          /*
           * If it's public fh, no 0x81, and client's flavor is
           * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
           * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
*** 683,692 ****
--- 795,806 ----
  
          error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
  
          /* check if a monitor detected a delegation conflict */
          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
+                 if (in_crit)
+                         nbl_end_crit(vp);
                  VN_RELE(vp);
                  /* mark as wouldblock so response is dropped */
                  curthread->t_flag |= T_WOULDBLOCK;
  
                  rr->rr_data = NULL;
*** 1008,1021 ****
  
          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
  
          /* check if a monitor detected a delegation conflict */
          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
!                 VN_RELE(vp);
!                 /* mark as wouldblock so response is dropped */
!                 curthread->t_flag |= T_WOULDBLOCK;
!                 return;
          }
  
          if (wa->wa_data || wa->wa_rlist) {
                  /* Do the RDMA thing if necessary */
                  if (wa->wa_rlist) {
--- 1122,1132 ----
  
          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
  
          /* check if a monitor detected a delegation conflict */
          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
!                 goto out;
          }
  
          if (wa->wa_data || wa->wa_rlist) {
                  /* Do the RDMA thing if necessary */
                  if (wa->wa_rlist) {
*** 1051,1060 ****
--- 1162,1172 ----
                  savecred = curthread->t_cred;
                  curthread->t_cred = cr;
                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
                  curthread->t_cred = savecred;
          } else {
+ 
                  iovcnt = 0;
                  for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
                          iovcnt++;
                  if (iovcnt <= MAX_IOVECS) {
  #ifdef DEBUG
*** 1149,1159 ****
          struct rfs_async_write_list *next;
  };
  
  static struct rfs_async_write_list *rfs_async_write_head = NULL;
  static kmutex_t rfs_async_write_lock;
! static int rfs_write_async = 1; /* enables write clustering if == 1 */
  
  #define MAXCLIOVECS     42
  #define RFSWRITE_INITVAL (enum nfsstat) -1
  
  #ifdef DEBUG
--- 1261,1271 ----
          struct rfs_async_write_list *next;
  };
  
  static struct rfs_async_write_list *rfs_async_write_head = NULL;
  static kmutex_t rfs_async_write_lock;
! volatile int rfs_write_async = 1;       /* enables write clustering if == 1 */
  
  #define MAXCLIOVECS     42
  #define RFSWRITE_INITVAL (enum nfsstat) -1
  
  #ifdef DEBUG
*** 1194,1205 ****
          struct rfs_async_write_list nlpsp;
          ushort_t t_flag;
          cred_t *savecred;
          int in_crit = 0;
          caller_context_t ct;
  
!         if (!rfs_write_async) {
                  rfs_write_sync(wa, ns, exi, req, cr, ro);
                  return;
          }
  
          /*
--- 1306,1319 ----
          struct rfs_async_write_list nlpsp;
          ushort_t t_flag;
          cred_t *savecred;
          int in_crit = 0;
          caller_context_t ct;
+         nfs_srv_t *nsrv;
  
!         nsrv = zone_getspecific(rfs_zone_key, curzone);
!         if (!nsrv->write_async) {
                  rfs_write_sync(wa, ns, exi, req, cr, ro);
                  return;
          }
  
          /*
*** 1220,1231 ****
  
          /*
           * Look to see if there is already a cluster started
           * for this file.
           */
!         mutex_enter(&rfs_async_write_lock);
!         for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
                  if (bcmp(&wa->wa_fhandle, lp->fhp,
                      sizeof (fhandle_t)) == 0)
                          break;
          }
  
--- 1334,1345 ----
  
          /*
           * Look to see if there is already a cluster started
           * for this file.
           */
!         mutex_enter(&nsrv->async_write_lock);
!         for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
                  if (bcmp(&wa->wa_fhandle, lp->fhp,
                      sizeof (fhandle_t)) == 0)
                          break;
          }
  
*** 1247,1258 ****
                  if (trp == NULL)
                          lp->list = nrp;
                  else
                          trp->list = nrp;
                  while (nrp->ns->ns_status == RFSWRITE_INITVAL)
!                         cv_wait(&lp->cv, &rfs_async_write_lock);
!                 mutex_exit(&rfs_async_write_lock);
  
                  return;
          }
  
          /*
--- 1361,1372 ----
                  if (trp == NULL)
                          lp->list = nrp;
                  else
                          trp->list = nrp;
                  while (nrp->ns->ns_status == RFSWRITE_INITVAL)
!                         cv_wait(&lp->cv, &nsrv->async_write_lock);
!                 mutex_exit(&nsrv->async_write_lock);
  
                  return;
          }
  
          /*
*** 1265,1295 ****
          nlp->fhp = &wa->wa_fhandle;
          cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
          nlp->list = nrp;
          nlp->next = NULL;
  
!         if (rfs_async_write_head == NULL) {
!                 rfs_async_write_head = nlp;
          } else {
!                 lp = rfs_async_write_head;
                  while (lp->next != NULL)
                          lp = lp->next;
                  lp->next = nlp;
          }
!         mutex_exit(&rfs_async_write_lock);
  
          /*
           * Convert the file handle common to all of the requests
           * in this cluster to a vnode.
           */
          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
          if (vp == NULL) {
!                 mutex_enter(&rfs_async_write_lock);
!                 if (rfs_async_write_head == nlp)
!                         rfs_async_write_head = nlp->next;
                  else {
!                         lp = rfs_async_write_head;
                          while (lp->next != nlp)
                                  lp = lp->next;
                          lp->next = nlp->next;
                  }
                  t_flag = curthread->t_flag & T_WOULDBLOCK;
--- 1379,1409 ----
          nlp->fhp = &wa->wa_fhandle;
          cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
          nlp->list = nrp;
          nlp->next = NULL;
  
!         if (nsrv->async_write_head == NULL) {
!                 nsrv->async_write_head = nlp;
          } else {
!                 lp = nsrv->async_write_head;
                  while (lp->next != NULL)
                          lp = lp->next;
                  lp->next = nlp;
          }
!         mutex_exit(&nsrv->async_write_lock);
  
          /*
           * Convert the file handle common to all of the requests
           * in this cluster to a vnode.
           */
          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
          if (vp == NULL) {
!                 mutex_enter(&nsrv->async_write_lock);
!                 if (nsrv->async_write_head == nlp)
!                         nsrv->async_write_head = nlp->next;
                  else {
!                         lp = nsrv->async_write_head;
                          while (lp->next != nlp)
                                  lp = lp->next;
                          lp->next = nlp->next;
                  }
                  t_flag = curthread->t_flag & T_WOULDBLOCK;
*** 1296,1306 ****
                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
                          rp->ns->ns_status = NFSERR_STALE;
                          rp->thread->t_flag |= t_flag;
                  }
                  cv_broadcast(&nlp->cv);
!                 mutex_exit(&rfs_async_write_lock);
  
                  return;
          }
  
          /*
--- 1410,1420 ----
                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
                          rp->ns->ns_status = NFSERR_STALE;
                          rp->thread->t_flag |= t_flag;
                  }
                  cv_broadcast(&nlp->cv);
!                 mutex_exit(&nsrv->async_write_lock);
  
                  return;
          }
  
          /*
*** 1307,1321 ****
           * Can only write regular files.  Attempts to write any
           * other file types fail with EISDIR.
           */
          if (vp->v_type != VREG) {
                  VN_RELE(vp);
!                 mutex_enter(&rfs_async_write_lock);
!                 if (rfs_async_write_head == nlp)
!                         rfs_async_write_head = nlp->next;
                  else {
!                         lp = rfs_async_write_head;
                          while (lp->next != nlp)
                                  lp = lp->next;
                          lp->next = nlp->next;
                  }
                  t_flag = curthread->t_flag & T_WOULDBLOCK;
--- 1421,1435 ----
           * Can only write regular files.  Attempts to write any
           * other file types fail with EISDIR.
           */
          if (vp->v_type != VREG) {
                  VN_RELE(vp);
!                 mutex_enter(&nsrv->async_write_lock);
!                 if (nsrv->async_write_head == nlp)
!                         nsrv->async_write_head = nlp->next;
                  else {
!                         lp = nsrv->async_write_head;
                          while (lp->next != nlp)
                                  lp = lp->next;
                          lp->next = nlp->next;
                  }
                  t_flag = curthread->t_flag & T_WOULDBLOCK;
*** 1322,1332 ****
                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
                          rp->ns->ns_status = NFSERR_ISDIR;
                          rp->thread->t_flag |= t_flag;
                  }
                  cv_broadcast(&nlp->cv);
!                 mutex_exit(&rfs_async_write_lock);
  
                  return;
          }
  
          /*
--- 1436,1446 ----
                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
                          rp->ns->ns_status = NFSERR_ISDIR;
                          rp->thread->t_flag |= t_flag;
                  }
                  cv_broadcast(&nlp->cv);
!                 mutex_exit(&nsrv->async_write_lock);
  
                  return;
          }
  
          /*
*** 1354,1368 ****
                  if (in_crit)
                          nbl_end_crit(vp);
                  VN_RELE(vp);
                  /* mark as wouldblock so response is dropped */
                  curthread->t_flag |= T_WOULDBLOCK;
!                 mutex_enter(&rfs_async_write_lock);
!                 if (rfs_async_write_head == nlp)
!                         rfs_async_write_head = nlp->next;
                  else {
!                         lp = rfs_async_write_head;
                          while (lp->next != nlp)
                                  lp = lp->next;
                          lp->next = nlp->next;
                  }
                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
--- 1468,1482 ----
                  if (in_crit)
                          nbl_end_crit(vp);
                  VN_RELE(vp);
                  /* mark as wouldblock so response is dropped */
                  curthread->t_flag |= T_WOULDBLOCK;
!                 mutex_enter(&nsrv->async_write_lock);
!                 if (nsrv->async_write_head == nlp)
!                         nsrv->async_write_head = nlp->next;
                  else {
!                         lp = nsrv->async_write_head;
                          while (lp->next != nlp)
                                  lp = lp->next;
                          lp->next = nlp->next;
                  }
                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
*** 1370,1380 ****
                                  rp->ns->ns_status = puterrno(error);
                                  rp->thread->t_flag |= T_WOULDBLOCK;
                          }
                  }
                  cv_broadcast(&nlp->cv);
!                 mutex_exit(&rfs_async_write_lock);
  
                  return;
          }
  
          /*
--- 1484,1494 ----
                                  rp->ns->ns_status = puterrno(error);
                                  rp->thread->t_flag |= T_WOULDBLOCK;
                          }
                  }
                  cv_broadcast(&nlp->cv);
!                 mutex_exit(&nsrv->async_write_lock);
  
                  return;
          }
  
          /*
*** 1392,1411 ****
           * a new cluster and be blocked in VOP_RWLOCK while
           * the first request is being processed.  This delay
           * will allow more requests to be clustered in this
           * second cluster.
           */
!         mutex_enter(&rfs_async_write_lock);
!         if (rfs_async_write_head == nlp)
!                 rfs_async_write_head = nlp->next;
          else {
!                 lp = rfs_async_write_head;
                  while (lp->next != nlp)
                          lp = lp->next;
                  lp->next = nlp->next;
          }
!         mutex_exit(&rfs_async_write_lock);
  
          /*
           * Step through the list of requests in this cluster.
           * We need to check permissions to make sure that all
           * of the requests have sufficient permission to write
--- 1506,1525 ----
           * a new cluster and be blocked in VOP_RWLOCK while
           * the first request is being processed.  This delay
           * will allow more requests to be clustered in this
           * second cluster.
           */
!         mutex_enter(&nsrv->async_write_lock);
!         if (nsrv->async_write_head == nlp)
!                 nsrv->async_write_head = nlp->next;
          else {
!                 lp = nsrv->async_write_head;
                  while (lp->next != nlp)
                          lp = lp->next;
                  lp->next = nlp->next;
          }
!         mutex_exit(&nsrv->async_write_lock);
  
          /*
           * Step through the list of requests in this cluster.
           * We need to check permissions to make sure that all
           * of the requests have sufficient permission to write
*** 1646,1664 ****
          if (in_crit)
                  nbl_end_crit(vp);
          VN_RELE(vp);
  
          t_flag = curthread->t_flag & T_WOULDBLOCK;
!         mutex_enter(&rfs_async_write_lock);
          for (rp = nlp->list; rp != NULL; rp = rp->list) {
                  if (rp->ns->ns_status == RFSWRITE_INITVAL) {
                          rp->ns->ns_status = puterrno(error);
                          rp->thread->t_flag |= t_flag;
                  }
          }
          cv_broadcast(&nlp->cv);
!         mutex_exit(&rfs_async_write_lock);
  
  }
  
  void *
  rfs_write_getfh(struct nfswriteargs *wa)
--- 1760,1778 ----
          if (in_crit)
                  nbl_end_crit(vp);
          VN_RELE(vp);
  
          t_flag = curthread->t_flag & T_WOULDBLOCK;
!         mutex_enter(&nsrv->async_write_lock);
          for (rp = nlp->list; rp != NULL; rp = rp->list) {
                  if (rp->ns->ns_status == RFSWRITE_INITVAL) {
                          rp->ns->ns_status = puterrno(error);
                          rp->thread->t_flag |= t_flag;
                  }
          }
          cv_broadcast(&nlp->cv);
!         mutex_exit(&nsrv->async_write_lock);
  
  }
  
  void *
  rfs_write_getfh(struct nfswriteargs *wa)
*** 1716,1725 ****
--- 1830,1845 ----
                  VN_RELE(dvp);
                  dr->dr_status = NFSERR_INVAL;
                  return;
          }
  
+         if (protect_zfs_mntpt(dvp) != 0) {
+                 VN_RELE(dvp);
+                 dr->dr_status = NFSERR_ACCES;
+                 return;
+         }
+ 
          /*
           * This is a completely gross hack to make mknod
           * work over the wire until we can wack the protocol
           */
          if ((va.va_mode & IFMT) == IFCHR) {
*** 2055,2065 ****
          if (to_exi == NULL) {
                  VN_RELE(fromvp);
                  *status = NFSERR_ACCES;
                  return;
          }
!         exi_rele(to_exi);
  
          if (to_exi != exi) {
                  VN_RELE(fromvp);
                  *status = NFSERR_XDEV;
                  return;
--- 2175,2185 ----
          if (to_exi == NULL) {
                  VN_RELE(fromvp);
                  *status = NFSERR_ACCES;
                  return;
          }
!         exi_rele(&to_exi);
  
          if (to_exi != exi) {
                  VN_RELE(fromvp);
                  *status = NFSERR_XDEV;
                  return;
*** 2095,2104 ****
--- 2215,2231 ----
                  VN_RELE(fromvp);
                  *status = NFSERR_ROFS;
                  return;
          }
  
+         if (protect_zfs_mntpt(tovp) != 0) {
+                 VN_RELE(tovp);
+                 VN_RELE(fromvp);
+                 *status = NFSERR_ACCES;
+                 return;
+         }
+ 
          /*
           * Check for a conflict with a non-blocking mandatory share reservation.
           */
          error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
              NULL, cr, NULL, NULL, NULL);
*** 2119,2129 ****
                  return;
          }
  
          /* Check for delegation on the file being renamed over, if it exists */
  
!         if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
              VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
              NULL, NULL, NULL) == 0) {
  
                  if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
                          VN_RELE(tovp);
--- 2246,2256 ----
                  return;
          }
  
          /* Check for delegation on the file being renamed over, if it exists */
  
!         if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
              VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
              NULL, NULL, NULL) == 0) {
  
                  if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
                          VN_RELE(tovp);
*** 2201,2211 ****
          if (to_exi == NULL) {
                  VN_RELE(fromvp);
                  *status = NFSERR_ACCES;
                  return;
          }
!         exi_rele(to_exi);
  
          if (to_exi != exi) {
                  VN_RELE(fromvp);
                  *status = NFSERR_XDEV;
                  return;
--- 2328,2338 ----
          if (to_exi == NULL) {
                  VN_RELE(fromvp);
                  *status = NFSERR_ACCES;
                  return;
          }
!         exi_rele(&to_exi);
  
          if (to_exi != exi) {
                  VN_RELE(fromvp);
                  *status = NFSERR_XDEV;
                  return;
*** 2239,2248 ****
--- 2366,2382 ----
                  VN_RELE(fromvp);
                  *status = NFSERR_ROFS;
                  return;
          }
  
+         if (protect_zfs_mntpt(tovp) != 0) {
+                 VN_RELE(tovp);
+                 VN_RELE(fromvp);
+                 *status = NFSERR_ACCES;
+                 return;
+         }
+ 
          error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
  
          /*
           * Force modified data and metadata out to stable storage.
           */
*** 2261,2271 ****
          return (args->la_from);
  }
  
  /*
   * Symbolicly link to a file.
!  * Create a file (to) with the given attributes which is a symbolic link
   * to the given path name (to).
   */
  void
  rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
--- 2395,2405 ----
          return (args->la_from);
  }
  
  /*
   * Symbolicly link to a file.
!  * Create a file (from) with the given attributes which is a symbolic link
   * to the given path name (to).
   */
  void
  rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
*** 2309,2318 ****
--- 2443,2458 ----
                  VN_RELE(vp);
                  *status = NFSERR_INVAL;
                  return;
          }
  
+         if (protect_zfs_mntpt(vp) != 0) {
+                 VN_RELE(vp);
+                 *status = NFSERR_ACCES;
+                 return;
+         }
+ 
          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
          name = nfscmd_convname(ca, exi, args->sla_tnm,
              NFSCMD_CONV_INBOUND, MAXPATHLEN);
  
          if (name == NULL) {
*** 2401,2410 ****
--- 2541,2556 ----
                  VN_RELE(vp);
                  dr->dr_status = NFSERR_INVAL;
                  return;
          }
  
+         if (protect_zfs_mntpt(vp) != 0) {
+                 VN_RELE(vp);
+                 dr->dr_status = NFSERR_ACCES;
+                 return;
+         }
+ 
          va.va_type = VDIR;
          va.va_mask |= AT_TYPE;
  
          error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
  
*** 2486,2496 ****
           * Of course, NFS servers have no idea what their
           * clients' current directories are.  We fake it by
           * supplying a vnode known to exist and illegal to
           * remove.
           */
!         error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
  
          /*
           * Force modified data and metadata out to stable storage.
           */
          (void) VOP_FSYNC(vp, 0, cr, NULL);
--- 2632,2642 ----
           * Of course, NFS servers have no idea what their
           * clients' current directories are.  We fake it by
           * supplying a vnode known to exist and illegal to
           * remove.
           */
!         error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
  
          /*
           * Force modified data and metadata out to stable storage.
           */
          (void) VOP_FSYNC(vp, 0, cr, NULL);
*** 2513,2635 ****
  rfs_rmdir_getfh(struct nfsdiropargs *da)
  {
          return (da->da_fhandle);
  }
  
  /* ARGSUSED */
  void
  rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
  {
          int error;
!         int iseof;
          struct iovec iov;
          struct uio uio;
!         vnode_t *vp;
!         char *ndata = NULL;
          struct sockaddr *ca;
!         size_t nents;
!         int ret;
  
          vp = nfs_fhtovp(&rda->rda_fh, exi);
          if (vp == NULL) {
-                 rd->rd_entries = NULL;
                  rd->rd_status = NFSERR_STALE;
                  return;
          }
  
          if (vp->v_type != VDIR) {
                  VN_RELE(vp);
-                 rd->rd_entries = NULL;
                  rd->rd_status = NFSERR_NOTDIR;
                  return;
          }
  
          (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
  
          error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
! 
!         if (error) {
!                 rd->rd_entries = NULL;
                  goto bad;
-         }
  
!         if (rda->rda_count == 0) {
!                 rd->rd_entries = NULL;
!                 rd->rd_size = 0;
!                 rd->rd_eof = FALSE;
!                 goto bad;
!         }
  
!         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
  
          /*
!          * Allocate data for entries.  This will be freed by rfs_rddirfree.
           */
!         rd->rd_bufsize = (uint_t)rda->rda_count;
!         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
  
          /*
!          * Set up io vector to read directory data
           */
!         iov.iov_base = (caddr_t)rd->rd_entries;
!         iov.iov_len = rda->rda_count;
          uio.uio_iov = &iov;
          uio.uio_iovcnt = 1;
          uio.uio_segflg = UIO_SYSSPACE;
          uio.uio_extflg = UIO_COPY_CACHED;
          uio.uio_loffset = (offset_t)rda->rda_offset;
!         uio.uio_resid = rda->rda_count;
  
!         /*
!          * read directory
!          */
          error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
  
!         /*
!          * Clean up
!          */
!         if (!error) {
!                 /*
!                  * set size and eof
!                  */
!                 if (uio.uio_resid == rda->rda_count) {
!                         rd->rd_size = 0;
!                         rd->rd_eof = TRUE;
!                 } else {
!                         rd->rd_size = (uint32_t)(rda->rda_count -
!                             uio.uio_resid);
!                         rd->rd_eof = iseof ? TRUE : FALSE;
                  }
          }
  
-         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
-         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
-         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
-             rda->rda_count, &ndata);
- 
-         if (ret != 0) {
-                 size_t dropbytes;
                  /*
!                  * We had to drop one or more entries in order to fit
!                  * during the character conversion.  We need to patch
!                  * up the size and eof info.
                   */
!                 if (rd->rd_eof)
!                         rd->rd_eof = FALSE;
!                 dropbytes = nfscmd_dropped_entrysize(
!                     (struct dirent64 *)rd->rd_entries, nents, ret);
!                 rd->rd_size -= dropbytes;
          }
!         if (ndata == NULL) {
!                 ndata = (char *)rd->rd_entries;
!         } else if (ndata != (char *)rd->rd_entries) {
!                 kmem_free(rd->rd_entries, rd->rd_bufsize);
!                 rd->rd_entries = (void *)ndata;
!                 rd->rd_bufsize = rda->rda_count;
          }
  
  bad:
          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
  
  #if 0 /* notyet */
          /*
--- 2659,2862 ----
  rfs_rmdir_getfh(struct nfsdiropargs *da)
  {
          return (da->da_fhandle);
  }
  
+ #ifdef nextdp
+ #undef nextdp
+ #endif
+ #define nextdp(dp)      ((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
+ 
  /* ARGSUSED */
  void
  rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
  {
          int error;
!         vnode_t *vp;
          struct iovec iov;
          struct uio uio;
!         int iseof;
! 
!         uint32_t count = rda->rda_count;
!         uint32_t size;          /* size of the readdirres structure */
!         int overflow = 0;
! 
!         size_t datasz;
!         char *data = NULL;
!         dirent64_t *dp;
! 
          struct sockaddr *ca;
!         struct nfsentry **eptr;
!         struct nfsentry *entry;
  
          vp = nfs_fhtovp(&rda->rda_fh, exi);
          if (vp == NULL) {
                  rd->rd_status = NFSERR_STALE;
                  return;
          }
  
          if (vp->v_type != VDIR) {
                  VN_RELE(vp);
                  rd->rd_status = NFSERR_NOTDIR;
                  return;
          }
  
          (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
  
          error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
!         if (error)
                  goto bad;
  
!         /*
!          * Don't allow arbitrary counts for allocation
!          */
!         if (count > NFS_MAXDATA)
!                 count = NFS_MAXDATA;
  
!         /*
!          * struct readdirres:
!          *   status:            1
!          *   entries (bool):    1
!          *   eof:               1
!          */
!         size = (1 + 1 + 1) * BYTES_PER_XDR_UNIT;
  
+         if (size > count) {
+                 eptr = &rd->rd_entries;
+                 iseof = 0;
+                 size = 0;
+ 
+                 goto done;
+         }
+ 
          /*
!          * This is simplification.  The dirent64_t size is not the same as the
!          * size of XDR representation of entry, but the sizes are similar so
!          * we'll assume they are same.  This assumption should not cause any
!          * harm.  In worst case we will need to issue VOP_READDIR() once more.
           */
!         datasz = count;
  
          /*
!          * Make sure that there is room to read at least one entry
!          * if any are available.
           */
!         if (datasz < DIRENT64_RECLEN(MAXNAMELEN))
!                 datasz = DIRENT64_RECLEN(MAXNAMELEN);
! 
!         data = kmem_alloc(datasz, KM_NOSLEEP);
!         if (data == NULL) {
!                 /* The allocation failed; downsize and wait for it this time */
!                 if (datasz > MAXBSIZE)
!                         datasz = MAXBSIZE;
!                 data = kmem_alloc(datasz, KM_SLEEP);
!         }
! 
          uio.uio_iov = &iov;
          uio.uio_iovcnt = 1;
          uio.uio_segflg = UIO_SYSSPACE;
          uio.uio_extflg = UIO_COPY_CACHED;
          uio.uio_loffset = (offset_t)rda->rda_offset;
!         uio.uio_resid = datasz;
  
!         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
!         eptr = &rd->rd_entries;
!         entry = NULL;
! 
! getmoredents:
!         iov.iov_base = data;
!         iov.iov_len = datasz;
! 
          error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
+         if (error) {
+                 iseof = 0;
+                 goto done;
+         }
  
!         if (iov.iov_len == datasz)
!                 goto done;
! 
!         for (dp = (dirent64_t *)data;
!             (char *)dp - data < datasz - iov.iov_len && !overflow;
!             dp = nextdp(dp)) {
!                 char *name;
!                 uint32_t esize;
!                 uint32_t cookie;
! 
!                 overflow = (uint64_t)dp->d_off > UINT32_MAX;
!                 if (overflow) {
!                         cookie = 0;
!                         iseof = 1;
!                 } else
!                         cookie = (uint32_t)dp->d_off;
! 
!                 if (dp->d_ino == 0 || (uint64_t)dp->d_ino > UINT32_MAX) {
!                         if (entry != NULL)
!                                 entry->cookie = cookie;
!                         continue;
                  }
+ 
+                 name = nfscmd_convname(ca, exi, dp->d_name,
+                     NFSCMD_CONV_OUTBOUND, NFS_MAXPATHLEN + 1);
+                 if (name == NULL) {
+                         if (entry != NULL)
+                                 entry->cookie = cookie;
+                         continue;
                  }
  
                  /*
!                  * struct entry:
!                  *   fileid:            1
!                  *   name (length):     1
!                  *   name (data):       length (rounded up)
!                  *   cookie:            1
!                  *   nextentry (bool):  1
                   */
!                 esize = (1 + 1 + 1 + 1) * BYTES_PER_XDR_UNIT +
!                     RNDUP(strlen(name));
! 
!                 /* If the new entry does not fit, discard it */
!                 if (esize > count - size) {
!                         if (name != dp->d_name)
!                                 kmem_free(name, NFS_MAXPATHLEN + 1);
!                         iseof = 0;
!                         goto done;
                  }
! 
!                 entry = kmem_alloc(sizeof (struct nfsentry), KM_SLEEP);
! 
!                 entry->fileid = (uint32_t)dp->d_ino;
!                 entry->name = strdup(name);
!                 if (name != dp->d_name)
!                         kmem_free(name, NFS_MAXPATHLEN + 1);
!                 entry->cookie = cookie;
! 
!                 size += esize;
! 
!                 /* Add the entry to the linked list */
!                 *eptr = entry;
!                 eptr = &entry->nextentry;
          }
  
+         if (!iseof && size < count) {
+                 uio.uio_resid = MIN(datasz, MAXBSIZE);
+                 goto getmoredents;
+         }
+ 
+ done:
+         *eptr = NULL;
+ 
+         if (iseof || rd->rd_entries != NULL || !error) {
+                 error = 0;
+                 rd->rd_eof = iseof ? TRUE : FALSE;
+ 
+                 /* This is for nfslog only */
+                 rd->rd_offset = rda->rda_offset;
+                 rd->rd_size = size;
+         }
+ 
  bad:
          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
  
  #if 0 /* notyet */
          /*
*** 2645,2665 ****
  
          VN_RELE(vp);
  
          rd->rd_status = puterrno(error);
  
  }
  void *
  rfs_readdir_getfh(struct nfsrddirargs *rda)
  {
          return (&rda->rda_fh);
  }
  void
  rfs_rddirfree(struct nfsrddirres *rd)
  {
!         if (rd->rd_entries != NULL)
!                 kmem_free(rd->rd_entries, rd->rd_bufsize);
  }
  
  /* ARGSUSED */
  void
  rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
--- 2872,2901 ----
  
          VN_RELE(vp);
  
          rd->rd_status = puterrno(error);
  
+         if (data != NULL)
+                 kmem_free(data, datasz);
  }
  void *
  rfs_readdir_getfh(struct nfsrddirargs *rda)
  {
          return (&rda->rda_fh);
  }
  void
  rfs_rddirfree(struct nfsrddirres *rd)
  {
!         if (rd->rd_status == NFS_OK) {
!                 struct nfsentry *entry, *nentry;
! 
!                 for (entry = rd->rd_entries; entry != NULL; entry = nentry) {
!                         nentry = entry->nextentry;
!                         strfree(entry->name);
!                         kmem_free(entry, sizeof (struct nfsentry));
!                 }
!         }
  }
  
  /* ARGSUSED */
  void
  rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
*** 2761,2771 ****
                  vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
          }
          return (0);
  }
  
! static enum nfsftype vt_to_nf[] = {
          0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
  };
  
  /*
   * check the following fields for overflow: nodeid, size, and time.
--- 2997,3007 ----
                  vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
          }
          return (0);
  }
  
! static const enum nfsftype vt_to_nf[] = {
          0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
  };
  
  /*
   * check the following fields for overflow: nodeid, size, and time.
*** 2980,2999 ****
  }
  
  void
  rfs_srvrinit(void)
  {
-         mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
          nfs2_srv_caller_id = fs_new_caller_id();
  }
  
  void
  rfs_srvrfini(void)
  {
-         mutex_destroy(&rfs_async_write_lock);
  }
  
  static int
  rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
  {
          struct clist    *wcl;
          int             wlist_len;
--- 3216,3259 ----
  }
  
  void
  rfs_srvrinit(void)
  {
          nfs2_srv_caller_id = fs_new_caller_id();
+         zone_key_create(&rfs_zone_key, rfs_zone_init, NULL, rfs_zone_fini);
  }
  
  void
  rfs_srvrfini(void)
  {
  }
  
+ /* ARGSUSED */
+ static void *
+ rfs_zone_init(zoneid_t zoneid)
+ {
+         nfs_srv_t *ns;
+ 
+         ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
+ 
+         mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
+         ns->write_async = 1;
+ 
+         return (ns);
+ }
+ 
+ /* ARGSUSED */
+ static void
+ rfs_zone_fini(zoneid_t zoneid, void *data)
+ {
+         nfs_srv_t *ns;
+ 
+         ns = (nfs_srv_t *)data;
+         mutex_destroy(&ns->async_write_lock);
+         kmem_free(ns, sizeof (*ns));
+ }
+ 
  static int
  rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
  {
          struct clist    *wcl;
          int             wlist_len;