Print this page
NEX-17125 NFS: nbmand lock entered but not exited on error path
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
NEX-15279 support NFS server in zone
NEX-15520 online NFS shares cause zoneadm halt to hang in nfs_export_zone_fini
Portions contributed by: Dan Kruchinin dan.kruchinin@nexenta.com
Portions contributed by: Stepan Zastupov stepan.zastupov@gmail.com
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
NEX-9275 Got "bad mutex" panic when run IO to nfs share from clients
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
NEX-3524 CLONE - Port NEX-3505 "wrong authentication" messages with root=@0.0.0.0/0 set, result in loss of client access
Reviewed by: Marcel Telka <marcel.telka@nexenta.com>
NEX-3533 CLONE - Port NEX-3019 NFSv3 writes underneath mounted filesystem to directory
Reviewed by: Dan Fields <dan.fields@nexenta.com>
NEX-3095 Issues related to NFS nohide
Reviewed by: Dan Fields <dan.fields@nexenta.com>
NEX-1128 NFS server: Generic uid and gid remapping for AUTH_SYS
Reviewed by: Jan Kryl <jan.kryl@nexenta.com>
OS-20 share_nfs(1m) charset handling is unreliable
OS-22 Page fault at nfscmd_dropped_entrysize+0x1e()
OS-23 NFSv2/3/4: READDIR responses are inconsistent when charset conversion fails
OS-24 rfs3_readdir(): Issues related to nfscmd_convdirent()
Reviewed by: Jan Kryl <jan.kryl@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
re #13613 rb4516 Tunables needs volatile keyword
closes #12112 rb3823 - nfs-nohide: lookup("..") for submount should be correct
re #3541 rb11254 - nfs nohide - "nfssrv: need ability to go to submounts for v3 and v2 protocols"
@@ -16,21 +16,25 @@
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
+
/*
* Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
*/
/*
* Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
* All rights reserved.
*/
+/*
+ * Copyright 2018 Nexenta Systems, Inc.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/cred.h>
#include <sys/buf.h>
@@ -67,20 +71,38 @@
#include <vm/seg_map.h>
#include <vm/seg_kmem.h>
#include <sys/strsubr.h>
+struct rfs_async_write_list;
+
/*
+ * Zone globals of NFSv2 server
+ */
+typedef struct nfs_srv {
+ kmutex_t async_write_lock;
+ struct rfs_async_write_list *async_write_head;
+
+ /*
+ * enables write clustering if == 1
+ */
+ int write_async;
+} nfs_srv_t;
+
+/*
* These are the interface routines for the server side of the
* Network File System. See the NFS version 2 protocol specification
* for a description of this interface.
*/
static int sattr_to_vattr(struct nfssattr *, struct vattr *);
static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
cred_t *);
+static void *rfs_zone_init(zoneid_t zoneid);
+static void rfs_zone_fini(zoneid_t zoneid, void *data);
+
/*
* Some "over the wire" UNIX file types. These are encoded
* into the mode. This needs to be fixed in the next rev.
*/
#define IFMT 0170000 /* type of file */
@@ -87,10 +109,11 @@
#define IFCHR 0020000 /* character special */
#define IFBLK 0060000 /* block special */
#define IFSOCK 0140000 /* socket */
u_longlong_t nfs2_srv_caller_id;
+static zone_key_t rfs_zone_key;
/*
* Get file attributes.
* Returns the current attributes of the file with the given fhandle.
*/
@@ -327,11 +350,85 @@
rfs_setattr_getfh(struct nfssaargs *args)
{
return (&args->saa_fh);
}
+/* Change and release @exip and @vpp only in success */
+int
+rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
+{
+ struct exportinfo *exi;
+ vnode_t *vp = *vpp;
+ fid_t fid;
+ int error;
+
+ VN_HOLD(vp);
+
+ if ((error = traverse(&vp)) != 0) {
+ VN_RELE(vp);
+ return (error);
+ }
+
+ bzero(&fid, sizeof (fid));
+ fid.fid_len = MAXFIDSZ;
+ error = VOP_FID(vp, &fid, NULL);
+ if (error) {
+ VN_RELE(vp);
+ return (error);
+ }
+
+ exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
+ if (exi == NULL ||
+ (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
+ /*
+ * It is not error, just subdir is not exported
+ * or "nohide" is not set
+ */
+ if (exi != NULL)
+ exi_rele(&exi);
+ VN_RELE(vp);
+ } else {
+ /* go to submount */
+ exi_rele(exip);
+ *exip = exi;
+
+ VN_RELE(*vpp);
+ *vpp = vp;
+ }
+
+ return (0);
+}
+
/*
+ * Given mounted "dvp" and "exi", go upper mountpoint
+ * with dvp/exi correction
+ * Return 0 in success
+ */
+int
+rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
+{
+ struct exportinfo *exi;
+ vnode_t *dvp = *dvpp;
+
+ ASSERT(dvp->v_flag & VROOT);
+
+ VN_HOLD(dvp);
+ dvp = untraverse(dvp);
+ exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
+ if (exi == NULL) {
+ VN_RELE(dvp);
+ return (-1);
+ }
+
+ exi_rele(exip);
+ *exip = exi;
+ VN_RELE(*dvpp);
+ *dvpp = dvp;
+
+ return (0);
+}
+/*
* Directory lookup.
* Returns an fhandle and file attributes for file name in a directory.
*/
/* ARGSUSED */
void
@@ -369,39 +466,51 @@
/*
* Allow lookups from the root - the default
* location of the public filehandle.
*/
if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
- dvp = rootdir;
+ dvp = ZONE_ROOTVP();
VN_HOLD(dvp);
} else {
dvp = nfs_fhtovp(fhp, exi);
if (dvp == NULL) {
dr->dr_status = NFSERR_STALE;
return;
}
}
+ exi_hold(exi);
+
/*
* Not allow lookup beyond root.
* If the filehandle matches a filehandle of the exi,
* then the ".." refers beyond the root of an exported filesystem.
*/
if (strcmp(da->da_name, "..") == 0 &&
EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
- VN_RELE(dvp);
- dr->dr_status = NFSERR_NOENT;
- return;
+ if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
+ (dvp->v_flag & VROOT)) {
+ /*
+ * special case for ".." and 'nohide'exported root
+ */
+ if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
+ error = NFSERR_ACCES;
+ goto out;
}
+ } else {
+ error = NFSERR_NOENT;
+ goto out;
+ }
+ }
ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
MAXPATHLEN);
if (name == NULL) {
- dr->dr_status = NFSERR_ACCES;
- return;
+ error = NFSERR_ACCES;
+ goto out;
}
/*
* If the public filehandle is used then allow
* a multi-component lookup, i.e. evaluate
@@ -411,10 +520,13 @@
* This may result in a vnode in another filesystem
* which is OK as long as the filesystem is exported.
*/
if (PUBLIC_FH2(fhp)) {
publicfh_flag = TRUE;
+
+ exi_rele(&exi);
+
error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
&sec);
} else {
/*
* Do a normal single component lookup.
@@ -424,10 +536,15 @@
}
if (name != da->da_name)
kmem_free(name, MAXPATHLEN);
+ if (error == 0 && vn_ismntpt(vp)) {
+ error = rfs_cross_mnt(&vp, &exi);
+ if (error)
+ VN_RELE(vp);
+ }
if (!error) {
va.va_mask = AT_ALL; /* we want everything */
error = rfs4_delegated_getattr(vp, &va, 0, cr);
@@ -450,20 +567,15 @@
}
}
VN_RELE(vp);
}
+out:
VN_RELE(dvp);
- /*
- * If publicfh_flag is true then we have called rfs_publicfh_mclookup
- * and have obtained a new exportinfo in exi which needs to be
- * released. Note the the original exportinfo pointed to by exi
- * will be released by the caller, comon_dispatch.
- */
- if (publicfh_flag && exi != NULL)
- exi_rele(exi);
+ if (exi != NULL)
+ exi_rele(&exi);
/*
* If it's public fh, no 0x81, and client's flavor is
* invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
* Then set RPC status to AUTH_TOOWEAK in common_dispatch.
@@ -683,10 +795,12 @@
error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
/* check if a monitor detected a delegation conflict */
if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
+ if (in_crit)
+ nbl_end_crit(vp);
VN_RELE(vp);
/* mark as wouldblock so response is dropped */
curthread->t_flag |= T_WOULDBLOCK;
rr->rr_data = NULL;
@@ -1008,14 +1122,11 @@
error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
/* check if a monitor detected a delegation conflict */
if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
- VN_RELE(vp);
- /* mark as wouldblock so response is dropped */
- curthread->t_flag |= T_WOULDBLOCK;
- return;
+ goto out;
}
if (wa->wa_data || wa->wa_rlist) {
/* Do the RDMA thing if necessary */
if (wa->wa_rlist) {
@@ -1051,10 +1162,11 @@
savecred = curthread->t_cred;
curthread->t_cred = cr;
error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
curthread->t_cred = savecred;
} else {
+
iovcnt = 0;
for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
iovcnt++;
if (iovcnt <= MAX_IOVECS) {
#ifdef DEBUG
@@ -1149,11 +1261,11 @@
struct rfs_async_write_list *next;
};
static struct rfs_async_write_list *rfs_async_write_head = NULL;
static kmutex_t rfs_async_write_lock;
-static int rfs_write_async = 1; /* enables write clustering if == 1 */
+volatile int rfs_write_async = 1; /* enables write clustering if == 1 */
#define MAXCLIOVECS 42
#define RFSWRITE_INITVAL (enum nfsstat) -1
#ifdef DEBUG
@@ -1194,12 +1306,14 @@
struct rfs_async_write_list nlpsp;
ushort_t t_flag;
cred_t *savecred;
int in_crit = 0;
caller_context_t ct;
+ nfs_srv_t *nsrv;
- if (!rfs_write_async) {
+ nsrv = zone_getspecific(rfs_zone_key, curzone);
+ if (!nsrv->write_async) {
rfs_write_sync(wa, ns, exi, req, cr, ro);
return;
}
/*
@@ -1220,12 +1334,12 @@
/*
* Look to see if there is already a cluster started
* for this file.
*/
- mutex_enter(&rfs_async_write_lock);
- for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
+ mutex_enter(&nsrv->async_write_lock);
+ for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
if (bcmp(&wa->wa_fhandle, lp->fhp,
sizeof (fhandle_t)) == 0)
break;
}
@@ -1247,12 +1361,12 @@
if (trp == NULL)
lp->list = nrp;
else
trp->list = nrp;
while (nrp->ns->ns_status == RFSWRITE_INITVAL)
- cv_wait(&lp->cv, &rfs_async_write_lock);
- mutex_exit(&rfs_async_write_lock);
+ cv_wait(&lp->cv, &nsrv->async_write_lock);
+ mutex_exit(&nsrv->async_write_lock);
return;
}
/*
@@ -1265,31 +1379,31 @@
nlp->fhp = &wa->wa_fhandle;
cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
nlp->list = nrp;
nlp->next = NULL;
- if (rfs_async_write_head == NULL) {
- rfs_async_write_head = nlp;
+ if (nsrv->async_write_head == NULL) {
+ nsrv->async_write_head = nlp;
} else {
- lp = rfs_async_write_head;
+ lp = nsrv->async_write_head;
while (lp->next != NULL)
lp = lp->next;
lp->next = nlp;
}
- mutex_exit(&rfs_async_write_lock);
+ mutex_exit(&nsrv->async_write_lock);
/*
* Convert the file handle common to all of the requests
* in this cluster to a vnode.
*/
vp = nfs_fhtovp(&wa->wa_fhandle, exi);
if (vp == NULL) {
- mutex_enter(&rfs_async_write_lock);
- if (rfs_async_write_head == nlp)
- rfs_async_write_head = nlp->next;
+ mutex_enter(&nsrv->async_write_lock);
+ if (nsrv->async_write_head == nlp)
+ nsrv->async_write_head = nlp->next;
else {
- lp = rfs_async_write_head;
+ lp = nsrv->async_write_head;
while (lp->next != nlp)
lp = lp->next;
lp->next = nlp->next;
}
t_flag = curthread->t_flag & T_WOULDBLOCK;
@@ -1296,11 +1410,11 @@
for (rp = nlp->list; rp != NULL; rp = rp->list) {
rp->ns->ns_status = NFSERR_STALE;
rp->thread->t_flag |= t_flag;
}
cv_broadcast(&nlp->cv);
- mutex_exit(&rfs_async_write_lock);
+ mutex_exit(&nsrv->async_write_lock);
return;
}
/*
@@ -1307,15 +1421,15 @@
* Can only write regular files. Attempts to write any
* other file types fail with EISDIR.
*/
if (vp->v_type != VREG) {
VN_RELE(vp);
- mutex_enter(&rfs_async_write_lock);
- if (rfs_async_write_head == nlp)
- rfs_async_write_head = nlp->next;
+ mutex_enter(&nsrv->async_write_lock);
+ if (nsrv->async_write_head == nlp)
+ nsrv->async_write_head = nlp->next;
else {
- lp = rfs_async_write_head;
+ lp = nsrv->async_write_head;
while (lp->next != nlp)
lp = lp->next;
lp->next = nlp->next;
}
t_flag = curthread->t_flag & T_WOULDBLOCK;
@@ -1322,11 +1436,11 @@
for (rp = nlp->list; rp != NULL; rp = rp->list) {
rp->ns->ns_status = NFSERR_ISDIR;
rp->thread->t_flag |= t_flag;
}
cv_broadcast(&nlp->cv);
- mutex_exit(&rfs_async_write_lock);
+ mutex_exit(&nsrv->async_write_lock);
return;
}
/*
@@ -1354,15 +1468,15 @@
if (in_crit)
nbl_end_crit(vp);
VN_RELE(vp);
/* mark as wouldblock so response is dropped */
curthread->t_flag |= T_WOULDBLOCK;
- mutex_enter(&rfs_async_write_lock);
- if (rfs_async_write_head == nlp)
- rfs_async_write_head = nlp->next;
+ mutex_enter(&nsrv->async_write_lock);
+ if (nsrv->async_write_head == nlp)
+ nsrv->async_write_head = nlp->next;
else {
- lp = rfs_async_write_head;
+ lp = nsrv->async_write_head;
while (lp->next != nlp)
lp = lp->next;
lp->next = nlp->next;
}
for (rp = nlp->list; rp != NULL; rp = rp->list) {
@@ -1370,11 +1484,11 @@
rp->ns->ns_status = puterrno(error);
rp->thread->t_flag |= T_WOULDBLOCK;
}
}
cv_broadcast(&nlp->cv);
- mutex_exit(&rfs_async_write_lock);
+ mutex_exit(&nsrv->async_write_lock);
return;
}
/*
@@ -1392,20 +1506,20 @@
* a new cluster and be blocked in VOP_RWLOCK while
* the first request is being processed. This delay
* will allow more requests to be clustered in this
* second cluster.
*/
- mutex_enter(&rfs_async_write_lock);
- if (rfs_async_write_head == nlp)
- rfs_async_write_head = nlp->next;
+ mutex_enter(&nsrv->async_write_lock);
+ if (nsrv->async_write_head == nlp)
+ nsrv->async_write_head = nlp->next;
else {
- lp = rfs_async_write_head;
+ lp = nsrv->async_write_head;
while (lp->next != nlp)
lp = lp->next;
lp->next = nlp->next;
}
- mutex_exit(&rfs_async_write_lock);
+ mutex_exit(&nsrv->async_write_lock);
/*
* Step through the list of requests in this cluster.
* We need to check permissions to make sure that all
* of the requests have sufficient permission to write
@@ -1646,19 +1760,19 @@
if (in_crit)
nbl_end_crit(vp);
VN_RELE(vp);
t_flag = curthread->t_flag & T_WOULDBLOCK;
- mutex_enter(&rfs_async_write_lock);
+ mutex_enter(&nsrv->async_write_lock);
for (rp = nlp->list; rp != NULL; rp = rp->list) {
if (rp->ns->ns_status == RFSWRITE_INITVAL) {
rp->ns->ns_status = puterrno(error);
rp->thread->t_flag |= t_flag;
}
}
cv_broadcast(&nlp->cv);
- mutex_exit(&rfs_async_write_lock);
+ mutex_exit(&nsrv->async_write_lock);
}
void *
rfs_write_getfh(struct nfswriteargs *wa)
@@ -1716,10 +1830,16 @@
VN_RELE(dvp);
dr->dr_status = NFSERR_INVAL;
return;
}
+ if (protect_zfs_mntpt(dvp) != 0) {
+ VN_RELE(dvp);
+ dr->dr_status = NFSERR_ACCES;
+ return;
+ }
+
/*
* This is a completely gross hack to make mknod
* work over the wire until we can wack the protocol
*/
if ((va.va_mode & IFMT) == IFCHR) {
@@ -2055,11 +2175,11 @@
if (to_exi == NULL) {
VN_RELE(fromvp);
*status = NFSERR_ACCES;
return;
}
- exi_rele(to_exi);
+ exi_rele(&to_exi);
if (to_exi != exi) {
VN_RELE(fromvp);
*status = NFSERR_XDEV;
return;
@@ -2095,10 +2215,17 @@
VN_RELE(fromvp);
*status = NFSERR_ROFS;
return;
}
+ if (protect_zfs_mntpt(tovp) != 0) {
+ VN_RELE(tovp);
+ VN_RELE(fromvp);
+ *status = NFSERR_ACCES;
+ return;
+ }
+
/*
* Check for a conflict with a non-blocking mandatory share reservation.
*/
error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
NULL, cr, NULL, NULL, NULL);
@@ -2119,11 +2246,11 @@
return;
}
/* Check for delegation on the file being renamed over, if it exists */
- if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
+ if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
NULL, NULL, NULL) == 0) {
if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
VN_RELE(tovp);
@@ -2201,11 +2328,11 @@
if (to_exi == NULL) {
VN_RELE(fromvp);
*status = NFSERR_ACCES;
return;
}
- exi_rele(to_exi);
+ exi_rele(&to_exi);
if (to_exi != exi) {
VN_RELE(fromvp);
*status = NFSERR_XDEV;
return;
@@ -2239,10 +2366,17 @@
VN_RELE(fromvp);
*status = NFSERR_ROFS;
return;
}
+ if (protect_zfs_mntpt(tovp) != 0) {
+ VN_RELE(tovp);
+ VN_RELE(fromvp);
+ *status = NFSERR_ACCES;
+ return;
+ }
+
error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
/*
* Force modified data and metadata out to stable storage.
*/
@@ -2261,11 +2395,11 @@
return (args->la_from);
}
/*
* Symbolicly link to a file.
- * Create a file (to) with the given attributes which is a symbolic link
+ * Create a file (from) with the given attributes which is a symbolic link
* to the given path name (to).
*/
void
rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
@@ -2309,10 +2443,16 @@
VN_RELE(vp);
*status = NFSERR_INVAL;
return;
}
+ if (protect_zfs_mntpt(vp) != 0) {
+ VN_RELE(vp);
+ *status = NFSERR_ACCES;
+ return;
+ }
+
ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
name = nfscmd_convname(ca, exi, args->sla_tnm,
NFSCMD_CONV_INBOUND, MAXPATHLEN);
if (name == NULL) {
@@ -2401,10 +2541,16 @@
VN_RELE(vp);
dr->dr_status = NFSERR_INVAL;
return;
}
+ if (protect_zfs_mntpt(vp) != 0) {
+ VN_RELE(vp);
+ dr->dr_status = NFSERR_ACCES;
+ return;
+ }
+
va.va_type = VDIR;
va.va_mask |= AT_TYPE;
error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
@@ -2486,11 +2632,11 @@
* Of course, NFS servers have no idea what their
* clients' current directories are. We fake it by
* supplying a vnode known to exist and illegal to
* remove.
*/
- error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
+ error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
/*
* Force modified data and metadata out to stable storage.
*/
(void) VOP_FSYNC(vp, 0, cr, NULL);
@@ -2513,123 +2659,204 @@
rfs_rmdir_getfh(struct nfsdiropargs *da)
{
return (da->da_fhandle);
}
+#ifdef nextdp
+#undef nextdp
+#endif
+#define nextdp(dp) ((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
+
/* ARGSUSED */
void
rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
{
int error;
- int iseof;
+ vnode_t *vp;
struct iovec iov;
struct uio uio;
- vnode_t *vp;
- char *ndata = NULL;
+ int iseof;
+
+ uint32_t count = rda->rda_count;
+ uint32_t size; /* size of the readdirres structure */
+ int overflow = 0;
+
+ size_t datasz;
+ char *data = NULL;
+ dirent64_t *dp;
+
struct sockaddr *ca;
- size_t nents;
- int ret;
+ struct nfsentry **eptr;
+ struct nfsentry *entry;
vp = nfs_fhtovp(&rda->rda_fh, exi);
if (vp == NULL) {
- rd->rd_entries = NULL;
rd->rd_status = NFSERR_STALE;
return;
}
if (vp->v_type != VDIR) {
VN_RELE(vp);
- rd->rd_entries = NULL;
rd->rd_status = NFSERR_NOTDIR;
return;
}
(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
-
- if (error) {
- rd->rd_entries = NULL;
+ if (error)
goto bad;
- }
- if (rda->rda_count == 0) {
- rd->rd_entries = NULL;
- rd->rd_size = 0;
- rd->rd_eof = FALSE;
- goto bad;
- }
+ /*
+ * Don't allow arbitrary counts for allocation
+ */
+ if (count > NFS_MAXDATA)
+ count = NFS_MAXDATA;
- rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
+ /*
+ * struct readdirres:
+ * status: 1
+ * entries (bool): 1
+ * eof: 1
+ */
+ size = (1 + 1 + 1) * BYTES_PER_XDR_UNIT;
+ if (size > count) {
+ eptr = &rd->rd_entries;
+ iseof = 0;
+ size = 0;
+
+ goto done;
+ }
+
/*
- * Allocate data for entries. This will be freed by rfs_rddirfree.
+ * This is simplification. The dirent64_t size is not the same as the
+ * size of XDR representation of entry, but the sizes are similar so
+ * we'll assume they are same. This assumption should not cause any
+ * harm. In worst case we will need to issue VOP_READDIR() once more.
*/
- rd->rd_bufsize = (uint_t)rda->rda_count;
- rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
+ datasz = count;
/*
- * Set up io vector to read directory data
+ * Make sure that there is room to read at least one entry
+ * if any are available.
*/
- iov.iov_base = (caddr_t)rd->rd_entries;
- iov.iov_len = rda->rda_count;
+ if (datasz < DIRENT64_RECLEN(MAXNAMELEN))
+ datasz = DIRENT64_RECLEN(MAXNAMELEN);
+
+ data = kmem_alloc(datasz, KM_NOSLEEP);
+ if (data == NULL) {
+ /* The allocation failed; downsize and wait for it this time */
+ if (datasz > MAXBSIZE)
+ datasz = MAXBSIZE;
+ data = kmem_alloc(datasz, KM_SLEEP);
+ }
+
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_extflg = UIO_COPY_CACHED;
uio.uio_loffset = (offset_t)rda->rda_offset;
- uio.uio_resid = rda->rda_count;
+ uio.uio_resid = datasz;
- /*
- * read directory
- */
+ ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
+ eptr = &rd->rd_entries;
+ entry = NULL;
+
+getmoredents:
+ iov.iov_base = data;
+ iov.iov_len = datasz;
+
error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
+ if (error) {
+ iseof = 0;
+ goto done;
+ }
- /*
- * Clean up
- */
- if (!error) {
- /*
- * set size and eof
- */
- if (uio.uio_resid == rda->rda_count) {
- rd->rd_size = 0;
- rd->rd_eof = TRUE;
- } else {
- rd->rd_size = (uint32_t)(rda->rda_count -
- uio.uio_resid);
- rd->rd_eof = iseof ? TRUE : FALSE;
+ if (iov.iov_len == datasz)
+ goto done;
+
+ for (dp = (dirent64_t *)data;
+ (char *)dp - data < datasz - iov.iov_len && !overflow;
+ dp = nextdp(dp)) {
+ char *name;
+ uint32_t esize;
+ uint32_t cookie;
+
+ overflow = (uint64_t)dp->d_off > UINT32_MAX;
+ if (overflow) {
+ cookie = 0;
+ iseof = 1;
+ } else
+ cookie = (uint32_t)dp->d_off;
+
+ if (dp->d_ino == 0 || (uint64_t)dp->d_ino > UINT32_MAX) {
+ if (entry != NULL)
+ entry->cookie = cookie;
+ continue;
}
+
+ name = nfscmd_convname(ca, exi, dp->d_name,
+ NFSCMD_CONV_OUTBOUND, NFS_MAXPATHLEN + 1);
+ if (name == NULL) {
+ if (entry != NULL)
+ entry->cookie = cookie;
+ continue;
}
- ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
- nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
- ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
- rda->rda_count, &ndata);
-
- if (ret != 0) {
- size_t dropbytes;
/*
- * We had to drop one or more entries in order to fit
- * during the character conversion. We need to patch
- * up the size and eof info.
+ * struct entry:
+ * fileid: 1
+ * name (length): 1
+ * name (data): length (rounded up)
+ * cookie: 1
+ * nextentry (bool): 1
*/
- if (rd->rd_eof)
- rd->rd_eof = FALSE;
- dropbytes = nfscmd_dropped_entrysize(
- (struct dirent64 *)rd->rd_entries, nents, ret);
- rd->rd_size -= dropbytes;
+ esize = (1 + 1 + 1 + 1) * BYTES_PER_XDR_UNIT +
+ RNDUP(strlen(name));
+
+ /* If the new entry does not fit, discard it */
+ if (esize > count - size) {
+ if (name != dp->d_name)
+ kmem_free(name, NFS_MAXPATHLEN + 1);
+ iseof = 0;
+ goto done;
}
- if (ndata == NULL) {
- ndata = (char *)rd->rd_entries;
- } else if (ndata != (char *)rd->rd_entries) {
- kmem_free(rd->rd_entries, rd->rd_bufsize);
- rd->rd_entries = (void *)ndata;
- rd->rd_bufsize = rda->rda_count;
+
+ entry = kmem_alloc(sizeof (struct nfsentry), KM_SLEEP);
+
+ entry->fileid = (uint32_t)dp->d_ino;
+ entry->name = strdup(name);
+ if (name != dp->d_name)
+ kmem_free(name, NFS_MAXPATHLEN + 1);
+ entry->cookie = cookie;
+
+ size += esize;
+
+ /* Add the entry to the linked list */
+ *eptr = entry;
+ eptr = &entry->nextentry;
}
+ if (!iseof && size < count) {
+ uio.uio_resid = MIN(datasz, MAXBSIZE);
+ goto getmoredents;
+ }
+
+done:
+ *eptr = NULL;
+
+ if (iseof || rd->rd_entries != NULL || !error) {
+ error = 0;
+ rd->rd_eof = iseof ? TRUE : FALSE;
+
+ /* This is for nfslog only */
+ rd->rd_offset = rda->rda_offset;
+ rd->rd_size = size;
+ }
+
bad:
VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
#if 0 /* notyet */
/*
@@ -2645,21 +2872,30 @@
VN_RELE(vp);
rd->rd_status = puterrno(error);
+ if (data != NULL)
+ kmem_free(data, datasz);
}
void *
rfs_readdir_getfh(struct nfsrddirargs *rda)
{
return (&rda->rda_fh);
}
void
rfs_rddirfree(struct nfsrddirres *rd)
{
- if (rd->rd_entries != NULL)
- kmem_free(rd->rd_entries, rd->rd_bufsize);
+ if (rd->rd_status == NFS_OK) {
+ struct nfsentry *entry, *nentry;
+
+ for (entry = rd->rd_entries; entry != NULL; entry = nentry) {
+ nentry = entry->nextentry;
+ strfree(entry->name);
+ kmem_free(entry, sizeof (struct nfsentry));
+ }
+ }
}
/* ARGSUSED */
void
rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
@@ -2761,11 +2997,11 @@
vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
}
return (0);
}
-static enum nfsftype vt_to_nf[] = {
+static const enum nfsftype vt_to_nf[] = {
0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
};
/*
* check the following fields for overflow: nodeid, size, and time.
@@ -2980,20 +3216,44 @@
}
void
rfs_srvrinit(void)
{
- mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
nfs2_srv_caller_id = fs_new_caller_id();
+ zone_key_create(&rfs_zone_key, rfs_zone_init, NULL, rfs_zone_fini);
}
void
rfs_srvrfini(void)
{
- mutex_destroy(&rfs_async_write_lock);
}
+/* ARGSUSED */
+static void *
+rfs_zone_init(zoneid_t zoneid)
+{
+ nfs_srv_t *ns;
+
+ ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
+
+ mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
+ ns->write_async = 1;
+
+ return (ns);
+}
+
+/* ARGSUSED */
+static void
+rfs_zone_fini(zoneid_t zoneid, void *data)
+{
+ nfs_srv_t *ns;
+
+ ns = (nfs_srv_t *)data;
+ mutex_destroy(&ns->async_write_lock);
+ kmem_free(ns, sizeof (*ns));
+}
+
static int
rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
{
struct clist *wcl;
int wlist_len;