Print this page
11083 support NFS server in zone
Portions contributed by: Dan Kruchinin <dan.kruchinin@nexenta.com>
Portions contributed by: Stepan Zastupov <stepan.zastupov@gmail.com>
Portions contributed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Portions contributed by: Mike Zeller <mike@mikezeller.net>
Portions contributed by: Dan McDonald <danmcd@joyent.com>
Portions contributed by: Gordon Ross <gordon.w.ross@gmail.com>
Portions contributed by: Vitaliy Gusev <gusev.vitaliy@gmail.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Jason King <jbk@joyent.com>
Reviewed by: C Fraire <cfraire@me.com>
Change-Id: I22f289d357503f9b48a0bc2482cc4328a6d43d16
*** 28,37 ****
--- 28,42 ----
/*
* Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
* All rights reserved.
*/
+ /*
+ * Copyright 2018 Nexenta Systems, Inc.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/cred.h>
#include <sys/buf.h>
*** 68,87 ****
--- 73,108 ----
#include <vm/seg_map.h>
#include <vm/seg_kmem.h>
#include <sys/strsubr.h>
+ struct rfs_async_write_list;
+
/*
+ * Zone globals of NFSv2 server
+ */
+ typedef struct nfs_srv {
+ kmutex_t async_write_lock;
+ struct rfs_async_write_list *async_write_head;
+
+ /*
+ * enables write clustering if == 1
+ */
+ int write_async;
+ } nfs_srv_t;
+
+ /*
* These are the interface routines for the server side of the
* Network File System. See the NFS version 2 protocol specification
* for a description of this interface.
*/
static int sattr_to_vattr(struct nfssattr *, struct vattr *);
static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
cred_t *);
+
/*
* Some "over the wire" UNIX file types. These are encoded
* into the mode. This needs to be fixed in the next rev.
*/
#define IFMT 0170000 /* type of file */
*** 89,98 ****
--- 110,128 ----
#define IFBLK 0060000 /* block special */
#define IFSOCK 0140000 /* socket */
u_longlong_t nfs2_srv_caller_id;
+ static nfs_srv_t *
+ nfs_get_srv(void)
+ {
+ nfs_globals_t *ng = nfs_srv_getzg();
+ nfs_srv_t *srv = ng->nfs_srv;
+ ASSERT(srv != NULL);
+ return (srv);
+ }
+
/*
* Get file attributes.
* Returns the current attributes of the file with the given fhandle.
*/
/* ARGSUSED */
*** 384,404 ****
int
rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
{
struct exportinfo *exi;
vnode_t *dvp = *dvpp;
! ASSERT(dvp->v_flag & VROOT);
VN_HOLD(dvp);
! dvp = untraverse(dvp);
exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
if (exi == NULL) {
VN_RELE(dvp);
return (-1);
}
exi_rele(*exip);
*exip = exi;
VN_RELE(*dvpp);
*dvpp = dvp;
--- 414,437 ----
int
rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
{
struct exportinfo *exi;
vnode_t *dvp = *dvpp;
+ vnode_t *zone_rootvp;
! zone_rootvp = (*exip)->exi_ne->exi_root->exi_vp;
! ASSERT((dvp->v_flag & VROOT) || VN_CMP(zone_rootvp, dvp));
VN_HOLD(dvp);
! dvp = untraverse(dvp, zone_rootvp);
exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
if (exi == NULL) {
VN_RELE(dvp);
return (-1);
}
+ ASSERT3U(exi->exi_zoneid, ==, (*exip)->exi_zoneid);
exi_rele(*exip);
*exip = exi;
VN_RELE(*dvpp);
*dvpp = dvp;
*** 444,454 ****
/*
* Allow lookups from the root - the default
* location of the public filehandle.
*/
if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
! dvp = rootdir;
VN_HOLD(dvp);
} else {
dvp = nfs_fhtovp(fhp, exi);
if (dvp == NULL) {
dr->dr_status = NFSERR_STALE;
--- 477,487 ----
/*
* Allow lookups from the root - the default
* location of the public filehandle.
*/
if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
! dvp = ZONE_ROOTVP();
VN_HOLD(dvp);
} else {
dvp = nfs_fhtovp(fhp, exi);
if (dvp == NULL) {
dr->dr_status = NFSERR_STALE;
*** 455,474 ****
return;
}
}
exi_hold(exi);
/*
* Not allow lookup beyond root.
* If the filehandle matches a filehandle of the exi,
* then the ".." refers beyond the root of an exported filesystem.
*/
if (strcmp(da->da_name, "..") == 0 &&
EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
! (dvp->v_flag & VROOT)) {
/*
* special case for ".." and 'nohide'exported root
*/
if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
error = NFSERR_ACCES;
--- 488,508 ----
return;
}
}
exi_hold(exi);
+ ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
/*
* Not allow lookup beyond root.
* If the filehandle matches a filehandle of the exi,
* then the ".." refers beyond the root of an exported filesystem.
*/
if (strcmp(da->da_name, "..") == 0 &&
EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
! ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
/*
* special case for ".." and 'nohide'exported root
*/
if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
error = NFSERR_ACCES;
*** 500,509 ****
--- 534,544 ----
*/
if (PUBLIC_FH2(fhp)) {
publicfh_flag = TRUE;
exi_rele(exi);
+ exi = NULL;
error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
&sec);
} else {
/*
*** 633,646 ****
rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
if (is_referral) {
char *s;
size_t strsz;
/* Get an artificial symlink based on a referral */
s = build_symlink(vp, cr, &strsz);
! global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
DTRACE_PROBE2(nfs2serv__func__referral__reflink,
vnode_t *, vp, char *, s);
if (s == NULL)
error = EINVAL;
else {
--- 668,683 ----
rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
if (is_referral) {
char *s;
size_t strsz;
+ kstat_named_t *stat =
+ exi->exi_ne->ne_globals->svstat[NFS_VERSION];
/* Get an artificial symlink based on a referral */
s = build_symlink(vp, cr, &strsz);
! stat[NFS_REFERLINKS].value.ui64++;
DTRACE_PROBE2(nfs2serv__func__referral__reflink,
vnode_t *, vp, char *, s);
if (s == NULL)
error = EINVAL;
else {
*** 773,782 ****
--- 810,821 ----
error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
/* check if a monitor detected a delegation conflict */
if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
+ if (in_crit)
+ nbl_end_crit(vp);
VN_RELE(vp);
/* mark as wouldblock so response is dropped */
curthread->t_flag |= T_WOULDBLOCK;
rr->rr_data = NULL;
*** 1098,1111 ****
error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
/* check if a monitor detected a delegation conflict */
if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
! VN_RELE(vp);
! /* mark as wouldblock so response is dropped */
! curthread->t_flag |= T_WOULDBLOCK;
! return;
}
if (wa->wa_data || wa->wa_rlist) {
/* Do the RDMA thing if necessary */
if (wa->wa_rlist) {
--- 1137,1147 ----
error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
/* check if a monitor detected a delegation conflict */
if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
! goto out;
}
if (wa->wa_data || wa->wa_rlist) {
/* Do the RDMA thing if necessary */
if (wa->wa_rlist) {
*** 1141,1150 ****
--- 1177,1187 ----
savecred = curthread->t_cred;
curthread->t_cred = cr;
error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
curthread->t_cred = savecred;
} else {
+
iovcnt = 0;
for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
iovcnt++;
if (iovcnt <= MAX_IOVECS) {
#ifdef DEBUG
*** 1284,1295 ****
struct rfs_async_write_list nlpsp;
ushort_t t_flag;
cred_t *savecred;
int in_crit = 0;
caller_context_t ct;
! if (!rfs_write_async) {
rfs_write_sync(wa, ns, exi, req, cr, ro);
return;
}
/*
--- 1321,1335 ----
struct rfs_async_write_list nlpsp;
ushort_t t_flag;
cred_t *savecred;
int in_crit = 0;
caller_context_t ct;
+ nfs_srv_t *nsrv;
! ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
! nsrv = nfs_get_srv();
! if (!nsrv->write_async) {
rfs_write_sync(wa, ns, exi, req, cr, ro);
return;
}
/*
*** 1310,1321 ****
/*
* Look to see if there is already a cluster started
* for this file.
*/
! mutex_enter(&rfs_async_write_lock);
! for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
if (bcmp(&wa->wa_fhandle, lp->fhp,
sizeof (fhandle_t)) == 0)
break;
}
--- 1350,1361 ----
/*
* Look to see if there is already a cluster started
* for this file.
*/
! mutex_enter(&nsrv->async_write_lock);
! for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
if (bcmp(&wa->wa_fhandle, lp->fhp,
sizeof (fhandle_t)) == 0)
break;
}
*** 1337,1348 ****
if (trp == NULL)
lp->list = nrp;
else
trp->list = nrp;
while (nrp->ns->ns_status == RFSWRITE_INITVAL)
! cv_wait(&lp->cv, &rfs_async_write_lock);
! mutex_exit(&rfs_async_write_lock);
return;
}
/*
--- 1377,1388 ----
if (trp == NULL)
lp->list = nrp;
else
trp->list = nrp;
while (nrp->ns->ns_status == RFSWRITE_INITVAL)
! cv_wait(&lp->cv, &nsrv->async_write_lock);
! mutex_exit(&nsrv->async_write_lock);
return;
}
/*
*** 1355,1385 ****
nlp->fhp = &wa->wa_fhandle;
cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
nlp->list = nrp;
nlp->next = NULL;
! if (rfs_async_write_head == NULL) {
! rfs_async_write_head = nlp;
} else {
! lp = rfs_async_write_head;
while (lp->next != NULL)
lp = lp->next;
lp->next = nlp;
}
! mutex_exit(&rfs_async_write_lock);
/*
* Convert the file handle common to all of the requests
* in this cluster to a vnode.
*/
vp = nfs_fhtovp(&wa->wa_fhandle, exi);
if (vp == NULL) {
! mutex_enter(&rfs_async_write_lock);
! if (rfs_async_write_head == nlp)
! rfs_async_write_head = nlp->next;
else {
! lp = rfs_async_write_head;
while (lp->next != nlp)
lp = lp->next;
lp->next = nlp->next;
}
t_flag = curthread->t_flag & T_WOULDBLOCK;
--- 1395,1425 ----
nlp->fhp = &wa->wa_fhandle;
cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
nlp->list = nrp;
nlp->next = NULL;
! if (nsrv->async_write_head == NULL) {
! nsrv->async_write_head = nlp;
} else {
! lp = nsrv->async_write_head;
while (lp->next != NULL)
lp = lp->next;
lp->next = nlp;
}
! mutex_exit(&nsrv->async_write_lock);
/*
* Convert the file handle common to all of the requests
* in this cluster to a vnode.
*/
vp = nfs_fhtovp(&wa->wa_fhandle, exi);
if (vp == NULL) {
! mutex_enter(&nsrv->async_write_lock);
! if (nsrv->async_write_head == nlp)
! nsrv->async_write_head = nlp->next;
else {
! lp = nsrv->async_write_head;
while (lp->next != nlp)
lp = lp->next;
lp->next = nlp->next;
}
t_flag = curthread->t_flag & T_WOULDBLOCK;
*** 1386,1396 ****
for (rp = nlp->list; rp != NULL; rp = rp->list) {
rp->ns->ns_status = NFSERR_STALE;
rp->thread->t_flag |= t_flag;
}
cv_broadcast(&nlp->cv);
! mutex_exit(&rfs_async_write_lock);
return;
}
/*
--- 1426,1436 ----
for (rp = nlp->list; rp != NULL; rp = rp->list) {
rp->ns->ns_status = NFSERR_STALE;
rp->thread->t_flag |= t_flag;
}
cv_broadcast(&nlp->cv);
! mutex_exit(&nsrv->async_write_lock);
return;
}
/*
*** 1397,1411 ****
* Can only write regular files. Attempts to write any
* other file types fail with EISDIR.
*/
if (vp->v_type != VREG) {
VN_RELE(vp);
! mutex_enter(&rfs_async_write_lock);
! if (rfs_async_write_head == nlp)
! rfs_async_write_head = nlp->next;
else {
! lp = rfs_async_write_head;
while (lp->next != nlp)
lp = lp->next;
lp->next = nlp->next;
}
t_flag = curthread->t_flag & T_WOULDBLOCK;
--- 1437,1451 ----
* Can only write regular files. Attempts to write any
* other file types fail with EISDIR.
*/
if (vp->v_type != VREG) {
VN_RELE(vp);
! mutex_enter(&nsrv->async_write_lock);
! if (nsrv->async_write_head == nlp)
! nsrv->async_write_head = nlp->next;
else {
! lp = nsrv->async_write_head;
while (lp->next != nlp)
lp = lp->next;
lp->next = nlp->next;
}
t_flag = curthread->t_flag & T_WOULDBLOCK;
*** 1412,1422 ****
for (rp = nlp->list; rp != NULL; rp = rp->list) {
rp->ns->ns_status = NFSERR_ISDIR;
rp->thread->t_flag |= t_flag;
}
cv_broadcast(&nlp->cv);
! mutex_exit(&rfs_async_write_lock);
return;
}
/*
--- 1452,1462 ----
for (rp = nlp->list; rp != NULL; rp = rp->list) {
rp->ns->ns_status = NFSERR_ISDIR;
rp->thread->t_flag |= t_flag;
}
cv_broadcast(&nlp->cv);
! mutex_exit(&nsrv->async_write_lock);
return;
}
/*
*** 1444,1458 ****
if (in_crit)
nbl_end_crit(vp);
VN_RELE(vp);
/* mark as wouldblock so response is dropped */
curthread->t_flag |= T_WOULDBLOCK;
! mutex_enter(&rfs_async_write_lock);
! if (rfs_async_write_head == nlp)
! rfs_async_write_head = nlp->next;
else {
! lp = rfs_async_write_head;
while (lp->next != nlp)
lp = lp->next;
lp->next = nlp->next;
}
for (rp = nlp->list; rp != NULL; rp = rp->list) {
--- 1484,1498 ----
if (in_crit)
nbl_end_crit(vp);
VN_RELE(vp);
/* mark as wouldblock so response is dropped */
curthread->t_flag |= T_WOULDBLOCK;
! mutex_enter(&nsrv->async_write_lock);
! if (nsrv->async_write_head == nlp)
! nsrv->async_write_head = nlp->next;
else {
! lp = nsrv->async_write_head;
while (lp->next != nlp)
lp = lp->next;
lp->next = nlp->next;
}
for (rp = nlp->list; rp != NULL; rp = rp->list) {
*** 1460,1470 ****
rp->ns->ns_status = puterrno(error);
rp->thread->t_flag |= T_WOULDBLOCK;
}
}
cv_broadcast(&nlp->cv);
! mutex_exit(&rfs_async_write_lock);
return;
}
/*
--- 1500,1510 ----
rp->ns->ns_status = puterrno(error);
rp->thread->t_flag |= T_WOULDBLOCK;
}
}
cv_broadcast(&nlp->cv);
! mutex_exit(&nsrv->async_write_lock);
return;
}
/*
*** 1482,1501 ****
* a new cluster and be blocked in VOP_RWLOCK while
* the first request is being processed. This delay
* will allow more requests to be clustered in this
* second cluster.
*/
! mutex_enter(&rfs_async_write_lock);
! if (rfs_async_write_head == nlp)
! rfs_async_write_head = nlp->next;
else {
! lp = rfs_async_write_head;
while (lp->next != nlp)
lp = lp->next;
lp->next = nlp->next;
}
! mutex_exit(&rfs_async_write_lock);
/*
* Step through the list of requests in this cluster.
* We need to check permissions to make sure that all
* of the requests have sufficient permission to write
--- 1522,1541 ----
* a new cluster and be blocked in VOP_RWLOCK while
* the first request is being processed. This delay
* will allow more requests to be clustered in this
* second cluster.
*/
! mutex_enter(&nsrv->async_write_lock);
! if (nsrv->async_write_head == nlp)
! nsrv->async_write_head = nlp->next;
else {
! lp = nsrv->async_write_head;
while (lp->next != nlp)
lp = lp->next;
lp->next = nlp->next;
}
! mutex_exit(&nsrv->async_write_lock);
/*
* Step through the list of requests in this cluster.
* We need to check permissions to make sure that all
* of the requests have sufficient permission to write
*** 1736,1754 ****
if (in_crit)
nbl_end_crit(vp);
VN_RELE(vp);
t_flag = curthread->t_flag & T_WOULDBLOCK;
! mutex_enter(&rfs_async_write_lock);
for (rp = nlp->list; rp != NULL; rp = rp->list) {
if (rp->ns->ns_status == RFSWRITE_INITVAL) {
rp->ns->ns_status = puterrno(error);
rp->thread->t_flag |= t_flag;
}
}
cv_broadcast(&nlp->cv);
! mutex_exit(&rfs_async_write_lock);
}
void *
rfs_write_getfh(struct nfswriteargs *wa)
--- 1776,1794 ----
if (in_crit)
nbl_end_crit(vp);
VN_RELE(vp);
t_flag = curthread->t_flag & T_WOULDBLOCK;
! mutex_enter(&nsrv->async_write_lock);
for (rp = nlp->list; rp != NULL; rp = rp->list) {
if (rp->ns->ns_status == RFSWRITE_INITVAL) {
rp->ns->ns_status = puterrno(error);
rp->thread->t_flag |= t_flag;
}
}
cv_broadcast(&nlp->cv);
! mutex_exit(&nsrv->async_write_lock);
}
void *
rfs_write_getfh(struct nfswriteargs *wa)
*** 2209,2219 ****
return;
}
/* Check for delegation on the file being renamed over, if it exists */
! if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
NULL, NULL, NULL) == 0) {
if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
VN_RELE(tovp);
--- 2249,2259 ----
return;
}
/* Check for delegation on the file being renamed over, if it exists */
! if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
NULL, NULL, NULL) == 0) {
if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
VN_RELE(tovp);
*** 2576,2586 ****
* Of course, NFS servers have no idea what their
* clients' current directories are. We fake it by
* supplying a vnode known to exist and illegal to
* remove.
*/
! error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
/*
* Force modified data and metadata out to stable storage.
*/
(void) VOP_FSYNC(vp, 0, cr, NULL);
--- 2616,2626 ----
* Of course, NFS servers have no idea what their
* clients' current directories are. We fake it by
* supplying a vnode known to exist and illegal to
* remove.
*/
! error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
/*
* Force modified data and metadata out to stable storage.
*/
(void) VOP_FSYNC(vp, 0, cr, NULL);
*** 2851,2861 ****
vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
}
return (0);
}
! static enum nfsftype vt_to_nf[] = {
0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
};
/*
* check the following fields for overflow: nodeid, size, and time.
--- 2891,2901 ----
vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
}
return (0);
}
! static const enum nfsftype vt_to_nf[] = {
0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
};
/*
* check the following fields for overflow: nodeid, size, and time.
*** 3070,3089 ****
}
void
rfs_srvrinit(void)
{
- mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
nfs2_srv_caller_id = fs_new_caller_id();
}
void
rfs_srvrfini(void)
{
- mutex_destroy(&rfs_async_write_lock);
}
static int
rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
{
struct clist *wcl;
int wlist_len;
--- 3110,3153 ----
}
void
rfs_srvrinit(void)
{
nfs2_srv_caller_id = fs_new_caller_id();
}
void
rfs_srvrfini(void)
{
}
+ /* ARGSUSED */
+ void
+ rfs_srv_zone_init(nfs_globals_t *ng)
+ {
+ nfs_srv_t *ns;
+
+ ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
+
+ mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
+ ns->write_async = 1;
+
+ ng->nfs_srv = ns;
+ }
+
+ /* ARGSUSED */
+ void
+ rfs_srv_zone_fini(nfs_globals_t *ng)
+ {
+ nfs_srv_t *ns = ng->nfs_srv;
+
+ ng->nfs_srv = NULL;
+
+ mutex_destroy(&ns->async_write_lock);
+ kmem_free(ns, sizeof (*ns));
+ }
+
static int
rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
{
struct clist *wcl;
int wlist_len;