Print this page
11083 support NFS server in zone
Portions contributed by: Dan Kruchinin <dan.kruchinin@nexenta.com>
Portions contributed by: Stepan Zastupov <stepan.zastupov@gmail.com>
Portions contributed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Portions contributed by: Mike Zeller <mike@mikezeller.net>
Portions contributed by: Dan McDonald <danmcd@joyent.com>
Portions contributed by: Gordon Ross <gordon.w.ross@gmail.com>
Portions contributed by: Vitaliy Gusev <gusev.vitaliy@gmail.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Jason King <jbk@joyent.com>
Reviewed by: C Fraire <cfraire@me.com>
Change-Id: I22f289d357503f9b48a0bc2482cc4328a6d43d16

*** 28,37 **** --- 28,42 ---- /* * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All rights reserved. */ + /* + * Copyright 2018 Nexenta Systems, Inc. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + #include <sys/param.h> #include <sys/types.h> #include <sys/systm.h> #include <sys/cred.h> #include <sys/buf.h>
*** 68,87 **** --- 73,108 ---- #include <vm/seg_map.h> #include <vm/seg_kmem.h> #include <sys/strsubr.h> + struct rfs_async_write_list; + /* + * Zone globals of NFSv2 server + */ + typedef struct nfs_srv { + kmutex_t async_write_lock; + struct rfs_async_write_list *async_write_head; + + /* + * enables write clustering if == 1 + */ + int write_async; + } nfs_srv_t; + + /* * These are the interface routines for the server side of the * Network File System. See the NFS version 2 protocol specification * for a description of this interface. */ static int sattr_to_vattr(struct nfssattr *, struct vattr *); static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *, cred_t *); + /* * Some "over the wire" UNIX file types. These are encoded * into the mode. This needs to be fixed in the next rev. */ #define IFMT 0170000 /* type of file */
*** 89,98 **** --- 110,128 ---- #define IFBLK 0060000 /* block special */ #define IFSOCK 0140000 /* socket */ u_longlong_t nfs2_srv_caller_id; + static nfs_srv_t * + nfs_get_srv(void) + { + nfs_globals_t *ng = nfs_srv_getzg(); + nfs_srv_t *srv = ng->nfs_srv; + ASSERT(srv != NULL); + return (srv); + } + /* * Get file attributes. * Returns the current attributes of the file with the given fhandle. */ /* ARGSUSED */
*** 384,404 **** int rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr) { struct exportinfo *exi; vnode_t *dvp = *dvpp; ! ASSERT(dvp->v_flag & VROOT); VN_HOLD(dvp); ! dvp = untraverse(dvp); exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE); if (exi == NULL) { VN_RELE(dvp); return (-1); } exi_rele(*exip); *exip = exi; VN_RELE(*dvpp); *dvpp = dvp; --- 414,437 ---- int rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr) { struct exportinfo *exi; vnode_t *dvp = *dvpp; + vnode_t *zone_rootvp; ! zone_rootvp = (*exip)->exi_ne->exi_root->exi_vp; ! ASSERT((dvp->v_flag & VROOT) || VN_CMP(zone_rootvp, dvp)); VN_HOLD(dvp); ! dvp = untraverse(dvp, zone_rootvp); exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE); if (exi == NULL) { VN_RELE(dvp); return (-1); } + ASSERT3U(exi->exi_zoneid, ==, (*exip)->exi_zoneid); exi_rele(*exip); *exip = exi; VN_RELE(*dvpp); *dvpp = dvp;
*** 444,454 **** /* * Allow lookups from the root - the default * location of the public filehandle. */ if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) { ! dvp = rootdir; VN_HOLD(dvp); } else { dvp = nfs_fhtovp(fhp, exi); if (dvp == NULL) { dr->dr_status = NFSERR_STALE; --- 477,487 ---- /* * Allow lookups from the root - the default * location of the public filehandle. */ if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) { ! dvp = ZONE_ROOTVP(); VN_HOLD(dvp); } else { dvp = nfs_fhtovp(fhp, exi); if (dvp == NULL) { dr->dr_status = NFSERR_STALE;
*** 455,474 **** return; } } exi_hold(exi); /* * Not allow lookup beyond root. * If the filehandle matches a filehandle of the exi, * then the ".." refers beyond the root of an exported filesystem. */ if (strcmp(da->da_name, "..") == 0 && EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) { if ((exi->exi_export.ex_flags & EX_NOHIDE) && ! (dvp->v_flag & VROOT)) { /* * special case for ".." and 'nohide'exported root */ if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) { error = NFSERR_ACCES; --- 488,508 ---- return; } } exi_hold(exi); + ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id); /* * Not allow lookup beyond root. * If the filehandle matches a filehandle of the exi, * then the ".." refers beyond the root of an exported filesystem. */ if (strcmp(da->da_name, "..") == 0 && EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) { if ((exi->exi_export.ex_flags & EX_NOHIDE) && ! ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) { /* * special case for ".." and 'nohide'exported root */ if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) { error = NFSERR_ACCES;
*** 500,509 **** --- 534,544 ---- */ if (PUBLIC_FH2(fhp)) { publicfh_flag = TRUE; exi_rele(exi); + exi = NULL; error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi, &sec); } else { /*
*** 633,646 **** rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); if (is_referral) { char *s; size_t strsz; /* Get an artificial symlink based on a referral */ s = build_symlink(vp, cr, &strsz); ! global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++; DTRACE_PROBE2(nfs2serv__func__referral__reflink, vnode_t *, vp, char *, s); if (s == NULL) error = EINVAL; else { --- 668,683 ---- rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); if (is_referral) { char *s; size_t strsz; + kstat_named_t *stat = + exi->exi_ne->ne_globals->svstat[NFS_VERSION]; /* Get an artificial symlink based on a referral */ s = build_symlink(vp, cr, &strsz); ! stat[NFS_REFERLINKS].value.ui64++; DTRACE_PROBE2(nfs2serv__func__referral__reflink, vnode_t *, vp, char *, s); if (s == NULL) error = EINVAL; else {
*** 773,782 **** --- 810,821 ---- error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct); /* check if a monitor detected a delegation conflict */ if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { + if (in_crit) + nbl_end_crit(vp); VN_RELE(vp); /* mark as wouldblock so response is dropped */ curthread->t_flag |= T_WOULDBLOCK; rr->rr_data = NULL;
*** 1098,1111 **** error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct); /* check if a monitor detected a delegation conflict */ if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { ! VN_RELE(vp); ! /* mark as wouldblock so response is dropped */ ! curthread->t_flag |= T_WOULDBLOCK; ! return; } if (wa->wa_data || wa->wa_rlist) { /* Do the RDMA thing if necessary */ if (wa->wa_rlist) { --- 1137,1147 ---- error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct); /* check if a monitor detected a delegation conflict */ if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { ! goto out; } if (wa->wa_data || wa->wa_rlist) { /* Do the RDMA thing if necessary */ if (wa->wa_rlist) {
*** 1141,1150 **** --- 1177,1187 ---- savecred = curthread->t_cred; curthread->t_cred = cr; error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct); curthread->t_cred = savecred; } else { + iovcnt = 0; for (m = wa->wa_mblk; m != NULL; m = m->b_cont) iovcnt++; if (iovcnt <= MAX_IOVECS) { #ifdef DEBUG
*** 1284,1295 **** struct rfs_async_write_list nlpsp; ushort_t t_flag; cred_t *savecred; int in_crit = 0; caller_context_t ct; ! if (!rfs_write_async) { rfs_write_sync(wa, ns, exi, req, cr, ro); return; } /* --- 1321,1335 ---- struct rfs_async_write_list nlpsp; ushort_t t_flag; cred_t *savecred; int in_crit = 0; caller_context_t ct; + nfs_srv_t *nsrv; ! ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id); ! nsrv = nfs_get_srv(); ! if (!nsrv->write_async) { rfs_write_sync(wa, ns, exi, req, cr, ro); return; } /*
*** 1310,1321 **** /* * Look to see if there is already a cluster started * for this file. */ ! mutex_enter(&rfs_async_write_lock); ! for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) { if (bcmp(&wa->wa_fhandle, lp->fhp, sizeof (fhandle_t)) == 0) break; } --- 1350,1361 ---- /* * Look to see if there is already a cluster started * for this file. */ ! mutex_enter(&nsrv->async_write_lock); ! for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) { if (bcmp(&wa->wa_fhandle, lp->fhp, sizeof (fhandle_t)) == 0) break; }
*** 1337,1348 **** if (trp == NULL) lp->list = nrp; else trp->list = nrp; while (nrp->ns->ns_status == RFSWRITE_INITVAL) ! cv_wait(&lp->cv, &rfs_async_write_lock); ! mutex_exit(&rfs_async_write_lock); return; } /* --- 1377,1388 ---- if (trp == NULL) lp->list = nrp; else trp->list = nrp; while (nrp->ns->ns_status == RFSWRITE_INITVAL) ! cv_wait(&lp->cv, &nsrv->async_write_lock); ! mutex_exit(&nsrv->async_write_lock); return; } /*
*** 1355,1385 **** nlp->fhp = &wa->wa_fhandle; cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL); nlp->list = nrp; nlp->next = NULL; ! if (rfs_async_write_head == NULL) { ! rfs_async_write_head = nlp; } else { ! lp = rfs_async_write_head; while (lp->next != NULL) lp = lp->next; lp->next = nlp; } ! mutex_exit(&rfs_async_write_lock); /* * Convert the file handle common to all of the requests * in this cluster to a vnode. */ vp = nfs_fhtovp(&wa->wa_fhandle, exi); if (vp == NULL) { ! mutex_enter(&rfs_async_write_lock); ! if (rfs_async_write_head == nlp) ! rfs_async_write_head = nlp->next; else { ! lp = rfs_async_write_head; while (lp->next != nlp) lp = lp->next; lp->next = nlp->next; } t_flag = curthread->t_flag & T_WOULDBLOCK; --- 1395,1425 ---- nlp->fhp = &wa->wa_fhandle; cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL); nlp->list = nrp; nlp->next = NULL; ! if (nsrv->async_write_head == NULL) { ! nsrv->async_write_head = nlp; } else { ! lp = nsrv->async_write_head; while (lp->next != NULL) lp = lp->next; lp->next = nlp; } ! mutex_exit(&nsrv->async_write_lock); /* * Convert the file handle common to all of the requests * in this cluster to a vnode. */ vp = nfs_fhtovp(&wa->wa_fhandle, exi); if (vp == NULL) { ! mutex_enter(&nsrv->async_write_lock); ! if (nsrv->async_write_head == nlp) ! nsrv->async_write_head = nlp->next; else { ! lp = nsrv->async_write_head; while (lp->next != nlp) lp = lp->next; lp->next = nlp->next; } t_flag = curthread->t_flag & T_WOULDBLOCK;
*** 1386,1396 **** for (rp = nlp->list; rp != NULL; rp = rp->list) { rp->ns->ns_status = NFSERR_STALE; rp->thread->t_flag |= t_flag; } cv_broadcast(&nlp->cv); ! mutex_exit(&rfs_async_write_lock); return; } /* --- 1426,1436 ---- for (rp = nlp->list; rp != NULL; rp = rp->list) { rp->ns->ns_status = NFSERR_STALE; rp->thread->t_flag |= t_flag; } cv_broadcast(&nlp->cv); ! mutex_exit(&nsrv->async_write_lock); return; } /*
*** 1397,1411 **** * Can only write regular files. Attempts to write any * other file types fail with EISDIR. */ if (vp->v_type != VREG) { VN_RELE(vp); ! mutex_enter(&rfs_async_write_lock); ! if (rfs_async_write_head == nlp) ! rfs_async_write_head = nlp->next; else { ! lp = rfs_async_write_head; while (lp->next != nlp) lp = lp->next; lp->next = nlp->next; } t_flag = curthread->t_flag & T_WOULDBLOCK; --- 1437,1451 ---- * Can only write regular files. Attempts to write any * other file types fail with EISDIR. */ if (vp->v_type != VREG) { VN_RELE(vp); ! mutex_enter(&nsrv->async_write_lock); ! if (nsrv->async_write_head == nlp) ! nsrv->async_write_head = nlp->next; else { ! lp = nsrv->async_write_head; while (lp->next != nlp) lp = lp->next; lp->next = nlp->next; } t_flag = curthread->t_flag & T_WOULDBLOCK;
*** 1412,1422 **** for (rp = nlp->list; rp != NULL; rp = rp->list) { rp->ns->ns_status = NFSERR_ISDIR; rp->thread->t_flag |= t_flag; } cv_broadcast(&nlp->cv); ! mutex_exit(&rfs_async_write_lock); return; } /* --- 1452,1462 ---- for (rp = nlp->list; rp != NULL; rp = rp->list) { rp->ns->ns_status = NFSERR_ISDIR; rp->thread->t_flag |= t_flag; } cv_broadcast(&nlp->cv); ! mutex_exit(&nsrv->async_write_lock); return; } /*
*** 1444,1458 **** if (in_crit) nbl_end_crit(vp); VN_RELE(vp); /* mark as wouldblock so response is dropped */ curthread->t_flag |= T_WOULDBLOCK; ! mutex_enter(&rfs_async_write_lock); ! if (rfs_async_write_head == nlp) ! rfs_async_write_head = nlp->next; else { ! lp = rfs_async_write_head; while (lp->next != nlp) lp = lp->next; lp->next = nlp->next; } for (rp = nlp->list; rp != NULL; rp = rp->list) { --- 1484,1498 ---- if (in_crit) nbl_end_crit(vp); VN_RELE(vp); /* mark as wouldblock so response is dropped */ curthread->t_flag |= T_WOULDBLOCK; ! mutex_enter(&nsrv->async_write_lock); ! if (nsrv->async_write_head == nlp) ! nsrv->async_write_head = nlp->next; else { ! lp = nsrv->async_write_head; while (lp->next != nlp) lp = lp->next; lp->next = nlp->next; } for (rp = nlp->list; rp != NULL; rp = rp->list) {
*** 1460,1470 **** rp->ns->ns_status = puterrno(error); rp->thread->t_flag |= T_WOULDBLOCK; } } cv_broadcast(&nlp->cv); ! mutex_exit(&rfs_async_write_lock); return; } /* --- 1500,1510 ---- rp->ns->ns_status = puterrno(error); rp->thread->t_flag |= T_WOULDBLOCK; } } cv_broadcast(&nlp->cv); ! mutex_exit(&nsrv->async_write_lock); return; } /*
*** 1482,1501 **** * a new cluster and be blocked in VOP_RWLOCK while * the first request is being processed. This delay * will allow more requests to be clustered in this * second cluster. */ ! mutex_enter(&rfs_async_write_lock); ! if (rfs_async_write_head == nlp) ! rfs_async_write_head = nlp->next; else { ! lp = rfs_async_write_head; while (lp->next != nlp) lp = lp->next; lp->next = nlp->next; } ! mutex_exit(&rfs_async_write_lock); /* * Step through the list of requests in this cluster. * We need to check permissions to make sure that all * of the requests have sufficient permission to write --- 1522,1541 ---- * a new cluster and be blocked in VOP_RWLOCK while * the first request is being processed. This delay * will allow more requests to be clustered in this * second cluster. */ ! mutex_enter(&nsrv->async_write_lock); ! if (nsrv->async_write_head == nlp) ! nsrv->async_write_head = nlp->next; else { ! lp = nsrv->async_write_head; while (lp->next != nlp) lp = lp->next; lp->next = nlp->next; } ! mutex_exit(&nsrv->async_write_lock); /* * Step through the list of requests in this cluster. * We need to check permissions to make sure that all * of the requests have sufficient permission to write
*** 1736,1754 **** if (in_crit) nbl_end_crit(vp); VN_RELE(vp); t_flag = curthread->t_flag & T_WOULDBLOCK; ! mutex_enter(&rfs_async_write_lock); for (rp = nlp->list; rp != NULL; rp = rp->list) { if (rp->ns->ns_status == RFSWRITE_INITVAL) { rp->ns->ns_status = puterrno(error); rp->thread->t_flag |= t_flag; } } cv_broadcast(&nlp->cv); ! mutex_exit(&rfs_async_write_lock); } void * rfs_write_getfh(struct nfswriteargs *wa) --- 1776,1794 ---- if (in_crit) nbl_end_crit(vp); VN_RELE(vp); t_flag = curthread->t_flag & T_WOULDBLOCK; ! mutex_enter(&nsrv->async_write_lock); for (rp = nlp->list; rp != NULL; rp = rp->list) { if (rp->ns->ns_status == RFSWRITE_INITVAL) { rp->ns->ns_status = puterrno(error); rp->thread->t_flag |= t_flag; } } cv_broadcast(&nlp->cv); ! mutex_exit(&nsrv->async_write_lock); } void * rfs_write_getfh(struct nfswriteargs *wa)
*** 2209,2219 **** return; } /* Check for delegation on the file being renamed over, if it exists */ ! if (rfs4_deleg_policy != SRV_NEVER_DELEGATE && VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr, NULL, NULL, NULL) == 0) { if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { VN_RELE(tovp); --- 2249,2259 ---- return; } /* Check for delegation on the file being renamed over, if it exists */ ! if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE && VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr, NULL, NULL, NULL) == 0) { if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { VN_RELE(tovp);
*** 2576,2586 **** * Of course, NFS servers have no idea what their * clients' current directories are. We fake it by * supplying a vnode known to exist and illegal to * remove. */ ! error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0); /* * Force modified data and metadata out to stable storage. */ (void) VOP_FSYNC(vp, 0, cr, NULL); --- 2616,2626 ---- * Of course, NFS servers have no idea what their * clients' current directories are. We fake it by * supplying a vnode known to exist and illegal to * remove. */ ! error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0); /* * Force modified data and metadata out to stable storage. */ (void) VOP_FSYNC(vp, 0, cr, NULL);
*** 2851,2861 **** vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000); } return (0); } ! static enum nfsftype vt_to_nf[] = { 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0 }; /* * check the following fields for overflow: nodeid, size, and time. --- 2891,2901 ---- vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000); } return (0); } ! static const enum nfsftype vt_to_nf[] = { 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0 }; /* * check the following fields for overflow: nodeid, size, and time.
*** 3070,3089 **** } void rfs_srvrinit(void) { - mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL); nfs2_srv_caller_id = fs_new_caller_id(); } void rfs_srvrfini(void) { - mutex_destroy(&rfs_async_write_lock); } static int rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr) { struct clist *wcl; int wlist_len; --- 3110,3153 ---- } void rfs_srvrinit(void) { nfs2_srv_caller_id = fs_new_caller_id(); } void rfs_srvrfini(void) { } + /* ARGSUSED */ + void + rfs_srv_zone_init(nfs_globals_t *ng) + { + nfs_srv_t *ns; + + ns = kmem_zalloc(sizeof (*ns), KM_SLEEP); + + mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL); + ns->write_async = 1; + + ng->nfs_srv = ns; + } + + /* ARGSUSED */ + void + rfs_srv_zone_fini(nfs_globals_t *ng) + { + nfs_srv_t *ns = ng->nfs_srv; + + ng->nfs_srv = NULL; + + mutex_destroy(&ns->async_write_lock); + kmem_free(ns, sizeof (*ns)); + } + static int rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr) { struct clist *wcl; int wlist_len;