Print this page
11083 support NFS server in zone
Portions contributed by: Dan Kruchinin <dan.kruchinin@nexenta.com>
Portions contributed by: Stepan Zastupov <stepan.zastupov@gmail.com>
Portions contributed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Portions contributed by: Mike Zeller <mike@mikezeller.net>
Portions contributed by: Dan McDonald <danmcd@joyent.com>
Portions contributed by: Gordon Ross <gordon.w.ross@gmail.com>
Portions contributed by: Vitaliy Gusev <gusev.vitaliy@gmail.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Jason King <jbk@joyent.com>
Reviewed by: C Fraire <cfraire@me.com>
Change-Id: I22f289d357503f9b48a0bc2482cc4328a6d43d16

@@ -28,10 +28,15 @@
 /*
  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  *      All rights reserved.
  */
 
+/*
+ * Copyright 2018 Nexenta Systems, Inc.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/cred.h>
 #include <sys/buf.h>

@@ -68,20 +73,36 @@
 #include <vm/seg_map.h>
 #include <vm/seg_kmem.h>
 
 #include <sys/strsubr.h>
 
+struct rfs_async_write_list;
+
 /*
+ * Zone globals of NFSv2 server
+ */
+typedef struct nfs_srv {
+        kmutex_t                        async_write_lock;
+        struct rfs_async_write_list     *async_write_head;
+
+        /*
+         * enables write clustering if == 1
+         */
+        int             write_async;
+} nfs_srv_t;
+
+/*
  * These are the interface routines for the server side of the
  * Network File System.  See the NFS version 2 protocol specification
  * for a description of this interface.
  */
 
 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
                         cred_t *);
 
+
 /*
  * Some "over the wire" UNIX file types.  These are encoded
  * into the mode.  This needs to be fixed in the next rev.
  */
 #define IFMT            0170000         /* type of file */

@@ -89,10 +110,19 @@
 #define IFBLK           0060000         /* block special */
 #define IFSOCK          0140000         /* socket */
 
 u_longlong_t nfs2_srv_caller_id;
 
+static nfs_srv_t *
+nfs_get_srv(void)
+{
+        nfs_globals_t *ng = nfs_srv_getzg();
+        nfs_srv_t *srv = ng->nfs_srv;
+        ASSERT(srv != NULL);
+        return (srv);
+}
+
 /*
  * Get file attributes.
  * Returns the current attributes of the file with the given fhandle.
  */
 /* ARGSUSED */

@@ -384,21 +414,24 @@
 int
 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 {
         struct exportinfo *exi;
         vnode_t *dvp = *dvpp;
+        vnode_t *zone_rootvp;
 
-        ASSERT(dvp->v_flag & VROOT);
+        zone_rootvp = (*exip)->exi_ne->exi_root->exi_vp;
+        ASSERT((dvp->v_flag & VROOT) || VN_CMP(zone_rootvp, dvp));
 
         VN_HOLD(dvp);
-        dvp = untraverse(dvp);
+        dvp = untraverse(dvp, zone_rootvp);
         exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
         if (exi == NULL) {
                 VN_RELE(dvp);
                 return (-1);
         }
 
+        ASSERT3U(exi->exi_zoneid, ==, (*exip)->exi_zoneid);
         exi_rele(*exip);
         *exip = exi;
         VN_RELE(*dvpp);
         *dvpp = dvp;
 

@@ -444,11 +477,11 @@
         /*
          * Allow lookups from the root - the default
          * location of the public filehandle.
          */
         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
-                dvp = rootdir;
+                dvp = ZONE_ROOTVP();
                 VN_HOLD(dvp);
         } else {
                 dvp = nfs_fhtovp(fhp, exi);
                 if (dvp == NULL) {
                         dr->dr_status = NFSERR_STALE;

@@ -455,20 +488,21 @@
                         return;
                 }
         }
 
         exi_hold(exi);
+        ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
 
         /*
          * Not allow lookup beyond root.
          * If the filehandle matches a filehandle of the exi,
          * then the ".." refers beyond the root of an exported filesystem.
          */
         if (strcmp(da->da_name, "..") == 0 &&
             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
                 if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
-                    (dvp->v_flag & VROOT)) {
+                    ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
                         /*
                          * special case for ".." and 'nohide'exported root
                          */
                         if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
                                 error = NFSERR_ACCES;

@@ -500,10 +534,11 @@
          */
         if (PUBLIC_FH2(fhp)) {
                 publicfh_flag = TRUE;
 
                 exi_rele(exi);
+                exi = NULL;
 
                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
                     &sec);
         } else {
                 /*

@@ -633,14 +668,16 @@
         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 
         if (is_referral) {
                 char *s;
                 size_t strsz;
+                kstat_named_t *stat =
+                    exi->exi_ne->ne_globals->svstat[NFS_VERSION];
 
                 /* Get an artificial symlink based on a referral */
                 s = build_symlink(vp, cr, &strsz);
-                global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
+                stat[NFS_REFERLINKS].value.ui64++;
                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
                     vnode_t *, vp, char *, s);
                 if (s == NULL)
                         error = EINVAL;
                 else {

@@ -773,10 +810,12 @@
 
         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 
         /* check if a monitor detected a delegation conflict */
         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
+                if (in_crit)
+                        nbl_end_crit(vp);
                 VN_RELE(vp);
                 /* mark as wouldblock so response is dropped */
                 curthread->t_flag |= T_WOULDBLOCK;
 
                 rr->rr_data = NULL;

@@ -1098,14 +1137,11 @@
 
         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
 
         /* check if a monitor detected a delegation conflict */
         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
-                VN_RELE(vp);
-                /* mark as wouldblock so response is dropped */
-                curthread->t_flag |= T_WOULDBLOCK;
-                return;
+                goto out;
         }
 
         if (wa->wa_data || wa->wa_rlist) {
                 /* Do the RDMA thing if necessary */
                 if (wa->wa_rlist) {

@@ -1141,10 +1177,11 @@
                 savecred = curthread->t_cred;
                 curthread->t_cred = cr;
                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
                 curthread->t_cred = savecred;
         } else {
+
                 iovcnt = 0;
                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
                         iovcnt++;
                 if (iovcnt <= MAX_IOVECS) {
 #ifdef DEBUG

@@ -1284,12 +1321,15 @@
         struct rfs_async_write_list nlpsp;
         ushort_t t_flag;
         cred_t *savecred;
         int in_crit = 0;
         caller_context_t ct;
+        nfs_srv_t *nsrv;
 
-        if (!rfs_write_async) {
+        ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
+        nsrv = nfs_get_srv();
+        if (!nsrv->write_async) {
                 rfs_write_sync(wa, ns, exi, req, cr, ro);
                 return;
         }
 
         /*

@@ -1310,12 +1350,12 @@
 
         /*
          * Look to see if there is already a cluster started
          * for this file.
          */
-        mutex_enter(&rfs_async_write_lock);
-        for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
+        mutex_enter(&nsrv->async_write_lock);
+        for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
                 if (bcmp(&wa->wa_fhandle, lp->fhp,
                     sizeof (fhandle_t)) == 0)
                         break;
         }
 

@@ -1337,12 +1377,12 @@
                 if (trp == NULL)
                         lp->list = nrp;
                 else
                         trp->list = nrp;
                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
-                        cv_wait(&lp->cv, &rfs_async_write_lock);
-                mutex_exit(&rfs_async_write_lock);
+                        cv_wait(&lp->cv, &nsrv->async_write_lock);
+                mutex_exit(&nsrv->async_write_lock);
 
                 return;
         }
 
         /*

@@ -1355,31 +1395,31 @@
         nlp->fhp = &wa->wa_fhandle;
         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
         nlp->list = nrp;
         nlp->next = NULL;
 
-        if (rfs_async_write_head == NULL) {
-                rfs_async_write_head = nlp;
+        if (nsrv->async_write_head == NULL) {
+                nsrv->async_write_head = nlp;
         } else {
-                lp = rfs_async_write_head;
+                lp = nsrv->async_write_head;
                 while (lp->next != NULL)
                         lp = lp->next;
                 lp->next = nlp;
         }
-        mutex_exit(&rfs_async_write_lock);
+        mutex_exit(&nsrv->async_write_lock);
 
         /*
          * Convert the file handle common to all of the requests
          * in this cluster to a vnode.
          */
         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
         if (vp == NULL) {
-                mutex_enter(&rfs_async_write_lock);
-                if (rfs_async_write_head == nlp)
-                        rfs_async_write_head = nlp->next;
+                mutex_enter(&nsrv->async_write_lock);
+                if (nsrv->async_write_head == nlp)
+                        nsrv->async_write_head = nlp->next;
                 else {
-                        lp = rfs_async_write_head;
+                        lp = nsrv->async_write_head;
                         while (lp->next != nlp)
                                 lp = lp->next;
                         lp->next = nlp->next;
                 }
                 t_flag = curthread->t_flag & T_WOULDBLOCK;

@@ -1386,11 +1426,11 @@
                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
                         rp->ns->ns_status = NFSERR_STALE;
                         rp->thread->t_flag |= t_flag;
                 }
                 cv_broadcast(&nlp->cv);
-                mutex_exit(&rfs_async_write_lock);
+                mutex_exit(&nsrv->async_write_lock);
 
                 return;
         }
 
         /*

@@ -1397,15 +1437,15 @@
          * Can only write regular files.  Attempts to write any
          * other file types fail with EISDIR.
          */
         if (vp->v_type != VREG) {
                 VN_RELE(vp);
-                mutex_enter(&rfs_async_write_lock);
-                if (rfs_async_write_head == nlp)
-                        rfs_async_write_head = nlp->next;
+                mutex_enter(&nsrv->async_write_lock);
+                if (nsrv->async_write_head == nlp)
+                        nsrv->async_write_head = nlp->next;
                 else {
-                        lp = rfs_async_write_head;
+                        lp = nsrv->async_write_head;
                         while (lp->next != nlp)
                                 lp = lp->next;
                         lp->next = nlp->next;
                 }
                 t_flag = curthread->t_flag & T_WOULDBLOCK;

@@ -1412,11 +1452,11 @@
                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
                         rp->ns->ns_status = NFSERR_ISDIR;
                         rp->thread->t_flag |= t_flag;
                 }
                 cv_broadcast(&nlp->cv);
-                mutex_exit(&rfs_async_write_lock);
+                mutex_exit(&nsrv->async_write_lock);
 
                 return;
         }
 
         /*

@@ -1444,15 +1484,15 @@
                 if (in_crit)
                         nbl_end_crit(vp);
                 VN_RELE(vp);
                 /* mark as wouldblock so response is dropped */
                 curthread->t_flag |= T_WOULDBLOCK;
-                mutex_enter(&rfs_async_write_lock);
-                if (rfs_async_write_head == nlp)
-                        rfs_async_write_head = nlp->next;
+                mutex_enter(&nsrv->async_write_lock);
+                if (nsrv->async_write_head == nlp)
+                        nsrv->async_write_head = nlp->next;
                 else {
-                        lp = rfs_async_write_head;
+                        lp = nsrv->async_write_head;
                         while (lp->next != nlp)
                                 lp = lp->next;
                         lp->next = nlp->next;
                 }
                 for (rp = nlp->list; rp != NULL; rp = rp->list) {

@@ -1460,11 +1500,11 @@
                                 rp->ns->ns_status = puterrno(error);
                                 rp->thread->t_flag |= T_WOULDBLOCK;
                         }
                 }
                 cv_broadcast(&nlp->cv);
-                mutex_exit(&rfs_async_write_lock);
+                mutex_exit(&nsrv->async_write_lock);
 
                 return;
         }
 
         /*

@@ -1482,20 +1522,20 @@
          * a new cluster and be blocked in VOP_RWLOCK while
          * the first request is being processed.  This delay
          * will allow more requests to be clustered in this
          * second cluster.
          */
-        mutex_enter(&rfs_async_write_lock);
-        if (rfs_async_write_head == nlp)
-                rfs_async_write_head = nlp->next;
+        mutex_enter(&nsrv->async_write_lock);
+        if (nsrv->async_write_head == nlp)
+                nsrv->async_write_head = nlp->next;
         else {
-                lp = rfs_async_write_head;
+                lp = nsrv->async_write_head;
                 while (lp->next != nlp)
                         lp = lp->next;
                 lp->next = nlp->next;
         }
-        mutex_exit(&rfs_async_write_lock);
+        mutex_exit(&nsrv->async_write_lock);
 
         /*
          * Step through the list of requests in this cluster.
          * We need to check permissions to make sure that all
          * of the requests have sufficient permission to write

@@ -1736,19 +1776,19 @@
         if (in_crit)
                 nbl_end_crit(vp);
         VN_RELE(vp);
 
         t_flag = curthread->t_flag & T_WOULDBLOCK;
-        mutex_enter(&rfs_async_write_lock);
+        mutex_enter(&nsrv->async_write_lock);
         for (rp = nlp->list; rp != NULL; rp = rp->list) {
                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
                         rp->ns->ns_status = puterrno(error);
                         rp->thread->t_flag |= t_flag;
                 }
         }
         cv_broadcast(&nlp->cv);
-        mutex_exit(&rfs_async_write_lock);
+        mutex_exit(&nsrv->async_write_lock);
 
 }
 
 void *
 rfs_write_getfh(struct nfswriteargs *wa)

@@ -2209,11 +2249,11 @@
                 return;
         }
 
         /* Check for delegation on the file being renamed over, if it exists */
 
-        if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
+        if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
             NULL, NULL, NULL) == 0) {
 
                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
                         VN_RELE(tovp);

@@ -2576,11 +2616,11 @@
          * Of course, NFS servers have no idea what their
          * clients' current directories are.  We fake it by
          * supplying a vnode known to exist and illegal to
          * remove.
          */
-        error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
+        error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
 
         /*
          * Force modified data and metadata out to stable storage.
          */
         (void) VOP_FSYNC(vp, 0, cr, NULL);

@@ -2851,11 +2891,11 @@
                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
         }
         return (0);
 }
 
-static enum nfsftype vt_to_nf[] = {
+static const enum nfsftype vt_to_nf[] = {
         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
 };
 
 /*
  * check the following fields for overflow: nodeid, size, and time.

@@ -3070,20 +3110,44 @@
 }
 
 void
 rfs_srvrinit(void)
 {
-        mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
         nfs2_srv_caller_id = fs_new_caller_id();
 }
 
 void
 rfs_srvrfini(void)
 {
-        mutex_destroy(&rfs_async_write_lock);
 }
 
+/* ARGSUSED */
+void
+rfs_srv_zone_init(nfs_globals_t *ng)
+{
+        nfs_srv_t *ns;
+
+        ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
+
+        mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
+        ns->write_async = 1;
+
+        ng->nfs_srv = ns;
+}
+
+/* ARGSUSED */
+void
+rfs_srv_zone_fini(nfs_globals_t *ng)
+{
+        nfs_srv_t *ns = ng->nfs_srv;
+
+        ng->nfs_srv = NULL;
+
+        mutex_destroy(&ns->async_write_lock);
+        kmem_free(ns, sizeof (*ns));
+}
+
 static int
 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
 {
         struct clist    *wcl;
         int             wlist_len;