Print this page
NEX-16917 Need to reduce the impact of NFS per-share kstats on failover
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
NEX-16712 NFS dtrace providers do not support per-share filtering
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Reviewed by: Yuri Pankon <yuri.pankov@nexenta.com>
NEX-15279 support NFS server in zone
NEX-15520 online NFS shares cause zoneadm halt to hang in nfs_export_zone_fini
Portions contributed by: Dan Kruchinin dan.kruchinin@nexenta.com
Portions contributed by: Stepan Zastupov stepan.zastupov@gmail.com
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
NEX-9275 Got "bad mutex" panic when run IO to nfs share from clients
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
NEX-14051 Be careful with RPC groups
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
8085 Handle RPC groups better
Reviewed by: "Joshua M. Clulow" <josh@sysmgr.org>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Dan McDonald <danmcd@omniti.com>
NEX-7366 Getting panic in "module "nfssrv" due to a NULL pointer dereference" when updating NFS shares on a pool
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Reviewed by: Steve Peng <steve.peng@nexenta.com>
NEX-6778 NFS kstats leak and cause system to hang
Revert "NEX-4261 Per-client NFS server IOPS, bandwidth, and latency kstats"
This reverts commit 586c3ab1927647487f01c337ddc011c642575a52.
Revert "NEX-5354 Aggregated IOPS, bandwidth, and latency kstats for NFS server"
This reverts commit c91d7614da8618ef48018102b077f60ecbbac8c2.
Revert "NEX-5667 nfssrv_stats_flags does not work for aggregated kstats"
This reverts commit 3dcf42618be7dd5f408c327f429c81e07ca08e74.
Revert "NEX-5750 Time values for aggregated NFS server kstats should be normalized"
This reverts commit 1f4d4f901153b0191027969fa4a8064f9d3b9ee1.
Revert "NEX-5942 Panic in rfs4_minorvers_mismatch() with NFSv4.1 client"
This reverts commit 40766417094a162f5e4cc8786c0fa0a7e5871cd9.
Revert "NEX-5752 NFS server: namespace collision in kstats"
This reverts commit ae81e668db86050da8e483264acb0cce0444a132.
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-4261 Per-client NFS server IOPS, bandwidth, and latency kstats
Reviewed by: Kevin Crowe <kevin.crowe@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-3097 IOPS, bandwidth, and latency kstats for NFS server
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
NEX-1974 Support for more than 16 groups with AUTH_SYS
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
NEX-1128 NFS server: Generic uid and gid remapping for AUTH_SYS
Reviewed by: Jan Kryl <jan.kryl@nexenta.com>
re #13613 rb4516 Tunables needs volatile keyword
@@ -16,24 +16,28 @@
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
+
/*
* Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2017 Joyent Inc
*/
/*
* Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
* All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2018 Nexenta Systems, Inc.
+ * Copyright (c) 2017 Joyent Inc
+ */
+
#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/cred.h>
#include <sys/proc.h>
@@ -81,11 +85,10 @@
#include <nfs/export.h>
#include <nfs/nfssys.h>
#include <nfs/nfs_clnt.h>
#include <nfs/nfs_acl.h>
#include <nfs/nfs_log.h>
-#include <nfs/nfs_cmd.h>
#include <nfs/lm.h>
#include <nfs/nfs_dispatch.h>
#include <nfs/nfs4_drc.h>
#include <sys/modctl.h>
@@ -107,22 +110,20 @@
static struct modlinkage modlinkage = {
MODREV_1, (void *)&modlmisc, NULL
};
+zone_key_t nfssrv_zone_key;
kmem_cache_t *nfs_xuio_cache;
int nfs_loaned_buffers = 0;
int
_init(void)
{
int status;
- if ((status = nfs_srvinit()) != 0) {
- cmn_err(CE_WARN, "_init: nfs_srvinit failed");
- return (status);
- }
+ nfs_srvinit();
status = mod_install((struct modlinkage *)&modlinkage);
if (status != 0) {
/*
* Could not load module, cleanup previous
@@ -175,31 +176,32 @@
* modifying those routines to avoid the duplication. For now, we optimize
* by calling exportmatch() only after checking that the dispatch routine
* supports RPC_PUBLICFH_OK, and if the filesystem is explicitly exported
* public (i.e., not the placeholder).
*/
-#define PUBLICFH_CHECK(disp, exi, fsid, xfid) \
+#define PUBLICFH_CHECK(ne, disp, exi, fsid, xfid) \
((disp->dis_flags & RPC_PUBLICFH_OK) && \
((exi->exi_export.ex_flags & EX_PUBLIC) || \
- (exi == exi_public && exportmatch(exi_root, \
+ (exi == ne->exi_public && exportmatch(ne->exi_root, \
fsid, xfid))))
static void nfs_srv_shutdown_all(int);
-static void rfs4_server_start(int);
+static void rfs4_server_start(nfs_globals_t *, int);
static void nullfree(void);
static void rfs_dispatch(struct svc_req *, SVCXPRT *);
static void acl_dispatch(struct svc_req *, SVCXPRT *);
static void common_dispatch(struct svc_req *, SVCXPRT *,
rpcvers_t, rpcvers_t, char *,
struct rpc_disptable *);
-static void hanfsv4_failover(void);
static int checkauth(struct exportinfo *, struct svc_req *, cred_t *, int,
bool_t, bool_t *);
static char *client_name(struct svc_req *req);
static char *client_addr(struct svc_req *req, char *buf);
extern int sec_svc_getcred(struct svc_req *, cred_t *cr, char **, int *);
extern bool_t sec_svc_inrootlist(int, caddr_t, int, caddr_t *);
+static void *nfs_srv_zone_init(zoneid_t);
+static void nfs_srv_zone_fini(zoneid_t, void *);
#define NFSLOG_COPY_NETBUF(exi, xprt, nb) { \
(nb)->maxlen = (xprt)->xp_rtaddr.maxlen; \
(nb)->len = (xprt)->xp_rtaddr.len; \
(nb)->buf = kmem_alloc((nb)->len, KM_SLEEP); \
@@ -246,58 +248,39 @@
};
static SVC_CALLOUT_TABLE nfs_sct_rdma = {
sizeof (__nfs_sc_rdma) / sizeof (__nfs_sc_rdma[0]), FALSE, __nfs_sc_rdma
};
-rpcvers_t nfs_versmin = NFS_VERSMIN_DEFAULT;
-rpcvers_t nfs_versmax = NFS_VERSMAX_DEFAULT;
/*
- * Used to track the state of the server so that initialization
- * can be done properly.
- */
-typedef enum {
- NFS_SERVER_STOPPED, /* server state destroyed */
- NFS_SERVER_STOPPING, /* server state being destroyed */
- NFS_SERVER_RUNNING,
- NFS_SERVER_QUIESCED, /* server state preserved */
- NFS_SERVER_OFFLINE /* server pool offline */
-} nfs_server_running_t;
-
-static nfs_server_running_t nfs_server_upordown;
-static kmutex_t nfs_server_upordown_lock;
-static kcondvar_t nfs_server_upordown_cv;
-
-/*
* DSS: distributed stable storage
* lists of all DSS paths: current, and before last warmstart
*/
nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths;
-int rfs4_dispatch(struct rpcdisp *, struct svc_req *, SVCXPRT *, char *);
+int rfs4_dispatch(struct rpcdisp *, struct svc_req *, SVCXPRT *, char *,
+ size_t *);
bool_t rfs4_minorvers_mismatch(struct svc_req *, SVCXPRT *, void *);
/*
- * RDMA wait variables.
- */
-static kcondvar_t rdma_wait_cv;
-static kmutex_t rdma_wait_mutex;
-
-/*
* Will be called at the point the server pool is being unregistered
* from the pool list. From that point onwards, the pool is waiting
* to be drained and as such the server state is stale and pertains
* to the old instantiation of the NFS server pool.
*/
void
nfs_srv_offline(void)
{
- mutex_enter(&nfs_server_upordown_lock);
- if (nfs_server_upordown == NFS_SERVER_RUNNING) {
- nfs_server_upordown = NFS_SERVER_OFFLINE;
+ nfs_globals_t *ng;
+
+ ng = zone_getspecific(nfssrv_zone_key, curzone);
+
+ mutex_enter(&ng->nfs_server_upordown_lock);
+ if (ng->nfs_server_upordown == NFS_SERVER_RUNNING) {
+ ng->nfs_server_upordown = NFS_SERVER_OFFLINE;
}
- mutex_exit(&nfs_server_upordown_lock);
+ mutex_exit(&ng->nfs_server_upordown_lock);
}
/*
* Will be called at the point the server pool is being destroyed so
* all transports have been closed and no service threads are in
@@ -322,37 +305,40 @@
int quiesce = 1;
nfs_srv_shutdown_all(quiesce);
}
static void
-nfs_srv_shutdown_all(int quiesce) {
- mutex_enter(&nfs_server_upordown_lock);
+nfs_srv_shutdown_all(int quiesce)
+{
+ nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
+
+ mutex_enter(&ng->nfs_server_upordown_lock);
if (quiesce) {
- if (nfs_server_upordown == NFS_SERVER_RUNNING ||
- nfs_server_upordown == NFS_SERVER_OFFLINE) {
- nfs_server_upordown = NFS_SERVER_QUIESCED;
- cv_signal(&nfs_server_upordown_cv);
+ if (ng->nfs_server_upordown == NFS_SERVER_RUNNING ||
+ ng->nfs_server_upordown == NFS_SERVER_OFFLINE) {
+ ng->nfs_server_upordown = NFS_SERVER_QUIESCED;
+ cv_signal(&ng->nfs_server_upordown_cv);
/* reset DSS state, for subsequent warm restart */
rfs4_dss_numnewpaths = 0;
rfs4_dss_newpaths = NULL;
cmn_err(CE_NOTE, "nfs_server: server is now quiesced; "
"NFSv4 state has been preserved");
}
} else {
- if (nfs_server_upordown == NFS_SERVER_OFFLINE) {
- nfs_server_upordown = NFS_SERVER_STOPPING;
- mutex_exit(&nfs_server_upordown_lock);
- rfs4_state_fini();
- rfs4_fini_drc(nfs4_drc);
- mutex_enter(&nfs_server_upordown_lock);
- nfs_server_upordown = NFS_SERVER_STOPPED;
- cv_signal(&nfs_server_upordown_cv);
+ if (ng->nfs_server_upordown == NFS_SERVER_OFFLINE) {
+ ng->nfs_server_upordown = NFS_SERVER_STOPPING;
+ mutex_exit(&ng->nfs_server_upordown_lock);
+ rfs4_state_zone_fini();
+ rfs4_fini_drc();
+ mutex_enter(&ng->nfs_server_upordown_lock);
+ ng->nfs_server_upordown = NFS_SERVER_STOPPED;
+ cv_signal(&ng->nfs_server_upordown_cv);
}
}
- mutex_exit(&nfs_server_upordown_lock);
+ mutex_exit(&ng->nfs_server_upordown_lock);
}
static int
nfs_srv_set_sc_versions(struct file *fp, SVC_CALLOUT_TABLE **sctpp,
rpcvers_t versmin, rpcvers_t versmax)
@@ -416,10 +402,11 @@
* uap->fd is the fd of an open transport provider
*/
int
nfs_svc(struct nfs_svc_args *arg, model_t model)
{
+ nfs_globals_t *ng;
file_t *fp;
SVCMASTERXPRT *xprt;
int error;
int readsize;
char buf[KNC_STRSIZE];
@@ -430,10 +417,11 @@
#ifdef lint
model = model; /* STRUCT macros don't always refer to it */
#endif
+ ng = zone_getspecific(nfssrv_zone_key, curzone);
STRUCT_SET_HANDLE(uap, model, arg);
/* Check privileges in nfssys() */
if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
@@ -463,31 +451,31 @@
releasef(STRUCT_FGET(uap, fd));
kmem_free(addrmask.buf, addrmask.maxlen);
return (error);
}
- nfs_versmin = STRUCT_FGET(uap, versmin);
- nfs_versmax = STRUCT_FGET(uap, versmax);
+ ng->nfs_versmin = STRUCT_FGET(uap, versmin);
+ ng->nfs_versmax = STRUCT_FGET(uap, versmax);
/* Double check the vers min/max ranges */
- if ((nfs_versmin > nfs_versmax) ||
- (nfs_versmin < NFS_VERSMIN) ||
- (nfs_versmax > NFS_VERSMAX)) {
- nfs_versmin = NFS_VERSMIN_DEFAULT;
- nfs_versmax = NFS_VERSMAX_DEFAULT;
+ if ((ng->nfs_versmin > ng->nfs_versmax) ||
+ (ng->nfs_versmin < NFS_VERSMIN) ||
+ (ng->nfs_versmax > NFS_VERSMAX)) {
+ ng->nfs_versmin = NFS_VERSMIN_DEFAULT;
+ ng->nfs_versmax = NFS_VERSMAX_DEFAULT;
}
- if (error =
- nfs_srv_set_sc_versions(fp, &sctp, nfs_versmin, nfs_versmax)) {
+ if (error = nfs_srv_set_sc_versions(fp, &sctp, ng->nfs_versmin,
+ ng->nfs_versmax)) {
releasef(STRUCT_FGET(uap, fd));
kmem_free(addrmask.buf, addrmask.maxlen);
return (error);
}
/* Initialize nfsv4 server */
- if (nfs_versmax == (rpcvers_t)NFS_V4)
- rfs4_server_start(STRUCT_FGET(uap, delegation));
+ if (ng->nfs_versmax == (rpcvers_t)NFS_V4)
+ rfs4_server_start(ng, STRUCT_FGET(uap, delegation));
/* Create a transport handle. */
error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &xprt,
sctp, NULL, NFS_SVCPOOL_ID, TRUE);
@@ -502,72 +490,50 @@
return (error);
}
static void
-rfs4_server_start(int nfs4_srv_delegation)
+rfs4_server_start(nfs_globals_t *ng, int nfs4_srv_delegation)
{
/*
* Determine if the server has previously been "started" and
* if not, do the per instance initialization
*/
- mutex_enter(&nfs_server_upordown_lock);
+ mutex_enter(&ng->nfs_server_upordown_lock);
- if (nfs_server_upordown != NFS_SERVER_RUNNING) {
+ if (ng->nfs_server_upordown != NFS_SERVER_RUNNING) {
/* Do we need to stop and wait on the previous server? */
- while (nfs_server_upordown == NFS_SERVER_STOPPING ||
- nfs_server_upordown == NFS_SERVER_OFFLINE)
- cv_wait(&nfs_server_upordown_cv,
- &nfs_server_upordown_lock);
+ while (ng->nfs_server_upordown == NFS_SERVER_STOPPING ||
+ ng->nfs_server_upordown == NFS_SERVER_OFFLINE)
+ cv_wait(&ng->nfs_server_upordown_cv,
+ &ng->nfs_server_upordown_lock);
- if (nfs_server_upordown != NFS_SERVER_RUNNING) {
+ if (ng->nfs_server_upordown != NFS_SERVER_RUNNING) {
(void) svc_pool_control(NFS_SVCPOOL_ID,
SVCPSET_UNREGISTER_PROC, (void *)&nfs_srv_offline);
(void) svc_pool_control(NFS_SVCPOOL_ID,
SVCPSET_SHUTDOWN_PROC, (void *)&nfs_srv_stop_all);
- /* is this an nfsd warm start? */
- if (nfs_server_upordown == NFS_SERVER_QUIESCED) {
- cmn_err(CE_NOTE, "nfs_server: "
- "server was previously quiesced; "
- "existing NFSv4 state will be re-used");
+ rfs4_do_server_start(ng->nfs_server_upordown,
+ nfs4_srv_delegation,
+ cluster_bootflags & CLUSTER_BOOTED);
- /*
- * HA-NFSv4: this is also the signal
- * that a Resource Group failover has
- * occurred.
- */
- if (cluster_bootflags & CLUSTER_BOOTED)
- hanfsv4_failover();
- } else {
- /* cold start */
- rfs4_state_init();
- nfs4_drc = rfs4_init_drc(nfs4_drc_max,
- nfs4_drc_hash);
+ ng->nfs_server_upordown = NFS_SERVER_RUNNING;
}
-
- /*
- * Check to see if delegation is to be
- * enabled at the server
- */
- if (nfs4_srv_delegation != FALSE)
- rfs4_set_deleg_policy(SRV_NORMAL_DELEGATE);
-
- nfs_server_upordown = NFS_SERVER_RUNNING;
+ cv_signal(&ng->nfs_server_upordown_cv);
}
- cv_signal(&nfs_server_upordown_cv);
- }
- mutex_exit(&nfs_server_upordown_lock);
+ mutex_exit(&ng->nfs_server_upordown_lock);
}
/*
* If RDMA device available,
* start RDMA listener.
*/
int
rdma_start(struct rdma_svc_args *rsa)
{
+ nfs_globals_t *ng;
int error;
rdma_xprt_group_t started_rdma_xprts;
rdma_stat stat;
int svc_state = 0;
@@ -576,13 +542,15 @@
(rsa->nfs_versmin < NFS_VERSMIN) ||
(rsa->nfs_versmax > NFS_VERSMAX)) {
rsa->nfs_versmin = NFS_VERSMIN_DEFAULT;
rsa->nfs_versmax = NFS_VERSMAX_DEFAULT;
}
- nfs_versmin = rsa->nfs_versmin;
- nfs_versmax = rsa->nfs_versmax;
+ ng = zone_getspecific(nfssrv_zone_key, curzone);
+ ng->nfs_versmin = rsa->nfs_versmin;
+ ng->nfs_versmax = rsa->nfs_versmax;
+
/* Set the versions in the callout table */
__nfs_sc_rdma[0].sc_versmin = rsa->nfs_versmin;
__nfs_sc_rdma[0].sc_versmax = rsa->nfs_versmax;
/* For the NFS_ACL program, check the max version */
__nfs_sc_rdma[1].sc_versmin = rsa->nfs_versmin;
@@ -591,11 +559,11 @@
else
__nfs_sc_rdma[1].sc_versmax = rsa->nfs_versmax;
/* Initialize nfsv4 server */
if (rsa->nfs_versmax == (rpcvers_t)NFS_V4)
- rfs4_server_start(rsa->delegation);
+ rfs4_server_start(ng, rsa->delegation);
started_rdma_xprts.rtg_count = 0;
started_rdma_xprts.rtg_listhead = NULL;
started_rdma_xprts.rtg_poolid = rsa->poolid;
@@ -608,11 +576,11 @@
while (!error) {
/*
* wait till either interrupted by a signal on
* nfs service stop/restart or signalled by a
- * rdma plugin attach/detatch.
+ * rdma attach/detatch.
*/
stat = rdma_kwait();
/*
@@ -659,14 +627,14 @@
/* ARGSUSED */
void
rpc_null_v3(caddr_t *argp, caddr_t *resp, struct exportinfo *exi,
struct svc_req *req, cred_t *cr, bool_t ro)
{
- DTRACE_NFSV3_3(op__null__start, struct svc_req *, req,
- cred_t *, cr, vnode_t *, NULL);
- DTRACE_NFSV3_3(op__null__done, struct svc_req *, req,
- cred_t *, cr, vnode_t *, NULL);
+ DTRACE_NFSV3_4(op__null__start, struct svc_req *, req,
+ cred_t *, cr, vnode_t *, NULL, struct exportinfo *, exi);
+ DTRACE_NFSV3_4(op__null__done, struct svc_req *, req,
+ cred_t *, cr, vnode_t *, NULL, struct exportinfo *, exi);
}
/* ARGSUSED */
static void
rfs_error(caddr_t *argp, caddr_t *resp, struct exportinfo *exi,
@@ -1340,17 +1308,17 @@
};
static struct rpc_disptable rfs_disptable[] = {
{sizeof (rfsdisptab_v2) / sizeof (rfsdisptab_v2[0]),
rfscallnames_v2,
- &rfsproccnt_v2_ptr, rfsdisptab_v2},
+ &rfsproccnt_v2_ptr, &rfsprocio_v2_ptr, rfsdisptab_v2},
{sizeof (rfsdisptab_v3) / sizeof (rfsdisptab_v3[0]),
rfscallnames_v3,
- &rfsproccnt_v3_ptr, rfsdisptab_v3},
+ &rfsproccnt_v3_ptr, &rfsprocio_v3_ptr, rfsdisptab_v3},
{sizeof (rfsdisptab_v4) / sizeof (rfsdisptab_v4[0]),
rfscallnames_v4,
- &rfsproccnt_v4_ptr, rfsdisptab_v4},
+ &rfsproccnt_v4_ptr, &rfsprocio_v4_ptr, rfsdisptab_v4},
};
/*
* If nfs_portmon is set, then clients are required to use privileged
* ports (ports < IPPORT_RESERVED) in order to get NFS services.
@@ -1358,18 +1326,17 @@
* N.B.: this attempt to carry forward the already ill-conceived notion
* of privileged ports for TCP/UDP is really quite ineffectual. Not only
* is it transport-dependent, it's laughably easy to spoof. If you're
* really interested in security, you must start with secure RPC instead.
*/
-static int nfs_portmon = 0;
+volatile int nfs_portmon = 0;
#ifdef DEBUG
static int cred_hits = 0;
static int cred_misses = 0;
#endif
-
#ifdef DEBUG
/*
* Debug code to allow disabling of rfs_dispatch() use of
* fastxdrargs() and fastxdrres() calls for testing purposes.
*/
@@ -1472,12 +1439,11 @@
}
static void
common_dispatch(struct svc_req *req, SVCXPRT *xprt, rpcvers_t min_vers,
- rpcvers_t max_vers, char *pgmname,
- struct rpc_disptable *disptable)
+ rpcvers_t max_vers, char *pgmname, struct rpc_disptable *disptable)
{
int which;
rpcvers_t vers;
char *args;
union {
@@ -1506,10 +1472,16 @@
bool_t logging_enabled = FALSE;
struct exportinfo *nfslog_exi = NULL;
char **procnames;
char cbuf[INET6_ADDRSTRLEN]; /* to hold both IPv4 and IPv6 addr */
bool_t ro = FALSE;
+ kstat_t *ksp = NULL;
+ kstat_t *exi_ksp = NULL;
+ size_t pos; /* request size */
+ size_t rlen; /* reply size */
+ bool_t rsent = FALSE; /* reply was sent successfully */
+ nfs_export_t *ne = nfs_get_export();
vers = req->rq_vers;
if (vers < min_vers || vers > max_vers) {
svcerr_progvers(req->rq_xprt, min_vers, max_vers);
@@ -1526,10 +1498,18 @@
goto done;
}
(*(disptable[(int)vers].dis_proccntp))[which].value.ui64++;
+ ksp = (*(disptable[(int)vers].dis_prociop))[which];
+ if (ksp != NULL) {
+ mutex_enter(ksp->ks_lock);
+ kstat_runq_enter(KSTAT_IO_PTR(ksp));
+ mutex_exit(ksp->ks_lock);
+ }
+ pos = XDR_GETPOS(&xprt->xp_xdrin);
+
disp = &disptable[(int)vers].dis_table[which];
procnames = disptable[(int)vers].dis_procnames;
auth_flavor = req->rq_cred.oa_flavor;
@@ -1569,11 +1549,13 @@
/*
* If Version 4 use that specific dispatch function.
*/
if (req->rq_vers == 4) {
- error += rfs4_dispatch(disp, req, xprt, args);
+ error += rfs4_dispatch(disp, req, xprt, args, &rlen);
+ if (error == 0)
+ rsent = TRUE;
goto done;
}
dis_flags = disp->dis_flags;
@@ -1630,17 +1612,19 @@
anon_ok = 0;
cr = xprt->xp_cred;
ASSERT(cr != NULL);
#ifdef DEBUG
+ {
if (crgetref(cr) != 1) {
crfree(cr);
cr = crget();
xprt->xp_cred = cr;
cred_misses++;
} else
cred_hits++;
+ }
#else
if (crgetref(cr) != 1) {
crfree(cr);
cr = crget();
xprt->xp_cred = cr;
@@ -1648,12 +1632,38 @@
#endif
exi = checkexport(fsid, xfid);
if (exi != NULL) {
- publicfh_ok = PUBLICFH_CHECK(disp, exi, fsid, xfid);
+ rw_enter(&ne->exported_lock, RW_READER);
+ exi_ksp = NULL;
+ if (exi->exi_kstats != NULL) {
+ switch (req->rq_vers) {
+ case NFS_VERSION:
+ exi_ksp = exp_kstats_v2(exi->exi_kstats,
+ which);
+ break;
+ case NFS_V3:
+ exi_ksp = exp_kstats_v3(exi->exi_kstats,
+ which);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+ }
+
+ if (exi_ksp != NULL) {
+ mutex_enter(exi_ksp->ks_lock);
+ kstat_runq_enter(KSTAT_IO_PTR(exi_ksp));
+ mutex_exit(exi_ksp->ks_lock);
+ } else {
+ rw_exit(&ne->exported_lock);
+ }
+
+ publicfh_ok = PUBLICFH_CHECK(ne, disp, exi, fsid, xfid);
/*
* Don't allow non-V4 clients access
* to pseudo exports
*/
if (PSEUDO(exi)) {
@@ -1761,11 +1771,11 @@
* the later writing of the log record. This is done for
* the case that a lookup is done across a non-logged public
* file system.
*/
if (nfslog_buffer_list != NULL) {
- nfslog_exi = nfslog_get_exi(exi, req, res, &nfslog_rec_id);
+ nfslog_exi = nfslog_get_exi(ne, exi, req, res, &nfslog_rec_id);
/*
* Is logging enabled?
*/
logging_enabled = (nfslog_exi != NULL);
@@ -1798,26 +1808,32 @@
{
if (!svc_sendreply(xprt, disp->dis_fastxdrres, res)) {
cmn_err(CE_NOTE, "%s: bad sendreply", pgmname);
svcerr_systemerr(xprt);
error++;
+ } else {
+ rlen = xdr_sizeof(disp->dis_fastxdrres, res);
+ rsent = TRUE;
}
} else {
if (!svc_sendreply(xprt, disp->dis_xdrres, res)) {
cmn_err(CE_NOTE, "%s: bad sendreply", pgmname);
svcerr_systemerr(xprt);
error++;
+ } else {
+ rlen = xdr_sizeof(disp->dis_xdrres, res);
+ rsent = TRUE;
}
}
/*
* Log if needed
*/
if (logging_enabled) {
nfslog_write_record(nfslog_exi, req, args, (char *)&res_buf,
cr, &nb, nfslog_rec_id, NFSLOG_ONE_BUFFER);
- exi_rele(nfslog_exi);
+ exi_rele(&nfslog_exi);
kmem_free((&nb)->buf, (&nb)->len);
}
/*
* Free results struct. With the addition of NFS V4 we can
@@ -1826,10 +1842,14 @@
if (disp->dis_resfree != nullfree && dupcached == FALSE) {
(*disp->dis_resfree)(res);
}
done:
+ if (ksp != NULL || exi_ksp != NULL) {
+ pos = XDR_GETPOS(&xprt->xp_xdrin) - pos;
+ }
+
/*
* Free arguments struct
*/
if (disp) {
if (!SVC_FREEARGS(xprt, disp->dis_xdrargs, args)) {
@@ -1841,13 +1861,39 @@
cmn_err(CE_NOTE, "%s: bad freeargs", pgmname);
error++;
}
}
+ if (exi_ksp != NULL) {
+ mutex_enter(exi_ksp->ks_lock);
+ KSTAT_IO_PTR(exi_ksp)->nwritten += pos;
+ KSTAT_IO_PTR(exi_ksp)->writes++;
+ if (rsent) {
+ KSTAT_IO_PTR(exi_ksp)->nread += rlen;
+ KSTAT_IO_PTR(exi_ksp)->reads++;
+ }
+ kstat_runq_exit(KSTAT_IO_PTR(exi_ksp));
+ mutex_exit(exi_ksp->ks_lock);
+
+ rw_exit(&ne->exported_lock);
+ }
+
if (exi != NULL)
- exi_rele(exi);
+ exi_rele(&exi);
+ if (ksp != NULL) {
+ mutex_enter(ksp->ks_lock);
+ KSTAT_IO_PTR(ksp)->nwritten += pos;
+ KSTAT_IO_PTR(ksp)->writes++;
+ if (rsent) {
+ KSTAT_IO_PTR(ksp)->nread += rlen;
+ KSTAT_IO_PTR(ksp)->reads++;
+ }
+ kstat_runq_exit(KSTAT_IO_PTR(ksp));
+ mutex_exit(ksp->ks_lock);
+ }
+
global_svstat_ptr[req->rq_vers][NFS_BADCALLS].value.ui64 += error;
global_svstat_ptr[req->rq_vers][NFS_CALLS].value.ui64++;
}
@@ -1969,14 +2015,14 @@
};
static struct rpc_disptable acl_disptable[] = {
{sizeof (acldisptab_v2) / sizeof (acldisptab_v2[0]),
aclcallnames_v2,
- &aclproccnt_v2_ptr, acldisptab_v2},
+ &aclproccnt_v2_ptr, &aclprocio_v2_ptr, acldisptab_v2},
{sizeof (acldisptab_v3) / sizeof (acldisptab_v3[0]),
aclcallnames_v3,
- &aclproccnt_v3_ptr, acldisptab_v3},
+ &aclproccnt_v3_ptr, &aclprocio_v3_ptr, acldisptab_v3},
};
static void
acl_dispatch(struct svc_req *req, SVCXPRT *xprt)
{
@@ -2566,35 +2612,22 @@
* once. It performs the following tasks:
* - Call sub-initialization routines (localize access to variables)
* - Initialize all locks
* - initialize the version 3 write verifier
*/
-int
+void
nfs_srvinit(void)
{
- int error;
+ /* NFS server zone-specific global variables */
+ zone_key_create(&nfssrv_zone_key, nfs_srv_zone_init,
+ NULL, nfs_srv_zone_fini);
- error = nfs_exportinit();
- if (error != 0)
- return (error);
- error = rfs4_srvrinit();
- if (error != 0) {
- nfs_exportfini();
- return (error);
- }
+ nfs_exportinit();
rfs_srvrinit();
rfs3_srvrinit();
+ rfs4_srvrinit();
nfsauth_init();
-
- /* Init the stuff to control start/stop */
- nfs_server_upordown = NFS_SERVER_STOPPED;
- mutex_init(&nfs_server_upordown_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&nfs_server_upordown_cv, NULL, CV_DEFAULT, NULL);
- mutex_init(&rdma_wait_mutex, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&rdma_wait_cv, NULL, CV_DEFAULT, NULL);
-
- return (0);
}
/*
* NFS Server finalization routine. This routine is called to cleanup the
* initialization work previously performed if the NFS server module could
@@ -2602,24 +2635,57 @@
*/
void
nfs_srvfini(void)
{
nfsauth_fini();
+ rfs4_srvrfini();
rfs3_srvrfini();
rfs_srvrfini();
nfs_exportfini();
- mutex_destroy(&nfs_server_upordown_lock);
- cv_destroy(&nfs_server_upordown_cv);
- mutex_destroy(&rdma_wait_mutex);
- cv_destroy(&rdma_wait_cv);
+ (void) zone_key_delete(nfssrv_zone_key);
}
+/* ARGSUSED */
+static void *
+nfs_srv_zone_init(zoneid_t zoneid)
+{
+ nfs_globals_t *ng;
+
+ ng = kmem_zalloc(sizeof (*ng), KM_SLEEP);
+
+ ng->nfs_versmin = NFS_VERSMIN_DEFAULT;
+ ng->nfs_versmax = NFS_VERSMAX_DEFAULT;
+
+ /* Init the stuff to control start/stop */
+ ng->nfs_server_upordown = NFS_SERVER_STOPPED;
+ mutex_init(&ng->nfs_server_upordown_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&ng->nfs_server_upordown_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&ng->rdma_wait_mutex, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&ng->rdma_wait_cv, NULL, CV_DEFAULT, NULL);
+
+ return (ng);
+}
+
+/* ARGSUSED */
+static void
+nfs_srv_zone_fini(zoneid_t zoneid, void *data)
+{
+ nfs_globals_t *ng;
+
+ ng = (nfs_globals_t *)data;
+ mutex_destroy(&ng->nfs_server_upordown_lock);
+ cv_destroy(&ng->nfs_server_upordown_cv);
+ mutex_destroy(&ng->rdma_wait_mutex);
+ cv_destroy(&ng->rdma_wait_cv);
+
+ kmem_free(ng, sizeof (*ng));
+}
+
/*
* Set up an iovec array of up to cnt pointers.
*/
-
void
mblk_to_iov(mblk_t *m, int cnt, struct iovec *iovp)
{
while (m != NULL && cnt-- > 0) {
iovp->iov_base = (caddr_t)m->b_rptr;
@@ -2852,11 +2918,11 @@
* option argument and leads us to another filesystem
*/
/* Release the reference on the old exi value */
ASSERT(*exi != NULL);
- exi_rele(*exi);
+ exi_rele(exi);
if (error = nfs_check_vpexi(mc_dvp, *vpp, kcred, exi)) {
VN_RELE(*vpp);
goto publicfh_done;
}
@@ -2891,11 +2957,11 @@
*/
if (*path == '/') {
while (*path == '/')
path++;
- startdvp = rootdir;
+ startdvp = ZONE_ROOTVP();
}
error = pn_get_buf(path, UIO_SYSSPACE, &pn, namebuf, sizeof (namebuf));
if (error == 0) {
/*
@@ -2914,11 +2980,11 @@
if ((pn.pn_pathlen = strlen(pn.pn_path)) == 0)
return (ENOENT);
}
VN_HOLD(startdvp);
error = lookuppnvp(&pn, NULL, NO_FOLLOW, dirvpp, compvpp,
- rootdir, startdvp, cr);
+ ZONE_ROOTVP(), startdvp, cr);
}
if (error == ENAMETOOLONG) {
/*
* This thread used a pathname > TYPICALMAXPATHLEN bytes long.
*/
@@ -2931,11 +2997,11 @@
return (ENOENT);
}
}
VN_HOLD(startdvp);
error = lookuppnvp(&pn, NULL, NO_FOLLOW, dirvpp, compvpp,
- rootdir, startdvp, cr);
+ ZONE_ROOTVP(), startdvp, cr);
pn_free(&pn);
}
return (error);
}
@@ -3035,172 +3101,10 @@
}
return (error);
}
-/*
- * Do the main work of handling HA-NFSv4 Resource Group failover on
- * Sun Cluster.
- * We need to detect whether any RG admin paths have been added or removed,
- * and adjust resources accordingly.
- * Currently we're using a very inefficient algorithm, ~ 2 * O(n**2). In
- * order to scale, the list and array of paths need to be held in more
- * suitable data structures.
- */
-static void
-hanfsv4_failover(void)
-{
- int i, start_grace, numadded_paths = 0;
- char **added_paths = NULL;
- rfs4_dss_path_t *dss_path;
-
- /*
- * Note: currently, rfs4_dss_pathlist cannot be NULL, since
- * it will always include an entry for NFS4_DSS_VAR_DIR. If we
- * make the latter dynamically specified too, the following will
- * need to be adjusted.
- */
-
- /*
- * First, look for removed paths: RGs that have been failed-over
- * away from this node.
- * Walk the "currently-serving" rfs4_dss_pathlist and, for each
- * path, check if it is on the "passed-in" rfs4_dss_newpaths array
- * from nfsd. If not, that RG path has been removed.
- *
- * Note that nfsd has sorted rfs4_dss_newpaths for us, and removed
- * any duplicates.
- */
- dss_path = rfs4_dss_pathlist;
- do {
- int found = 0;
- char *path = dss_path->path;
-
- /* used only for non-HA so may not be removed */
- if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) {
- dss_path = dss_path->next;
- continue;
- }
-
- for (i = 0; i < rfs4_dss_numnewpaths; i++) {
- int cmpret;
- char *newpath = rfs4_dss_newpaths[i];
-
- /*
- * Since nfsd has sorted rfs4_dss_newpaths for us,
- * once the return from strcmp is negative we know
- * we've passed the point where "path" should be,
- * and can stop searching: "path" has been removed.
- */
- cmpret = strcmp(path, newpath);
- if (cmpret < 0)
- break;
- if (cmpret == 0) {
- found = 1;
- break;
- }
- }
-
- if (found == 0) {
- unsigned index = dss_path->index;
- rfs4_servinst_t *sip = dss_path->sip;
- rfs4_dss_path_t *path_next = dss_path->next;
-
- /*
- * This path has been removed.
- * We must clear out the servinst reference to
- * it, since it's now owned by another
- * node: we should not attempt to touch it.
- */
- ASSERT(dss_path == sip->dss_paths[index]);
- sip->dss_paths[index] = NULL;
-
- /* remove from "currently-serving" list, and destroy */
- remque(dss_path);
- /* allow for NUL */
- kmem_free(dss_path->path, strlen(dss_path->path) + 1);
- kmem_free(dss_path, sizeof (rfs4_dss_path_t));
-
- dss_path = path_next;
- } else {
- /* path was found; not removed */
- dss_path = dss_path->next;
- }
- } while (dss_path != rfs4_dss_pathlist);
-
- /*
- * Now, look for added paths: RGs that have been failed-over
- * to this node.
- * Walk the "passed-in" rfs4_dss_newpaths array from nfsd and,
- * for each path, check if it is on the "currently-serving"
- * rfs4_dss_pathlist. If not, that RG path has been added.
- *
- * Note: we don't do duplicate detection here; nfsd does that for us.
- *
- * Note: numadded_paths <= rfs4_dss_numnewpaths, which gives us
- * an upper bound for the size needed for added_paths[numadded_paths].
- */
-
- /* probably more space than we need, but guaranteed to be enough */
- if (rfs4_dss_numnewpaths > 0) {
- size_t sz = rfs4_dss_numnewpaths * sizeof (char *);
- added_paths = kmem_zalloc(sz, KM_SLEEP);
- }
-
- /* walk the "passed-in" rfs4_dss_newpaths array from nfsd */
- for (i = 0; i < rfs4_dss_numnewpaths; i++) {
- int found = 0;
- char *newpath = rfs4_dss_newpaths[i];
-
- dss_path = rfs4_dss_pathlist;
- do {
- char *path = dss_path->path;
-
- /* used only for non-HA */
- if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) {
- dss_path = dss_path->next;
- continue;
- }
-
- if (strncmp(path, newpath, strlen(path)) == 0) {
- found = 1;
- break;
- }
-
- dss_path = dss_path->next;
- } while (dss_path != rfs4_dss_pathlist);
-
- if (found == 0) {
- added_paths[numadded_paths] = newpath;
- numadded_paths++;
- }
- }
-
- /* did we find any added paths? */
- if (numadded_paths > 0) {
- /* create a new server instance, and start its grace period */
- start_grace = 1;
- rfs4_servinst_create(start_grace, numadded_paths, added_paths);
-
- /* read in the stable storage state from these paths */
- rfs4_dss_readstate(numadded_paths, added_paths);
-
- /*
- * Multiple failovers during a grace period will cause
- * clients of the same resource group to be partitioned
- * into different server instances, with different
- * grace periods. Since clients of the same resource
- * group must be subject to the same grace period,
- * we need to reset all currently active grace periods.
- */
- rfs4_grace_reset_all();
- }
-
- if (rfs4_dss_numnewpaths > 0)
- kmem_free(added_paths, rfs4_dss_numnewpaths * sizeof (char *));
-}
-
/*
* Used by NFSv3 and NFSv4 server to query label of
* a pathname component during lookup/access ops.
*/
ts_label_t *