Print this page
11083 support NFS server in zone
Portions contributed by: Dan Kruchinin <dan.kruchinin@nexenta.com>
Portions contributed by: Stepan Zastupov <stepan.zastupov@gmail.com>
Portions contributed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Portions contributed by: Mike Zeller <mike@mikezeller.net>
Portions contributed by: Dan McDonald <danmcd@joyent.com>
Portions contributed by: Gordon Ross <gordon.w.ross@gmail.com>
Portions contributed by: Vitaliy Gusev <gusev.vitaliy@gmail.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Jason King <jbk@joyent.com>
Reviewed by: C Fraire <cfraire@me.com>
Change-Id: I22f289d357503f9b48a0bc2482cc4328a6d43d16

*** 20,31 **** */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Bayard G. Bell. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017 Joyent Inc */ /* * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All rights reserved. --- 20,31 ---- */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Bayard G. Bell. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2017 Joyent Inc + * Copyright 2019 Nexenta by DDN, Inc. */ /* * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All rights reserved.
*** 81,91 **** #include <nfs/export.h> #include <nfs/nfssys.h> #include <nfs/nfs_clnt.h> #include <nfs/nfs_acl.h> #include <nfs/nfs_log.h> - #include <nfs/nfs_cmd.h> #include <nfs/lm.h> #include <nfs/nfs_dispatch.h> #include <nfs/nfs4_drc.h> #include <sys/modctl.h> --- 81,90 ----
*** 107,128 **** static struct modlinkage modlinkage = { MODREV_1, (void *)&modlmisc, NULL }; kmem_cache_t *nfs_xuio_cache; int nfs_loaned_buffers = 0; int _init(void) { int status; ! if ((status = nfs_srvinit()) != 0) { ! cmn_err(CE_WARN, "_init: nfs_srvinit failed"); ! return (status); ! } status = mod_install((struct modlinkage *)&modlinkage); if (status != 0) { /* * Could not load module, cleanup previous --- 106,128 ---- static struct modlinkage modlinkage = { MODREV_1, (void *)&modlmisc, NULL }; + zone_key_t nfssrv_zone_key; + list_t nfssrv_globals_list; + krwlock_t nfssrv_globals_rwl; + kmem_cache_t *nfs_xuio_cache; int nfs_loaned_buffers = 0; int _init(void) { int status; ! nfs_srvinit(); status = mod_install((struct modlinkage *)&modlinkage); if (status != 0) { /* * Could not load module, cleanup previous
*** 175,205 **** * modifying those routines to avoid the duplication. For now, we optimize * by calling exportmatch() only after checking that the dispatch routine * supports RPC_PUBLICFH_OK, and if the filesystem is explicitly exported * public (i.e., not the placeholder). */ ! #define PUBLICFH_CHECK(disp, exi, fsid, xfid) \ ((disp->dis_flags & RPC_PUBLICFH_OK) && \ ((exi->exi_export.ex_flags & EX_PUBLIC) || \ ! (exi == exi_public && exportmatch(exi_root, \ fsid, xfid)))) static void nfs_srv_shutdown_all(int); ! static void rfs4_server_start(int); static void nullfree(void); static void rfs_dispatch(struct svc_req *, SVCXPRT *); static void acl_dispatch(struct svc_req *, SVCXPRT *); - static void common_dispatch(struct svc_req *, SVCXPRT *, - rpcvers_t, rpcvers_t, char *, - struct rpc_disptable *); - static void hanfsv4_failover(void); static int checkauth(struct exportinfo *, struct svc_req *, cred_t *, int, bool_t, bool_t *); static char *client_name(struct svc_req *req); static char *client_addr(struct svc_req *req, char *buf); extern int sec_svc_getcred(struct svc_req *, cred_t *cr, char **, int *); extern bool_t sec_svc_inrootlist(int, caddr_t, int, caddr_t *); #define NFSLOG_COPY_NETBUF(exi, xprt, nb) { \ (nb)->maxlen = (xprt)->xp_rtaddr.maxlen; \ (nb)->len = (xprt)->xp_rtaddr.len; \ (nb)->buf = kmem_alloc((nb)->len, KM_SLEEP); \ --- 175,204 ---- * modifying those routines to avoid the duplication. For now, we optimize * by calling exportmatch() only after checking that the dispatch routine * supports RPC_PUBLICFH_OK, and if the filesystem is explicitly exported * public (i.e., not the placeholder). */ ! #define PUBLICFH_CHECK(ne, disp, exi, fsid, xfid) \ ((disp->dis_flags & RPC_PUBLICFH_OK) && \ ((exi->exi_export.ex_flags & EX_PUBLIC) || \ ! (exi == ne->exi_public && exportmatch(ne->exi_root, \ fsid, xfid)))) static void nfs_srv_shutdown_all(int); ! static void rfs4_server_start(nfs_globals_t *, int); static void nullfree(void); static void rfs_dispatch(struct svc_req *, SVCXPRT *); static void acl_dispatch(struct svc_req *, SVCXPRT *); static int checkauth(struct exportinfo *, struct svc_req *, cred_t *, int, bool_t, bool_t *); static char *client_name(struct svc_req *req); static char *client_addr(struct svc_req *req, char *buf); extern int sec_svc_getcred(struct svc_req *, cred_t *cr, char **, int *); extern bool_t sec_svc_inrootlist(int, caddr_t, int, caddr_t *); + static void *nfs_server_zone_init(zoneid_t); + static void nfs_server_zone_fini(zoneid_t, void *); + static void nfs_server_zone_shutdown(zoneid_t, void *); #define NFSLOG_COPY_NETBUF(exi, xprt, nb) { \ (nb)->maxlen = (xprt)->xp_rtaddr.maxlen; \ (nb)->len = (xprt)->xp_rtaddr.len; \ (nb)->buf = kmem_alloc((nb)->len, KM_SLEEP); \
*** 246,303 **** }; static SVC_CALLOUT_TABLE nfs_sct_rdma = { sizeof (__nfs_sc_rdma) / sizeof (__nfs_sc_rdma[0]), FALSE, __nfs_sc_rdma }; - rpcvers_t nfs_versmin = NFS_VERSMIN_DEFAULT; - rpcvers_t nfs_versmax = NFS_VERSMAX_DEFAULT; /* - * Used to track the state of the server so that initialization - * can be done properly. - */ - typedef enum { - NFS_SERVER_STOPPED, /* server state destroyed */ - NFS_SERVER_STOPPING, /* server state being destroyed */ - NFS_SERVER_RUNNING, - NFS_SERVER_QUIESCED, /* server state preserved */ - NFS_SERVER_OFFLINE /* server pool offline */ - } nfs_server_running_t; - - static nfs_server_running_t nfs_server_upordown; - static kmutex_t nfs_server_upordown_lock; - static kcondvar_t nfs_server_upordown_cv; - - /* * DSS: distributed stable storage * lists of all DSS paths: current, and before last warmstart */ nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths; int rfs4_dispatch(struct rpcdisp *, struct svc_req *, SVCXPRT *, char *); bool_t rfs4_minorvers_mismatch(struct svc_req *, SVCXPRT *, void *); /* ! * RDMA wait variables. */ ! static kcondvar_t rdma_wait_cv; ! static kmutex_t rdma_wait_mutex; /* * Will be called at the point the server pool is being unregistered * from the pool list. From that point onwards, the pool is waiting * to be drained and as such the server state is stale and pertains * to the old instantiation of the NFS server pool. */ void nfs_srv_offline(void) { ! mutex_enter(&nfs_server_upordown_lock); ! if (nfs_server_upordown == NFS_SERVER_RUNNING) { ! nfs_server_upordown = NFS_SERVER_OFFLINE; } ! mutex_exit(&nfs_server_upordown_lock); } /* * Will be called at the point the server pool is being destroyed so * all transports have been closed and no service threads are in --- 245,302 ---- }; static SVC_CALLOUT_TABLE nfs_sct_rdma = { sizeof (__nfs_sc_rdma) / sizeof (__nfs_sc_rdma[0]), FALSE, __nfs_sc_rdma }; /* * DSS: distributed stable storage * lists of all DSS paths: current, and before last warmstart */ nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths; int rfs4_dispatch(struct rpcdisp *, struct svc_req *, SVCXPRT *, char *); bool_t rfs4_minorvers_mismatch(struct svc_req *, SVCXPRT *, void *); /* ! * Stash NFS zone globals in TSD to avoid some lock contention ! * from frequent zone_getspecific calls. */ ! static uint_t nfs_server_tsd_key; + nfs_globals_t * + nfs_srv_getzg(void) + { + nfs_globals_t *ng; + + ng = tsd_get(nfs_server_tsd_key); + if (ng == NULL) { + ng = zone_getspecific(nfssrv_zone_key, curzone); + (void) tsd_set(nfs_server_tsd_key, ng); + } + + return (ng); + } + /* * Will be called at the point the server pool is being unregistered * from the pool list. From that point onwards, the pool is waiting * to be drained and as such the server state is stale and pertains * to the old instantiation of the NFS server pool. */ void nfs_srv_offline(void) { ! nfs_globals_t *ng; ! ! ng = nfs_srv_getzg(); ! ! mutex_enter(&ng->nfs_server_upordown_lock); ! if (ng->nfs_server_upordown == NFS_SERVER_RUNNING) { ! ng->nfs_server_upordown = NFS_SERVER_OFFLINE; } ! mutex_exit(&ng->nfs_server_upordown_lock); } /* * Will be called at the point the server pool is being destroyed so * all transports have been closed and no service threads are in
*** 322,358 **** int quiesce = 1; nfs_srv_shutdown_all(quiesce); } static void ! nfs_srv_shutdown_all(int quiesce) { ! mutex_enter(&nfs_server_upordown_lock); if (quiesce) { ! if (nfs_server_upordown == NFS_SERVER_RUNNING || ! nfs_server_upordown == NFS_SERVER_OFFLINE) { ! nfs_server_upordown = NFS_SERVER_QUIESCED; ! cv_signal(&nfs_server_upordown_cv); ! /* reset DSS state, for subsequent warm restart */ rfs4_dss_numnewpaths = 0; rfs4_dss_newpaths = NULL; cmn_err(CE_NOTE, "nfs_server: server is now quiesced; " "NFSv4 state has been preserved"); } } else { ! if (nfs_server_upordown == NFS_SERVER_OFFLINE) { ! nfs_server_upordown = NFS_SERVER_STOPPING; ! mutex_exit(&nfs_server_upordown_lock); ! rfs4_state_fini(); ! rfs4_fini_drc(nfs4_drc); ! mutex_enter(&nfs_server_upordown_lock); ! nfs_server_upordown = NFS_SERVER_STOPPED; ! cv_signal(&nfs_server_upordown_cv); } } ! mutex_exit(&nfs_server_upordown_lock); } static int nfs_srv_set_sc_versions(struct file *fp, SVC_CALLOUT_TABLE **sctpp, rpcvers_t versmin, rpcvers_t versmax) --- 321,365 ---- int quiesce = 1; nfs_srv_shutdown_all(quiesce); } static void ! nfs_srv_shutdown_all(int quiesce) ! { ! nfs_globals_t *ng = nfs_srv_getzg(); ! ! mutex_enter(&ng->nfs_server_upordown_lock); if (quiesce) { ! if (ng->nfs_server_upordown == NFS_SERVER_RUNNING || ! ng->nfs_server_upordown == NFS_SERVER_OFFLINE) { ! ng->nfs_server_upordown = NFS_SERVER_QUIESCED; ! cv_signal(&ng->nfs_server_upordown_cv); ! /* reset DSS state */ rfs4_dss_numnewpaths = 0; rfs4_dss_newpaths = NULL; cmn_err(CE_NOTE, "nfs_server: server is now quiesced; " "NFSv4 state has been preserved"); } } else { ! if (ng->nfs_server_upordown == NFS_SERVER_OFFLINE) { ! ng->nfs_server_upordown = NFS_SERVER_STOPPING; ! mutex_exit(&ng->nfs_server_upordown_lock); ! rfs4_state_zone_fini(); ! rfs4_fini_drc(); ! mutex_enter(&ng->nfs_server_upordown_lock); ! ng->nfs_server_upordown = NFS_SERVER_STOPPED; ! ! /* reset DSS state */ ! rfs4_dss_numnewpaths = 0; ! rfs4_dss_newpaths = NULL; ! ! cv_signal(&ng->nfs_server_upordown_cv); } } ! mutex_exit(&ng->nfs_server_upordown_lock); } static int nfs_srv_set_sc_versions(struct file *fp, SVC_CALLOUT_TABLE **sctpp, rpcvers_t versmin, rpcvers_t versmax)
*** 416,425 **** --- 423,433 ---- * uap->fd is the fd of an open transport provider */ int nfs_svc(struct nfs_svc_args *arg, model_t model) { + nfs_globals_t *ng; file_t *fp; SVCMASTERXPRT *xprt; int error; int readsize; char buf[KNC_STRSIZE];
*** 430,446 **** --- 438,459 ---- #ifdef lint model = model; /* STRUCT macros don't always refer to it */ #endif + ng = nfs_srv_getzg(); STRUCT_SET_HANDLE(uap, model, arg); /* Check privileges in nfssys() */ if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL) return (EBADF); + /* Setup global file handle in nfs_export */ + if ((error = nfs_export_get_rootfh(ng)) != 0) + return (error); + /* * Set read buffer size to rsize * and add room for RPC headers. */ readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
*** 463,493 **** releasef(STRUCT_FGET(uap, fd)); kmem_free(addrmask.buf, addrmask.maxlen); return (error); } ! nfs_versmin = STRUCT_FGET(uap, versmin); ! nfs_versmax = STRUCT_FGET(uap, versmax); /* Double check the vers min/max ranges */ ! if ((nfs_versmin > nfs_versmax) || ! (nfs_versmin < NFS_VERSMIN) || ! (nfs_versmax > NFS_VERSMAX)) { ! nfs_versmin = NFS_VERSMIN_DEFAULT; ! nfs_versmax = NFS_VERSMAX_DEFAULT; } ! if (error = ! nfs_srv_set_sc_versions(fp, &sctp, nfs_versmin, nfs_versmax)) { releasef(STRUCT_FGET(uap, fd)); kmem_free(addrmask.buf, addrmask.maxlen); return (error); } /* Initialize nfsv4 server */ ! if (nfs_versmax == (rpcvers_t)NFS_V4) ! rfs4_server_start(STRUCT_FGET(uap, delegation)); /* Create a transport handle. */ error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &xprt, sctp, NULL, NFS_SVCPOOL_ID, TRUE); --- 476,506 ---- releasef(STRUCT_FGET(uap, fd)); kmem_free(addrmask.buf, addrmask.maxlen); return (error); } ! ng->nfs_versmin = STRUCT_FGET(uap, versmin); ! ng->nfs_versmax = STRUCT_FGET(uap, versmax); /* Double check the vers min/max ranges */ ! if ((ng->nfs_versmin > ng->nfs_versmax) || ! (ng->nfs_versmin < NFS_VERSMIN) || ! (ng->nfs_versmax > NFS_VERSMAX)) { ! ng->nfs_versmin = NFS_VERSMIN_DEFAULT; ! ng->nfs_versmax = NFS_VERSMAX_DEFAULT; } ! if (error = nfs_srv_set_sc_versions(fp, &sctp, ng->nfs_versmin, ! ng->nfs_versmax)) { releasef(STRUCT_FGET(uap, fd)); kmem_free(addrmask.buf, addrmask.maxlen); return (error); } /* Initialize nfsv4 server */ ! if (ng->nfs_versmax == (rpcvers_t)NFS_V4) ! rfs4_server_start(ng, STRUCT_FGET(uap, delegation)); /* Create a transport handle. */ error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &xprt, sctp, NULL, NFS_SVCPOOL_ID, TRUE);
*** 502,573 **** return (error); } static void ! rfs4_server_start(int nfs4_srv_delegation) { /* * Determine if the server has previously been "started" and * if not, do the per instance initialization */ ! mutex_enter(&nfs_server_upordown_lock); ! if (nfs_server_upordown != NFS_SERVER_RUNNING) { /* Do we need to stop and wait on the previous server? */ ! while (nfs_server_upordown == NFS_SERVER_STOPPING || ! nfs_server_upordown == NFS_SERVER_OFFLINE) ! cv_wait(&nfs_server_upordown_cv, ! &nfs_server_upordown_lock); ! if (nfs_server_upordown != NFS_SERVER_RUNNING) { (void) svc_pool_control(NFS_SVCPOOL_ID, SVCPSET_UNREGISTER_PROC, (void *)&nfs_srv_offline); (void) svc_pool_control(NFS_SVCPOOL_ID, SVCPSET_SHUTDOWN_PROC, (void *)&nfs_srv_stop_all); ! /* is this an nfsd warm start? */ ! if (nfs_server_upordown == NFS_SERVER_QUIESCED) { ! cmn_err(CE_NOTE, "nfs_server: " ! "server was previously quiesced; " ! "existing NFSv4 state will be re-used"); ! /* ! * HA-NFSv4: this is also the signal ! * that a Resource Group failover has ! * occurred. ! */ ! if (cluster_bootflags & CLUSTER_BOOTED) ! hanfsv4_failover(); ! } else { ! /* cold start */ ! rfs4_state_init(); ! nfs4_drc = rfs4_init_drc(nfs4_drc_max, ! nfs4_drc_hash); } ! ! /* ! * Check to see if delegation is to be ! * enabled at the server ! */ ! if (nfs4_srv_delegation != FALSE) ! rfs4_set_deleg_policy(SRV_NORMAL_DELEGATE); ! ! nfs_server_upordown = NFS_SERVER_RUNNING; } ! cv_signal(&nfs_server_upordown_cv); ! } ! mutex_exit(&nfs_server_upordown_lock); } /* * If RDMA device available, * start RDMA listener. */ int rdma_start(struct rdma_svc_args *rsa) { int error; rdma_xprt_group_t started_rdma_xprts; rdma_stat stat; int svc_state = 0; --- 515,564 ---- return (error); } static void ! rfs4_server_start(nfs_globals_t *ng, int nfs4_srv_delegation) { /* * Determine if the server has previously been "started" and * if not, do the per instance initialization */ ! mutex_enter(&ng->nfs_server_upordown_lock); ! if (ng->nfs_server_upordown != NFS_SERVER_RUNNING) { /* Do we need to stop and wait on the previous server? */ ! while (ng->nfs_server_upordown == NFS_SERVER_STOPPING || ! ng->nfs_server_upordown == NFS_SERVER_OFFLINE) ! cv_wait(&ng->nfs_server_upordown_cv, ! &ng->nfs_server_upordown_lock); ! if (ng->nfs_server_upordown != NFS_SERVER_RUNNING) { (void) svc_pool_control(NFS_SVCPOOL_ID, SVCPSET_UNREGISTER_PROC, (void *)&nfs_srv_offline); (void) svc_pool_control(NFS_SVCPOOL_ID, SVCPSET_SHUTDOWN_PROC, (void *)&nfs_srv_stop_all); ! rfs4_do_server_start(ng->nfs_server_upordown, ! nfs4_srv_delegation, ! cluster_bootflags & CLUSTER_BOOTED); ! ng->nfs_server_upordown = NFS_SERVER_RUNNING; } ! cv_signal(&ng->nfs_server_upordown_cv); } ! mutex_exit(&ng->nfs_server_upordown_lock); } /* * If RDMA device available, * start RDMA listener. */ int rdma_start(struct rdma_svc_args *rsa) { + nfs_globals_t *ng; int error; rdma_xprt_group_t started_rdma_xprts; rdma_stat stat; int svc_state = 0;
*** 576,588 **** (rsa->nfs_versmin < NFS_VERSMIN) || (rsa->nfs_versmax > NFS_VERSMAX)) { rsa->nfs_versmin = NFS_VERSMIN_DEFAULT; rsa->nfs_versmax = NFS_VERSMAX_DEFAULT; } - nfs_versmin = rsa->nfs_versmin; - nfs_versmax = rsa->nfs_versmax; /* Set the versions in the callout table */ __nfs_sc_rdma[0].sc_versmin = rsa->nfs_versmin; __nfs_sc_rdma[0].sc_versmax = rsa->nfs_versmax; /* For the NFS_ACL program, check the max version */ __nfs_sc_rdma[1].sc_versmin = rsa->nfs_versmin; --- 567,581 ---- (rsa->nfs_versmin < NFS_VERSMIN) || (rsa->nfs_versmax > NFS_VERSMAX)) { rsa->nfs_versmin = NFS_VERSMIN_DEFAULT; rsa->nfs_versmax = NFS_VERSMAX_DEFAULT; } + ng = nfs_srv_getzg(); + ng->nfs_versmin = rsa->nfs_versmin; + ng->nfs_versmax = rsa->nfs_versmax; + /* Set the versions in the callout table */ __nfs_sc_rdma[0].sc_versmin = rsa->nfs_versmin; __nfs_sc_rdma[0].sc_versmax = rsa->nfs_versmax; /* For the NFS_ACL program, check the max version */ __nfs_sc_rdma[1].sc_versmin = rsa->nfs_versmin;
*** 591,601 **** else __nfs_sc_rdma[1].sc_versmax = rsa->nfs_versmax; /* Initialize nfsv4 server */ if (rsa->nfs_versmax == (rpcvers_t)NFS_V4) ! rfs4_server_start(rsa->delegation); started_rdma_xprts.rtg_count = 0; started_rdma_xprts.rtg_listhead = NULL; started_rdma_xprts.rtg_poolid = rsa->poolid; --- 584,594 ---- else __nfs_sc_rdma[1].sc_versmax = rsa->nfs_versmax; /* Initialize nfsv4 server */ if (rsa->nfs_versmax == (rpcvers_t)NFS_V4) ! rfs4_server_start(ng, rsa->delegation); started_rdma_xprts.rtg_count = 0; started_rdma_xprts.rtg_listhead = NULL; started_rdma_xprts.rtg_poolid = rsa->poolid;
*** 608,618 **** while (!error) { /* * wait till either interrupted by a signal on * nfs service stop/restart or signalled by a ! * rdma plugin attach/detatch. */ stat = rdma_kwait(); /* --- 601,611 ---- while (!error) { /* * wait till either interrupted by a signal on * nfs service stop/restart or signalled by a ! * rdma attach/detatch. */ stat = rdma_kwait(); /*
*** 659,672 **** /* ARGSUSED */ void rpc_null_v3(caddr_t *argp, caddr_t *resp, struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro) { ! DTRACE_NFSV3_3(op__null__start, struct svc_req *, req, ! cred_t *, cr, vnode_t *, NULL); ! DTRACE_NFSV3_3(op__null__done, struct svc_req *, req, ! cred_t *, cr, vnode_t *, NULL); } /* ARGSUSED */ static void rfs_error(caddr_t *argp, caddr_t *resp, struct exportinfo *exi, --- 652,665 ---- /* ARGSUSED */ void rpc_null_v3(caddr_t *argp, caddr_t *resp, struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro) { ! DTRACE_NFSV3_4(op__null__start, struct svc_req *, req, ! cred_t *, cr, vnode_t *, NULL, struct exportinfo *, exi); ! DTRACE_NFSV3_4(op__null__done, struct svc_req *, req, ! cred_t *, cr, vnode_t *, NULL, struct exportinfo *, exi); } /* ARGSUSED */ static void rfs_error(caddr_t *argp, caddr_t *resp, struct exportinfo *exi,
*** 1340,1356 **** }; static struct rpc_disptable rfs_disptable[] = { {sizeof (rfsdisptab_v2) / sizeof (rfsdisptab_v2[0]), rfscallnames_v2, ! &rfsproccnt_v2_ptr, rfsdisptab_v2}, {sizeof (rfsdisptab_v3) / sizeof (rfsdisptab_v3[0]), rfscallnames_v3, ! &rfsproccnt_v3_ptr, rfsdisptab_v3}, {sizeof (rfsdisptab_v4) / sizeof (rfsdisptab_v4[0]), rfscallnames_v4, ! &rfsproccnt_v4_ptr, rfsdisptab_v4}, }; /* * If nfs_portmon is set, then clients are required to use privileged * ports (ports < IPPORT_RESERVED) in order to get NFS services. --- 1333,1349 ---- }; static struct rpc_disptable rfs_disptable[] = { {sizeof (rfsdisptab_v2) / sizeof (rfsdisptab_v2[0]), rfscallnames_v2, ! rfsdisptab_v2}, {sizeof (rfsdisptab_v3) / sizeof (rfsdisptab_v3[0]), rfscallnames_v3, ! rfsdisptab_v3}, {sizeof (rfsdisptab_v4) / sizeof (rfsdisptab_v4[0]), rfscallnames_v4, ! rfsdisptab_v4}, }; /* * If nfs_portmon is set, then clients are required to use privileged * ports (ports < IPPORT_RESERVED) in order to get NFS services.
*** 1365,1375 **** #ifdef DEBUG static int cred_hits = 0; static int cred_misses = 0; #endif - #ifdef DEBUG /* * Debug code to allow disabling of rfs_dispatch() use of * fastxdrargs() and fastxdrres() calls for testing purposes. */ --- 1358,1367 ----
*** 1469,1483 **** return (TRUE); } return (FALSE); } - static void common_dispatch(struct svc_req *req, SVCXPRT *xprt, rpcvers_t min_vers, ! rpcvers_t max_vers, char *pgmname, ! struct rpc_disptable *disptable) { int which; rpcvers_t vers; char *args; union { --- 1461,1473 ---- return (TRUE); } return (FALSE); } static void common_dispatch(struct svc_req *req, SVCXPRT *xprt, rpcvers_t min_vers, ! rpcvers_t max_vers, char *pgmname, struct rpc_disptable *disptable) { int which; rpcvers_t vers; char *args; union {
*** 1506,1518 **** --- 1496,1517 ---- bool_t logging_enabled = FALSE; struct exportinfo *nfslog_exi = NULL; char **procnames; char cbuf[INET6_ADDRSTRLEN]; /* to hold both IPv4 and IPv6 addr */ bool_t ro = FALSE; + nfs_globals_t *ng = nfs_srv_getzg(); + nfs_export_t *ne = ng->nfs_export; + kstat_named_t *svstat, *procstat; + ASSERT(req->rq_prog == NFS_PROGRAM || req->rq_prog == NFS_ACL_PROGRAM); + vers = req->rq_vers; + svstat = ng->svstat[req->rq_vers]; + procstat = (req->rq_prog == NFS_PROGRAM) ? + ng->rfsproccnt[vers] : ng->aclproccnt[vers]; + if (vers < min_vers || vers > max_vers) { svcerr_progvers(req->rq_xprt, min_vers, max_vers); error++; cmn_err(CE_NOTE, "%s: bad version number %u", pgmname, vers); goto done;
*** 1524,1534 **** svcerr_noproc(req->rq_xprt); error++; goto done; } ! (*(disptable[(int)vers].dis_proccntp))[which].value.ui64++; disp = &disptable[(int)vers].dis_table[which]; procnames = disptable[(int)vers].dis_procnames; auth_flavor = req->rq_cred.oa_flavor; --- 1523,1533 ---- svcerr_noproc(req->rq_xprt); error++; goto done; } ! procstat[which].value.ui64++; disp = &disptable[(int)vers].dis_table[which]; procnames = disptable[(int)vers].dis_procnames; auth_flavor = req->rq_cred.oa_flavor;
*** 1630,1646 **** --- 1629,1647 ---- anon_ok = 0; cr = xprt->xp_cred; ASSERT(cr != NULL); #ifdef DEBUG + { if (crgetref(cr) != 1) { crfree(cr); cr = crget(); xprt->xp_cred = cr; cred_misses++; } else cred_hits++; + } #else if (crgetref(cr) != 1) { crfree(cr); cr = crget(); xprt->xp_cred = cr;
*** 1648,1658 **** #endif exi = checkexport(fsid, xfid); if (exi != NULL) { ! publicfh_ok = PUBLICFH_CHECK(disp, exi, fsid, xfid); /* * Don't allow non-V4 clients access * to pseudo exports */ --- 1649,1659 ---- #endif exi = checkexport(fsid, xfid); if (exi != NULL) { ! publicfh_ok = PUBLICFH_CHECK(ne, disp, exi, fsid, xfid); /* * Don't allow non-V4 clients access * to pseudo exports */
*** 1761,1771 **** * the later writing of the log record. This is done for * the case that a lookup is done across a non-logged public * file system. */ if (nfslog_buffer_list != NULL) { ! nfslog_exi = nfslog_get_exi(exi, req, res, &nfslog_rec_id); /* * Is logging enabled? */ logging_enabled = (nfslog_exi != NULL); --- 1762,1772 ---- * the later writing of the log record. This is done for * the case that a lookup is done across a non-logged public * file system. */ if (nfslog_buffer_list != NULL) { ! nfslog_exi = nfslog_get_exi(ne, exi, req, res, &nfslog_rec_id); /* * Is logging enabled? */ logging_enabled = (nfslog_exi != NULL);
*** 1844,1856 **** } if (exi != NULL) exi_rele(exi); ! global_svstat_ptr[req->rq_vers][NFS_BADCALLS].value.ui64 += error; ! ! global_svstat_ptr[req->rq_vers][NFS_CALLS].value.ui64++; } static void rfs_dispatch(struct svc_req *req, SVCXPRT *xprt) { --- 1845,1856 ---- } if (exi != NULL) exi_rele(exi); ! svstat[NFS_BADCALLS].value.ui64 += error; ! svstat[NFS_CALLS].value.ui64++; } static void rfs_dispatch(struct svc_req *req, SVCXPRT *xprt) {
*** 1969,1982 **** }; static struct rpc_disptable acl_disptable[] = { {sizeof (acldisptab_v2) / sizeof (acldisptab_v2[0]), aclcallnames_v2, ! &aclproccnt_v2_ptr, acldisptab_v2}, {sizeof (acldisptab_v3) / sizeof (acldisptab_v3[0]), aclcallnames_v3, ! &aclproccnt_v3_ptr, acldisptab_v3}, }; static void acl_dispatch(struct svc_req *req, SVCXPRT *xprt) { --- 1969,1982 ---- }; static struct rpc_disptable acl_disptable[] = { {sizeof (acldisptab_v2) / sizeof (acldisptab_v2[0]), aclcallnames_v2, ! acldisptab_v2}, {sizeof (acldisptab_v3) / sizeof (acldisptab_v3[0]), aclcallnames_v3, ! acldisptab_v3}, }; static void acl_dispatch(struct svc_req *req, SVCXPRT *xprt) {
*** 2566,2600 **** * once. It performs the following tasks: * - Call sub-initialization routines (localize access to variables) * - Initialize all locks * - initialize the version 3 write verifier */ ! int nfs_srvinit(void) { - int error; ! error = nfs_exportinit(); ! if (error != 0) ! return (error); ! error = rfs4_srvrinit(); ! if (error != 0) { ! nfs_exportfini(); ! return (error); ! } rfs_srvrinit(); rfs3_srvrinit(); nfsauth_init(); ! /* Init the stuff to control start/stop */ ! nfs_server_upordown = NFS_SERVER_STOPPED; ! mutex_init(&nfs_server_upordown_lock, NULL, MUTEX_DEFAULT, NULL); ! cv_init(&nfs_server_upordown_cv, NULL, CV_DEFAULT, NULL); ! mutex_init(&rdma_wait_mutex, NULL, MUTEX_DEFAULT, NULL); ! cv_init(&rdma_wait_cv, NULL, CV_DEFAULT, NULL); ! ! return (0); } /* * NFS Server finalization routine. This routine is called to cleanup the * initialization work previously performed if the NFS server module could --- 2566,2598 ---- * once. It performs the following tasks: * - Call sub-initialization routines (localize access to variables) * - Initialize all locks * - initialize the version 3 write verifier */ ! void nfs_srvinit(void) { ! /* Truly global stuff in this module (not per zone) */ ! rw_init(&nfssrv_globals_rwl, NULL, RW_DEFAULT, NULL); ! list_create(&nfssrv_globals_list, sizeof (nfs_globals_t), ! offsetof(nfs_globals_t, nfs_g_link)); ! tsd_create(&nfs_server_tsd_key, NULL); ! ! /* The order here is important */ ! nfs_exportinit(); rfs_srvrinit(); rfs3_srvrinit(); + rfs4_srvrinit(); nfsauth_init(); ! /* ! * NFS server zone-specific global variables ! * Note the zone_init is called for the GZ here. ! */ ! zone_key_create(&nfssrv_zone_key, nfs_server_zone_init, ! nfs_server_zone_shutdown, nfs_server_zone_fini); } /* * NFS Server finalization routine. This routine is called to cleanup the * initialization work previously performed if the NFS server module could
*** 2601,2625 **** * not be loaded correctly. */ void nfs_srvfini(void) { nfsauth_fini(); rfs3_srvrfini(); rfs_srvrfini(); nfs_exportfini(); ! mutex_destroy(&nfs_server_upordown_lock); ! cv_destroy(&nfs_server_upordown_cv); ! mutex_destroy(&rdma_wait_mutex); ! cv_destroy(&rdma_wait_cv); } /* ! * Set up an iovec array of up to cnt pointers. */ void mblk_to_iov(mblk_t *m, int cnt, struct iovec *iovp) { while (m != NULL && cnt-- > 0) { iovp->iov_base = (caddr_t)m->b_rptr; --- 2599,2728 ---- * not be loaded correctly. */ void nfs_srvfini(void) { + + /* + * NFS server zone-specific global variables + * Note the zone_fini is called for the GZ here. + */ + (void) zone_key_delete(nfssrv_zone_key); + + /* The order here is important (reverse of init) */ nfsauth_fini(); + rfs4_srvrfini(); rfs3_srvrfini(); rfs_srvrfini(); nfs_exportfini(); ! /* Truly global stuff in this module (not per zone) */ ! tsd_destroy(&nfs_server_tsd_key); ! list_destroy(&nfssrv_globals_list); ! rw_destroy(&nfssrv_globals_rwl); } /* ! * Zone init, shutdown, fini functions for the NFS server ! * ! * This design is careful to create the entire hierarhcy of ! * NFS server "globals" (including those created by various ! * per-module *_zone_init functions, etc.) so that all these ! * objects have exactly the same lifetime. ! * ! * These objects are also kept on a list for two reasons: ! * 1: It makes finding these in mdb _much_ easier. ! * 2: It allows operating across all zone globals for ! * functions like nfs_auth.c:exi_cache_reclaim */ + static void * + nfs_server_zone_init(zoneid_t zoneid) + { + nfs_globals_t *ng; + ng = kmem_zalloc(sizeof (*ng), KM_SLEEP); + + ng->nfs_versmin = NFS_VERSMIN_DEFAULT; + ng->nfs_versmax = NFS_VERSMAX_DEFAULT; + + /* Init the stuff to control start/stop */ + ng->nfs_server_upordown = NFS_SERVER_STOPPED; + mutex_init(&ng->nfs_server_upordown_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&ng->nfs_server_upordown_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&ng->rdma_wait_mutex, NULL, MUTEX_DEFAULT, NULL); + cv_init(&ng->rdma_wait_cv, NULL, CV_DEFAULT, NULL); + + ng->nfs_zoneid = zoneid; + + /* + * Order here is important. + * export init must precede srv init calls. + */ + nfs_export_zone_init(ng); + rfs_stat_zone_init(ng); + rfs_srv_zone_init(ng); + rfs3_srv_zone_init(ng); + rfs4_srv_zone_init(ng); + nfsauth_zone_init(ng); + + rw_enter(&nfssrv_globals_rwl, RW_WRITER); + list_insert_tail(&nfssrv_globals_list, ng); + rw_exit(&nfssrv_globals_rwl); + + return (ng); + } + + /* ARGSUSED */ + static void + nfs_server_zone_shutdown(zoneid_t zoneid, void *data) + { + nfs_globals_t *ng; + + ng = (nfs_globals_t *)data; + + /* + * Order is like _fini, but only + * some modules need this hook. + */ + nfsauth_zone_shutdown(ng); + nfs_export_zone_shutdown(ng); + } + + /* ARGSUSED */ + static void + nfs_server_zone_fini(zoneid_t zoneid, void *data) + { + nfs_globals_t *ng; + + ng = (nfs_globals_t *)data; + + rw_enter(&nfssrv_globals_rwl, RW_WRITER); + list_remove(&nfssrv_globals_list, ng); + rw_exit(&nfssrv_globals_rwl); + + /* + * Order here is important. + * reverse order from init + */ + nfsauth_zone_fini(ng); + rfs4_srv_zone_fini(ng); + rfs3_srv_zone_fini(ng); + rfs_srv_zone_fini(ng); + rfs_stat_zone_fini(ng); + nfs_export_zone_fini(ng); + + mutex_destroy(&ng->nfs_server_upordown_lock); + cv_destroy(&ng->nfs_server_upordown_cv); + mutex_destroy(&ng->rdma_wait_mutex); + cv_destroy(&ng->rdma_wait_cv); + + kmem_free(ng, sizeof (*ng)); + } + + /* + * Set up an iovec array of up to cnt pointers. + */ void mblk_to_iov(mblk_t *m, int cnt, struct iovec *iovp) { while (m != NULL && cnt-- > 0) { iovp->iov_base = (caddr_t)m->b_rptr;
*** 2853,2867 **** --- 2956,2972 ---- */ /* Release the reference on the old exi value */ ASSERT(*exi != NULL); exi_rele(*exi); + *exi = NULL; if (error = nfs_check_vpexi(mc_dvp, *vpp, kcred, exi)) { VN_RELE(*vpp); goto publicfh_done; } + /* Have a new *exi */ } } publicfh_done: if (mc_dvp)
*** 2884,2901 **** { char namebuf[TYPICALMAXPATHLEN]; struct pathname pn; int error; /* * If pathname starts with '/', then set startdvp to root. */ if (*path == '/') { while (*path == '/') path++; ! startdvp = rootdir; } error = pn_get_buf(path, UIO_SYSSPACE, &pn, namebuf, sizeof (namebuf)); if (error == 0) { /* --- 2989,3008 ---- { char namebuf[TYPICALMAXPATHLEN]; struct pathname pn; int error; + ASSERT3U(crgetzoneid(cr), ==, curzone->zone_id); + /* * If pathname starts with '/', then set startdvp to root. */ if (*path == '/') { while (*path == '/') path++; ! startdvp = ZONE_ROOTVP(); } error = pn_get_buf(path, UIO_SYSSPACE, &pn, namebuf, sizeof (namebuf)); if (error == 0) { /*
*** 2914,2924 **** if ((pn.pn_pathlen = strlen(pn.pn_path)) == 0) return (ENOENT); } VN_HOLD(startdvp); error = lookuppnvp(&pn, NULL, NO_FOLLOW, dirvpp, compvpp, ! rootdir, startdvp, cr); } if (error == ENAMETOOLONG) { /* * This thread used a pathname > TYPICALMAXPATHLEN bytes long. */ --- 3021,3031 ---- if ((pn.pn_pathlen = strlen(pn.pn_path)) == 0) return (ENOENT); } VN_HOLD(startdvp); error = lookuppnvp(&pn, NULL, NO_FOLLOW, dirvpp, compvpp, ! ZONE_ROOTVP(), startdvp, cr); } if (error == ENAMETOOLONG) { /* * This thread used a pathname > TYPICALMAXPATHLEN bytes long. */
*** 2931,2941 **** return (ENOENT); } } VN_HOLD(startdvp); error = lookuppnvp(&pn, NULL, NO_FOLLOW, dirvpp, compvpp, ! rootdir, startdvp, cr); pn_free(&pn); } return (error); } --- 3038,3048 ---- return (ENOENT); } } VN_HOLD(startdvp); error = lookuppnvp(&pn, NULL, NO_FOLLOW, dirvpp, compvpp, ! ZONE_ROOTVP(), startdvp, cr); pn_free(&pn); } return (error); }
*** 3035,3206 **** } return (error); } - /* - * Do the main work of handling HA-NFSv4 Resource Group failover on - * Sun Cluster. - * We need to detect whether any RG admin paths have been added or removed, - * and adjust resources accordingly. - * Currently we're using a very inefficient algorithm, ~ 2 * O(n**2). In - * order to scale, the list and array of paths need to be held in more - * suitable data structures. - */ - static void - hanfsv4_failover(void) - { - int i, start_grace, numadded_paths = 0; - char **added_paths = NULL; - rfs4_dss_path_t *dss_path; - - /* - * Note: currently, rfs4_dss_pathlist cannot be NULL, since - * it will always include an entry for NFS4_DSS_VAR_DIR. If we - * make the latter dynamically specified too, the following will - * need to be adjusted. - */ - - /* - * First, look for removed paths: RGs that have been failed-over - * away from this node. - * Walk the "currently-serving" rfs4_dss_pathlist and, for each - * path, check if it is on the "passed-in" rfs4_dss_newpaths array - * from nfsd. If not, that RG path has been removed. - * - * Note that nfsd has sorted rfs4_dss_newpaths for us, and removed - * any duplicates. - */ - dss_path = rfs4_dss_pathlist; - do { - int found = 0; - char *path = dss_path->path; - - /* used only for non-HA so may not be removed */ - if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) { - dss_path = dss_path->next; - continue; - } - - for (i = 0; i < rfs4_dss_numnewpaths; i++) { - int cmpret; - char *newpath = rfs4_dss_newpaths[i]; - - /* - * Since nfsd has sorted rfs4_dss_newpaths for us, - * once the return from strcmp is negative we know - * we've passed the point where "path" should be, - * and can stop searching: "path" has been removed. - */ - cmpret = strcmp(path, newpath); - if (cmpret < 0) - break; - if (cmpret == 0) { - found = 1; - break; - } - } - - if (found == 0) { - unsigned index = dss_path->index; - rfs4_servinst_t *sip = dss_path->sip; - rfs4_dss_path_t *path_next = dss_path->next; - - /* - * This path has been removed. - * We must clear out the servinst reference to - * it, since it's now owned by another - * node: we should not attempt to touch it. - */ - ASSERT(dss_path == sip->dss_paths[index]); - sip->dss_paths[index] = NULL; - - /* remove from "currently-serving" list, and destroy */ - remque(dss_path); - /* allow for NUL */ - kmem_free(dss_path->path, strlen(dss_path->path) + 1); - kmem_free(dss_path, sizeof (rfs4_dss_path_t)); - - dss_path = path_next; - } else { - /* path was found; not removed */ - dss_path = dss_path->next; - } - } while (dss_path != rfs4_dss_pathlist); - - /* - * Now, look for added paths: RGs that have been failed-over - * to this node. - * Walk the "passed-in" rfs4_dss_newpaths array from nfsd and, - * for each path, check if it is on the "currently-serving" - * rfs4_dss_pathlist. If not, that RG path has been added. - * - * Note: we don't do duplicate detection here; nfsd does that for us. - * - * Note: numadded_paths <= rfs4_dss_numnewpaths, which gives us - * an upper bound for the size needed for added_paths[numadded_paths]. - */ - - /* probably more space than we need, but guaranteed to be enough */ - if (rfs4_dss_numnewpaths > 0) { - size_t sz = rfs4_dss_numnewpaths * sizeof (char *); - added_paths = kmem_zalloc(sz, KM_SLEEP); - } - - /* walk the "passed-in" rfs4_dss_newpaths array from nfsd */ - for (i = 0; i < rfs4_dss_numnewpaths; i++) { - int found = 0; - char *newpath = rfs4_dss_newpaths[i]; - - dss_path = rfs4_dss_pathlist; - do { - char *path = dss_path->path; - - /* used only for non-HA */ - if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) { - dss_path = dss_path->next; - continue; - } - - if (strncmp(path, newpath, strlen(path)) == 0) { - found = 1; - break; - } - - dss_path = dss_path->next; - } while (dss_path != rfs4_dss_pathlist); - - if (found == 0) { - added_paths[numadded_paths] = newpath; - numadded_paths++; - } - } - - /* did we find any added paths? */ - if (numadded_paths > 0) { - /* create a new server instance, and start its grace period */ - start_grace = 1; - rfs4_servinst_create(start_grace, numadded_paths, added_paths); - - /* read in the stable storage state from these paths */ - rfs4_dss_readstate(numadded_paths, added_paths); - - /* - * Multiple failovers during a grace period will cause - * clients of the same resource group to be partitioned - * into different server instances, with different - * grace periods. Since clients of the same resource - * group must be subject to the same grace period, - * we need to reset all currently active grace periods. - */ - rfs4_grace_reset_all(); - } - - if (rfs4_dss_numnewpaths > 0) - kmem_free(added_paths, rfs4_dss_numnewpaths * sizeof (char *)); - } - /* * Used by NFSv3 and NFSv4 server to query label of * a pathname component during lookup/access ops. */ ts_label_t * --- 3142,3151 ----