Print this page
11083 support NFS server in zone
Portions contributed by: Dan Kruchinin <dan.kruchinin@nexenta.com>
Portions contributed by: Stepan Zastupov <stepan.zastupov@gmail.com>
Portions contributed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Portions contributed by: Mike Zeller <mike@mikezeller.net>
Portions contributed by: Dan McDonald <danmcd@joyent.com>
Portions contributed by: Gordon Ross <gordon.w.ross@gmail.com>
Portions contributed by: Vitaliy Gusev <gusev.vitaliy@gmail.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Jason King <jbk@joyent.com>
Reviewed by: C Fraire <cfraire@me.com>
Change-Id: I22f289d357503f9b48a0bc2482cc4328a6d43d16

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/nfs/nfs_server.c
          +++ new/usr/src/uts/common/fs/nfs/nfs_server.c
↓ open down ↓ 14 lines elided ↑ open up ↑
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25      - * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  26   25   * Copyright (c) 2017 Joyent Inc
       26 + * Copyright 2019 Nexenta by DDN, Inc.
  27   27   */
  28   28  
  29   29  /*
  30   30   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  31   31   *      All rights reserved.
  32   32   *      Use is subject to license terms.
  33   33   */
  34   34  
  35   35  #include <sys/param.h>
  36   36  #include <sys/types.h>
↓ open down ↓ 39 lines elided ↑ open up ↑
  76   76  #include <rpc/svc.h>
  77   77  #include <rpc/xdr.h>
  78   78  #include <rpc/rpc_rdma.h>
  79   79  
  80   80  #include <nfs/nfs.h>
  81   81  #include <nfs/export.h>
  82   82  #include <nfs/nfssys.h>
  83   83  #include <nfs/nfs_clnt.h>
  84   84  #include <nfs/nfs_acl.h>
  85   85  #include <nfs/nfs_log.h>
  86      -#include <nfs/nfs_cmd.h>
  87   86  #include <nfs/lm.h>
  88   87  #include <nfs/nfs_dispatch.h>
  89   88  #include <nfs/nfs4_drc.h>
  90   89  
  91   90  #include <sys/modctl.h>
  92   91  #include <sys/cladm.h>
  93   92  #include <sys/clconf.h>
  94   93  
  95   94  #include <sys/tsol/label.h>
  96   95  
↓ open down ↓ 5 lines elided ↑ open up ↑
 102  101   */
 103  102  
 104  103  static struct modlmisc modlmisc = {
 105  104          &mod_miscops, "NFS server module"
 106  105  };
 107  106  
 108  107  static struct modlinkage modlinkage = {
 109  108          MODREV_1, (void *)&modlmisc, NULL
 110  109  };
 111  110  
      111 +zone_key_t      nfssrv_zone_key;
      112 +list_t          nfssrv_globals_list;
      113 +krwlock_t       nfssrv_globals_rwl;
      114 +
 112  115  kmem_cache_t *nfs_xuio_cache;
 113  116  int nfs_loaned_buffers = 0;
 114  117  
 115  118  int
 116  119  _init(void)
 117  120  {
 118  121          int status;
 119  122  
 120      -        if ((status = nfs_srvinit()) != 0) {
 121      -                cmn_err(CE_WARN, "_init: nfs_srvinit failed");
 122      -                return (status);
 123      -        }
      123 +        nfs_srvinit();
 124  124  
 125  125          status = mod_install((struct modlinkage *)&modlinkage);
 126  126          if (status != 0) {
 127  127                  /*
 128  128                   * Could not load module, cleanup previous
 129  129                   * initialization work.
 130  130                   */
 131  131                  nfs_srvfini();
 132  132  
 133  133                  return (status);
↓ open down ↓ 36 lines elided ↑ open up ↑
 170  170  /*
 171  171   * PUBLICFH_CHECK() checks if the dispatch routine supports
 172  172   * RPC_PUBLICFH_OK, if the filesystem is exported public, and if the
 173  173   * incoming request is using the public filehandle. The check duplicates
 174  174   * the exportmatch() call done in checkexport(), and we should consider
 175  175   * modifying those routines to avoid the duplication. For now, we optimize
 176  176   * by calling exportmatch() only after checking that the dispatch routine
 177  177   * supports RPC_PUBLICFH_OK, and if the filesystem is explicitly exported
 178  178   * public (i.e., not the placeholder).
 179  179   */
 180      -#define PUBLICFH_CHECK(disp, exi, fsid, xfid) \
      180 +#define PUBLICFH_CHECK(ne, disp, exi, fsid, xfid) \
 181  181                  ((disp->dis_flags & RPC_PUBLICFH_OK) && \
 182  182                  ((exi->exi_export.ex_flags & EX_PUBLIC) || \
 183      -                (exi == exi_public && exportmatch(exi_root, \
      183 +                (exi == ne->exi_public && exportmatch(ne->exi_root, \
 184  184                  fsid, xfid))))
 185  185  
 186  186  static void     nfs_srv_shutdown_all(int);
 187      -static void     rfs4_server_start(int);
      187 +static void     rfs4_server_start(nfs_globals_t *, int);
 188  188  static void     nullfree(void);
 189  189  static void     rfs_dispatch(struct svc_req *, SVCXPRT *);
 190  190  static void     acl_dispatch(struct svc_req *, SVCXPRT *);
 191      -static void     common_dispatch(struct svc_req *, SVCXPRT *,
 192      -                rpcvers_t, rpcvers_t, char *,
 193      -                struct rpc_disptable *);
 194      -static void     hanfsv4_failover(void);
 195  191  static  int     checkauth(struct exportinfo *, struct svc_req *, cred_t *, int,
 196  192                  bool_t, bool_t *);
 197  193  static char     *client_name(struct svc_req *req);
 198  194  static char     *client_addr(struct svc_req *req, char *buf);
 199  195  extern  int     sec_svc_getcred(struct svc_req *, cred_t *cr, char **, int *);
 200  196  extern  bool_t  sec_svc_inrootlist(int, caddr_t, int, caddr_t *);
      197 +static void     *nfs_server_zone_init(zoneid_t);
      198 +static void     nfs_server_zone_fini(zoneid_t, void *);
      199 +static void     nfs_server_zone_shutdown(zoneid_t, void *);
 201  200  
 202  201  #define NFSLOG_COPY_NETBUF(exi, xprt, nb)       {               \
 203  202          (nb)->maxlen = (xprt)->xp_rtaddr.maxlen;                \
 204  203          (nb)->len = (xprt)->xp_rtaddr.len;                      \
 205  204          (nb)->buf = kmem_alloc((nb)->len, KM_SLEEP);            \
 206  205          bcopy((xprt)->xp_rtaddr.buf, (nb)->buf, (nb)->len);     \
 207  206          }
 208  207  
 209  208  /*
 210  209   * Public Filehandle common nfs routines
↓ open down ↓ 30 lines elided ↑ open up ↑
 241  240  };
 242  241  
 243  242  static SVC_CALLOUT __nfs_sc_rdma[] = {
 244  243          { NFS_PROGRAM,     NFS_VERSMIN,     NFS_VERSMAX,        rfs_dispatch },
 245  244          { NFS_ACL_PROGRAM, NFS_ACL_VERSMIN, NFS_ACL_VERSMAX,    acl_dispatch }
 246  245  };
 247  246  
 248  247  static SVC_CALLOUT_TABLE nfs_sct_rdma = {
 249  248          sizeof (__nfs_sc_rdma) / sizeof (__nfs_sc_rdma[0]), FALSE, __nfs_sc_rdma
 250  249  };
 251      -rpcvers_t nfs_versmin = NFS_VERSMIN_DEFAULT;
 252      -rpcvers_t nfs_versmax = NFS_VERSMAX_DEFAULT;
 253  250  
 254  251  /*
 255      - * Used to track the state of the server so that initialization
 256      - * can be done properly.
 257      - */
 258      -typedef enum {
 259      -        NFS_SERVER_STOPPED,     /* server state destroyed */
 260      -        NFS_SERVER_STOPPING,    /* server state being destroyed */
 261      -        NFS_SERVER_RUNNING,
 262      -        NFS_SERVER_QUIESCED,    /* server state preserved */
 263      -        NFS_SERVER_OFFLINE      /* server pool offline */
 264      -} nfs_server_running_t;
 265      -
 266      -static nfs_server_running_t nfs_server_upordown;
 267      -static kmutex_t nfs_server_upordown_lock;
 268      -static  kcondvar_t nfs_server_upordown_cv;
 269      -
 270      -/*
 271  252   * DSS: distributed stable storage
 272  253   * lists of all DSS paths: current, and before last warmstart
 273  254   */
 274  255  nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths;
 275  256  
 276  257  int rfs4_dispatch(struct rpcdisp *, struct svc_req *, SVCXPRT *, char *);
 277  258  bool_t rfs4_minorvers_mismatch(struct svc_req *, SVCXPRT *, void *);
 278  259  
 279  260  /*
 280      - * RDMA wait variables.
      261 + * Stash NFS zone globals in TSD to avoid some lock contention
      262 + * from frequent zone_getspecific calls.
 281  263   */
 282      -static kcondvar_t rdma_wait_cv;
 283      -static kmutex_t rdma_wait_mutex;
      264 +static uint_t nfs_server_tsd_key;
 284  265  
      266 +nfs_globals_t *
      267 +nfs_srv_getzg(void)
      268 +{
      269 +        nfs_globals_t *ng;
      270 +
      271 +        ng = tsd_get(nfs_server_tsd_key);
      272 +        if (ng == NULL) {
      273 +                ng = zone_getspecific(nfssrv_zone_key, curzone);
      274 +                (void) tsd_set(nfs_server_tsd_key, ng);
      275 +        }
      276 +
      277 +        return (ng);
      278 +}
      279 +
 285  280  /*
 286  281   * Will be called at the point the server pool is being unregistered
 287  282   * from the pool list. From that point onwards, the pool is waiting
 288  283   * to be drained and as such the server state is stale and pertains
 289  284   * to the old instantiation of the NFS server pool.
 290  285   */
 291  286  void
 292  287  nfs_srv_offline(void)
 293  288  {
 294      -        mutex_enter(&nfs_server_upordown_lock);
 295      -        if (nfs_server_upordown == NFS_SERVER_RUNNING) {
 296      -                nfs_server_upordown = NFS_SERVER_OFFLINE;
      289 +        nfs_globals_t *ng;
      290 +
      291 +        ng = nfs_srv_getzg();
      292 +
      293 +        mutex_enter(&ng->nfs_server_upordown_lock);
      294 +        if (ng->nfs_server_upordown == NFS_SERVER_RUNNING) {
      295 +                ng->nfs_server_upordown = NFS_SERVER_OFFLINE;
 297  296          }
 298      -        mutex_exit(&nfs_server_upordown_lock);
      297 +        mutex_exit(&ng->nfs_server_upordown_lock);
 299  298  }
 300  299  
 301  300  /*
 302  301   * Will be called at the point the server pool is being destroyed so
 303  302   * all transports have been closed and no service threads are in
 304  303   * existence.
 305  304   *
 306  305   * If we quiesce the server, we're shutting it down without destroying the
 307  306   * server state. This allows it to warm start subsequently.
 308  307   */
↓ open down ↓ 8 lines elided ↑ open up ↑
 317  316   * This alternative shutdown routine can be requested via nfssys()
 318  317   */
 319  318  void
 320  319  nfs_srv_quiesce_all(void)
 321  320  {
 322  321          int quiesce = 1;
 323  322          nfs_srv_shutdown_all(quiesce);
 324  323  }
 325  324  
 326  325  static void
 327      -nfs_srv_shutdown_all(int quiesce) {
 328      -        mutex_enter(&nfs_server_upordown_lock);
      326 +nfs_srv_shutdown_all(int quiesce)
      327 +{
      328 +        nfs_globals_t *ng = nfs_srv_getzg();
      329 +
      330 +        mutex_enter(&ng->nfs_server_upordown_lock);
 329  331          if (quiesce) {
 330      -                if (nfs_server_upordown == NFS_SERVER_RUNNING ||
 331      -                        nfs_server_upordown == NFS_SERVER_OFFLINE) {
 332      -                        nfs_server_upordown = NFS_SERVER_QUIESCED;
 333      -                        cv_signal(&nfs_server_upordown_cv);
      332 +                if (ng->nfs_server_upordown == NFS_SERVER_RUNNING ||
      333 +                    ng->nfs_server_upordown == NFS_SERVER_OFFLINE) {
      334 +                        ng->nfs_server_upordown = NFS_SERVER_QUIESCED;
      335 +                        cv_signal(&ng->nfs_server_upordown_cv);
 334  336  
 335      -                        /* reset DSS state, for subsequent warm restart */
      337 +                        /* reset DSS state */
 336  338                          rfs4_dss_numnewpaths = 0;
 337  339                          rfs4_dss_newpaths = NULL;
 338  340  
 339  341                          cmn_err(CE_NOTE, "nfs_server: server is now quiesced; "
 340  342                              "NFSv4 state has been preserved");
 341  343                  }
 342  344          } else {
 343      -                if (nfs_server_upordown == NFS_SERVER_OFFLINE) {
 344      -                        nfs_server_upordown = NFS_SERVER_STOPPING;
 345      -                        mutex_exit(&nfs_server_upordown_lock);
 346      -                        rfs4_state_fini();
 347      -                        rfs4_fini_drc(nfs4_drc);
 348      -                        mutex_enter(&nfs_server_upordown_lock);
 349      -                        nfs_server_upordown = NFS_SERVER_STOPPED;
 350      -                        cv_signal(&nfs_server_upordown_cv);
      345 +                if (ng->nfs_server_upordown == NFS_SERVER_OFFLINE) {
      346 +                        ng->nfs_server_upordown = NFS_SERVER_STOPPING;
      347 +                        mutex_exit(&ng->nfs_server_upordown_lock);
      348 +                        rfs4_state_zone_fini();
      349 +                        rfs4_fini_drc();
      350 +                        mutex_enter(&ng->nfs_server_upordown_lock);
      351 +                        ng->nfs_server_upordown = NFS_SERVER_STOPPED;
      352 +
      353 +                        /* reset DSS state */
      354 +                        rfs4_dss_numnewpaths = 0;
      355 +                        rfs4_dss_newpaths = NULL;
      356 +
      357 +                        cv_signal(&ng->nfs_server_upordown_cv);
 351  358                  }
 352  359          }
 353      -        mutex_exit(&nfs_server_upordown_lock);
      360 +        mutex_exit(&ng->nfs_server_upordown_lock);
 354  361  }
 355  362  
 356  363  static int
 357  364  nfs_srv_set_sc_versions(struct file *fp, SVC_CALLOUT_TABLE **sctpp,
 358      -                        rpcvers_t versmin, rpcvers_t versmax)
      365 +    rpcvers_t versmin, rpcvers_t versmax)
 359  366  {
 360  367          struct strioctl strioc;
 361  368          struct T_info_ack tinfo;
 362  369          int             error, retval;
 363  370  
 364  371          /*
 365  372           * Find out what type of transport this is.
 366  373           */
 367  374          strioc.ic_cmd = TI_GETINFO;
 368  375          strioc.ic_timout = -1;
↓ open down ↓ 42 lines elided ↑ open up ↑
 411  418  }
 412  419  
 413  420  /*
 414  421   * NFS Server system call.
 415  422   * Does all of the work of running a NFS server.
 416  423   * uap->fd is the fd of an open transport provider
 417  424   */
 418  425  int
 419  426  nfs_svc(struct nfs_svc_args *arg, model_t model)
 420  427  {
      428 +        nfs_globals_t *ng;
 421  429          file_t *fp;
 422  430          SVCMASTERXPRT *xprt;
 423  431          int error;
 424  432          int readsize;
 425  433          char buf[KNC_STRSIZE];
 426  434          size_t len;
 427  435          STRUCT_HANDLE(nfs_svc_args, uap);
 428  436          struct netbuf addrmask;
 429  437          SVC_CALLOUT_TABLE *sctp = NULL;
 430  438  
 431  439  #ifdef lint
 432  440          model = model;          /* STRUCT macros don't always refer to it */
 433  441  #endif
 434  442  
      443 +        ng = nfs_srv_getzg();
 435  444          STRUCT_SET_HANDLE(uap, model, arg);
 436  445  
 437  446          /* Check privileges in nfssys() */
 438  447  
 439  448          if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
 440  449                  return (EBADF);
 441  450  
      451 +        /* Setup global file handle in nfs_export */
      452 +        if ((error = nfs_export_get_rootfh(ng)) != 0)
      453 +                return (error);
      454 +
 442  455          /*
 443  456           * Set read buffer size to rsize
 444  457           * and add room for RPC headers.
 445  458           */
 446  459          readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
 447  460          if (readsize < RPC_MAXDATASIZE)
 448  461                  readsize = RPC_MAXDATASIZE;
 449  462  
 450  463          error = copyinstr((const char *)STRUCT_FGETP(uap, netid), buf,
 451  464              KNC_STRSIZE, &len);
↓ open down ↓ 6 lines elided ↑ open up ↑
 458  471          addrmask.maxlen = STRUCT_FGET(uap, addrmask.maxlen);
 459  472          addrmask.buf = kmem_alloc(addrmask.maxlen, KM_SLEEP);
 460  473          error = copyin(STRUCT_FGETP(uap, addrmask.buf), addrmask.buf,
 461  474              addrmask.len);
 462  475          if (error) {
 463  476                  releasef(STRUCT_FGET(uap, fd));
 464  477                  kmem_free(addrmask.buf, addrmask.maxlen);
 465  478                  return (error);
 466  479          }
 467  480  
 468      -        nfs_versmin = STRUCT_FGET(uap, versmin);
 469      -        nfs_versmax = STRUCT_FGET(uap, versmax);
      481 +        ng->nfs_versmin = STRUCT_FGET(uap, versmin);
      482 +        ng->nfs_versmax = STRUCT_FGET(uap, versmax);
 470  483  
 471  484          /* Double check the vers min/max ranges */
 472      -        if ((nfs_versmin > nfs_versmax) ||
 473      -            (nfs_versmin < NFS_VERSMIN) ||
 474      -            (nfs_versmax > NFS_VERSMAX)) {
 475      -                nfs_versmin = NFS_VERSMIN_DEFAULT;
 476      -                nfs_versmax = NFS_VERSMAX_DEFAULT;
      485 +        if ((ng->nfs_versmin > ng->nfs_versmax) ||
      486 +            (ng->nfs_versmin < NFS_VERSMIN) ||
      487 +            (ng->nfs_versmax > NFS_VERSMAX)) {
      488 +                ng->nfs_versmin = NFS_VERSMIN_DEFAULT;
      489 +                ng->nfs_versmax = NFS_VERSMAX_DEFAULT;
 477  490          }
 478  491  
 479      -        if (error =
 480      -            nfs_srv_set_sc_versions(fp, &sctp, nfs_versmin, nfs_versmax)) {
      492 +        if (error = nfs_srv_set_sc_versions(fp, &sctp, ng->nfs_versmin,
      493 +            ng->nfs_versmax)) {
 481  494                  releasef(STRUCT_FGET(uap, fd));
 482  495                  kmem_free(addrmask.buf, addrmask.maxlen);
 483  496                  return (error);
 484  497          }
 485  498  
 486  499          /* Initialize nfsv4 server */
 487      -        if (nfs_versmax == (rpcvers_t)NFS_V4)
 488      -                rfs4_server_start(STRUCT_FGET(uap, delegation));
      500 +        if (ng->nfs_versmax == (rpcvers_t)NFS_V4)
      501 +                rfs4_server_start(ng, STRUCT_FGET(uap, delegation));
 489  502  
 490  503          /* Create a transport handle. */
 491  504          error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &xprt,
 492  505              sctp, NULL, NFS_SVCPOOL_ID, TRUE);
 493  506  
 494  507          if (error)
 495  508                  kmem_free(addrmask.buf, addrmask.maxlen);
 496  509  
 497  510          releasef(STRUCT_FGET(uap, fd));
 498  511  
 499  512          /* HA-NFSv4: save the cluster nodeid */
 500  513          if (cluster_bootflags & CLUSTER_BOOTED)
 501  514                  lm_global_nlmid = clconf_get_nodeid();
 502  515  
 503  516          return (error);
 504  517  }
 505  518  
 506  519  static void
 507      -rfs4_server_start(int nfs4_srv_delegation)
      520 +rfs4_server_start(nfs_globals_t *ng, int nfs4_srv_delegation)
 508  521  {
 509  522          /*
 510  523           * Determine if the server has previously been "started" and
 511  524           * if not, do the per instance initialization
 512  525           */
 513      -        mutex_enter(&nfs_server_upordown_lock);
      526 +        mutex_enter(&ng->nfs_server_upordown_lock);
 514  527  
 515      -        if (nfs_server_upordown != NFS_SERVER_RUNNING) {
      528 +        if (ng->nfs_server_upordown != NFS_SERVER_RUNNING) {
 516  529                  /* Do we need to stop and wait on the previous server? */
 517      -                while (nfs_server_upordown == NFS_SERVER_STOPPING ||
 518      -                    nfs_server_upordown == NFS_SERVER_OFFLINE)
 519      -                        cv_wait(&nfs_server_upordown_cv,
 520      -                            &nfs_server_upordown_lock);
      530 +                while (ng->nfs_server_upordown == NFS_SERVER_STOPPING ||
      531 +                    ng->nfs_server_upordown == NFS_SERVER_OFFLINE)
      532 +                        cv_wait(&ng->nfs_server_upordown_cv,
      533 +                            &ng->nfs_server_upordown_lock);
 521  534  
 522      -                if (nfs_server_upordown != NFS_SERVER_RUNNING) {
      535 +                if (ng->nfs_server_upordown != NFS_SERVER_RUNNING) {
 523  536                          (void) svc_pool_control(NFS_SVCPOOL_ID,
 524  537                              SVCPSET_UNREGISTER_PROC, (void *)&nfs_srv_offline);
 525  538                          (void) svc_pool_control(NFS_SVCPOOL_ID,
 526  539                              SVCPSET_SHUTDOWN_PROC, (void *)&nfs_srv_stop_all);
 527  540  
 528      -                        /* is this an nfsd warm start? */
 529      -                        if (nfs_server_upordown == NFS_SERVER_QUIESCED) {
 530      -                                cmn_err(CE_NOTE, "nfs_server: "
 531      -                                    "server was previously quiesced; "
 532      -                                    "existing NFSv4 state will be re-used");
      541 +                        rfs4_do_server_start(ng->nfs_server_upordown,
      542 +                            nfs4_srv_delegation,
      543 +                            cluster_bootflags & CLUSTER_BOOTED);
 533  544  
 534      -                                /*
 535      -                                 * HA-NFSv4: this is also the signal
 536      -                                 * that a Resource Group failover has
 537      -                                 * occurred.
 538      -                                 */
 539      -                                if (cluster_bootflags & CLUSTER_BOOTED)
 540      -                                        hanfsv4_failover();
 541      -                        } else {
 542      -                                /* cold start */
 543      -                                rfs4_state_init();
 544      -                                nfs4_drc = rfs4_init_drc(nfs4_drc_max,
 545      -                                    nfs4_drc_hash);
 546      -                        }
 547      -
 548      -                        /*
 549      -                         * Check to see if delegation is to be
 550      -                         * enabled at the server
 551      -                         */
 552      -                        if (nfs4_srv_delegation != FALSE)
 553      -                                rfs4_set_deleg_policy(SRV_NORMAL_DELEGATE);
 554      -
 555      -                        nfs_server_upordown = NFS_SERVER_RUNNING;
      545 +                        ng->nfs_server_upordown = NFS_SERVER_RUNNING;
 556  546                  }
 557      -                cv_signal(&nfs_server_upordown_cv);
      547 +                cv_signal(&ng->nfs_server_upordown_cv);
 558  548          }
 559      -        mutex_exit(&nfs_server_upordown_lock);
      549 +        mutex_exit(&ng->nfs_server_upordown_lock);
 560  550  }
 561  551  
 562  552  /*
 563  553   * If RDMA device available,
 564  554   * start RDMA listener.
 565  555   */
 566  556  int
 567  557  rdma_start(struct rdma_svc_args *rsa)
 568  558  {
      559 +        nfs_globals_t *ng;
 569  560          int error;
 570  561          rdma_xprt_group_t started_rdma_xprts;
 571  562          rdma_stat stat;
 572  563          int svc_state = 0;
 573  564  
 574  565          /* Double check the vers min/max ranges */
 575  566          if ((rsa->nfs_versmin > rsa->nfs_versmax) ||
 576  567              (rsa->nfs_versmin < NFS_VERSMIN) ||
 577  568              (rsa->nfs_versmax > NFS_VERSMAX)) {
 578  569                  rsa->nfs_versmin = NFS_VERSMIN_DEFAULT;
 579  570                  rsa->nfs_versmax = NFS_VERSMAX_DEFAULT;
 580  571          }
 581      -        nfs_versmin = rsa->nfs_versmin;
 582      -        nfs_versmax = rsa->nfs_versmax;
 583  572  
      573 +        ng = nfs_srv_getzg();
      574 +        ng->nfs_versmin = rsa->nfs_versmin;
      575 +        ng->nfs_versmax = rsa->nfs_versmax;
      576 +
 584  577          /* Set the versions in the callout table */
 585  578          __nfs_sc_rdma[0].sc_versmin = rsa->nfs_versmin;
 586  579          __nfs_sc_rdma[0].sc_versmax = rsa->nfs_versmax;
 587  580          /* For the NFS_ACL program, check the max version */
 588  581          __nfs_sc_rdma[1].sc_versmin = rsa->nfs_versmin;
 589  582          if (rsa->nfs_versmax > NFS_ACL_VERSMAX)
 590  583                  __nfs_sc_rdma[1].sc_versmax = NFS_ACL_VERSMAX;
 591  584          else
 592  585                  __nfs_sc_rdma[1].sc_versmax = rsa->nfs_versmax;
 593  586  
 594  587          /* Initialize nfsv4 server */
 595  588          if (rsa->nfs_versmax == (rpcvers_t)NFS_V4)
 596      -                rfs4_server_start(rsa->delegation);
      589 +                rfs4_server_start(ng, rsa->delegation);
 597  590  
 598  591          started_rdma_xprts.rtg_count = 0;
 599  592          started_rdma_xprts.rtg_listhead = NULL;
 600  593          started_rdma_xprts.rtg_poolid = rsa->poolid;
 601  594  
 602  595  restart:
 603  596          error = svc_rdma_kcreate(rsa->netid, &nfs_sct_rdma, rsa->poolid,
 604  597              &started_rdma_xprts);
 605  598  
 606  599          svc_state = !error;
 607  600  
 608  601          while (!error) {
 609  602  
 610  603                  /*
 611  604                   * wait till either interrupted by a signal on
 612  605                   * nfs service stop/restart or signalled by a
 613      -                 * rdma plugin attach/detatch.
      606 +                 * rdma attach/detatch.
 614  607                   */
 615  608  
 616  609                  stat = rdma_kwait();
 617  610  
 618  611                  /*
 619  612                   * stop services if running -- either on a HCA detach event
 620  613                   * or if the nfs service is stopped/restarted.
 621  614                   */
 622  615  
 623  616                  if ((stat == RDMA_HCA_DETACH || stat == RDMA_INTR) &&
↓ open down ↓ 30 lines elided ↑ open up ↑
 654  647  rpc_null(caddr_t *argp, caddr_t *resp, struct exportinfo *exi,
 655  648      struct svc_req *req, cred_t *cr, bool_t ro)
 656  649  {
 657  650  }
 658  651  
 659  652  /* ARGSUSED */
 660  653  void
 661  654  rpc_null_v3(caddr_t *argp, caddr_t *resp, struct exportinfo *exi,
 662  655      struct svc_req *req, cred_t *cr, bool_t ro)
 663  656  {
 664      -        DTRACE_NFSV3_3(op__null__start, struct svc_req *, req,
 665      -            cred_t *, cr, vnode_t *, NULL);
 666      -        DTRACE_NFSV3_3(op__null__done, struct svc_req *, req,
 667      -            cred_t *, cr, vnode_t *, NULL);
      657 +        DTRACE_NFSV3_4(op__null__start, struct svc_req *, req,
      658 +            cred_t *, cr, vnode_t *, NULL, struct exportinfo *, exi);
      659 +        DTRACE_NFSV3_4(op__null__done, struct svc_req *, req,
      660 +            cred_t *, cr, vnode_t *, NULL, struct exportinfo *, exi);
 668  661  }
 669  662  
 670  663  /* ARGSUSED */
 671  664  static void
 672  665  rfs_error(caddr_t *argp, caddr_t *resp, struct exportinfo *exi,
 673  666      struct svc_req *req, cred_t *cr, bool_t ro)
 674  667  {
 675  668          /* return (EOPNOTSUPP); */
 676  669  }
 677  670  
↓ open down ↓ 657 lines elided ↑ open up ↑
1335 1328          /* RFS_NULL = 0 */
1336 1329  
1337 1330          /* RFS4_COMPOUND = 1 */
1338 1331          COMPOUND4res nfs4_compound_res;
1339 1332  
1340 1333  };
1341 1334  
1342 1335  static struct rpc_disptable rfs_disptable[] = {
1343 1336          {sizeof (rfsdisptab_v2) / sizeof (rfsdisptab_v2[0]),
1344 1337              rfscallnames_v2,
1345      -            &rfsproccnt_v2_ptr, rfsdisptab_v2},
     1338 +            rfsdisptab_v2},
1346 1339          {sizeof (rfsdisptab_v3) / sizeof (rfsdisptab_v3[0]),
1347 1340              rfscallnames_v3,
1348      -            &rfsproccnt_v3_ptr, rfsdisptab_v3},
     1341 +            rfsdisptab_v3},
1349 1342          {sizeof (rfsdisptab_v4) / sizeof (rfsdisptab_v4[0]),
1350 1343              rfscallnames_v4,
1351      -            &rfsproccnt_v4_ptr, rfsdisptab_v4},
     1344 +            rfsdisptab_v4},
1352 1345  };
1353 1346  
1354 1347  /*
1355 1348   * If nfs_portmon is set, then clients are required to use privileged
1356 1349   * ports (ports < IPPORT_RESERVED) in order to get NFS services.
1357 1350   *
1358 1351   * N.B.: this attempt to carry forward the already ill-conceived notion
1359 1352   * of privileged ports for TCP/UDP is really quite ineffectual.  Not only
1360 1353   * is it transport-dependent, it's laughably easy to spoof.  If you're
1361 1354   * really interested in security, you must start with secure RPC instead.
1362 1355   */
1363 1356  static int nfs_portmon = 0;
1364 1357  
1365 1358  #ifdef DEBUG
1366 1359  static int cred_hits = 0;
1367 1360  static int cred_misses = 0;
1368 1361  #endif
1369 1362  
1370      -
1371 1363  #ifdef DEBUG
1372 1364  /*
1373 1365   * Debug code to allow disabling of rfs_dispatch() use of
1374 1366   * fastxdrargs() and fastxdrres() calls for testing purposes.
1375 1367   */
1376 1368  static int rfs_no_fast_xdrargs = 0;
1377 1369  static int rfs_no_fast_xdrres = 0;
1378 1370  #endif
1379 1371  
1380 1372  union acl_args {
↓ open down ↓ 83 lines elided ↑ open up ↑
1464 1456                  if ((enum wnfsstat)dr->dr_status == WNFSERR_CLNT_FLAVOR)
1465 1457                          return (TRUE);
1466 1458          } else if (req->rq_vers == NFS_V3 && req->rq_proc == NFSPROC3_LOOKUP) {
1467 1459                  LOOKUP3res *resp = (LOOKUP3res *)res;
1468 1460                  if ((enum wnfsstat)resp->status == WNFSERR_CLNT_FLAVOR)
1469 1461                          return (TRUE);
1470 1462          }
1471 1463          return (FALSE);
1472 1464  }
1473 1465  
1474      -
1475 1466  static void
1476 1467  common_dispatch(struct svc_req *req, SVCXPRT *xprt, rpcvers_t min_vers,
1477      -                rpcvers_t max_vers, char *pgmname,
1478      -                struct rpc_disptable *disptable)
     1468 +    rpcvers_t max_vers, char *pgmname, struct rpc_disptable *disptable)
1479 1469  {
1480 1470          int which;
1481 1471          rpcvers_t vers;
1482 1472          char *args;
1483 1473          union {
1484 1474                          union rfs_args ra;
1485 1475                          union acl_args aa;
1486 1476                  } args_buf;
1487 1477          char *res;
1488 1478          union {
↓ open down ↓ 12 lines elided ↑ open up ↑
1501 1491          int authres;
1502 1492          bool_t publicfh_ok = FALSE;
1503 1493          enum_t auth_flavor;
1504 1494          bool_t dupcached = FALSE;
1505 1495          struct netbuf   nb;
1506 1496          bool_t logging_enabled = FALSE;
1507 1497          struct exportinfo *nfslog_exi = NULL;
1508 1498          char **procnames;
1509 1499          char cbuf[INET6_ADDRSTRLEN];    /* to hold both IPv4 and IPv6 addr */
1510 1500          bool_t ro = FALSE;
     1501 +        nfs_globals_t *ng = nfs_srv_getzg();
     1502 +        nfs_export_t *ne = ng->nfs_export;
     1503 +        kstat_named_t *svstat, *procstat;
1511 1504  
     1505 +        ASSERT(req->rq_prog == NFS_PROGRAM || req->rq_prog == NFS_ACL_PROGRAM);
     1506 +
1512 1507          vers = req->rq_vers;
1513 1508  
     1509 +        svstat = ng->svstat[req->rq_vers];
     1510 +        procstat = (req->rq_prog == NFS_PROGRAM) ?
     1511 +            ng->rfsproccnt[vers] : ng->aclproccnt[vers];
     1512 +
1514 1513          if (vers < min_vers || vers > max_vers) {
1515 1514                  svcerr_progvers(req->rq_xprt, min_vers, max_vers);
1516 1515                  error++;
1517 1516                  cmn_err(CE_NOTE, "%s: bad version number %u", pgmname, vers);
1518 1517                  goto done;
1519 1518          }
1520 1519          vers -= min_vers;
1521 1520  
1522 1521          which = req->rq_proc;
1523 1522          if (which < 0 || which >= disptable[(int)vers].dis_nprocs) {
1524 1523                  svcerr_noproc(req->rq_xprt);
1525 1524                  error++;
1526 1525                  goto done;
1527 1526          }
1528 1527  
1529      -        (*(disptable[(int)vers].dis_proccntp))[which].value.ui64++;
     1528 +        procstat[which].value.ui64++;
1530 1529  
1531 1530          disp = &disptable[(int)vers].dis_table[which];
1532 1531          procnames = disptable[(int)vers].dis_procnames;
1533 1532  
1534 1533          auth_flavor = req->rq_cred.oa_flavor;
1535 1534  
1536 1535          /*
1537 1536           * Deserialize into the args struct.
1538 1537           */
1539 1538          args = (char *)&args_buf;
↓ open down ↓ 85 lines elided ↑ open up ↑
1625 1624                   */
1626 1625  
1627 1626                  if ((dis_flags & RPC_ALLOWANON) && EQFID(fid, xfid))
1628 1627                          anon_ok = 1;
1629 1628                  else
1630 1629                          anon_ok = 0;
1631 1630  
1632 1631                  cr = xprt->xp_cred;
1633 1632                  ASSERT(cr != NULL);
1634 1633  #ifdef DEBUG
1635      -                if (crgetref(cr) != 1) {
1636      -                        crfree(cr);
1637      -                        cr = crget();
1638      -                        xprt->xp_cred = cr;
1639      -                        cred_misses++;
1640      -                } else
1641      -                        cred_hits++;
     1634 +                {
     1635 +                        if (crgetref(cr) != 1) {
     1636 +                                crfree(cr);
     1637 +                                cr = crget();
     1638 +                                xprt->xp_cred = cr;
     1639 +                                cred_misses++;
     1640 +                        } else
     1641 +                                cred_hits++;
     1642 +                }
1642 1643  #else
1643 1644                  if (crgetref(cr) != 1) {
1644 1645                          crfree(cr);
1645 1646                          cr = crget();
1646 1647                          xprt->xp_cred = cr;
1647 1648                  }
1648 1649  #endif
1649 1650  
1650 1651                  exi = checkexport(fsid, xfid);
1651 1652  
1652 1653                  if (exi != NULL) {
1653      -                        publicfh_ok = PUBLICFH_CHECK(disp, exi, fsid, xfid);
     1654 +                        publicfh_ok = PUBLICFH_CHECK(ne, disp, exi, fsid, xfid);
1654 1655  
1655 1656                          /*
1656 1657                           * Don't allow non-V4 clients access
1657 1658                           * to pseudo exports
1658 1659                           */
1659 1660                          if (PSEUDO(exi)) {
1660 1661                                  svcerr_weakauth(xprt);
1661 1662                                  error++;
1662 1663                                  goto done;
1663 1664                          }
↓ open down ↓ 92 lines elided ↑ open up ↑
1756 1757          }
1757 1758  
1758 1759          /*
1759 1760           * Check to see if logging has been enabled on the server.
1760 1761           * If so, then obtain the export info struct to be used for
1761 1762           * the later writing of the log record.  This is done for
1762 1763           * the case that a lookup is done across a non-logged public
1763 1764           * file system.
1764 1765           */
1765 1766          if (nfslog_buffer_list != NULL) {
1766      -                nfslog_exi = nfslog_get_exi(exi, req, res, &nfslog_rec_id);
     1767 +                nfslog_exi = nfslog_get_exi(ne, exi, req, res, &nfslog_rec_id);
1767 1768                  /*
1768 1769                   * Is logging enabled?
1769 1770                   */
1770 1771                  logging_enabled = (nfslog_exi != NULL);
1771 1772  
1772 1773                  /*
1773 1774                   * Copy the netbuf for logging purposes, before it is
1774 1775                   * freed by svc_sendreply().
1775 1776                   */
1776 1777                  if (logging_enabled) {
↓ open down ↓ 62 lines elided ↑ open up ↑
1839 1840          } else {
1840 1841                  if (!SVC_FREEARGS(xprt, (xdrproc_t)0, (caddr_t)0)) {
1841 1842                          cmn_err(CE_NOTE, "%s: bad freeargs", pgmname);
1842 1843                          error++;
1843 1844                  }
1844 1845          }
1845 1846  
1846 1847          if (exi != NULL)
1847 1848                  exi_rele(exi);
1848 1849  
1849      -        global_svstat_ptr[req->rq_vers][NFS_BADCALLS].value.ui64 += error;
1850      -
1851      -        global_svstat_ptr[req->rq_vers][NFS_CALLS].value.ui64++;
     1850 +        svstat[NFS_BADCALLS].value.ui64 += error;
     1851 +        svstat[NFS_CALLS].value.ui64++;
1852 1852  }
1853 1853  
1854 1854  static void
1855 1855  rfs_dispatch(struct svc_req *req, SVCXPRT *xprt)
1856 1856  {
1857 1857          common_dispatch(req, xprt, NFS_VERSMIN, NFS_VERSMAX,
1858 1858              "NFS", rfs_disptable);
1859 1859  }
1860 1860  
1861 1861  static char *aclcallnames_v2[] = {
↓ open down ↓ 102 lines elided ↑ open up ↑
1964 1964          {acl3_getxattrdir,
1965 1965              xdr_GETXATTRDIR3args, NULL_xdrproc_t, sizeof (GETXATTRDIR3args),
1966 1966              xdr_GETXATTRDIR3res, NULL_xdrproc_t, sizeof (GETXATTRDIR3res),
1967 1967              nullfree, RPC_IDEMPOTENT,
1968 1968              acl3_getxattrdir_getfh},
1969 1969  };
1970 1970  
1971 1971  static struct rpc_disptable acl_disptable[] = {
1972 1972          {sizeof (acldisptab_v2) / sizeof (acldisptab_v2[0]),
1973 1973                  aclcallnames_v2,
1974      -                &aclproccnt_v2_ptr, acldisptab_v2},
     1974 +                acldisptab_v2},
1975 1975          {sizeof (acldisptab_v3) / sizeof (acldisptab_v3[0]),
1976 1976                  aclcallnames_v3,
1977      -                &aclproccnt_v3_ptr, acldisptab_v3},
     1977 +                acldisptab_v3},
1978 1978  };
1979 1979  
1980 1980  static void
1981 1981  acl_dispatch(struct svc_req *req, SVCXPRT *xprt)
1982 1982  {
1983 1983          common_dispatch(req, xprt, NFS_ACL_VERSMIN, NFS_ACL_VERSMAX,
1984 1984              "ACL", acl_disptable);
1985 1985  }
1986 1986  
1987 1987  int
↓ open down ↓ 573 lines elided ↑ open up ↑
2561 2561          return (buf);
2562 2562  }
2563 2563  
2564 2564  /*
2565 2565   * NFS Server initialization routine.  This routine should only be called
2566 2566   * once.  It performs the following tasks:
2567 2567   *      - Call sub-initialization routines (localize access to variables)
2568 2568   *      - Initialize all locks
2569 2569   *      - initialize the version 3 write verifier
2570 2570   */
2571      -int
     2571 +void
2572 2572  nfs_srvinit(void)
2573 2573  {
2574      -        int error;
2575 2574  
2576      -        error = nfs_exportinit();
2577      -        if (error != 0)
2578      -                return (error);
2579      -        error = rfs4_srvrinit();
2580      -        if (error != 0) {
2581      -                nfs_exportfini();
2582      -                return (error);
2583      -        }
     2575 +        /* Truly global stuff in this module (not per zone) */
     2576 +        rw_init(&nfssrv_globals_rwl, NULL, RW_DEFAULT, NULL);
     2577 +        list_create(&nfssrv_globals_list, sizeof (nfs_globals_t),
     2578 +            offsetof(nfs_globals_t, nfs_g_link));
     2579 +        tsd_create(&nfs_server_tsd_key, NULL);
     2580 +
     2581 +        /* The order here is important */
     2582 +        nfs_exportinit();
2584 2583          rfs_srvrinit();
2585 2584          rfs3_srvrinit();
     2585 +        rfs4_srvrinit();
2586 2586          nfsauth_init();
2587 2587  
2588      -        /* Init the stuff to control start/stop */
2589      -        nfs_server_upordown = NFS_SERVER_STOPPED;
2590      -        mutex_init(&nfs_server_upordown_lock, NULL, MUTEX_DEFAULT, NULL);
2591      -        cv_init(&nfs_server_upordown_cv, NULL, CV_DEFAULT, NULL);
2592      -        mutex_init(&rdma_wait_mutex, NULL, MUTEX_DEFAULT, NULL);
2593      -        cv_init(&rdma_wait_cv, NULL, CV_DEFAULT, NULL);
2594      -
2595      -        return (0);
     2588 +        /*
     2589 +         * NFS server zone-specific global variables
     2590 +         * Note the zone_init is called for the GZ here.
     2591 +         */
     2592 +        zone_key_create(&nfssrv_zone_key, nfs_server_zone_init,
     2593 +            nfs_server_zone_shutdown, nfs_server_zone_fini);
2596 2594  }
2597 2595  
2598 2596  /*
2599 2597   * NFS Server finalization routine. This routine is called to cleanup the
2600 2598   * initialization work previously performed if the NFS server module could
2601 2599   * not be loaded correctly.
2602 2600   */
2603 2601  void
2604 2602  nfs_srvfini(void)
2605 2603  {
     2604 +
     2605 +        /*
     2606 +         * NFS server zone-specific global variables
     2607 +         * Note the zone_fini is called for the GZ here.
     2608 +         */
     2609 +        (void) zone_key_delete(nfssrv_zone_key);
     2610 +
     2611 +        /* The order here is important (reverse of init) */
2606 2612          nfsauth_fini();
     2613 +        rfs4_srvrfini();
2607 2614          rfs3_srvrfini();
2608 2615          rfs_srvrfini();
2609 2616          nfs_exportfini();
2610 2617  
2611      -        mutex_destroy(&nfs_server_upordown_lock);
2612      -        cv_destroy(&nfs_server_upordown_cv);
2613      -        mutex_destroy(&rdma_wait_mutex);
2614      -        cv_destroy(&rdma_wait_cv);
     2618 +        /* Truly global stuff in this module (not per zone) */
     2619 +        tsd_destroy(&nfs_server_tsd_key);
     2620 +        list_destroy(&nfssrv_globals_list);
     2621 +        rw_destroy(&nfssrv_globals_rwl);
2615 2622  }
2616 2623  
2617 2624  /*
2618      - * Set up an iovec array of up to cnt pointers.
     2625 + * Zone init, shutdown, fini functions for the NFS server
     2626 + *
     2627 + * This design is careful to create the entire hierarhcy of
     2628 + * NFS server "globals" (including those created by various
     2629 + * per-module *_zone_init functions, etc.) so that all these
     2630 + * objects have exactly the same lifetime.
     2631 + *
     2632 + * These objects are also kept on a list for two reasons:
     2633 + * 1: It makes finding these in mdb _much_ easier.
     2634 + * 2: It allows operating across all zone globals for
     2635 + *    functions like nfs_auth.c:exi_cache_reclaim
2619 2636   */
     2637 +static void *
     2638 +nfs_server_zone_init(zoneid_t zoneid)
     2639 +{
     2640 +        nfs_globals_t *ng;
2620 2641  
     2642 +        ng = kmem_zalloc(sizeof (*ng), KM_SLEEP);
     2643 +
     2644 +        ng->nfs_versmin = NFS_VERSMIN_DEFAULT;
     2645 +        ng->nfs_versmax = NFS_VERSMAX_DEFAULT;
     2646 +
     2647 +        /* Init the stuff to control start/stop */
     2648 +        ng->nfs_server_upordown = NFS_SERVER_STOPPED;
     2649 +        mutex_init(&ng->nfs_server_upordown_lock, NULL, MUTEX_DEFAULT, NULL);
     2650 +        cv_init(&ng->nfs_server_upordown_cv, NULL, CV_DEFAULT, NULL);
     2651 +        mutex_init(&ng->rdma_wait_mutex, NULL, MUTEX_DEFAULT, NULL);
     2652 +        cv_init(&ng->rdma_wait_cv, NULL, CV_DEFAULT, NULL);
     2653 +
     2654 +        ng->nfs_zoneid = zoneid;
     2655 +
     2656 +        /*
     2657 +         * Order here is important.
     2658 +         * export init must precede srv init calls.
     2659 +         */
     2660 +        nfs_export_zone_init(ng);
     2661 +        rfs_stat_zone_init(ng);
     2662 +        rfs_srv_zone_init(ng);
     2663 +        rfs3_srv_zone_init(ng);
     2664 +        rfs4_srv_zone_init(ng);
     2665 +        nfsauth_zone_init(ng);
     2666 +
     2667 +        rw_enter(&nfssrv_globals_rwl, RW_WRITER);
     2668 +        list_insert_tail(&nfssrv_globals_list, ng);
     2669 +        rw_exit(&nfssrv_globals_rwl);
     2670 +
     2671 +        return (ng);
     2672 +}
     2673 +
     2674 +/* ARGSUSED */
     2675 +static void
     2676 +nfs_server_zone_shutdown(zoneid_t zoneid, void *data)
     2677 +{
     2678 +        nfs_globals_t *ng;
     2679 +
     2680 +        ng = (nfs_globals_t *)data;
     2681 +
     2682 +        /*
     2683 +         * Order is like _fini, but only
     2684 +         * some modules need this hook.
     2685 +         */
     2686 +        nfsauth_zone_shutdown(ng);
     2687 +        nfs_export_zone_shutdown(ng);
     2688 +}
     2689 +
     2690 +/* ARGSUSED */
     2691 +static void
     2692 +nfs_server_zone_fini(zoneid_t zoneid, void *data)
     2693 +{
     2694 +        nfs_globals_t *ng;
     2695 +
     2696 +        ng = (nfs_globals_t *)data;
     2697 +
     2698 +        rw_enter(&nfssrv_globals_rwl, RW_WRITER);
     2699 +        list_remove(&nfssrv_globals_list, ng);
     2700 +        rw_exit(&nfssrv_globals_rwl);
     2701 +
     2702 +        /*
     2703 +         * Order here is important.
     2704 +         * reverse order from init
     2705 +         */
     2706 +        nfsauth_zone_fini(ng);
     2707 +        rfs4_srv_zone_fini(ng);
     2708 +        rfs3_srv_zone_fini(ng);
     2709 +        rfs_srv_zone_fini(ng);
     2710 +        rfs_stat_zone_fini(ng);
     2711 +        nfs_export_zone_fini(ng);
     2712 +
     2713 +        mutex_destroy(&ng->nfs_server_upordown_lock);
     2714 +        cv_destroy(&ng->nfs_server_upordown_cv);
     2715 +        mutex_destroy(&ng->rdma_wait_mutex);
     2716 +        cv_destroy(&ng->rdma_wait_cv);
     2717 +
     2718 +        kmem_free(ng, sizeof (*ng));
     2719 +}
     2720 +
     2721 +/*
     2722 + * Set up an iovec array of up to cnt pointers.
     2723 + */
2621 2724  void
2622 2725  mblk_to_iov(mblk_t *m, int cnt, struct iovec *iovp)
2623 2726  {
2624 2727          while (m != NULL && cnt-- > 0) {
2625 2728                  iovp->iov_base = (caddr_t)m->b_rptr;
2626 2729                  iovp->iov_len = (m->b_wptr - m->b_rptr);
2627 2730                  iovp++;
2628 2731                  m = m->b_cont;
2629 2732          }
2630 2733  }
↓ open down ↓ 217 lines elided ↑ open up ↑
2848 2951  
2849 2952                          /*
2850 2953                           * Found a valid vp for index "filename". Sanity check
2851 2954                           * for odd case where a directory is provided as index
2852 2955                           * option argument and leads us to another filesystem
2853 2956                           */
2854 2957  
2855 2958                          /* Release the reference on the old exi value */
2856 2959                          ASSERT(*exi != NULL);
2857 2960                          exi_rele(*exi);
     2961 +                        *exi = NULL;
2858 2962  
2859 2963                          if (error = nfs_check_vpexi(mc_dvp, *vpp, kcred, exi)) {
2860 2964                                  VN_RELE(*vpp);
2861 2965                                  goto publicfh_done;
2862 2966                          }
     2967 +                        /* Have a new *exi */
2863 2968                  }
2864 2969          }
2865 2970  
2866 2971  publicfh_done:
2867 2972          if (mc_dvp)
2868 2973                  VN_RELE(mc_dvp);
2869 2974  
2870 2975          return (error);
2871 2976  }
2872 2977  
↓ open down ↓ 6 lines elided ↑ open up ↑
2879 2984          vnode_t **dirvpp,               /* ret for ptr to parent dir vnode */
2880 2985          vnode_t **compvpp,              /* ret for ptr to component vnode */
2881 2986          vnode_t *startdvp,              /* starting vnode */
2882 2987          cred_t *cr,                     /* user's credential */
2883 2988          int pathflag)                   /* flag to identify path, e.g. URL */
2884 2989  {
2885 2990          char namebuf[TYPICALMAXPATHLEN];
2886 2991          struct pathname pn;
2887 2992          int error;
2888 2993  
     2994 +        ASSERT3U(crgetzoneid(cr), ==, curzone->zone_id);
     2995 +
2889 2996          /*
2890 2997           * If pathname starts with '/', then set startdvp to root.
2891 2998           */
2892 2999          if (*path == '/') {
2893 3000                  while (*path == '/')
2894 3001                          path++;
2895 3002  
2896      -                startdvp = rootdir;
     3003 +                startdvp = ZONE_ROOTVP();
2897 3004          }
2898 3005  
2899 3006          error = pn_get_buf(path, UIO_SYSSPACE, &pn, namebuf, sizeof (namebuf));
2900 3007          if (error == 0) {
2901 3008                  /*
2902 3009                   * Call the URL parser for URL paths to modify the original
2903 3010                   * string to handle any '%' encoded characters that exist.
2904 3011                   * Done here to avoid an extra bcopy in the lookup.
2905 3012                   * We need to be careful about pathlen's. We know that
2906 3013                   * rfs_pathname() is called with a non-empty path. However,
↓ open down ↓ 2 lines elided ↑ open up ↑
2909 3016                   * URL parser finding an encoded null character at the
2910 3017                   * beginning of path which should not proceed with the lookup.
2911 3018                   */
2912 3019                  if (pn.pn_pathlen != 0 && pathflag == URLPATH) {
2913 3020                          URLparse(pn.pn_path);
2914 3021                          if ((pn.pn_pathlen = strlen(pn.pn_path)) == 0)
2915 3022                                  return (ENOENT);
2916 3023                  }
2917 3024                  VN_HOLD(startdvp);
2918 3025                  error = lookuppnvp(&pn, NULL, NO_FOLLOW, dirvpp, compvpp,
2919      -                    rootdir, startdvp, cr);
     3026 +                    ZONE_ROOTVP(), startdvp, cr);
2920 3027          }
2921 3028          if (error == ENAMETOOLONG) {
2922 3029                  /*
2923 3030                   * This thread used a pathname > TYPICALMAXPATHLEN bytes long.
2924 3031                   */
2925 3032                  if (error = pn_get(path, UIO_SYSSPACE, &pn))
2926 3033                          return (error);
2927 3034                  if (pn.pn_pathlen != 0 && pathflag == URLPATH) {
2928 3035                          URLparse(pn.pn_path);
2929 3036                          if ((pn.pn_pathlen = strlen(pn.pn_path)) == 0) {
2930 3037                                  pn_free(&pn);
2931 3038                                  return (ENOENT);
2932 3039                          }
2933 3040                  }
2934 3041                  VN_HOLD(startdvp);
2935 3042                  error = lookuppnvp(&pn, NULL, NO_FOLLOW, dirvpp, compvpp,
2936      -                    rootdir, startdvp, cr);
     3043 +                    ZONE_ROOTVP(), startdvp, cr);
2937 3044                  pn_free(&pn);
2938 3045          }
2939 3046  
2940 3047          return (error);
2941 3048  }
2942 3049  
2943 3050  /*
2944 3051   * Adapt the multicomponent lookup path depending on the pathtype
2945 3052   */
2946 3053  static int
↓ open down ↓ 83 lines elided ↑ open up ↑
3030 3137                   * must not terminate below the
3031 3138                   * exported directory.
3032 3139                   */
3033 3140                  if ((*exi)->exi_export.ex_flags & EX_NOSUB && walk > 0)
3034 3141                          error = EACCES;
3035 3142          }
3036 3143  
3037 3144          return (error);
3038 3145  }
3039 3146  
3040      -/*
3041      - * Do the main work of handling HA-NFSv4 Resource Group failover on
3042      - * Sun Cluster.
3043      - * We need to detect whether any RG admin paths have been added or removed,
3044      - * and adjust resources accordingly.
3045      - * Currently we're using a very inefficient algorithm, ~ 2 * O(n**2). In
3046      - * order to scale, the list and array of paths need to be held in more
3047      - * suitable data structures.
3048      - */
3049      -static void
3050      -hanfsv4_failover(void)
3051      -{
3052      -        int i, start_grace, numadded_paths = 0;
3053      -        char **added_paths = NULL;
3054      -        rfs4_dss_path_t *dss_path;
3055      -
3056      -        /*
3057      -         * Note: currently, rfs4_dss_pathlist cannot be NULL, since
3058      -         * it will always include an entry for NFS4_DSS_VAR_DIR. If we
3059      -         * make the latter dynamically specified too, the following will
3060      -         * need to be adjusted.
3061      -         */
3062      -
3063      -        /*
3064      -         * First, look for removed paths: RGs that have been failed-over
3065      -         * away from this node.
3066      -         * Walk the "currently-serving" rfs4_dss_pathlist and, for each
3067      -         * path, check if it is on the "passed-in" rfs4_dss_newpaths array
3068      -         * from nfsd. If not, that RG path has been removed.
3069      -         *
3070      -         * Note that nfsd has sorted rfs4_dss_newpaths for us, and removed
3071      -         * any duplicates.
3072      -         */
3073      -        dss_path = rfs4_dss_pathlist;
3074      -        do {
3075      -                int found = 0;
3076      -                char *path = dss_path->path;
3077      -
3078      -                /* used only for non-HA so may not be removed */
3079      -                if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) {
3080      -                        dss_path = dss_path->next;
3081      -                        continue;
3082      -                }
3083      -
3084      -                for (i = 0; i < rfs4_dss_numnewpaths; i++) {
3085      -                        int cmpret;
3086      -                        char *newpath = rfs4_dss_newpaths[i];
3087      -
3088      -                        /*
3089      -                         * Since nfsd has sorted rfs4_dss_newpaths for us,
3090      -                         * once the return from strcmp is negative we know
3091      -                         * we've passed the point where "path" should be,
3092      -                         * and can stop searching: "path" has been removed.
3093      -                         */
3094      -                        cmpret = strcmp(path, newpath);
3095      -                        if (cmpret < 0)
3096      -                                break;
3097      -                        if (cmpret == 0) {
3098      -                                found = 1;
3099      -                                break;
3100      -                        }
3101      -                }
3102      -
3103      -                if (found == 0) {
3104      -                        unsigned index = dss_path->index;
3105      -                        rfs4_servinst_t *sip = dss_path->sip;
3106      -                        rfs4_dss_path_t *path_next = dss_path->next;
3107      -
3108      -                        /*
3109      -                         * This path has been removed.
3110      -                         * We must clear out the servinst reference to
3111      -                         * it, since it's now owned by another
3112      -                         * node: we should not attempt to touch it.
3113      -                         */
3114      -                        ASSERT(dss_path == sip->dss_paths[index]);
3115      -                        sip->dss_paths[index] = NULL;
3116      -
3117      -                        /* remove from "currently-serving" list, and destroy */
3118      -                        remque(dss_path);
3119      -                        /* allow for NUL */
3120      -                        kmem_free(dss_path->path, strlen(dss_path->path) + 1);
3121      -                        kmem_free(dss_path, sizeof (rfs4_dss_path_t));
3122      -
3123      -                        dss_path = path_next;
3124      -                } else {
3125      -                        /* path was found; not removed */
3126      -                        dss_path = dss_path->next;
3127      -                }
3128      -        } while (dss_path != rfs4_dss_pathlist);
3129      -
3130      -        /*
3131      -         * Now, look for added paths: RGs that have been failed-over
3132      -         * to this node.
3133      -         * Walk the "passed-in" rfs4_dss_newpaths array from nfsd and,
3134      -         * for each path, check if it is on the "currently-serving"
3135      -         * rfs4_dss_pathlist. If not, that RG path has been added.
3136      -         *
3137      -         * Note: we don't do duplicate detection here; nfsd does that for us.
3138      -         *
3139      -         * Note: numadded_paths <= rfs4_dss_numnewpaths, which gives us
3140      -         * an upper bound for the size needed for added_paths[numadded_paths].
3141      -         */
3142      -
3143      -        /* probably more space than we need, but guaranteed to be enough */
3144      -        if (rfs4_dss_numnewpaths > 0) {
3145      -                size_t sz = rfs4_dss_numnewpaths * sizeof (char *);
3146      -                added_paths = kmem_zalloc(sz, KM_SLEEP);
3147      -        }
3148      -
3149      -        /* walk the "passed-in" rfs4_dss_newpaths array from nfsd */
3150      -        for (i = 0; i < rfs4_dss_numnewpaths; i++) {
3151      -                int found = 0;
3152      -                char *newpath = rfs4_dss_newpaths[i];
3153      -
3154      -                dss_path = rfs4_dss_pathlist;
3155      -                do {
3156      -                        char *path = dss_path->path;
3157      -
3158      -                        /* used only for non-HA */
3159      -                        if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) {
3160      -                                dss_path = dss_path->next;
3161      -                                continue;
3162      -                        }
3163      -
3164      -                        if (strncmp(path, newpath, strlen(path)) == 0) {
3165      -                                found = 1;
3166      -                                break;
3167      -                        }
3168      -
3169      -                        dss_path = dss_path->next;
3170      -                } while (dss_path != rfs4_dss_pathlist);
3171      -
3172      -                if (found == 0) {
3173      -                        added_paths[numadded_paths] = newpath;
3174      -                        numadded_paths++;
3175      -                }
3176      -        }
3177      -
3178      -        /* did we find any added paths? */
3179      -        if (numadded_paths > 0) {
3180      -                /* create a new server instance, and start its grace period */
3181      -                start_grace = 1;
3182      -                rfs4_servinst_create(start_grace, numadded_paths, added_paths);
3183      -
3184      -                /* read in the stable storage state from these paths */
3185      -                rfs4_dss_readstate(numadded_paths, added_paths);
3186      -
3187      -                /*
3188      -                 * Multiple failovers during a grace period will cause
3189      -                 * clients of the same resource group to be partitioned
3190      -                 * into different server instances, with different
3191      -                 * grace periods.  Since clients of the same resource
3192      -                 * group must be subject to the same grace period,
3193      -                 * we need to reset all currently active grace periods.
3194      -                 */
3195      -                rfs4_grace_reset_all();
3196      -        }
3197      -
3198      -        if (rfs4_dss_numnewpaths > 0)
3199      -                kmem_free(added_paths, rfs4_dss_numnewpaths * sizeof (char *));
3200      -}
3201      -
3202 3147  /*
3203 3148   * Used by NFSv3 and NFSv4 server to query label of
3204 3149   * a pathname component during lookup/access ops.
3205 3150   */
3206 3151  ts_label_t *
3207 3152  nfs_getflabel(vnode_t *vp, struct exportinfo *exi)
3208 3153  {
3209 3154          zone_t *zone;
3210 3155          ts_label_t *zone_label;
3211 3156          char *path;
↓ open down ↓ 260 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX