nlm-mutex Wdiff usr/src/uts/common/brand/lx/os/lx_lockd.c

Print this page

XXXXX convert NLM's single-count semaphore to a mutex

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/brand/lx/os/lx_lockd.c
          +++ new/usr/src/uts/common/brand/lx/os/lx_lockd.c

   1    1  /*
   2    2   * This file and its contents are supplied under the terms of the
   3    3   * Common Development and Distribution License ("CDDL"), version 1.0.
   4    4   * You may only use this file in accordance with the terms of version
   5    5   * 1.0 of the CDDL.
   6    6   *
   7    7   * A full copy of the text of the CDDL should have accompanied this
   8    8   * source.  A copy of the CDDL is also available via the Internet at
   9    9   * http://www.illumos.org/license/CDDL.
  10   10   */
  11   11  
  12   12  /*
  13   13   * Copyright 2018 Joyent, Inc.
  14   14   */
  15   15  
  16   16  /*
  17   17   * lx_start_nfs_lockd() starts an NFS lockd (lx_lockd) process inside the zone.
  18   18   * This uses the same technique as used in our lx cgroupfs to launch a release
  19   19   * agent process. This is called implicitly when an NFS mount syscall occurs
  20   20   * within the zone. See the user-level lx_lockd source for the "big theory"
  21   21   * comment behind this.
  22   22   *
  23   23   * lx_upcall_statd() is a brand hook that interposes on the rpc.statd RPC
  24   24   * handling so that we can interface to a Linux rpc.statd that must run
  25   25   * when NFSv3 locking is in use. The rpc.statd handles server or client reboots
  26   26   * and interacts with the lockd to reclaim locks after the server reboots. The
  27   27   * rcp.statd also informs the server when we reboot, so the server can release
  28   28   * the locks we held.
  29   29   */
  30   30  
  31   31  #include <sys/types.h>
  32   32  #include <sys/param.h>
  33   33  #include <sys/sysmacros.h>
  34   34  #include <sys/errno.h>
  35   35  #include <sys/cred.h>
  36   36  #include <sys/systm.h>
  37   37  #include <sys/policy.h>
  38   38  #include <sys/vmparam.h>
  39   39  #include <sys/contract_impl.h>
  40   40  #include <sys/pool.h>
  41   41  #include <sys/stack.h>
  42   42  #include <sys/var.h>
  43   43  #include <sys/rt.h>
  44   44  #include <sys/fx.h>
  45   45  #include <sys/brand.h>
  46   46  #include <sys/lx_brand.h>
  47   47  #include <sys/pathname.h>
  48   48  #include <rpcsvc/nlm_prot.h>
  49   49  #include <rpcsvc/sm_inter.h>
  50   50  #include <klm/nlm_impl.h>
  51   51  
  52   52  #define LX_LOCKD_PATH   "/native/usr/lib/brand/lx/lx_lockd"
  53   53  
  54   54  /* Linux lockd RPC called by statd when it detects an NFS server reboot */
  55   55  #define LX_NLMPROC_NSM_NOTIFY   16
  56   56  
  57   57  /* From uts/common/klm/nlm_impl.c */
  58   58  extern void nlm_netbuf_to_netobj(struct netbuf *, int *, netobj *);
  59   59  extern void nlm_nsm_clnt_init(CLIENT *, struct nlm_nsm *);
  60   60  
  61   61  /*
  62   62   * Check if the current lockd is still running.
  63   63   */
  64   64  static boolean_t
  65   65  lx_lockd_alive(pid_t lockd_pid)
  66   66  {
  67   67          boolean_t ret = B_FALSE;
  68   68          proc_t *p;
  69   69          vnode_t *vp;
  70   70          char path[MAXPATHLEN];
  71   71  
  72   72          mutex_enter(&pidlock);
  73   73          p = prfind(lockd_pid);
  74   74          if (p == NULL) {
  75   75                  mutex_exit(&pidlock);
  76   76                  return (B_FALSE);
  77   77          }
  78   78  
  79   79          mutex_enter(&p->p_lock);
  80   80          if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) {
  81   81                  mutex_exit(&p->p_lock);
  82   82                  mutex_exit(&pidlock);
  83   83                  return (B_FALSE);
  84   84          }
  85   85          vp = p->p_exec;
  86   86          VN_HOLD(vp);
  87   87          mutex_exit(&p->p_lock);
  88   88          mutex_exit(&pidlock);
  89   89  
  90   90          if (vnodetopath(NULL, vp, path, sizeof (path), CRED()) == 0 &&
  91   91              strcmp(path, LX_LOCKD_PATH) == 0) {
  92   92                  ret = B_TRUE;
  93   93          }
  94   94  
  95   95          VN_RELE(vp);
  96   96          return (ret);
  97   97  }
  98   98  
  99   99  static void
 100  100  lx_run_lockd(void *a)
 101  101  {
 102  102          proc_t *p = curproc;
 103  103          zone_t *z = curzone;
 104  104          struct core_globals *cg;
 105  105          lx_zone_data_t *lxzd = ztolxzd(z);
 106  106          int res;
 107  107  
 108  108          ASSERT(!INGLOBALZONE(p));
 109  109          VERIFY(lxzd != NULL);
 110  110  
 111  111          /* The following block is derived from start_init_common */
 112  112          ASSERT_STACK_ALIGNED();
 113  113  
 114  114          p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0;
 115  115          p->p_usrstack = (caddr_t)USRSTACK32;
 116  116          p->p_model = DATAMODEL_ILP32;
 117  117          p->p_stkprot = PROT_ZFOD & ~PROT_EXEC;
 118  118          p->p_datprot = PROT_ZFOD & ~PROT_EXEC;
 119  119          p->p_stk_ctl = INT32_MAX;
 120  120  
 121  121          p->p_as = as_alloc();
 122  122          p->p_as->a_proc = p;
 123  123          p->p_as->a_userlimit = (caddr_t)USERLIMIT32;
 124  124          (void) hat_setup(p->p_as->a_hat, HAT_INIT);
 125  125  
 126  126          VERIFY((cg = zone_getspecific(core_zone_key, z)) != NULL);
 127  127  
 128  128          corectl_path_hold(cg->core_default_path);
 129  129          corectl_content_hold(cg->core_default_content);
 130  130  
 131  131          p->p_corefile = cg->core_default_path;
 132  132          p->p_content = cg->core_default_content;
 133  133  
 134  134          init_mstate(curthread, LMS_SYSTEM);
 135  135          res = exec_init(LX_LOCKD_PATH, NULL);
 136  136  
 137  137          /* End of code derived from start_init_common */
 138  138  
 139  139          /* The following is derived from zone_start_init - see comments there */
 140  140          if (res != 0 || zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) {
 141  141                  if (proc_exit(CLD_EXITED, res) != 0) {
 142  142                          mutex_enter(&p->p_lock);
 143  143                          ASSERT(p->p_flag & SEXITLWPS);
 144  144                          lwp_exit();
 145  145                  }
 146  146          } else {
 147  147                  id_t cid = curthread->t_cid;
 148  148  
 149  149                  mutex_enter(&class_lock);
 150  150                  ASSERT(cid < loaded_classes);
 151  151                  if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
 152  152                      z->zone_fixed_hipri) {
 153  153                          pcparms_t pcparms;
 154  154  
 155  155                          pcparms.pc_cid = cid;
 156  156                          ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
 157  157                          ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
 158  158                              FXMAXUPRI;
 159  159                          ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
 160  160                              FX_DOUPRILIM | FX_DOUPRI;
 161  161  
 162  162                          mutex_enter(&pidlock);
 163  163                          mutex_enter(&p->p_lock);
 164  164                          (void) parmsset(&pcparms, curthread);
 165  165                          mutex_exit(&p->p_lock);
 166  166                          mutex_exit(&pidlock);
 167  167                  } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
 168  168                          curthread->t_pri = RTGPPRIO0;
 169  169                  }
 170  170                  mutex_exit(&class_lock);
 171  171  
 172  172                  /*
 173  173                   * Set our pid as the lockd pid in the zone data, or exit
 174  174                   * if another process raced and already did so.
 175  175                   */
 176  176                  mutex_enter(&lxzd->lxzd_lock);
 177  177                  if (lxzd->lxzd_lockd_pid != 0) {
 178  178                          /* another mount raced and created a new lockd */
 179  179                          mutex_exit(&lxzd->lxzd_lock);
 180  180                          if (proc_exit(CLD_EXITED, 0) != 0) {
 181  181                                  mutex_enter(&p->p_lock);
 182  182                                  ASSERT(p->p_flag & SEXITLWPS);
 183  183                                  lwp_exit();
 184  184                          }
 185  185                          return;
 186  186                  }
 187  187                  lxzd->lxzd_lockd_pid = p->p_pid;
 188  188                  mutex_exit(&lxzd->lxzd_lock);
 189  189  
 190  190                  /* cause the process to return to userland. */
 191  191                  lwp_rtt();
 192  192          }
 193  193  }
 194  194  
 195  195  /*
 196  196   * Launch the user-level, native, lx_lockd process.
 197  197   */
 198  198  int
 199  199  lx_start_nfs_lockd()
 200  200  {
 201  201          id_t cid;
 202  202          proc_t *p = ttoproc(curthread);
 203  203          zone_t *z = p->p_zone;
 204  204          lx_zone_data_t *lxzd = ztolxzd(z);
 205  205  
 206  206          ASSERT(!INGLOBALZONE(p));
 207  207          ASSERT(lxzd != NULL);
 208  208  
 209  209          /*
 210  210           * This should only be called by the mount emulation, which must have
 211  211           * 'root' privileges in order to have performed a mount, but
 212  212           * double-check.
 213  213           */
 214  214          if (crgetuid(CRED()) != 0)
 215  215                  return (EPERM);
 216  216  
 217  217          mutex_enter(&lxzd->lxzd_lock);
 218  218          if (lxzd->lxzd_lockd_pid != 0) {
 219  219                  /* verify lockd is still alive */
 220  220                  pid_t lockd_pid;
 221  221  
 222  222                  lockd_pid = lxzd->lxzd_lockd_pid;
 223  223                  mutex_exit(&lxzd->lxzd_lock);
 224  224  
 225  225                  if (lx_lockd_alive(lockd_pid))
 226  226                          return (EEXIST);
 227  227  
 228  228                  mutex_enter(&lxzd->lxzd_lock);
 229  229                  if (lxzd->lxzd_lockd_pid != lockd_pid) {
 230  230                          /* another mount raced and created a new lockd */
 231  231                          mutex_exit(&lxzd->lxzd_lock);
 232  232                          return (EEXIST);
 233  233                  }
 234  234  
 235  235                  /* old lockd is dead, launch a new one */
 236  236                  lxzd->lxzd_lockd_pid = 0;
 237  237          }
 238  238          mutex_exit(&lxzd->lxzd_lock);
 239  239  
 240  240          if (z->zone_defaultcid > 0) {
 241  241                  cid = z->zone_defaultcid;
 242  242          } else {
 243  243                  pool_lock();
 244  244                  cid = pool_get_class(z->zone_pool);
 245  245                  pool_unlock();
 246  246          }
 247  247          if (cid == -1)
 248  248                  cid = defaultcid;
 249  249  
 250  250          /*
 251  251           * There's nothing to do here if creating the proc fails, but we
 252  252           * return the result to make it obvious while DTracing.
 253  253           */
 254  254          return (newproc(lx_run_lockd, NULL, cid, minclsyspri - 1, NULL, -1));
 255  255  }
 256  256  
 257  257  void
 258  258  lx_upcall_statd(int op, struct nlm_globals *g, struct nlm_host *host)
 259  259  {
 260  260          struct nlm_nsm *nsm;
 261  261          struct mon args;
 262  262          struct mon_id *mip = &args.mon_id;
 263  263          int family;
 264  264          netobj obj;
 265  265          enum clnt_stat stat;
 266  266  
 267  267          /*
 268  268           * For Linux rpc.statd monitor registration, the Linux NSMPROC_MON and
 269  269           * NSMPROC_UNMON RPC upcalls correspond almost directly to the native
 270  270           * SM_MON and SM_UNMON RPC upcalls. The key differences with the native
 271  271           * registration is that in our nlm_host_monitor function we make two
 272  272           * RPC calls:
 273  273           *    - the first RPC (nsmaddrproc1_reg_1) uses our private 'nsm_addr'
 274  274           *      RPC protocol to register the lockd RPC information that statd
 275  275           *      should call when it detects that the remote server rebooted
 276  276           *    - the second RPC (sm_mon_1) tells statd the information about the
 277  277           *      remote server to be monitored
 278  278           * For Linux, there is only a single RPC from the kernel to the local
 279  279           * statd. This RPC is equivalent to our sm_mon_1 code, but it uses the
 280  280           * Linux-private NLMPROC_NSM_NOTIFY lockd procedure in the 'my_proc'
 281  281           * RPC parameter. This corresponds to our private 'nsm_addr' code, and
 282  282           * tells statd which lockd RPC to call when it detects a server reboot.
 283  283           *
 284  284           * Because our sm_mon_1 RPC is so similar to the Linux RPC, we can use
 285  285           * that directly and simply set the expected value in the 'my_proc'
 286  286           * argument.
 287  287           *
 288  288           * Within the kernel lockd RPC handling, the nlm_prog_3_dtable dispatch
 289  289           * table has an entry for each lockd RPC function. Thus, this table also
 290  290           * contains an entry for the Linux NLMPROC_NSM_NOTIFY procedure. That
 291  291           * procedure number is unused by the native lockd code, so there is no
 292  292           * conflict with dispatching that procedure. The implementation of the
 293  293           * procedure corresponds to the native, private NLM_SM_NOTIFY1
 294  294           * procedure which is called by the native rpc.statd.
 295  295           *
 296  296           * The Linux RPC call to "unmonitor" a host expects the same arguments
 297  297           * as we pass to monitor, so that is also handled here by this same
 298  298           * brand hook.
 299  299           */
 300  300          nlm_netbuf_to_netobj(&host->nh_addr, &family, &obj);
 301  301          nsm = &g->nlm_nsm;
 302  302  
 303  303          bzero(&args, sizeof (args));

↓ open down ↓

303 lines elided

↑ open up ↑

 304  304  
 305  305          mip->mon_name = host->nh_name;
 306  306          mip->my_id.my_name = uts_nodename();
 307  307          mip->my_id.my_prog = NLM_PROG;
 308  308          mip->my_id.my_vers = NLM_SM;
 309  309          mip->my_id.my_proc = LX_NLMPROC_NSM_NOTIFY;
 310  310          if (op == SM_MON) {
 311  311                  bcopy(&host->nh_sysid, args.priv, sizeof (uint16_t));
 312  312          }
 313  313  
 314      -        sema_p(&nsm->ns_sem);
      314 +        mutex_enter(&nsm->ns_lock);
 315  315          nlm_nsm_clnt_init(nsm->ns_handle, nsm);
 316  316          if (op == SM_MON) {
 317  317                  struct sm_stat_res mres;
 318  318  
 319  319                  bzero(&mres, sizeof (mres));
 320  320                  stat = sm_mon_1(&args, &mres, nsm->ns_handle);
 321  321          } else {
 322  322                  struct sm_stat ures;
 323  323  
 324  324                  ASSERT(op == SM_UNMON);
 325  325                  bzero(&ures, sizeof (ures));
 326  326                  stat = sm_unmon_1(mip, &ures, nsm->ns_handle);
 327  327          }
 328      -        sema_v(&nsm->ns_sem);
      328 +        mutex_exit(&nsm->ns_lock);
 329  329  
 330  330          if (stat != RPC_SUCCESS) {
 331  331                  NLM_WARN("Failed to contact local statd, stat=%d", stat);
 332  332                  if (op == SM_MON) {
 333  333                          mutex_enter(&g->lock);
 334  334                          host->nh_flags &= ~NLM_NH_MONITORED;
 335  335                          mutex_exit(&g->lock);
 336  336                  }
 337  337          }
 338  338  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX