1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2018 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * lx_start_nfs_lockd() starts an NFS lockd (lx_lockd) process inside the zone.
  18  * This uses the same technique as used in our lx cgroupfs to launch a release
  19  * agent process. This is called implicitly when an NFS mount syscall occurs
  20  * within the zone. See the user-level lx_lockd source for the "big theory"
  21  * comment behind this.
  22  *
  23  * lx_upcall_statd() is a brand hook that interposes on the rpc.statd RPC
  24  * handling so that we can interface to a Linux rpc.statd that must run
  25  * when NFSv3 locking is in use. The rpc.statd handles server or client reboots
  26  * and interacts with the lockd to reclaim locks after the server reboots. The
  27  * rcp.statd also informs the server when we reboot, so the server can release
  28  * the locks we held.
  29  */
  30 
  31 #include <sys/types.h>
  32 #include <sys/param.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/errno.h>
  35 #include <sys/cred.h>
  36 #include <sys/systm.h>
  37 #include <sys/policy.h>
  38 #include <sys/vmparam.h>
  39 #include <sys/contract_impl.h>
  40 #include <sys/pool.h>
  41 #include <sys/stack.h>
  42 #include <sys/var.h>
  43 #include <sys/rt.h>
  44 #include <sys/fx.h>
  45 #include <sys/brand.h>
  46 #include <sys/lx_brand.h>
  47 #include <sys/pathname.h>
  48 #include <rpcsvc/nlm_prot.h>
  49 #include <rpcsvc/sm_inter.h>
  50 #include <klm/nlm_impl.h>
  51 
  52 #define LX_LOCKD_PATH   "/native/usr/lib/brand/lx/lx_lockd"
  53 
  54 /* Linux lockd RPC called by statd when it detects an NFS server reboot */
  55 #define LX_NLMPROC_NSM_NOTIFY   16
  56 
  57 /* From uts/common/klm/nlm_impl.c */
  58 extern void nlm_netbuf_to_netobj(struct netbuf *, int *, netobj *);
  59 extern void nlm_nsm_clnt_init(CLIENT *, struct nlm_nsm *);
  60 
  61 /*
  62  * Check if the current lockd is still running.
  63  */
  64 static boolean_t
  65 lx_lockd_alive(pid_t lockd_pid)
  66 {
  67         boolean_t ret = B_FALSE;
  68         proc_t *p;
  69         vnode_t *vp;
  70         char path[MAXPATHLEN];
  71 
  72         mutex_enter(&pidlock);
  73         p = prfind(lockd_pid);
  74         if (p == NULL) {
  75                 mutex_exit(&pidlock);
  76                 return (B_FALSE);
  77         }
  78 
  79         mutex_enter(&p->p_lock);
  80         if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) {
  81                 mutex_exit(&p->p_lock);
  82                 mutex_exit(&pidlock);
  83                 return (B_FALSE);
  84         }
  85         vp = p->p_exec;
  86         VN_HOLD(vp);
  87         mutex_exit(&p->p_lock);
  88         mutex_exit(&pidlock);
  89 
  90         if (vnodetopath(NULL, vp, path, sizeof (path), CRED()) == 0 &&
  91             strcmp(path, LX_LOCKD_PATH) == 0) {
  92                 ret = B_TRUE;
  93         }
  94 
  95         VN_RELE(vp);
  96         return (ret);
  97 }
  98 
  99 static void
 100 lx_run_lockd(void *a)
 101 {
 102         proc_t *p = curproc;
 103         zone_t *z = curzone;
 104         struct core_globals *cg;
 105         lx_zone_data_t *lxzd = ztolxzd(z);
 106         int res;
 107 
 108         ASSERT(!INGLOBALZONE(p));
 109         VERIFY(lxzd != NULL);
 110 
 111         /* The following block is derived from start_init_common */
 112         ASSERT_STACK_ALIGNED();
 113 
 114         p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0;
 115         p->p_usrstack = (caddr_t)USRSTACK32;
 116         p->p_model = DATAMODEL_ILP32;
 117         p->p_stkprot = PROT_ZFOD & ~PROT_EXEC;
 118         p->p_datprot = PROT_ZFOD & ~PROT_EXEC;
 119         p->p_stk_ctl = INT32_MAX;
 120 
 121         p->p_as = as_alloc();
 122         p->p_as->a_proc = p;
 123         p->p_as->a_userlimit = (caddr_t)USERLIMIT32;
 124         (void) hat_setup(p->p_as->a_hat, HAT_INIT);
 125 
 126         VERIFY((cg = zone_getspecific(core_zone_key, z)) != NULL);
 127 
 128         corectl_path_hold(cg->core_default_path);
 129         corectl_content_hold(cg->core_default_content);
 130 
 131         p->p_corefile = cg->core_default_path;
 132         p->p_content = cg->core_default_content;
 133 
 134         init_mstate(curthread, LMS_SYSTEM);
 135         res = exec_init(LX_LOCKD_PATH, NULL);
 136 
 137         /* End of code derived from start_init_common */
 138 
 139         /* The following is derived from zone_start_init - see comments there */
 140         if (res != 0 || zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) {
 141                 if (proc_exit(CLD_EXITED, res) != 0) {
 142                         mutex_enter(&p->p_lock);
 143                         ASSERT(p->p_flag & SEXITLWPS);
 144                         lwp_exit();
 145                 }
 146         } else {
 147                 id_t cid = curthread->t_cid;
 148 
 149                 mutex_enter(&class_lock);
 150                 ASSERT(cid < loaded_classes);
 151                 if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
 152                     z->zone_fixed_hipri) {
 153                         pcparms_t pcparms;
 154 
 155                         pcparms.pc_cid = cid;
 156                         ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
 157                         ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
 158                             FXMAXUPRI;
 159                         ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
 160                             FX_DOUPRILIM | FX_DOUPRI;
 161 
 162                         mutex_enter(&pidlock);
 163                         mutex_enter(&p->p_lock);
 164                         (void) parmsset(&pcparms, curthread);
 165                         mutex_exit(&p->p_lock);
 166                         mutex_exit(&pidlock);
 167                 } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
 168                         curthread->t_pri = RTGPPRIO0;
 169                 }
 170                 mutex_exit(&class_lock);
 171 
 172                 /*
 173                  * Set our pid as the lockd pid in the zone data, or exit
 174                  * if another process raced and already did so.
 175                  */
 176                 mutex_enter(&lxzd->lxzd_lock);
 177                 if (lxzd->lxzd_lockd_pid != 0) {
 178                         /* another mount raced and created a new lockd */
 179                         mutex_exit(&lxzd->lxzd_lock);
 180                         if (proc_exit(CLD_EXITED, 0) != 0) {
 181                                 mutex_enter(&p->p_lock);
 182                                 ASSERT(p->p_flag & SEXITLWPS);
 183                                 lwp_exit();
 184                         }
 185                         return;
 186                 }
 187                 lxzd->lxzd_lockd_pid = p->p_pid;
 188                 mutex_exit(&lxzd->lxzd_lock);
 189 
 190                 /* cause the process to return to userland. */
 191                 lwp_rtt();
 192         }
 193 }
 194 
 195 /*
 196  * Launch the user-level, native, lx_lockd process.
 197  */
 198 int
 199 lx_start_nfs_lockd()
 200 {
 201         id_t cid;
 202         proc_t *p = ttoproc(curthread);
 203         zone_t *z = p->p_zone;
 204         lx_zone_data_t *lxzd = ztolxzd(z);
 205 
 206         ASSERT(!INGLOBALZONE(p));
 207         ASSERT(lxzd != NULL);
 208 
 209         /*
 210          * This should only be called by the mount emulation, which must have
 211          * 'root' privileges in order to have performed a mount, but
 212          * double-check.
 213          */
 214         if (crgetuid(CRED()) != 0)
 215                 return (EPERM);
 216 
 217         mutex_enter(&lxzd->lxzd_lock);
 218         if (lxzd->lxzd_lockd_pid != 0) {
 219                 /* verify lockd is still alive */
 220                 pid_t lockd_pid;
 221 
 222                 lockd_pid = lxzd->lxzd_lockd_pid;
 223                 mutex_exit(&lxzd->lxzd_lock);
 224 
 225                 if (lx_lockd_alive(lockd_pid))
 226                         return (EEXIST);
 227 
 228                 mutex_enter(&lxzd->lxzd_lock);
 229                 if (lxzd->lxzd_lockd_pid != lockd_pid) {
 230                         /* another mount raced and created a new lockd */
 231                         mutex_exit(&lxzd->lxzd_lock);
 232                         return (EEXIST);
 233                 }
 234 
 235                 /* old lockd is dead, launch a new one */
 236                 lxzd->lxzd_lockd_pid = 0;
 237         }
 238         mutex_exit(&lxzd->lxzd_lock);
 239 
 240         if (z->zone_defaultcid > 0) {
 241                 cid = z->zone_defaultcid;
 242         } else {
 243                 pool_lock();
 244                 cid = pool_get_class(z->zone_pool);
 245                 pool_unlock();
 246         }
 247         if (cid == -1)
 248                 cid = defaultcid;
 249 
 250         /*
 251          * There's nothing to do here if creating the proc fails, but we
 252          * return the result to make it obvious while DTracing.
 253          */
 254         return (newproc(lx_run_lockd, NULL, cid, minclsyspri - 1, NULL, -1));
 255 }
 256 
 257 void
 258 lx_upcall_statd(int op, struct nlm_globals *g, struct nlm_host *host)
 259 {
 260         struct nlm_nsm *nsm;
 261         struct mon args;
 262         struct mon_id *mip = &args.mon_id;
 263         int family;
 264         netobj obj;
 265         enum clnt_stat stat;
 266 
 267         /*
 268          * For Linux rpc.statd monitor registration, the Linux NSMPROC_MON and
 269          * NSMPROC_UNMON RPC upcalls correspond almost directly to the native
 270          * SM_MON and SM_UNMON RPC upcalls. The key differences with the native
 271          * registration is that in our nlm_host_monitor function we make two
 272          * RPC calls:
 273          *    - the first RPC (nsmaddrproc1_reg_1) uses our private 'nsm_addr'
 274          *      RPC protocol to register the lockd RPC information that statd
 275          *      should call when it detects that the remote server rebooted
 276          *    - the second RPC (sm_mon_1) tells statd the information about the
 277          *      remote server to be monitored
 278          * For Linux, there is only a single RPC from the kernel to the local
 279          * statd. This RPC is equivalent to our sm_mon_1 code, but it uses the
 280          * Linux-private NLMPROC_NSM_NOTIFY lockd procedure in the 'my_proc'
 281          * RPC parameter. This corresponds to our private 'nsm_addr' code, and
 282          * tells statd which lockd RPC to call when it detects a server reboot.
 283          *
 284          * Because our sm_mon_1 RPC is so similar to the Linux RPC, we can use
 285          * that directly and simply set the expected value in the 'my_proc'
 286          * argument.
 287          *
 288          * Within the kernel lockd RPC handling, the nlm_prog_3_dtable dispatch
 289          * table has an entry for each lockd RPC function. Thus, this table also
 290          * contains an entry for the Linux NLMPROC_NSM_NOTIFY procedure. That
 291          * procedure number is unused by the native lockd code, so there is no
 292          * conflict with dispatching that procedure. The implementation of the
 293          * procedure corresponds to the native, private NLM_SM_NOTIFY1
 294          * procedure which is called by the native rpc.statd.
 295          *
 296          * The Linux RPC call to "unmonitor" a host expects the same arguments
 297          * as we pass to monitor, so that is also handled here by this same
 298          * brand hook.
 299          */
 300         nlm_netbuf_to_netobj(&host->nh_addr, &family, &obj);
 301         nsm = &g->nlm_nsm;
 302 
 303         bzero(&args, sizeof (args));
 304 
 305         mip->mon_name = host->nh_name;
 306         mip->my_id.my_name = uts_nodename();
 307         mip->my_id.my_prog = NLM_PROG;
 308         mip->my_id.my_vers = NLM_SM;
 309         mip->my_id.my_proc = LX_NLMPROC_NSM_NOTIFY;
 310         if (op == SM_MON) {
 311                 bcopy(&host->nh_sysid, args.priv, sizeof (uint16_t));
 312         }
 313 
 314         mutex_enter(&nsm->ns_lock);
 315         nlm_nsm_clnt_init(nsm->ns_handle, nsm);
 316         if (op == SM_MON) {
 317                 struct sm_stat_res mres;
 318 
 319                 bzero(&mres, sizeof (mres));
 320                 stat = sm_mon_1(&args, &mres, nsm->ns_handle);
 321         } else {
 322                 struct sm_stat ures;
 323 
 324                 ASSERT(op == SM_UNMON);
 325                 bzero(&ures, sizeof (ures));
 326                 stat = sm_unmon_1(mip, &ures, nsm->ns_handle);
 327         }
 328         mutex_exit(&nsm->ns_lock);
 329 
 330         if (stat != RPC_SUCCESS) {
 331                 NLM_WARN("Failed to contact local statd, stat=%d", stat);
 332                 if (op == SM_MON) {
 333                         mutex_enter(&g->lock);
 334                         host->nh_flags &= ~NLM_NH_MONITORED;
 335                         mutex_exit(&g->lock);
 336                 }
 337         }
 338 }