1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2018 Joyent, Inc.
14 */
15
16 /*
17 * lx_start_nfs_lockd() starts an NFS lockd (lx_lockd) process inside the zone.
18 * This uses the same technique as used in our lx cgroupfs to launch a release
19 * agent process. This is called implicitly when an NFS mount syscall occurs
20 * within the zone. See the user-level lx_lockd source for the "big theory"
21 * comment behind this.
22 *
23 * lx_upcall_statd() is a brand hook that interposes on the rpc.statd RPC
24 * handling so that we can interface to a Linux rpc.statd that must run
25 * when NFSv3 locking is in use. The rpc.statd handles server or client reboots
26 * and interacts with the lockd to reclaim locks after the server reboots. The
27 * rcp.statd also informs the server when we reboot, so the server can release
28 * the locks we held.
29 */
30
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/sysmacros.h>
34 #include <sys/errno.h>
35 #include <sys/cred.h>
36 #include <sys/systm.h>
37 #include <sys/policy.h>
38 #include <sys/vmparam.h>
39 #include <sys/contract_impl.h>
40 #include <sys/pool.h>
41 #include <sys/stack.h>
42 #include <sys/var.h>
43 #include <sys/rt.h>
44 #include <sys/fx.h>
45 #include <sys/brand.h>
46 #include <sys/lx_brand.h>
47 #include <sys/pathname.h>
48 #include <rpcsvc/nlm_prot.h>
49 #include <rpcsvc/sm_inter.h>
50 #include <klm/nlm_impl.h>
51
52 #define LX_LOCKD_PATH "/native/usr/lib/brand/lx/lx_lockd"
53
54 /* Linux lockd RPC called by statd when it detects an NFS server reboot */
55 #define LX_NLMPROC_NSM_NOTIFY 16
56
57 /* From uts/common/klm/nlm_impl.c */
58 extern void nlm_netbuf_to_netobj(struct netbuf *, int *, netobj *);
59 extern void nlm_nsm_clnt_init(CLIENT *, struct nlm_nsm *);
60
61 /*
62 * Check if the current lockd is still running.
63 */
64 static boolean_t
65 lx_lockd_alive(pid_t lockd_pid)
66 {
67 boolean_t ret = B_FALSE;
68 proc_t *p;
69 vnode_t *vp;
70 char path[MAXPATHLEN];
71
72 mutex_enter(&pidlock);
73 p = prfind(lockd_pid);
74 if (p == NULL) {
75 mutex_exit(&pidlock);
76 return (B_FALSE);
77 }
78
79 mutex_enter(&p->p_lock);
80 if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) {
81 mutex_exit(&p->p_lock);
82 mutex_exit(&pidlock);
83 return (B_FALSE);
84 }
85 vp = p->p_exec;
86 VN_HOLD(vp);
87 mutex_exit(&p->p_lock);
88 mutex_exit(&pidlock);
89
90 if (vnodetopath(NULL, vp, path, sizeof (path), CRED()) == 0 &&
91 strcmp(path, LX_LOCKD_PATH) == 0) {
92 ret = B_TRUE;
93 }
94
95 VN_RELE(vp);
96 return (ret);
97 }
98
99 static void
100 lx_run_lockd(void *a)
101 {
102 proc_t *p = curproc;
103 zone_t *z = curzone;
104 struct core_globals *cg;
105 lx_zone_data_t *lxzd = ztolxzd(z);
106 int res;
107
108 ASSERT(!INGLOBALZONE(p));
109 VERIFY(lxzd != NULL);
110
111 /* The following block is derived from start_init_common */
112 ASSERT_STACK_ALIGNED();
113
114 p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0;
115 p->p_usrstack = (caddr_t)USRSTACK32;
116 p->p_model = DATAMODEL_ILP32;
117 p->p_stkprot = PROT_ZFOD & ~PROT_EXEC;
118 p->p_datprot = PROT_ZFOD & ~PROT_EXEC;
119 p->p_stk_ctl = INT32_MAX;
120
121 p->p_as = as_alloc();
122 p->p_as->a_proc = p;
123 p->p_as->a_userlimit = (caddr_t)USERLIMIT32;
124 (void) hat_setup(p->p_as->a_hat, HAT_INIT);
125
126 VERIFY((cg = zone_getspecific(core_zone_key, z)) != NULL);
127
128 corectl_path_hold(cg->core_default_path);
129 corectl_content_hold(cg->core_default_content);
130
131 p->p_corefile = cg->core_default_path;
132 p->p_content = cg->core_default_content;
133
134 init_mstate(curthread, LMS_SYSTEM);
135 res = exec_init(LX_LOCKD_PATH, NULL);
136
137 /* End of code derived from start_init_common */
138
139 /* The following is derived from zone_start_init - see comments there */
140 if (res != 0 || zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) {
141 if (proc_exit(CLD_EXITED, res) != 0) {
142 mutex_enter(&p->p_lock);
143 ASSERT(p->p_flag & SEXITLWPS);
144 lwp_exit();
145 }
146 } else {
147 id_t cid = curthread->t_cid;
148
149 mutex_enter(&class_lock);
150 ASSERT(cid < loaded_classes);
151 if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
152 z->zone_fixed_hipri) {
153 pcparms_t pcparms;
154
155 pcparms.pc_cid = cid;
156 ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
157 ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
158 FXMAXUPRI;
159 ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
160 FX_DOUPRILIM | FX_DOUPRI;
161
162 mutex_enter(&pidlock);
163 mutex_enter(&p->p_lock);
164 (void) parmsset(&pcparms, curthread);
165 mutex_exit(&p->p_lock);
166 mutex_exit(&pidlock);
167 } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
168 curthread->t_pri = RTGPPRIO0;
169 }
170 mutex_exit(&class_lock);
171
172 /*
173 * Set our pid as the lockd pid in the zone data, or exit
174 * if another process raced and already did so.
175 */
176 mutex_enter(&lxzd->lxzd_lock);
177 if (lxzd->lxzd_lockd_pid != 0) {
178 /* another mount raced and created a new lockd */
179 mutex_exit(&lxzd->lxzd_lock);
180 if (proc_exit(CLD_EXITED, 0) != 0) {
181 mutex_enter(&p->p_lock);
182 ASSERT(p->p_flag & SEXITLWPS);
183 lwp_exit();
184 }
185 return;
186 }
187 lxzd->lxzd_lockd_pid = p->p_pid;
188 mutex_exit(&lxzd->lxzd_lock);
189
190 /* cause the process to return to userland. */
191 lwp_rtt();
192 }
193 }
194
195 /*
196 * Launch the user-level, native, lx_lockd process.
197 */
198 int
199 lx_start_nfs_lockd()
200 {
201 id_t cid;
202 proc_t *p = ttoproc(curthread);
203 zone_t *z = p->p_zone;
204 lx_zone_data_t *lxzd = ztolxzd(z);
205
206 ASSERT(!INGLOBALZONE(p));
207 ASSERT(lxzd != NULL);
208
209 /*
210 * This should only be called by the mount emulation, which must have
211 * 'root' privileges in order to have performed a mount, but
212 * double-check.
213 */
214 if (crgetuid(CRED()) != 0)
215 return (EPERM);
216
217 mutex_enter(&lxzd->lxzd_lock);
218 if (lxzd->lxzd_lockd_pid != 0) {
219 /* verify lockd is still alive */
220 pid_t lockd_pid;
221
222 lockd_pid = lxzd->lxzd_lockd_pid;
223 mutex_exit(&lxzd->lxzd_lock);
224
225 if (lx_lockd_alive(lockd_pid))
226 return (EEXIST);
227
228 mutex_enter(&lxzd->lxzd_lock);
229 if (lxzd->lxzd_lockd_pid != lockd_pid) {
230 /* another mount raced and created a new lockd */
231 mutex_exit(&lxzd->lxzd_lock);
232 return (EEXIST);
233 }
234
235 /* old lockd is dead, launch a new one */
236 lxzd->lxzd_lockd_pid = 0;
237 }
238 mutex_exit(&lxzd->lxzd_lock);
239
240 if (z->zone_defaultcid > 0) {
241 cid = z->zone_defaultcid;
242 } else {
243 pool_lock();
244 cid = pool_get_class(z->zone_pool);
245 pool_unlock();
246 }
247 if (cid == -1)
248 cid = defaultcid;
249
250 /*
251 * There's nothing to do here if creating the proc fails, but we
252 * return the result to make it obvious while DTracing.
253 */
254 return (newproc(lx_run_lockd, NULL, cid, minclsyspri - 1, NULL, -1));
255 }
256
257 void
258 lx_upcall_statd(int op, struct nlm_globals *g, struct nlm_host *host)
259 {
260 struct nlm_nsm *nsm;
261 struct mon args;
262 struct mon_id *mip = &args.mon_id;
263 int family;
264 netobj obj;
265 enum clnt_stat stat;
266
267 /*
268 * For Linux rpc.statd monitor registration, the Linux NSMPROC_MON and
269 * NSMPROC_UNMON RPC upcalls correspond almost directly to the native
270 * SM_MON and SM_UNMON RPC upcalls. The key differences with the native
271 * registration is that in our nlm_host_monitor function we make two
272 * RPC calls:
273 * - the first RPC (nsmaddrproc1_reg_1) uses our private 'nsm_addr'
274 * RPC protocol to register the lockd RPC information that statd
275 * should call when it detects that the remote server rebooted
276 * - the second RPC (sm_mon_1) tells statd the information about the
277 * remote server to be monitored
278 * For Linux, there is only a single RPC from the kernel to the local
279 * statd. This RPC is equivalent to our sm_mon_1 code, but it uses the
280 * Linux-private NLMPROC_NSM_NOTIFY lockd procedure in the 'my_proc'
281 * RPC parameter. This corresponds to our private 'nsm_addr' code, and
282 * tells statd which lockd RPC to call when it detects a server reboot.
283 *
284 * Because our sm_mon_1 RPC is so similar to the Linux RPC, we can use
285 * that directly and simply set the expected value in the 'my_proc'
286 * argument.
287 *
288 * Within the kernel lockd RPC handling, the nlm_prog_3_dtable dispatch
289 * table has an entry for each lockd RPC function. Thus, this table also
290 * contains an entry for the Linux NLMPROC_NSM_NOTIFY procedure. That
291 * procedure number is unused by the native lockd code, so there is no
292 * conflict with dispatching that procedure. The implementation of the
293 * procedure corresponds to the native, private NLM_SM_NOTIFY1
294 * procedure which is called by the native rpc.statd.
295 *
296 * The Linux RPC call to "unmonitor" a host expects the same arguments
297 * as we pass to monitor, so that is also handled here by this same
298 * brand hook.
299 */
300 nlm_netbuf_to_netobj(&host->nh_addr, &family, &obj);
301 nsm = &g->nlm_nsm;
302
303 bzero(&args, sizeof (args));
304
305 mip->mon_name = host->nh_name;
306 mip->my_id.my_name = uts_nodename();
307 mip->my_id.my_prog = NLM_PROG;
308 mip->my_id.my_vers = NLM_SM;
309 mip->my_id.my_proc = LX_NLMPROC_NSM_NOTIFY;
310 if (op == SM_MON) {
311 bcopy(&host->nh_sysid, args.priv, sizeof (uint16_t));
312 }
313
314 mutex_enter(&nsm->ns_lock);
315 nlm_nsm_clnt_init(nsm->ns_handle, nsm);
316 if (op == SM_MON) {
317 struct sm_stat_res mres;
318
319 bzero(&mres, sizeof (mres));
320 stat = sm_mon_1(&args, &mres, nsm->ns_handle);
321 } else {
322 struct sm_stat ures;
323
324 ASSERT(op == SM_UNMON);
325 bzero(&ures, sizeof (ures));
326 stat = sm_unmon_1(mip, &ures, nsm->ns_handle);
327 }
328 mutex_exit(&nsm->ns_lock);
329
330 if (stat != RPC_SUCCESS) {
331 NLM_WARN("Failed to contact local statd, stat=%d", stat);
332 if (op == SM_MON) {
333 mutex_enter(&g->lock);
334 host->nh_flags &= ~NLM_NH_MONITORED;
335 mutex_exit(&g->lock);
336 }
337 }
338 }