Print this page
XXXXX convert NLM's single-count semaphore to a mutex
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/brand/lx/os/lx_lockd.c
+++ new/usr/src/uts/common/brand/lx/os/lx_lockd.c
1 1 /*
2 2 * This file and its contents are supplied under the terms of the
3 3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 4 * You may only use this file in accordance with the terms of version
5 5 * 1.0 of the CDDL.
6 6 *
7 7 * A full copy of the text of the CDDL should have accompanied this
8 8 * source. A copy of the CDDL is also available via the Internet at
9 9 * http://www.illumos.org/license/CDDL.
10 10 */
11 11
12 12 /*
13 13 * Copyright 2018 Joyent, Inc.
14 14 */
15 15
16 16 /*
17 17 * lx_start_nfs_lockd() starts an NFS lockd (lx_lockd) process inside the zone.
18 18 * This uses the same technique as used in our lx cgroupfs to launch a release
19 19 * agent process. This is called implicitly when an NFS mount syscall occurs
20 20 * within the zone. See the user-level lx_lockd source for the "big theory"
21 21 * comment behind this.
22 22 *
23 23 * lx_upcall_statd() is a brand hook that interposes on the rpc.statd RPC
24 24 * handling so that we can interface to a Linux rpc.statd that must run
25 25 * when NFSv3 locking is in use. The rpc.statd handles server or client reboots
26 26 * and interacts with the lockd to reclaim locks after the server reboots. The
27 27 * rcp.statd also informs the server when we reboot, so the server can release
28 28 * the locks we held.
29 29 */
30 30
31 31 #include <sys/types.h>
32 32 #include <sys/param.h>
33 33 #include <sys/sysmacros.h>
34 34 #include <sys/errno.h>
35 35 #include <sys/cred.h>
36 36 #include <sys/systm.h>
37 37 #include <sys/policy.h>
38 38 #include <sys/vmparam.h>
39 39 #include <sys/contract_impl.h>
40 40 #include <sys/pool.h>
41 41 #include <sys/stack.h>
42 42 #include <sys/var.h>
43 43 #include <sys/rt.h>
44 44 #include <sys/fx.h>
45 45 #include <sys/brand.h>
46 46 #include <sys/lx_brand.h>
47 47 #include <sys/pathname.h>
48 48 #include <rpcsvc/nlm_prot.h>
49 49 #include <rpcsvc/sm_inter.h>
50 50 #include <klm/nlm_impl.h>
51 51
52 52 #define LX_LOCKD_PATH "/native/usr/lib/brand/lx/lx_lockd"
53 53
54 54 /* Linux lockd RPC called by statd when it detects an NFS server reboot */
55 55 #define LX_NLMPROC_NSM_NOTIFY 16
56 56
57 57 /* From uts/common/klm/nlm_impl.c */
58 58 extern void nlm_netbuf_to_netobj(struct netbuf *, int *, netobj *);
59 59 extern void nlm_nsm_clnt_init(CLIENT *, struct nlm_nsm *);
60 60
61 61 /*
62 62 * Check if the current lockd is still running.
63 63 */
64 64 static boolean_t
65 65 lx_lockd_alive(pid_t lockd_pid)
66 66 {
67 67 boolean_t ret = B_FALSE;
68 68 proc_t *p;
69 69 vnode_t *vp;
70 70 char path[MAXPATHLEN];
71 71
72 72 mutex_enter(&pidlock);
73 73 p = prfind(lockd_pid);
74 74 if (p == NULL) {
75 75 mutex_exit(&pidlock);
76 76 return (B_FALSE);
77 77 }
78 78
79 79 mutex_enter(&p->p_lock);
80 80 if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) {
81 81 mutex_exit(&p->p_lock);
82 82 mutex_exit(&pidlock);
83 83 return (B_FALSE);
84 84 }
85 85 vp = p->p_exec;
86 86 VN_HOLD(vp);
87 87 mutex_exit(&p->p_lock);
88 88 mutex_exit(&pidlock);
89 89
90 90 if (vnodetopath(NULL, vp, path, sizeof (path), CRED()) == 0 &&
91 91 strcmp(path, LX_LOCKD_PATH) == 0) {
92 92 ret = B_TRUE;
93 93 }
94 94
95 95 VN_RELE(vp);
96 96 return (ret);
97 97 }
98 98
99 99 static void
100 100 lx_run_lockd(void *a)
101 101 {
102 102 proc_t *p = curproc;
103 103 zone_t *z = curzone;
104 104 struct core_globals *cg;
105 105 lx_zone_data_t *lxzd = ztolxzd(z);
106 106 int res;
107 107
108 108 ASSERT(!INGLOBALZONE(p));
109 109 VERIFY(lxzd != NULL);
110 110
111 111 /* The following block is derived from start_init_common */
112 112 ASSERT_STACK_ALIGNED();
113 113
114 114 p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0;
115 115 p->p_usrstack = (caddr_t)USRSTACK32;
116 116 p->p_model = DATAMODEL_ILP32;
117 117 p->p_stkprot = PROT_ZFOD & ~PROT_EXEC;
118 118 p->p_datprot = PROT_ZFOD & ~PROT_EXEC;
119 119 p->p_stk_ctl = INT32_MAX;
120 120
121 121 p->p_as = as_alloc();
122 122 p->p_as->a_proc = p;
123 123 p->p_as->a_userlimit = (caddr_t)USERLIMIT32;
124 124 (void) hat_setup(p->p_as->a_hat, HAT_INIT);
125 125
126 126 VERIFY((cg = zone_getspecific(core_zone_key, z)) != NULL);
127 127
128 128 corectl_path_hold(cg->core_default_path);
129 129 corectl_content_hold(cg->core_default_content);
130 130
131 131 p->p_corefile = cg->core_default_path;
132 132 p->p_content = cg->core_default_content;
133 133
134 134 init_mstate(curthread, LMS_SYSTEM);
135 135 res = exec_init(LX_LOCKD_PATH, NULL);
136 136
137 137 /* End of code derived from start_init_common */
138 138
139 139 /* The following is derived from zone_start_init - see comments there */
140 140 if (res != 0 || zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) {
141 141 if (proc_exit(CLD_EXITED, res) != 0) {
142 142 mutex_enter(&p->p_lock);
143 143 ASSERT(p->p_flag & SEXITLWPS);
144 144 lwp_exit();
145 145 }
146 146 } else {
147 147 id_t cid = curthread->t_cid;
148 148
149 149 mutex_enter(&class_lock);
150 150 ASSERT(cid < loaded_classes);
151 151 if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
152 152 z->zone_fixed_hipri) {
153 153 pcparms_t pcparms;
154 154
155 155 pcparms.pc_cid = cid;
156 156 ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
157 157 ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
158 158 FXMAXUPRI;
159 159 ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
160 160 FX_DOUPRILIM | FX_DOUPRI;
161 161
162 162 mutex_enter(&pidlock);
163 163 mutex_enter(&p->p_lock);
164 164 (void) parmsset(&pcparms, curthread);
165 165 mutex_exit(&p->p_lock);
166 166 mutex_exit(&pidlock);
167 167 } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
168 168 curthread->t_pri = RTGPPRIO0;
169 169 }
170 170 mutex_exit(&class_lock);
171 171
172 172 /*
173 173 * Set our pid as the lockd pid in the zone data, or exit
174 174 * if another process raced and already did so.
175 175 */
176 176 mutex_enter(&lxzd->lxzd_lock);
177 177 if (lxzd->lxzd_lockd_pid != 0) {
178 178 /* another mount raced and created a new lockd */
179 179 mutex_exit(&lxzd->lxzd_lock);
180 180 if (proc_exit(CLD_EXITED, 0) != 0) {
181 181 mutex_enter(&p->p_lock);
182 182 ASSERT(p->p_flag & SEXITLWPS);
183 183 lwp_exit();
184 184 }
185 185 return;
186 186 }
187 187 lxzd->lxzd_lockd_pid = p->p_pid;
188 188 mutex_exit(&lxzd->lxzd_lock);
189 189
190 190 /* cause the process to return to userland. */
191 191 lwp_rtt();
192 192 }
193 193 }
194 194
195 195 /*
196 196 * Launch the user-level, native, lx_lockd process.
197 197 */
198 198 int
199 199 lx_start_nfs_lockd()
200 200 {
201 201 id_t cid;
202 202 proc_t *p = ttoproc(curthread);
203 203 zone_t *z = p->p_zone;
204 204 lx_zone_data_t *lxzd = ztolxzd(z);
205 205
206 206 ASSERT(!INGLOBALZONE(p));
207 207 ASSERT(lxzd != NULL);
208 208
209 209 /*
210 210 * This should only be called by the mount emulation, which must have
211 211 * 'root' privileges in order to have performed a mount, but
212 212 * double-check.
213 213 */
214 214 if (crgetuid(CRED()) != 0)
215 215 return (EPERM);
216 216
217 217 mutex_enter(&lxzd->lxzd_lock);
218 218 if (lxzd->lxzd_lockd_pid != 0) {
219 219 /* verify lockd is still alive */
220 220 pid_t lockd_pid;
221 221
222 222 lockd_pid = lxzd->lxzd_lockd_pid;
223 223 mutex_exit(&lxzd->lxzd_lock);
224 224
225 225 if (lx_lockd_alive(lockd_pid))
226 226 return (EEXIST);
227 227
228 228 mutex_enter(&lxzd->lxzd_lock);
229 229 if (lxzd->lxzd_lockd_pid != lockd_pid) {
230 230 /* another mount raced and created a new lockd */
231 231 mutex_exit(&lxzd->lxzd_lock);
232 232 return (EEXIST);
233 233 }
234 234
235 235 /* old lockd is dead, launch a new one */
236 236 lxzd->lxzd_lockd_pid = 0;
237 237 }
238 238 mutex_exit(&lxzd->lxzd_lock);
239 239
240 240 if (z->zone_defaultcid > 0) {
241 241 cid = z->zone_defaultcid;
242 242 } else {
243 243 pool_lock();
244 244 cid = pool_get_class(z->zone_pool);
245 245 pool_unlock();
246 246 }
247 247 if (cid == -1)
248 248 cid = defaultcid;
249 249
250 250 /*
251 251 * There's nothing to do here if creating the proc fails, but we
252 252 * return the result to make it obvious while DTracing.
253 253 */
254 254 return (newproc(lx_run_lockd, NULL, cid, minclsyspri - 1, NULL, -1));
255 255 }
256 256
257 257 void
258 258 lx_upcall_statd(int op, struct nlm_globals *g, struct nlm_host *host)
259 259 {
260 260 struct nlm_nsm *nsm;
261 261 struct mon args;
262 262 struct mon_id *mip = &args.mon_id;
263 263 int family;
264 264 netobj obj;
265 265 enum clnt_stat stat;
266 266
267 267 /*
268 268 * For Linux rpc.statd monitor registration, the Linux NSMPROC_MON and
269 269 * NSMPROC_UNMON RPC upcalls correspond almost directly to the native
270 270 * SM_MON and SM_UNMON RPC upcalls. The key differences with the native
271 271 * registration is that in our nlm_host_monitor function we make two
272 272 * RPC calls:
273 273 * - the first RPC (nsmaddrproc1_reg_1) uses our private 'nsm_addr'
274 274 * RPC protocol to register the lockd RPC information that statd
275 275 * should call when it detects that the remote server rebooted
276 276 * - the second RPC (sm_mon_1) tells statd the information about the
277 277 * remote server to be monitored
278 278 * For Linux, there is only a single RPC from the kernel to the local
279 279 * statd. This RPC is equivalent to our sm_mon_1 code, but it uses the
280 280 * Linux-private NLMPROC_NSM_NOTIFY lockd procedure in the 'my_proc'
281 281 * RPC parameter. This corresponds to our private 'nsm_addr' code, and
282 282 * tells statd which lockd RPC to call when it detects a server reboot.
283 283 *
284 284 * Because our sm_mon_1 RPC is so similar to the Linux RPC, we can use
285 285 * that directly and simply set the expected value in the 'my_proc'
286 286 * argument.
287 287 *
288 288 * Within the kernel lockd RPC handling, the nlm_prog_3_dtable dispatch
289 289 * table has an entry for each lockd RPC function. Thus, this table also
290 290 * contains an entry for the Linux NLMPROC_NSM_NOTIFY procedure. That
291 291 * procedure number is unused by the native lockd code, so there is no
292 292 * conflict with dispatching that procedure. The implementation of the
293 293 * procedure corresponds to the native, private NLM_SM_NOTIFY1
294 294 * procedure which is called by the native rpc.statd.
295 295 *
296 296 * The Linux RPC call to "unmonitor" a host expects the same arguments
297 297 * as we pass to monitor, so that is also handled here by this same
298 298 * brand hook.
299 299 */
300 300 nlm_netbuf_to_netobj(&host->nh_addr, &family, &obj);
301 301 nsm = &g->nlm_nsm;
302 302
303 303 bzero(&args, sizeof (args));
|
↓ open down ↓ |
303 lines elided |
↑ open up ↑ |
304 304
305 305 mip->mon_name = host->nh_name;
306 306 mip->my_id.my_name = uts_nodename();
307 307 mip->my_id.my_prog = NLM_PROG;
308 308 mip->my_id.my_vers = NLM_SM;
309 309 mip->my_id.my_proc = LX_NLMPROC_NSM_NOTIFY;
310 310 if (op == SM_MON) {
311 311 bcopy(&host->nh_sysid, args.priv, sizeof (uint16_t));
312 312 }
313 313
314 - sema_p(&nsm->ns_sem);
314 + mutex_enter(&nsm->ns_lock);
315 315 nlm_nsm_clnt_init(nsm->ns_handle, nsm);
316 316 if (op == SM_MON) {
317 317 struct sm_stat_res mres;
318 318
319 319 bzero(&mres, sizeof (mres));
320 320 stat = sm_mon_1(&args, &mres, nsm->ns_handle);
321 321 } else {
322 322 struct sm_stat ures;
323 323
324 324 ASSERT(op == SM_UNMON);
325 325 bzero(&ures, sizeof (ures));
326 326 stat = sm_unmon_1(mip, &ures, nsm->ns_handle);
327 327 }
328 - sema_v(&nsm->ns_sem);
328 + mutex_exit(&nsm->ns_lock);
329 329
330 330 if (stat != RPC_SUCCESS) {
331 331 NLM_WARN("Failed to contact local statd, stat=%d", stat);
332 332 if (op == SM_MON) {
333 333 mutex_enter(&g->lock);
334 334 host->nh_flags &= ~NLM_NH_MONITORED;
335 335 mutex_exit(&g->lock);
336 336 }
337 337 }
338 338 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX