Print this page
OS-5591 Double flock(3C) causes undue block
OS-5585 fcntl(F_OFD_GETLK) should return EINVAL on bad parameters
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/os/flock.c
+++ new/usr/src/uts/common/os/flock.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 28 /* All Rights Reserved */
29 29
30 30 /*
31 31 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
32 32 * Copyright 2015 Joyent, Inc.
33 33 */
34 34
35 35 #include <sys/flock_impl.h>
36 36 #include <sys/vfs.h>
37 37 #include <sys/t_lock.h> /* for <sys/callb.h> */
38 38 #include <sys/callb.h>
39 39 #include <sys/clconf.h>
40 40 #include <sys/cladm.h>
41 41 #include <sys/nbmlock.h>
42 42 #include <sys/cred.h>
43 43 #include <sys/policy.h>
44 44
45 45 /*
46 46 * The following four variables are for statistics purposes and they are
47 47 * not protected by locks. They may not be accurate but will at least be
48 48 * close to the actual value.
49 49 */
50 50
51 51 int flk_lock_allocs;
52 52 int flk_lock_frees;
53 53 int edge_allocs;
54 54 int edge_frees;
55 55 int flk_proc_vertex_allocs;
56 56 int flk_proc_edge_allocs;
57 57 int flk_proc_vertex_frees;
58 58 int flk_proc_edge_frees;
59 59
60 60 static kmutex_t flock_lock;
61 61
62 62 #ifdef DEBUG
63 63 int check_debug = 0;
64 64 #define CHECK_ACTIVE_LOCKS(gp) if (check_debug) \
65 65 check_active_locks(gp);
66 66 #define CHECK_SLEEPING_LOCKS(gp) if (check_debug) \
67 67 check_sleeping_locks(gp);
68 68 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp) \
69 69 if (check_debug) \
70 70 check_owner_locks(gp, pid, sysid, vp);
71 71 #define CHECK_LOCK_TRANSITION(old_state, new_state) \
72 72 { \
73 73 if (check_lock_transition(old_state, new_state)) { \
74 74 cmn_err(CE_PANIC, "Illegal lock transition \
75 75 from %d to %d", old_state, new_state); \
76 76 } \
77 77 }
78 78 #else
79 79
80 80 #define CHECK_ACTIVE_LOCKS(gp)
81 81 #define CHECK_SLEEPING_LOCKS(gp)
82 82 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
83 83 #define CHECK_LOCK_TRANSITION(old_state, new_state)
84 84
85 85 #endif /* DEBUG */
86 86
87 87 struct kmem_cache *flk_edge_cache;
88 88
89 89 graph_t *lock_graph[HASH_SIZE];
90 90 proc_graph_t pgraph;
91 91
92 92 /*
93 93 * Clustering.
94 94 *
95 95 * NLM REGISTRY TYPE IMPLEMENTATION
96 96 *
97 97 * Assumptions:
98 98 * 1. Nodes in a cluster are numbered starting at 1; always non-negative
99 99 * integers; maximum node id is returned by clconf_maximum_nodeid().
100 100 * 2. We use this node id to identify the node an NLM server runs on.
101 101 */
102 102
103 103 /*
104 104 * NLM registry object keeps track of NLM servers via their
105 105 * nlmids (which are the node ids of the node in the cluster they run on)
106 106 * that have requested locks at this LLM with which this registry is
107 107 * associated.
108 108 *
109 109 * Representation of abstraction:
110 110 * rep = record[ states: array[nlm_state],
111 111 * lock: mutex]
112 112 *
113 113 * Representation invariants:
114 114 * 1. index i of rep.states is between 0 and n - 1 where n is number
115 115 * of elements in the array, which happen to be the maximum number
116 116 * of nodes in the cluster configuration + 1.
117 117 * 2. map nlmid to index i of rep.states
118 118 * 0 -> 0
119 119 * 1 -> 1
120 120 * 2 -> 2
121 121 * n-1 -> clconf_maximum_nodeid()+1
122 122 * 3. This 1-1 mapping is quite convenient and it avoids errors resulting
123 123 * from forgetting to subtract 1 from the index.
124 124 * 4. The reason we keep the 0th index is the following. A legitimate
125 125 * cluster configuration includes making a UFS file system NFS
126 126 * exportable. The code is structured so that if you're in a cluster
127 127 * you do one thing; otherwise, you do something else. The problem
128 128 * is what to do if you think you're in a cluster with PXFS loaded,
129 129 * but you're using UFS not PXFS? The upper two bytes of the sysid
130 130 * encode the node id of the node where NLM server runs; these bytes
131 131 * are zero for UFS. Since the nodeid is used to index into the
132 132 * registry, we can record the NLM server state information at index
133 133 * 0 using the same mechanism used for PXFS file locks!
134 134 */
135 135 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
136 136 static kmutex_t nlm_reg_lock; /* lock to protect arrary */
137 137 static uint_t nlm_status_size; /* size of state array */
138 138
139 139 /*
140 140 * Although we need a global lock dependency graph (and associated data
141 141 * structures), we also need a per-zone notion of whether the lock manager is
142 142 * running, and so whether to allow lock manager requests or not.
143 143 *
144 144 * Thus, on a per-zone basis we maintain a ``global'' variable
145 145 * (flk_lockmgr_status), protected by flock_lock, and set when the lock
146 146 * manager is determined to be changing state (starting or stopping).
147 147 *
148 148 * Each graph/zone pair also has a copy of this variable, which is protected by
149 149 * the graph's mutex.
150 150 *
151 151 * The per-graph copies are used to synchronize lock requests with shutdown
152 152 * requests. The global copy is used to initialize the per-graph field when a
153 153 * new graph is created.
154 154 */
155 155 struct flock_globals {
156 156 flk_lockmgr_status_t flk_lockmgr_status;
157 157 flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
158 158 };
159 159
160 160 zone_key_t flock_zone_key;
161 161
162 162 static void create_flock(lock_descriptor_t *, flock64_t *);
163 163 static lock_descriptor_t *flk_get_lock(void);
164 164 static void flk_free_lock(lock_descriptor_t *lock);
165 165 static void flk_get_first_blocking_lock(lock_descriptor_t *request);
166 166 static int flk_process_request(lock_descriptor_t *);
167 167 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
168 168 static edge_t *flk_get_edge(void);
169 169 static int flk_wait_execute_request(lock_descriptor_t *);
170 170 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
171 171 static void flk_insert_active_lock(lock_descriptor_t *);
172 172 static void flk_delete_active_lock(lock_descriptor_t *, int);
173 173 static void flk_insert_sleeping_lock(lock_descriptor_t *);
174 174 static void flk_graph_uncolor(graph_t *);
175 175 static void flk_wakeup(lock_descriptor_t *, int);
176 176 static void flk_free_edge(edge_t *);
177 177 static void flk_recompute_dependencies(lock_descriptor_t *,
178 178 lock_descriptor_t **, int, int);
179 179 static int flk_find_barriers(lock_descriptor_t *);
180 180 static void flk_update_barriers(lock_descriptor_t *);
181 181 static int flk_color_reachables(lock_descriptor_t *);
182 182 static int flk_canceled(lock_descriptor_t *);
183 183 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
184 184 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
185 185 static void wait_for_lock(lock_descriptor_t *);
186 186 static void unlock_lockmgr_granted(struct flock_globals *);
187 187 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
188 188
189 189 /* Clustering hooks */
190 190 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
191 191 static void cl_flk_wakeup_sleeping_nlm_locks(int);
192 192 static void cl_flk_unlock_nlm_granted(int);
193 193
194 194 #ifdef DEBUG
195 195 static int check_lock_transition(int, int);
196 196 static void check_sleeping_locks(graph_t *);
197 197 static void check_active_locks(graph_t *);
198 198 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
199 199 static void path(lock_descriptor_t *, lock_descriptor_t *);
200 200 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
201 201 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
202 202 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
203 203 #endif
204 204
205 205 /* proc_graph function definitions */
206 206 static int flk_check_deadlock(lock_descriptor_t *);
207 207 static void flk_proc_graph_uncolor(void);
208 208 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
209 209 static proc_edge_t *flk_get_proc_edge(void);
210 210 static void flk_proc_release(proc_vertex_t *);
211 211 static void flk_free_proc_edge(proc_edge_t *);
212 212 static void flk_update_proc_graph(edge_t *, int);
213 213
214 214 /* Non-blocking mandatory locking */
215 215 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
216 216 u_offset_t);
217 217
218 218 static struct flock_globals *
219 219 flk_get_globals(void)
220 220 {
221 221 /*
222 222 * The KLM module had better be loaded if we're attempting to handle
223 223 * lockmgr requests.
224 224 */
225 225 ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
226 226 return (zone_getspecific(flock_zone_key, curproc->p_zone));
227 227 }
228 228
229 229 static flk_lockmgr_status_t
230 230 flk_get_lockmgr_status(void)
231 231 {
232 232 struct flock_globals *fg;
233 233
234 234 ASSERT(MUTEX_HELD(&flock_lock));
235 235
236 236 if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
237 237 /*
238 238 * KLM module not loaded; lock manager definitely not running.
239 239 */
240 240 return (FLK_LOCKMGR_DOWN);
241 241 }
242 242 fg = flk_get_globals();
243 243 return (fg->flk_lockmgr_status);
244 244 }
245 245
246 246 /*
247 247 * This implements Open File Description (not descriptor) style record locking.
248 248 * These locks can also be thought of as pid-less since they are not tied to a
249 249 * specific process, thus they're preserved across fork.
250 250 *
251 251 * Called directly from fcntl.
252 252 *
253 253 * See reclock() for the implementation of the traditional POSIX style record
254 254 * locking scheme (pid-ful). This function is derived from reclock() but
255 255 * simplified and modified to work for OFD style locking.
256 256 *
257 257 * The two primary advantages of OFD style of locking are:
258 258 * 1) It is per-file description, so closing a file descriptor that refers to a
259 259 * different file description for the same file will not drop the lock (i.e.
260 260 * two open's of the same file get different descriptions but a dup or fork
261 261 * will refer to the same description).
262 262 * 2) Locks are preserved across fork(2).
263 263 *
264 264 * Because these locks are per-description a lock ptr lives at the f_filocks
265 265 * member of the file_t and the lock_descriptor includes a file_t pointer
266 266 * to enable unique lock identification and management.
267 267 *
268 268 * Since these locks are pid-less we cannot do deadlock detection with the
269 269 * current process-oriented implementation. This is consistent with OFD locking
270 270 * behavior on other operating systems such as Linux. Since we don't do
271 271 * deadlock detection we never interact with the process graph that is
272 272 * maintained for deadlock detection on the traditional POSIX-style locks.
273 273 *
274 274 * Future Work:
275 275 *
276 276 * The current implementation does not support record locks. That is,
277 277 * currently the single lock must cover the entire file. This is validated in
278 278 * fcntl. To support record locks the f_filock pointer in the file_t needs to
279 279 * be changed to a list of pointers to the locks. That list needs to be
280 280 * managed independently of the lock list on the vnode itself and it needs to
281 281 * be maintained as record locks are created, split, coalesced and deleted.
282 282 *
283 283 * The current implementation does not support remote file systems (e.g.
284 284 * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks
285 285 * interact with the NLM is not clear since the NLM protocol/implementation
286 286 * appears to be oriented around locks associated with a process. A further
287 287 * problem is that a design is needed for what nlm_send_siglost() should do and
288 288 * where it will send SIGLOST. More recent versions of Linux apparently try to
289 289 * emulate OFD locks on NFS by converting them to traditional POSIX style locks
290 290 * that work with the NLM. It is not clear that this provides the correct
291 291 * semantics in all cases.
292 292 */
293 293 int
294 294 ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, u_offset_t offset)
295 295 {
296 296 int cmd = 0;
297 297 vnode_t *vp;
298 298 lock_descriptor_t stack_lock_request;
299 299 lock_descriptor_t *lock_request;
300 300 int error = 0;
301 301 graph_t *gp;
302 302 int serialize = 0;
303 303
304 304 if (fcmd != F_OFD_GETLK)
305 305 cmd = SETFLCK;
306 306
307 307 if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW)
308 308 cmd |= SLPFLCK;
309 309
310 310 /* see block comment */
311 311 VERIFY(lckdat->l_whence == 0);
312 312 VERIFY(lckdat->l_start == 0);
313 313 VERIFY(lckdat->l_len == 0);
314 314
315 315 vp = fp->f_vnode;
316 316
317 317 /*
318 318 * For reclock fs_frlock() would normally have set these in a few
319 319 * places but for us it's cleaner to centralize it here. Note that
320 320 * IGN_PID is -1. We use 0 for our pid-less locks.
321 321 */
322 322 lckdat->l_pid = 0;
323 323 lckdat->l_sysid = 0;
324 324
325 325 /*
326 326 * Check access permissions
327 327 */
328 328 if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) &&
329 329 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
330 330 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
331 331 return (EBADF);
332 332
333 333 /*
334 334 * for query and unlock we use the stack_lock_request
335 335 */
336 336 if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) {
337 337 lock_request = &stack_lock_request;
338 338 (void) bzero((caddr_t)lock_request,
339 339 sizeof (lock_descriptor_t));
340 340
341 341 /*
342 342 * following is added to make the assertions in
343 343 * flk_execute_request() pass
344 344 */
345 345 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
346 346 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
347 347 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
348 348 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
349 349 lock_request->l_status = FLK_INITIAL_STATE;
350 350 } else {
351 351 lock_request = flk_get_lock();
352 352 fp->f_filock = (struct filock *)lock_request;
353 353 }
354 354 lock_request->l_state = 0;
355 355 lock_request->l_vnode = vp;
356 356 lock_request->l_zoneid = getzoneid();
357 357 lock_request->l_ofd = fp;
358 358
359 359 /*
360 360 * Convert the request range into the canonical start and end
361 361 * values then check the validity of the lock range.
362 362 */
363 363 error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start,
364 364 &lock_request->l_end, offset);
365 365 if (error)
366 366 goto done;
367 367
368 368 error = flk_check_lock_data(lock_request->l_start, lock_request->l_end,
369 369 MAXEND);
370 370 if (error)
371 371 goto done;
372 372
373 373 ASSERT(lock_request->l_end >= lock_request->l_start);
374 374
375 375 lock_request->l_type = lckdat->l_type;
376 376 if (cmd & SLPFLCK)
377 377 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
378 378
379 379 if (!(cmd & SETFLCK)) {
380 380 if (lock_request->l_type == F_RDLCK ||
381 381 lock_request->l_type == F_WRLCK)
382 382 lock_request->l_state |= QUERY_LOCK;
383 383 }
384 384 lock_request->l_flock = (*lckdat);
385 385
386 386 /*
387 387 * We are ready for processing the request
388 388 */
389 389
390 390 if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK &&
391 391 nbl_need_check(vp)) {
392 392 nbl_start_crit(vp, RW_WRITER);
393 393 serialize = 1;
394 394 }
395 395
396 396 /* Get the lock graph for a particular vnode */
397 397 gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
398 398
399 399 mutex_enter(&gp->gp_mutex);
400 400
401 401 lock_request->l_state |= REFERENCED_LOCK;
402 402 lock_request->l_graph = gp;
403 403
404 404 switch (lock_request->l_type) {
405 405 case F_RDLCK:
406 406 case F_WRLCK:
407 407 if (IS_QUERY_LOCK(lock_request)) {
408 408 flk_get_first_blocking_lock(lock_request);
409 409 if (lock_request->l_ofd != NULL)
410 410 lock_request->l_flock.l_pid = -1;
411 411 (*lckdat) = lock_request->l_flock;
412 412 } else {
413 413 /* process the request now */
414 414 error = flk_process_request(lock_request);
415 415 }
416 416 break;
417 417
418 418 case F_UNLCK:
419 419 /* unlock request will not block so execute it immediately */
420 420 error = flk_execute_request(lock_request);
421 421 break;
422 422
423 423 default:
424 424 error = EINVAL;
425 425 break;
426 426 }
427 427
428 428 if (lock_request == &stack_lock_request) {
429 429 flk_set_state(lock_request, FLK_DEAD_STATE);
430 430 } else {
431 431 lock_request->l_state &= ~REFERENCED_LOCK;
432 432 if ((error != 0) || IS_DELETED(lock_request)) {
433 433 flk_set_state(lock_request, FLK_DEAD_STATE);
434 434 flk_free_lock(lock_request);
435 435 }
436 436 }
437 437
438 438 mutex_exit(&gp->gp_mutex);
439 439 if (serialize)
440 440 nbl_end_crit(vp);
441 441
442 442 return (error);
443 443
444 444 done:
445 445 flk_set_state(lock_request, FLK_DEAD_STATE);
446 446 if (lock_request != &stack_lock_request)
447 447 flk_free_lock(lock_request);
448 448 return (error);
449 449 }
450 450
451 451 /*
452 452 * Remove any lock on the vnode belonging to the given file_t.
453 453 * Called from closef on last close, file_t is locked.
454 454 *
455 455 * This is modeled on the cleanlocks() function but only removes the single
456 456 * lock associated with fp.
457 457 */
458 458 void
459 459 ofdcleanlock(file_t *fp)
460 460 {
461 461 lock_descriptor_t *fplock, *lock, *nlock;
462 462 vnode_t *vp;
463 463 graph_t *gp;
464 464
465 465 ASSERT(MUTEX_HELD(&fp->f_tlock));
466 466
467 467 if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL)
468 468 return;
469 469
470 470 fp->f_filock = NULL;
471 471 vp = fp->f_vnode;
472 472
473 473 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
474 474
475 475 if (gp == NULL)
476 476 return;
477 477 mutex_enter(&gp->gp_mutex);
478 478
479 479 CHECK_SLEEPING_LOCKS(gp);
480 480 CHECK_ACTIVE_LOCKS(gp);
481 481
482 482 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
483 483
484 484 if (lock) {
485 485 do {
486 486 nlock = lock->l_next;
487 487 if (fplock == lock) {
488 488 CANCEL_WAKEUP(lock);
489 489 break;
490 490 }
491 491 lock = nlock;
492 492 } while (lock->l_vnode == vp);
493 493 }
494 494
495 495 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
496 496
497 497 if (lock) {
498 498 do {
499 499 nlock = lock->l_next;
500 500 if (fplock == lock) {
501 501 flk_delete_active_lock(lock, 0);
502 502 flk_wakeup(lock, 1);
503 503 flk_free_lock(lock);
504 504 break;
505 505 }
506 506 lock = nlock;
507 507 } while (lock->l_vnode == vp);
508 508 }
509 509
510 510 CHECK_SLEEPING_LOCKS(gp);
511 511 CHECK_ACTIVE_LOCKS(gp);
512 512 mutex_exit(&gp->gp_mutex);
513 513 }
514 514
515 515 /*
|
↓ open down ↓ |
515 lines elided |
↑ open up ↑ |
516 516 * Routine called from fs_frlock in fs/fs_subr.c
517 517 *
518 518 * This implements traditional POSIX style record locking. The two primary
519 519 * drawbacks to this style of locking are:
520 520 * 1) It is per-process, so any close of a file descriptor that refers to the
521 521 * file will drop the lock (e.g. lock /etc/passwd, call a library function
522 522 * which opens /etc/passwd to read the file, when the library closes it's
523 523 * file descriptor the application loses its lock and does not know).
524 524 * 2) Locks are not preserved across fork(2).
525 525 *
526 - * Because these locks are only assoiciated with a pid they are per-process.
527 - * This is why any close will drop the lock and is also why once the process
528 - * forks then the lock is no longer related to the new process. These locks can
529 - * be considered as pid-ful.
526 + * Because these locks are only associated with a PID, they are per-process.
527 + * This is why any close will drop the lock and is also why, once the process
528 + * forks, the lock is no longer related to the new process. These locks can
529 + * be considered as PID-ful.
530 530 *
531 531 * See ofdlock() for the implementation of a similar but improved locking
532 532 * scheme.
533 533 */
534 534 int
535 535 reclock(vnode_t *vp,
536 536 flock64_t *lckdat,
537 537 int cmd,
538 538 int flag,
539 539 u_offset_t offset,
540 540 flk_callback_t *flk_cbp)
541 541 {
542 542 lock_descriptor_t stack_lock_request;
543 543 lock_descriptor_t *lock_request;
544 544 int error = 0;
545 545 graph_t *gp;
546 546 int nlmid;
547 547
548 548 /*
549 549 * Check access permissions
550 550 */
551 551 if ((cmd & SETFLCK) &&
552 552 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
553 553 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
554 554 return (EBADF);
555 555
556 556 /*
557 557 * for query and unlock we use the stack_lock_request
558 558 */
559 559
560 560 if ((lckdat->l_type == F_UNLCK) ||
561 561 !((cmd & INOFLCK) || (cmd & SETFLCK))) {
562 562 lock_request = &stack_lock_request;
563 563 (void) bzero((caddr_t)lock_request,
564 564 sizeof (lock_descriptor_t));
565 565
566 566 /*
567 567 * following is added to make the assertions in
568 568 * flk_execute_request() to pass through
569 569 */
570 570
571 571 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
572 572 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
573 573 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
574 574 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
575 575 lock_request->l_status = FLK_INITIAL_STATE;
576 576 } else {
577 577 lock_request = flk_get_lock();
578 578 }
579 579 lock_request->l_state = 0;
580 580 lock_request->l_vnode = vp;
581 581 lock_request->l_zoneid = getzoneid();
582 582
583 583 /*
584 584 * Convert the request range into the canonical start and end
585 585 * values. The NLM protocol supports locking over the entire
586 586 * 32-bit range, so there's no range checking for remote requests,
587 587 * but we still need to verify that local requests obey the rules.
588 588 */
589 589 /* Clustering */
590 590 if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
591 591 ASSERT(lckdat->l_whence == 0);
592 592 lock_request->l_start = lckdat->l_start;
593 593 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
594 594 lckdat->l_start + (lckdat->l_len - 1);
595 595 } else {
596 596 /* check the validity of the lock range */
597 597 error = flk_convert_lock_data(vp, lckdat,
598 598 &lock_request->l_start, &lock_request->l_end,
599 599 offset);
600 600 if (error) {
601 601 goto done;
602 602 }
603 603 error = flk_check_lock_data(lock_request->l_start,
604 604 lock_request->l_end, MAXEND);
605 605 if (error) {
606 606 goto done;
607 607 }
608 608 }
609 609
610 610 ASSERT(lock_request->l_end >= lock_request->l_start);
611 611
612 612 lock_request->l_type = lckdat->l_type;
613 613 if (cmd & INOFLCK)
614 614 lock_request->l_state |= IO_LOCK;
615 615 if (cmd & SLPFLCK)
616 616 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
617 617 if (cmd & RCMDLCK)
618 618 lock_request->l_state |= LOCKMGR_LOCK;
619 619 if (cmd & NBMLCK)
620 620 lock_request->l_state |= NBMAND_LOCK;
621 621 /*
622 622 * Clustering: set flag for PXFS locks
623 623 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
624 624 * also be of type 'RCMDLCK'.
625 625 * We do not _only_ check the GETPXFSID() macro because local PXFS
626 626 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
627 627 */
628 628
629 629 if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
630 630 lock_request->l_state |= PXFS_LOCK;
631 631 }
632 632 if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
633 633 if (lock_request->l_type == F_RDLCK ||
634 634 lock_request->l_type == F_WRLCK)
635 635 lock_request->l_state |= QUERY_LOCK;
636 636 }
637 637 lock_request->l_flock = (*lckdat);
638 638 lock_request->l_callbacks = flk_cbp;
639 639
640 640 /*
641 641 * We are ready for processing the request
642 642 */
643 643 if (IS_LOCKMGR(lock_request)) {
644 644 /*
645 645 * If the lock request is an NLM server request ....
646 646 */
647 647 if (nlm_status_size == 0) { /* not booted as cluster */
648 648 mutex_enter(&flock_lock);
649 649 /*
650 650 * Bail out if this is a lock manager request and the
651 651 * lock manager is not supposed to be running.
652 652 */
653 653 if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
654 654 mutex_exit(&flock_lock);
655 655 error = ENOLCK;
656 656 goto done;
657 657 }
658 658 mutex_exit(&flock_lock);
659 659 } else { /* booted as a cluster */
660 660 nlmid = GETNLMID(lock_request->l_flock.l_sysid);
661 661 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
662 662
663 663 mutex_enter(&nlm_reg_lock);
664 664 /*
665 665 * If the NLM registry does not know about this
666 666 * NLM server making the request, add its nlmid
667 667 * to the registry.
668 668 */
669 669 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
670 670 nlmid)) {
671 671 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
672 672 } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
673 673 nlmid)) {
674 674 /*
675 675 * If the NLM server is already known (has made
676 676 * previous lock requests) and its state is
677 677 * not NLM_UP (means that NLM server is
678 678 * shutting down), then bail out with an
679 679 * error to deny the lock request.
680 680 */
681 681 mutex_exit(&nlm_reg_lock);
682 682 error = ENOLCK;
683 683 goto done;
684 684 }
685 685 mutex_exit(&nlm_reg_lock);
686 686 }
687 687 }
688 688
689 689 /* Now get the lock graph for a particular vnode */
690 690 gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
691 691
692 692 /*
693 693 * We drop rwlock here otherwise this might end up causing a
694 694 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
695 695 */
696 696
697 697 if (IS_IO_LOCK(lock_request)) {
698 698 VOP_RWUNLOCK(vp,
699 699 (lock_request->l_type == F_RDLCK) ?
700 700 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
701 701 }
702 702 mutex_enter(&gp->gp_mutex);
703 703
704 704 lock_request->l_state |= REFERENCED_LOCK;
705 705 lock_request->l_graph = gp;
706 706
707 707 switch (lock_request->l_type) {
708 708 case F_RDLCK:
709 709 case F_WRLCK:
710 710 if (IS_QUERY_LOCK(lock_request)) {
711 711 flk_get_first_blocking_lock(lock_request);
712 712 if (lock_request->l_ofd != NULL)
713 713 lock_request->l_flock.l_pid = -1;
714 714 (*lckdat) = lock_request->l_flock;
715 715 break;
716 716 }
717 717
718 718 /* process the request now */
719 719
720 720 error = flk_process_request(lock_request);
721 721 break;
722 722
723 723 case F_UNLCK:
724 724 /* unlock request will not block so execute it immediately */
725 725
726 726 if (IS_LOCKMGR(lock_request) &&
727 727 flk_canceled(lock_request)) {
728 728 error = 0;
729 729 } else {
730 730 error = flk_execute_request(lock_request);
731 731 }
732 732 break;
733 733
734 734 case F_UNLKSYS:
735 735 /*
736 736 * Recovery mechanism to release lock manager locks when
737 737 * NFS client crashes and restart. NFS server will clear
738 738 * old locks and grant new locks.
739 739 */
740 740
741 741 if (lock_request->l_flock.l_sysid == 0) {
742 742 mutex_exit(&gp->gp_mutex);
743 743 return (EINVAL);
744 744 }
745 745 if (secpolicy_nfs(CRED()) != 0) {
746 746 mutex_exit(&gp->gp_mutex);
747 747 return (EPERM);
748 748 }
749 749 flk_delete_locks_by_sysid(lock_request);
750 750 lock_request->l_state &= ~REFERENCED_LOCK;
751 751 flk_set_state(lock_request, FLK_DEAD_STATE);
752 752 flk_free_lock(lock_request);
753 753 mutex_exit(&gp->gp_mutex);
754 754 return (0);
755 755
756 756 default:
757 757 error = EINVAL;
758 758 break;
759 759 }
760 760
761 761 /* Clustering: For blocked PXFS locks, return */
762 762 if (error == PXFS_LOCK_BLOCKED) {
763 763 lock_request->l_state &= ~REFERENCED_LOCK;
764 764 mutex_exit(&gp->gp_mutex);
765 765 return (error);
766 766 }
767 767
768 768 /*
769 769 * Now that we have seen the status of locks in the system for
770 770 * this vnode we acquire the rwlock if it is an IO_LOCK.
771 771 */
772 772
773 773 if (IS_IO_LOCK(lock_request)) {
774 774 (void) VOP_RWLOCK(vp,
775 775 (lock_request->l_type == F_RDLCK) ?
776 776 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
777 777 if (!error) {
778 778 lckdat->l_type = F_UNLCK;
779 779
780 780 /*
781 781 * This wake up is needed otherwise
782 782 * if IO_LOCK has slept the dependents on this
783 783 * will not be woken up at all. (bugid # 1185482).
784 784 */
785 785
786 786 flk_wakeup(lock_request, 1);
787 787 flk_set_state(lock_request, FLK_DEAD_STATE);
788 788 flk_free_lock(lock_request);
789 789 }
790 790 /*
791 791 * else if error had occurred either flk_process_request()
792 792 * has returned EDEADLK in which case there will be no
793 793 * dependents for this lock or EINTR from flk_wait_execute_
794 794 * request() in which case flk_cancel_sleeping_lock()
795 795 * would have been done. same is true with EBADF.
796 796 */
797 797 }
798 798
799 799 if (lock_request == &stack_lock_request) {
800 800 flk_set_state(lock_request, FLK_DEAD_STATE);
801 801 } else {
802 802 lock_request->l_state &= ~REFERENCED_LOCK;
803 803 if ((error != 0) || IS_DELETED(lock_request)) {
804 804 flk_set_state(lock_request, FLK_DEAD_STATE);
805 805 flk_free_lock(lock_request);
806 806 }
807 807 }
808 808
809 809 mutex_exit(&gp->gp_mutex);
810 810 return (error);
811 811
812 812 done:
813 813 flk_set_state(lock_request, FLK_DEAD_STATE);
814 814 if (lock_request != &stack_lock_request)
815 815 flk_free_lock(lock_request);
816 816 return (error);
817 817 }
818 818
819 819 /*
820 820 * Invoke the callbacks in the given list. If before sleeping, invoke in
821 821 * list order. If after sleeping, invoke in reverse order.
822 822 *
823 823 * CPR (suspend/resume) support: if one of the callbacks returns a
824 824 * callb_cpr_t, return it. This will be used to make the thread CPR-safe
825 825 * while it is sleeping. There should be at most one callb_cpr_t for the
826 826 * thread.
827 827 * XXX This is unnecessarily complicated. The CPR information should just
828 828 * get passed in directly through VOP_FRLOCK and reclock, rather than
829 829 * sneaking it in via a callback.
830 830 */
831 831
832 832 callb_cpr_t *
833 833 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
834 834 {
835 835 callb_cpr_t *cpr_callbackp = NULL;
836 836 callb_cpr_t *one_result;
837 837 flk_callback_t *cb;
838 838
839 839 if (cblist == NULL)
840 840 return (NULL);
841 841
842 842 if (when == FLK_BEFORE_SLEEP) {
843 843 cb = cblist;
844 844 do {
845 845 one_result = (*cb->cb_callback)(when, cb->cb_data);
846 846 if (one_result != NULL) {
847 847 ASSERT(cpr_callbackp == NULL);
848 848 cpr_callbackp = one_result;
849 849 }
850 850 cb = cb->cb_next;
851 851 } while (cb != cblist);
852 852 } else {
853 853 cb = cblist->cb_prev;
854 854 do {
855 855 one_result = (*cb->cb_callback)(when, cb->cb_data);
856 856 if (one_result != NULL) {
857 857 cpr_callbackp = one_result;
858 858 }
859 859 cb = cb->cb_prev;
860 860 } while (cb != cblist->cb_prev);
861 861 }
862 862
863 863 return (cpr_callbackp);
864 864 }
865 865
866 866 /*
867 867 * Initialize a flk_callback_t to hold the given callback.
868 868 */
869 869
870 870 void
871 871 flk_init_callback(flk_callback_t *flk_cb,
872 872 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
873 873 {
874 874 flk_cb->cb_next = flk_cb;
875 875 flk_cb->cb_prev = flk_cb;
876 876 flk_cb->cb_callback = cb_fcn;
877 877 flk_cb->cb_data = cbdata;
878 878 }
879 879
880 880 /*
881 881 * Initialize an flk_callback_t and then link it into the head of an
882 882 * existing list (which may be NULL).
883 883 */
884 884
885 885 void
886 886 flk_add_callback(flk_callback_t *newcb,
887 887 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
888 888 void *cbdata, flk_callback_t *cblist)
889 889 {
890 890 flk_init_callback(newcb, cb_fcn, cbdata);
891 891
892 892 if (cblist == NULL)
893 893 return;
894 894
895 895 newcb->cb_prev = cblist->cb_prev;
896 896 newcb->cb_next = cblist;
897 897 cblist->cb_prev->cb_next = newcb;
898 898 cblist->cb_prev = newcb;
899 899 }
900 900
901 901 /*
902 902 * Initialize the flk_edge_cache data structure and create the
903 903 * nlm_reg_status array.
904 904 */
905 905
906 906 void
907 907 flk_init(void)
908 908 {
909 909 uint_t i;
910 910
911 911 flk_edge_cache = kmem_cache_create("flk_edges",
912 912 sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
913 913 if (flk_edge_cache == NULL) {
914 914 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
915 915 }
916 916 /*
917 917 * Create the NLM registry object.
918 918 */
919 919
920 920 if (cluster_bootflags & CLUSTER_BOOTED) {
921 921 /*
922 922 * This routine tells you the maximum node id that will be used
923 923 * in the cluster. This number will be the size of the nlm
924 924 * registry status array. We add 1 because we will be using
925 925 * all entries indexed from 0 to maxnodeid; e.g., from 0
926 926 * to 64, for a total of 65 entries.
927 927 */
928 928 nlm_status_size = clconf_maximum_nodeid() + 1;
929 929 } else {
930 930 nlm_status_size = 0;
931 931 }
932 932
933 933 if (nlm_status_size != 0) { /* booted as a cluster */
934 934 nlm_reg_status = (flk_nlm_status_t *)
935 935 kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
936 936 KM_SLEEP);
937 937
938 938 /* initialize all NLM states in array to NLM_UNKNOWN */
939 939 for (i = 0; i < nlm_status_size; i++) {
940 940 nlm_reg_status[i] = FLK_NLM_UNKNOWN;
941 941 }
942 942 }
943 943 }
944 944
945 945 /*
946 946 * Zone constructor/destructor callbacks to be executed when a zone is
947 947 * created/destroyed.
948 948 */
949 949 /* ARGSUSED */
950 950 void *
951 951 flk_zone_init(zoneid_t zoneid)
952 952 {
953 953 struct flock_globals *fg;
954 954 uint_t i;
955 955
956 956 fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
957 957 fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
958 958 for (i = 0; i < HASH_SIZE; i++)
959 959 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
960 960 return (fg);
961 961 }
962 962
963 963 /* ARGSUSED */
964 964 void
965 965 flk_zone_fini(zoneid_t zoneid, void *data)
966 966 {
967 967 struct flock_globals *fg = data;
968 968
969 969 kmem_free(fg, sizeof (*fg));
970 970 }
971 971
972 972 /*
973 973 * Get a lock_descriptor structure with initialization of edge lists.
974 974 */
975 975
976 976 static lock_descriptor_t *
977 977 flk_get_lock(void)
978 978 {
979 979 lock_descriptor_t *l;
980 980
981 981 l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
982 982
983 983 cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
984 984 l->l_edge.edge_in_next = &l->l_edge;
985 985 l->l_edge.edge_in_prev = &l->l_edge;
986 986 l->l_edge.edge_adj_next = &l->l_edge;
987 987 l->l_edge.edge_adj_prev = &l->l_edge;
988 988 l->pvertex = -1;
989 989 l->l_status = FLK_INITIAL_STATE;
990 990 flk_lock_allocs++;
991 991 return (l);
992 992 }
993 993
994 994 /*
995 995 * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
|
↓ open down ↓ |
456 lines elided |
↑ open up ↑ |
996 996 * when some thread has a reference to it as in reclock().
997 997 */
998 998
999 999 void
1000 1000 flk_free_lock(lock_descriptor_t *lock)
1001 1001 {
1002 1002 file_t *fp;
1003 1003
1004 1004 ASSERT(IS_DEAD(lock));
1005 1005
1006 - if ((fp = lock->l_ofd) != NULL)
1006 + if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock)
1007 1007 fp->f_filock = NULL;
1008 1008
1009 1009 if (IS_REFERENCED(lock)) {
1010 1010 lock->l_state |= DELETED_LOCK;
1011 1011 return;
1012 1012 }
1013 1013 flk_lock_frees++;
1014 1014 kmem_free((void *)lock, sizeof (lock_descriptor_t));
1015 1015 }
1016 1016
1017 1017 void
1018 1018 flk_set_state(lock_descriptor_t *lock, int new_state)
1019 1019 {
1020 1020 /*
1021 1021 * Locks in the sleeping list may be woken up in a number of ways,
1022 1022 * and more than once. If a sleeping lock is signaled awake more
1023 1023 * than once, then it may or may not change state depending on its
1024 1024 * current state.
1025 1025 * Also note that NLM locks that are sleeping could be moved to an
1026 1026 * interrupted state more than once if the unlock request is
1027 1027 * retransmitted by the NLM client - the second time around, this is
1028 1028 * just a nop.
1029 1029 * The ordering of being signaled awake is:
1030 1030 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
1031 1031 * The checks below implement this ordering.
1032 1032 */
1033 1033 if (IS_INTERRUPTED(lock)) {
1034 1034 if ((new_state == FLK_CANCELLED_STATE) ||
1035 1035 (new_state == FLK_GRANTED_STATE) ||
1036 1036 (new_state == FLK_INTERRUPTED_STATE)) {
1037 1037 return;
1038 1038 }
1039 1039 }
1040 1040 if (IS_CANCELLED(lock)) {
1041 1041 if ((new_state == FLK_GRANTED_STATE) ||
1042 1042 (new_state == FLK_CANCELLED_STATE)) {
1043 1043 return;
1044 1044 }
1045 1045 }
1046 1046 CHECK_LOCK_TRANSITION(lock->l_status, new_state);
1047 1047 if (IS_PXFS(lock)) {
1048 1048 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
1049 1049 }
1050 1050 lock->l_status = new_state;
1051 1051 }
1052 1052
1053 1053 /*
1054 1054 * Routine that checks whether there are any blocking locks in the system.
1055 1055 *
1056 1056 * The policy followed is if a write lock is sleeping we don't allow read
1057 1057 * locks before this write lock even though there may not be any active
1058 1058 * locks corresponding to the read locks' region.
1059 1059 *
1060 1060 * flk_add_edge() function adds an edge between l1 and l2 iff there
1061 1061 * is no path between l1 and l2. This is done to have a "minimum
1062 1062 * storage representation" of the dependency graph.
1063 1063 *
1064 1064 * Another property of the graph is since only the new request throws
1065 1065 * edges to the existing locks in the graph, the graph is always topologically
1066 1066 * ordered.
1067 1067 */
1068 1068
1069 1069 static int
1070 1070 flk_process_request(lock_descriptor_t *request)
1071 1071 {
1072 1072 graph_t *gp = request->l_graph;
1073 1073 lock_descriptor_t *lock;
1074 1074 int request_blocked_by_active = 0;
1075 1075 int request_blocked_by_granted = 0;
1076 1076 int request_blocked_by_sleeping = 0;
1077 1077 vnode_t *vp = request->l_vnode;
1078 1078 int error = 0;
1079 1079 int request_will_wait = 0;
1080 1080 int found_covering_lock = 0;
1081 1081 lock_descriptor_t *covered_by = NULL;
1082 1082
1083 1083 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1084 1084 request_will_wait = IS_WILLING_TO_SLEEP(request);
1085 1085
1086 1086 /*
1087 1087 * check active locks
1088 1088 */
1089 1089
1090 1090 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1091 1091
1092 1092
1093 1093 if (lock) {
1094 1094 do {
1095 1095 if (BLOCKS(lock, request)) {
1096 1096 if (!request_will_wait)
1097 1097 return (EAGAIN);
1098 1098 request_blocked_by_active = 1;
1099 1099 break;
1100 1100 }
1101 1101 /*
1102 1102 * Grant lock if it is for the same owner holding active
1103 1103 * lock that covers the request.
1104 1104 */
1105 1105
1106 1106 if (SAME_OWNER(lock, request) &&
1107 1107 COVERS(lock, request) &&
1108 1108 (request->l_type == F_RDLCK))
1109 1109 return (flk_execute_request(request));
1110 1110 lock = lock->l_next;
1111 1111 } while (lock->l_vnode == vp);
1112 1112 }
1113 1113
1114 1114 if (!request_blocked_by_active) {
1115 1115 lock_descriptor_t *lk[1];
1116 1116 lock_descriptor_t *first_glock = NULL;
1117 1117 /*
1118 1118 * Shall we grant this?! NO!!
1119 1119 * What about those locks that were just granted and still
1120 1120 * in sleep queue. Those threads are woken up and so locks
1121 1121 * are almost active.
1122 1122 */
1123 1123 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1124 1124 if (lock) {
1125 1125 do {
1126 1126 if (BLOCKS(lock, request)) {
1127 1127 if (IS_GRANTED(lock)) {
1128 1128 request_blocked_by_granted = 1;
1129 1129 } else {
1130 1130 request_blocked_by_sleeping = 1;
1131 1131 }
1132 1132 }
1133 1133
1134 1134 lock = lock->l_next;
1135 1135 } while ((lock->l_vnode == vp));
1136 1136 first_glock = lock->l_prev;
1137 1137 ASSERT(first_glock->l_vnode == vp);
1138 1138 }
1139 1139
1140 1140 if (request_blocked_by_granted)
1141 1141 goto block;
1142 1142
1143 1143 if (!request_blocked_by_sleeping) {
1144 1144 /*
1145 1145 * If the request isn't going to be blocked by a
1146 1146 * sleeping request, we know that it isn't going to
1147 1147 * be blocked; we can just execute the request --
1148 1148 * without performing costly deadlock detection.
1149 1149 */
1150 1150 ASSERT(!request_blocked_by_active);
1151 1151 return (flk_execute_request(request));
1152 1152 } else if (request->l_type == F_RDLCK) {
1153 1153 /*
1154 1154 * If we have a sleeping writer in the requested
1155 1155 * lock's range, block.
1156 1156 */
1157 1157 goto block;
1158 1158 }
1159 1159
1160 1160 lk[0] = request;
1161 1161 request->l_state |= RECOMPUTE_LOCK;
1162 1162 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1163 1163 if (lock) {
1164 1164 do {
1165 1165 flk_recompute_dependencies(lock, lk, 1, 0);
1166 1166 lock = lock->l_next;
1167 1167 } while (lock->l_vnode == vp);
1168 1168 }
1169 1169 lock = first_glock;
1170 1170 if (lock) {
1171 1171 do {
1172 1172 if (IS_GRANTED(lock)) {
1173 1173 flk_recompute_dependencies(lock, lk, 1, 0);
1174 1174 }
1175 1175 lock = lock->l_prev;
1176 1176 } while ((lock->l_vnode == vp));
1177 1177 }
1178 1178 request->l_state &= ~RECOMPUTE_LOCK;
1179 1179 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
1180 1180 return (EDEADLK);
1181 1181 return (flk_execute_request(request));
1182 1182 }
1183 1183
1184 1184 block:
1185 1185 if (request_will_wait)
1186 1186 flk_graph_uncolor(gp);
1187 1187
1188 1188 /* check sleeping locks */
1189 1189
1190 1190 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1191 1191
1192 1192 /*
1193 1193 * If we find a sleeping write lock that is a superset of the
1194 1194 * region wanted by request we can be assured that by adding an
1195 1195 * edge to this write lock we have paths to all locks in the
1196 1196 * graph that blocks the request except in one case and that is why
1197 1197 * another check for SAME_OWNER in the loop below. The exception
1198 1198 * case is when this process that owns the sleeping write lock 'l1'
1199 1199 * has other locks l2, l3, l4 that are in the system and arrived
1200 1200 * before l1. l1 does not have path to these locks as they are from
1201 1201 * same process. We break when we find a second covering sleeping
1202 1202 * lock l5 owned by a process different from that owning l1, because
1203 1203 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
1204 1204 * it has l1 would have produced a deadlock already.
1205 1205 */
1206 1206
1207 1207 if (lock) {
1208 1208 do {
1209 1209 if (BLOCKS(lock, request)) {
1210 1210 if (!request_will_wait)
1211 1211 return (EAGAIN);
1212 1212 if (COVERS(lock, request) &&
1213 1213 lock->l_type == F_WRLCK) {
1214 1214 if (found_covering_lock &&
1215 1215 !SAME_OWNER(lock, covered_by)) {
1216 1216 found_covering_lock++;
1217 1217 break;
1218 1218 }
1219 1219 found_covering_lock = 1;
1220 1220 covered_by = lock;
1221 1221 }
1222 1222 if (found_covering_lock &&
1223 1223 !SAME_OWNER(lock, covered_by)) {
1224 1224 lock = lock->l_next;
1225 1225 continue;
1226 1226 }
1227 1227 if ((error = flk_add_edge(request, lock,
1228 1228 !found_covering_lock, 0)))
1229 1229 return (error);
1230 1230 }
1231 1231 lock = lock->l_next;
1232 1232 } while (lock->l_vnode == vp);
1233 1233 }
1234 1234
1235 1235 /*
1236 1236 * found_covering_lock == 2 iff at this point 'request' has paths
1237 1237 * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
1238 1238 * point 'request' has paths to all locks that blocks 'request' whose owners
1239 1239 * are not same as the one that covers 'request' (covered_by above) and
1240 1240 * we can have locks whose owner is same as covered_by in the active list.
1241 1241 */
1242 1242
1243 1243 if (request_blocked_by_active && found_covering_lock != 2) {
1244 1244 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1245 1245 ASSERT(lock != NULL);
1246 1246 do {
1247 1247 if (BLOCKS(lock, request)) {
1248 1248 if (found_covering_lock &&
1249 1249 !SAME_OWNER(lock, covered_by)) {
1250 1250 lock = lock->l_next;
1251 1251 continue;
1252 1252 }
1253 1253 if ((error = flk_add_edge(request, lock,
1254 1254 CHECK_CYCLE, 0)))
1255 1255 return (error);
1256 1256 }
1257 1257 lock = lock->l_next;
1258 1258 } while (lock->l_vnode == vp);
1259 1259 }
1260 1260
1261 1261 if (NOT_BLOCKED(request)) {
1262 1262 /*
1263 1263 * request not dependent on any other locks
1264 1264 * so execute this request
1265 1265 */
1266 1266 return (flk_execute_request(request));
1267 1267 } else {
1268 1268 /*
1269 1269 * check for deadlock
1270 1270 */
1271 1271 if (flk_check_deadlock(request))
1272 1272 return (EDEADLK);
1273 1273 /*
1274 1274 * this thread has to sleep
1275 1275 */
1276 1276 return (flk_wait_execute_request(request));
1277 1277 }
1278 1278 }
1279 1279
1280 1280 /*
1281 1281 * The actual execution of the request in the simple case is only to
1282 1282 * insert the 'request' in the list of active locks if it is not an
1283 1283 * UNLOCK.
1284 1284 * We have to consider the existing active locks' relation to
1285 1285 * this 'request' if they are owned by same process. flk_relation() does
1286 1286 * this job and sees to that the dependency graph information is maintained
1287 1287 * properly.
1288 1288 */
1289 1289
1290 1290 int
1291 1291 flk_execute_request(lock_descriptor_t *request)
1292 1292 {
1293 1293 graph_t *gp = request->l_graph;
1294 1294 vnode_t *vp = request->l_vnode;
1295 1295 lock_descriptor_t *lock, *lock1;
1296 1296 int done_searching = 0;
1297 1297
1298 1298 CHECK_SLEEPING_LOCKS(gp);
1299 1299 CHECK_ACTIVE_LOCKS(gp);
1300 1300
1301 1301 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1302 1302
1303 1303 flk_set_state(request, FLK_START_STATE);
1304 1304
1305 1305 ASSERT(NOT_BLOCKED(request));
1306 1306
1307 1307 /* IO_LOCK requests are only to check status */
1308 1308
1309 1309 if (IS_IO_LOCK(request))
1310 1310 return (0);
1311 1311
1312 1312 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1313 1313
1314 1314 if (lock == NULL && request->l_type == F_UNLCK)
1315 1315 return (0);
1316 1316 if (lock == NULL) {
1317 1317 flk_insert_active_lock(request);
1318 1318 return (0);
1319 1319 }
1320 1320
1321 1321 do {
1322 1322 lock1 = lock->l_next;
1323 1323 if (SAME_OWNER(request, lock)) {
1324 1324 done_searching = flk_relation(lock, request);
1325 1325 }
1326 1326 lock = lock1;
1327 1327 } while (lock->l_vnode == vp && !done_searching);
1328 1328
1329 1329 /*
1330 1330 * insert in active queue
1331 1331 */
1332 1332
1333 1333 if (request->l_type != F_UNLCK)
1334 1334 flk_insert_active_lock(request);
1335 1335
1336 1336 return (0);
1337 1337 }
1338 1338
1339 1339 /*
1340 1340 * 'request' is blocked by some one therefore we put it into sleep queue.
1341 1341 */
1342 1342 static int
1343 1343 flk_wait_execute_request(lock_descriptor_t *request)
1344 1344 {
1345 1345 graph_t *gp = request->l_graph;
1346 1346 callb_cpr_t *cprp; /* CPR info from callback */
1347 1347 struct flock_globals *fg;
1348 1348 int index;
1349 1349
1350 1350 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1351 1351 ASSERT(IS_WILLING_TO_SLEEP(request));
1352 1352
1353 1353 flk_insert_sleeping_lock(request);
1354 1354
1355 1355 if (IS_LOCKMGR(request)) {
1356 1356 index = HASH_INDEX(request->l_vnode);
1357 1357 fg = flk_get_globals();
1358 1358
1359 1359 if (nlm_status_size == 0) { /* not booted as a cluster */
1360 1360 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1361 1361 flk_cancel_sleeping_lock(request, 1);
1362 1362 return (ENOLCK);
1363 1363 }
1364 1364 } else { /* booted as a cluster */
1365 1365 /*
1366 1366 * If the request is an NLM server lock request,
1367 1367 * and the NLM state of the lock request is not
1368 1368 * NLM_UP (because the NLM server is shutting
1369 1369 * down), then cancel the sleeping lock and
1370 1370 * return error ENOLCK that will encourage the
1371 1371 * client to retransmit.
1372 1372 */
1373 1373 if (!IS_NLM_UP(request)) {
1374 1374 flk_cancel_sleeping_lock(request, 1);
1375 1375 return (ENOLCK);
1376 1376 }
1377 1377 }
1378 1378 }
1379 1379
1380 1380 /* Clustering: For blocking PXFS locks, return */
1381 1381 if (IS_PXFS(request)) {
1382 1382 /*
1383 1383 * PXFS locks sleep on the client side.
1384 1384 * The callback argument is used to wake up the sleeper
1385 1385 * when the lock is granted.
1386 1386 * We return -1 (rather than an errno value) to indicate
1387 1387 * the client side should sleep
1388 1388 */
1389 1389 return (PXFS_LOCK_BLOCKED);
1390 1390 }
1391 1391
1392 1392 if (request->l_callbacks != NULL) {
1393 1393 /*
1394 1394 * To make sure the shutdown code works correctly, either
1395 1395 * the callback must happen after putting the lock on the
1396 1396 * sleep list, or we must check the shutdown status after
1397 1397 * returning from the callback (and before sleeping). At
1398 1398 * least for now, we'll use the first option. If a
1399 1399 * shutdown or signal or whatever happened while the graph
1400 1400 * mutex was dropped, that will be detected by
1401 1401 * wait_for_lock().
1402 1402 */
1403 1403 mutex_exit(&gp->gp_mutex);
1404 1404
1405 1405 cprp = flk_invoke_callbacks(request->l_callbacks,
1406 1406 FLK_BEFORE_SLEEP);
1407 1407
1408 1408 mutex_enter(&gp->gp_mutex);
1409 1409
1410 1410 if (cprp == NULL) {
1411 1411 wait_for_lock(request);
1412 1412 } else {
1413 1413 mutex_enter(cprp->cc_lockp);
1414 1414 CALLB_CPR_SAFE_BEGIN(cprp);
1415 1415 mutex_exit(cprp->cc_lockp);
1416 1416 wait_for_lock(request);
1417 1417 mutex_enter(cprp->cc_lockp);
1418 1418 CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1419 1419 mutex_exit(cprp->cc_lockp);
1420 1420 }
1421 1421
1422 1422 mutex_exit(&gp->gp_mutex);
1423 1423 (void) flk_invoke_callbacks(request->l_callbacks,
1424 1424 FLK_AFTER_SLEEP);
1425 1425 mutex_enter(&gp->gp_mutex);
1426 1426 } else {
1427 1427 wait_for_lock(request);
1428 1428 }
1429 1429
1430 1430 if (IS_LOCKMGR(request)) {
1431 1431 /*
1432 1432 * If the lock manager is shutting down, return an
1433 1433 * error that will encourage the client to retransmit.
1434 1434 */
1435 1435 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1436 1436 !IS_GRANTED(request)) {
1437 1437 flk_cancel_sleeping_lock(request, 1);
1438 1438 return (ENOLCK);
1439 1439 }
1440 1440 }
1441 1441
1442 1442 if (IS_INTERRUPTED(request)) {
1443 1443 /* we got a signal, or act like we did */
1444 1444 flk_cancel_sleeping_lock(request, 1);
1445 1445 return (EINTR);
1446 1446 }
1447 1447
1448 1448 /* Cancelled if some other thread has closed the file */
1449 1449
1450 1450 if (IS_CANCELLED(request)) {
1451 1451 flk_cancel_sleeping_lock(request, 1);
1452 1452 return (EBADF);
1453 1453 }
1454 1454
1455 1455 request->l_state &= ~GRANTED_LOCK;
1456 1456 REMOVE_SLEEP_QUEUE(request);
1457 1457 return (flk_execute_request(request));
1458 1458 }
1459 1459
1460 1460 /*
1461 1461 * This routine adds an edge between from and to because from depends
1462 1462 * to. If asked to check for deadlock it checks whether there are any
1463 1463 * reachable locks from "from_lock" that is owned by the same process
1464 1464 * as "from_lock".
1465 1465 * NOTE: It is the caller's responsibility to make sure that the color
1466 1466 * of the graph is consistent between the calls to flk_add_edge as done
1467 1467 * in flk_process_request. This routine does not color and check for
1468 1468 * deadlock explicitly.
1469 1469 */
1470 1470
1471 1471 static int
1472 1472 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1473 1473 int check_cycle, int update_graph)
1474 1474 {
1475 1475 edge_t *edge;
1476 1476 edge_t *ep;
1477 1477 lock_descriptor_t *vertex;
1478 1478 lock_descriptor_t *vertex_stack;
1479 1479
1480 1480 STACK_INIT(vertex_stack);
1481 1481
1482 1482 /*
1483 1483 * if to vertex already has mark_color just return
1484 1484 * don't add an edge as it is reachable from from vertex
1485 1485 * before itself.
1486 1486 */
1487 1487
1488 1488 if (COLORED(to_lock))
1489 1489 return (0);
1490 1490
1491 1491 edge = flk_get_edge();
1492 1492
1493 1493 /*
1494 1494 * set the from and to vertex
1495 1495 */
1496 1496
1497 1497 edge->from_vertex = from_lock;
1498 1498 edge->to_vertex = to_lock;
1499 1499
1500 1500 /*
1501 1501 * put in adjacency list of from vertex
1502 1502 */
1503 1503
1504 1504 from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1505 1505 edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1506 1506 edge->edge_adj_prev = &from_lock->l_edge;
1507 1507 from_lock->l_edge.edge_adj_next = edge;
1508 1508
1509 1509 /*
1510 1510 * put in list of to vertex
1511 1511 */
1512 1512
1513 1513 to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1514 1514 edge->edge_in_next = to_lock->l_edge.edge_in_next;
1515 1515 to_lock->l_edge.edge_in_next = edge;
1516 1516 edge->edge_in_prev = &to_lock->l_edge;
1517 1517
1518 1518
1519 1519 if (update_graph) {
1520 1520 flk_update_proc_graph(edge, 0);
1521 1521 return (0);
1522 1522 }
1523 1523 if (!check_cycle) {
1524 1524 return (0);
1525 1525 }
1526 1526
1527 1527 STACK_PUSH(vertex_stack, from_lock, l_stack);
1528 1528
1529 1529 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1530 1530
1531 1531 STACK_POP(vertex_stack, l_stack);
1532 1532
1533 1533 for (ep = FIRST_ADJ(vertex);
1534 1534 ep != HEAD(vertex);
1535 1535 ep = NEXT_ADJ(ep)) {
1536 1536 if (COLORED(ep->to_vertex))
1537 1537 continue;
1538 1538 COLOR(ep->to_vertex);
1539 1539 if (SAME_OWNER(ep->to_vertex, from_lock))
1540 1540 goto dead_lock;
1541 1541 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1542 1542 }
1543 1543 }
1544 1544 return (0);
1545 1545
1546 1546 dead_lock:
1547 1547
1548 1548 /*
1549 1549 * remove all edges
1550 1550 */
1551 1551
1552 1552 ep = FIRST_ADJ(from_lock);
1553 1553
1554 1554 while (ep != HEAD(from_lock)) {
1555 1555 IN_LIST_REMOVE(ep);
1556 1556 from_lock->l_sedge = NEXT_ADJ(ep);
1557 1557 ADJ_LIST_REMOVE(ep);
1558 1558 flk_free_edge(ep);
1559 1559 ep = from_lock->l_sedge;
1560 1560 }
1561 1561 return (EDEADLK);
1562 1562 }
1563 1563
1564 1564 /*
1565 1565 * Get an edge structure for representing the dependency between two locks.
1566 1566 */
1567 1567
1568 1568 static edge_t *
1569 1569 flk_get_edge()
1570 1570 {
1571 1571 edge_t *ep;
1572 1572
1573 1573 ASSERT(flk_edge_cache != NULL);
1574 1574
1575 1575 ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1576 1576 edge_allocs++;
1577 1577 return (ep);
1578 1578 }
1579 1579
1580 1580 /*
1581 1581 * Free the edge structure.
1582 1582 */
1583 1583
1584 1584 static void
1585 1585 flk_free_edge(edge_t *ep)
1586 1586 {
1587 1587 edge_frees++;
1588 1588 kmem_cache_free(flk_edge_cache, (void *)ep);
1589 1589 }
1590 1590
1591 1591 /*
1592 1592 * Check the relationship of request with lock and perform the
1593 1593 * recomputation of dependencies, break lock if required, and return
1594 1594 * 1 if request cannot have any more relationship with the next
1595 1595 * active locks.
1596 1596 * The 'lock' and 'request' are compared and in case of overlap we
1597 1597 * delete the 'lock' and form new locks to represent the non-overlapped
1598 1598 * portion of original 'lock'. This function has side effects such as
1599 1599 * 'lock' will be freed, new locks will be added to the active list.
1600 1600 */
1601 1601
1602 1602 static int
1603 1603 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1604 1604 {
1605 1605 int lock_effect;
1606 1606 lock_descriptor_t *lock1, *lock2;
1607 1607 lock_descriptor_t *topology[3];
1608 1608 int nvertex = 0;
1609 1609 int i;
1610 1610 edge_t *ep;
1611 1611 graph_t *gp = (lock->l_graph);
1612 1612
1613 1613
1614 1614 CHECK_SLEEPING_LOCKS(gp);
1615 1615 CHECK_ACTIVE_LOCKS(gp);
1616 1616
1617 1617 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1618 1618
1619 1619 topology[0] = topology[1] = topology[2] = NULL;
1620 1620
1621 1621 if (request->l_type == F_UNLCK)
1622 1622 lock_effect = FLK_UNLOCK;
1623 1623 else if (request->l_type == F_RDLCK &&
1624 1624 lock->l_type == F_WRLCK)
1625 1625 lock_effect = FLK_DOWNGRADE;
1626 1626 else if (request->l_type == F_WRLCK &&
1627 1627 lock->l_type == F_RDLCK)
1628 1628 lock_effect = FLK_UPGRADE;
1629 1629 else
1630 1630 lock_effect = FLK_STAY_SAME;
1631 1631
1632 1632 if (lock->l_end < request->l_start) {
1633 1633 if (lock->l_end == request->l_start - 1 &&
1634 1634 lock_effect == FLK_STAY_SAME) {
1635 1635 topology[0] = request;
1636 1636 request->l_start = lock->l_start;
1637 1637 nvertex = 1;
1638 1638 goto recompute;
1639 1639 } else {
1640 1640 return (0);
1641 1641 }
1642 1642 }
1643 1643
1644 1644 if (lock->l_start > request->l_end) {
1645 1645 if (request->l_end == lock->l_start - 1 &&
1646 1646 lock_effect == FLK_STAY_SAME) {
1647 1647 topology[0] = request;
1648 1648 request->l_end = lock->l_end;
1649 1649 nvertex = 1;
1650 1650 goto recompute;
1651 1651 } else {
1652 1652 return (1);
1653 1653 }
1654 1654 }
1655 1655
1656 1656 if (request->l_end < lock->l_end) {
1657 1657 if (request->l_start > lock->l_start) {
1658 1658 if (lock_effect == FLK_STAY_SAME) {
1659 1659 request->l_start = lock->l_start;
1660 1660 request->l_end = lock->l_end;
1661 1661 topology[0] = request;
1662 1662 nvertex = 1;
1663 1663 } else {
1664 1664 lock1 = flk_get_lock();
1665 1665 lock2 = flk_get_lock();
1666 1666 COPY(lock1, lock);
1667 1667 COPY(lock2, lock);
1668 1668 lock1->l_start = lock->l_start;
1669 1669 lock1->l_end = request->l_start - 1;
1670 1670 lock2->l_start = request->l_end + 1;
1671 1671 lock2->l_end = lock->l_end;
1672 1672 topology[0] = lock1;
1673 1673 topology[1] = lock2;
1674 1674 topology[2] = request;
1675 1675 nvertex = 3;
1676 1676 }
1677 1677 } else if (request->l_start < lock->l_start) {
1678 1678 if (lock_effect == FLK_STAY_SAME) {
1679 1679 request->l_end = lock->l_end;
1680 1680 topology[0] = request;
1681 1681 nvertex = 1;
1682 1682 } else {
1683 1683 lock1 = flk_get_lock();
1684 1684 COPY(lock1, lock);
1685 1685 lock1->l_start = request->l_end + 1;
1686 1686 topology[0] = lock1;
1687 1687 topology[1] = request;
1688 1688 nvertex = 2;
1689 1689 }
1690 1690 } else {
1691 1691 if (lock_effect == FLK_STAY_SAME) {
1692 1692 request->l_start = lock->l_start;
1693 1693 request->l_end = lock->l_end;
1694 1694 topology[0] = request;
1695 1695 nvertex = 1;
1696 1696 } else {
1697 1697 lock1 = flk_get_lock();
1698 1698 COPY(lock1, lock);
1699 1699 lock1->l_start = request->l_end + 1;
1700 1700 topology[0] = lock1;
1701 1701 topology[1] = request;
1702 1702 nvertex = 2;
1703 1703 }
1704 1704 }
1705 1705 } else if (request->l_end > lock->l_end) {
1706 1706 if (request->l_start > lock->l_start) {
1707 1707 if (lock_effect == FLK_STAY_SAME) {
1708 1708 request->l_start = lock->l_start;
1709 1709 topology[0] = request;
1710 1710 nvertex = 1;
1711 1711 } else {
1712 1712 lock1 = flk_get_lock();
1713 1713 COPY(lock1, lock);
1714 1714 lock1->l_end = request->l_start - 1;
1715 1715 topology[0] = lock1;
1716 1716 topology[1] = request;
1717 1717 nvertex = 2;
1718 1718 }
1719 1719 } else if (request->l_start < lock->l_start) {
1720 1720 topology[0] = request;
1721 1721 nvertex = 1;
1722 1722 } else {
1723 1723 topology[0] = request;
1724 1724 nvertex = 1;
1725 1725 }
1726 1726 } else {
1727 1727 if (request->l_start > lock->l_start) {
1728 1728 if (lock_effect == FLK_STAY_SAME) {
1729 1729 request->l_start = lock->l_start;
1730 1730 topology[0] = request;
1731 1731 nvertex = 1;
1732 1732 } else {
1733 1733 lock1 = flk_get_lock();
1734 1734 COPY(lock1, lock);
1735 1735 lock1->l_end = request->l_start - 1;
1736 1736 topology[0] = lock1;
1737 1737 topology[1] = request;
1738 1738 nvertex = 2;
1739 1739 }
1740 1740 } else if (request->l_start < lock->l_start) {
1741 1741 topology[0] = request;
1742 1742 nvertex = 1;
1743 1743 } else {
1744 1744 if (lock_effect != FLK_UNLOCK) {
1745 1745 topology[0] = request;
1746 1746 nvertex = 1;
1747 1747 } else {
1748 1748 flk_delete_active_lock(lock, 0);
1749 1749 flk_wakeup(lock, 1);
1750 1750 flk_free_lock(lock);
1751 1751 CHECK_SLEEPING_LOCKS(gp);
1752 1752 CHECK_ACTIVE_LOCKS(gp);
1753 1753 return (1);
1754 1754 }
1755 1755 }
1756 1756 }
1757 1757
1758 1758 recompute:
1759 1759
1760 1760 /*
1761 1761 * For unlock we don't send the 'request' to for recomputing
1762 1762 * dependencies because no lock will add an edge to this.
1763 1763 */
1764 1764
1765 1765 if (lock_effect == FLK_UNLOCK) {
1766 1766 topology[nvertex-1] = NULL;
1767 1767 nvertex--;
1768 1768 }
1769 1769 for (i = 0; i < nvertex; i++) {
1770 1770 topology[i]->l_state |= RECOMPUTE_LOCK;
1771 1771 topology[i]->l_color = NO_COLOR;
1772 1772 }
1773 1773
1774 1774 ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1775 1775
1776 1776 /*
1777 1777 * we remove the adjacent edges for all vertices' to this vertex
1778 1778 * 'lock'.
1779 1779 */
1780 1780
1781 1781 ep = FIRST_IN(lock);
1782 1782 while (ep != HEAD(lock)) {
1783 1783 ADJ_LIST_REMOVE(ep);
1784 1784 ep = NEXT_IN(ep);
1785 1785 }
1786 1786
1787 1787 flk_delete_active_lock(lock, 0);
1788 1788
1789 1789 /* We are ready for recomputing the dependencies now */
1790 1790
1791 1791 flk_recompute_dependencies(lock, topology, nvertex, 1);
1792 1792
1793 1793 for (i = 0; i < nvertex; i++) {
1794 1794 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1795 1795 topology[i]->l_color = NO_COLOR;
1796 1796 }
1797 1797
1798 1798
1799 1799 if (lock_effect == FLK_UNLOCK) {
1800 1800 nvertex++;
1801 1801 }
1802 1802 for (i = 0; i < nvertex - 1; i++) {
1803 1803 flk_insert_active_lock(topology[i]);
1804 1804 }
1805 1805
1806 1806
1807 1807 if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1808 1808 flk_wakeup(lock, 0);
1809 1809 } else {
1810 1810 ep = FIRST_IN(lock);
1811 1811 while (ep != HEAD(lock)) {
1812 1812 lock->l_sedge = NEXT_IN(ep);
1813 1813 IN_LIST_REMOVE(ep);
1814 1814 flk_update_proc_graph(ep, 1);
1815 1815 flk_free_edge(ep);
1816 1816 ep = lock->l_sedge;
1817 1817 }
1818 1818 }
1819 1819 flk_free_lock(lock);
1820 1820
1821 1821 CHECK_SLEEPING_LOCKS(gp);
1822 1822 CHECK_ACTIVE_LOCKS(gp);
1823 1823 return (0);
1824 1824 }
1825 1825
1826 1826 /*
1827 1827 * Insert a lock into the active queue.
1828 1828 */
1829 1829
1830 1830 static void
1831 1831 flk_insert_active_lock(lock_descriptor_t *new_lock)
1832 1832 {
1833 1833 graph_t *gp = new_lock->l_graph;
1834 1834 vnode_t *vp = new_lock->l_vnode;
1835 1835 lock_descriptor_t *first_lock, *lock;
1836 1836
1837 1837 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1838 1838
1839 1839 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1840 1840 first_lock = lock;
1841 1841
1842 1842 if (first_lock != NULL) {
1843 1843 for (; (lock->l_vnode == vp &&
1844 1844 lock->l_start < new_lock->l_start); lock = lock->l_next)
1845 1845 ;
1846 1846 } else {
1847 1847 lock = ACTIVE_HEAD(gp);
1848 1848 }
1849 1849
1850 1850 lock->l_prev->l_next = new_lock;
1851 1851 new_lock->l_next = lock;
1852 1852 new_lock->l_prev = lock->l_prev;
1853 1853 lock->l_prev = new_lock;
1854 1854
1855 1855 if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1856 1856 vp->v_filocks = (struct filock *)new_lock;
1857 1857 }
1858 1858 flk_set_state(new_lock, FLK_ACTIVE_STATE);
1859 1859 new_lock->l_state |= ACTIVE_LOCK;
1860 1860
1861 1861 CHECK_ACTIVE_LOCKS(gp);
1862 1862 CHECK_SLEEPING_LOCKS(gp);
1863 1863 }
1864 1864
1865 1865 /*
1866 1866 * Delete the active lock : Performs two functions depending on the
1867 1867 * value of second parameter. One is to remove from the active lists
1868 1868 * only and other is to both remove and free the lock.
1869 1869 */
1870 1870
1871 1871 static void
1872 1872 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1873 1873 {
1874 1874 vnode_t *vp = lock->l_vnode;
1875 1875 graph_t *gp = lock->l_graph;
1876 1876
1877 1877 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1878 1878 if (free_lock)
1879 1879 ASSERT(NO_DEPENDENTS(lock));
1880 1880 ASSERT(NOT_BLOCKED(lock));
1881 1881 ASSERT(IS_ACTIVE(lock));
1882 1882
1883 1883 ASSERT((vp->v_filocks != NULL));
1884 1884
1885 1885 if (vp->v_filocks == (struct filock *)lock) {
1886 1886 vp->v_filocks = (struct filock *)
1887 1887 ((lock->l_next->l_vnode == vp) ? lock->l_next :
1888 1888 NULL);
1889 1889 }
1890 1890 lock->l_next->l_prev = lock->l_prev;
1891 1891 lock->l_prev->l_next = lock->l_next;
1892 1892 lock->l_next = lock->l_prev = NULL;
1893 1893 flk_set_state(lock, FLK_DEAD_STATE);
1894 1894 lock->l_state &= ~ACTIVE_LOCK;
1895 1895
1896 1896 if (free_lock)
1897 1897 flk_free_lock(lock);
1898 1898 CHECK_ACTIVE_LOCKS(gp);
1899 1899 CHECK_SLEEPING_LOCKS(gp);
1900 1900 }
1901 1901
1902 1902 /*
1903 1903 * Insert into the sleep queue.
1904 1904 */
1905 1905
1906 1906 static void
1907 1907 flk_insert_sleeping_lock(lock_descriptor_t *request)
1908 1908 {
1909 1909 graph_t *gp = request->l_graph;
1910 1910 vnode_t *vp = request->l_vnode;
1911 1911 lock_descriptor_t *lock;
1912 1912
1913 1913 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1914 1914 ASSERT(IS_INITIAL(request));
1915 1915
1916 1916 for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1917 1917 lock->l_vnode < vp); lock = lock->l_next)
1918 1918 ;
1919 1919
1920 1920 lock->l_prev->l_next = request;
1921 1921 request->l_prev = lock->l_prev;
1922 1922 lock->l_prev = request;
1923 1923 request->l_next = lock;
1924 1924 flk_set_state(request, FLK_SLEEPING_STATE);
1925 1925 request->l_state |= SLEEPING_LOCK;
1926 1926 }
1927 1927
1928 1928 /*
1929 1929 * Cancelling a sleeping lock implies removing a vertex from the
1930 1930 * dependency graph and therefore we should recompute the dependencies
1931 1931 * of all vertices that have a path to this vertex, w.r.t. all
1932 1932 * vertices reachable from this vertex.
1933 1933 */
1934 1934
1935 1935 void
1936 1936 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1937 1937 {
1938 1938 graph_t *gp = request->l_graph;
1939 1939 vnode_t *vp = request->l_vnode;
1940 1940 lock_descriptor_t **topology = NULL;
1941 1941 edge_t *ep;
1942 1942 lock_descriptor_t *vertex, *lock;
1943 1943 int nvertex = 0;
1944 1944 int i;
1945 1945 lock_descriptor_t *vertex_stack;
1946 1946
1947 1947 STACK_INIT(vertex_stack);
1948 1948
1949 1949 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1950 1950 /*
1951 1951 * count number of vertex pointers that has to be allocated
1952 1952 * All vertices that are reachable from request.
1953 1953 */
1954 1954
1955 1955 STACK_PUSH(vertex_stack, request, l_stack);
1956 1956
1957 1957 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1958 1958 STACK_POP(vertex_stack, l_stack);
1959 1959 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1960 1960 ep = NEXT_ADJ(ep)) {
1961 1961 if (IS_RECOMPUTE(ep->to_vertex))
1962 1962 continue;
1963 1963 ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1964 1964 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1965 1965 nvertex++;
1966 1966 }
1967 1967 }
1968 1968
1969 1969 /*
1970 1970 * allocate memory for holding the vertex pointers
1971 1971 */
1972 1972
1973 1973 if (nvertex) {
1974 1974 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1975 1975 KM_SLEEP);
1976 1976 }
1977 1977
1978 1978 /*
1979 1979 * one more pass to actually store the vertices in the
1980 1980 * allocated array.
1981 1981 * We first check sleeping locks and then active locks
1982 1982 * so that topology array will be in a topological
1983 1983 * order.
1984 1984 */
1985 1985
1986 1986 nvertex = 0;
1987 1987 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1988 1988
1989 1989 if (lock) {
1990 1990 do {
1991 1991 if (IS_RECOMPUTE(lock)) {
1992 1992 lock->l_index = nvertex;
1993 1993 topology[nvertex++] = lock;
1994 1994 }
1995 1995 lock->l_color = NO_COLOR;
1996 1996 lock = lock->l_next;
1997 1997 } while (lock->l_vnode == vp);
1998 1998 }
1999 1999
2000 2000 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2001 2001
2002 2002 if (lock) {
2003 2003 do {
2004 2004 if (IS_RECOMPUTE(lock)) {
2005 2005 lock->l_index = nvertex;
2006 2006 topology[nvertex++] = lock;
2007 2007 }
2008 2008 lock->l_color = NO_COLOR;
2009 2009 lock = lock->l_next;
2010 2010 } while (lock->l_vnode == vp);
2011 2011 }
2012 2012
2013 2013 /*
2014 2014 * remove in and out edges of request
2015 2015 * They are freed after updating proc_graph below.
2016 2016 */
2017 2017
2018 2018 for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
2019 2019 ADJ_LIST_REMOVE(ep);
2020 2020 }
2021 2021
2022 2022
2023 2023 if (remove_from_queue)
2024 2024 REMOVE_SLEEP_QUEUE(request);
2025 2025
2026 2026 /* we are ready to recompute */
2027 2027
2028 2028 flk_recompute_dependencies(request, topology, nvertex, 1);
2029 2029
2030 2030 ep = FIRST_ADJ(request);
2031 2031 while (ep != HEAD(request)) {
2032 2032 IN_LIST_REMOVE(ep);
2033 2033 request->l_sedge = NEXT_ADJ(ep);
2034 2034 ADJ_LIST_REMOVE(ep);
2035 2035 flk_update_proc_graph(ep, 1);
2036 2036 flk_free_edge(ep);
2037 2037 ep = request->l_sedge;
2038 2038 }
2039 2039
2040 2040
2041 2041 /*
2042 2042 * unset the RECOMPUTE flag in those vertices
2043 2043 */
2044 2044
2045 2045 for (i = 0; i < nvertex; i++) {
2046 2046 topology[i]->l_state &= ~RECOMPUTE_LOCK;
2047 2047 }
2048 2048
2049 2049 /*
2050 2050 * free the topology
2051 2051 */
2052 2052 if (nvertex)
2053 2053 kmem_free((void *)topology,
2054 2054 (nvertex * sizeof (lock_descriptor_t *)));
2055 2055 /*
2056 2056 * Possibility of some locks unblocked now
2057 2057 */
2058 2058
2059 2059 flk_wakeup(request, 0);
2060 2060
2061 2061 /*
2062 2062 * we expect to have a correctly recomputed graph now.
2063 2063 */
2064 2064 flk_set_state(request, FLK_DEAD_STATE);
2065 2065 flk_free_lock(request);
2066 2066 CHECK_SLEEPING_LOCKS(gp);
2067 2067 CHECK_ACTIVE_LOCKS(gp);
2068 2068
2069 2069 }
2070 2070
2071 2071 /*
2072 2072 * Uncoloring the graph is simply to increment the mark value of the graph
2073 2073 * And only when wrap round takes place will we color all vertices in
2074 2074 * the graph explicitly.
2075 2075 */
2076 2076
2077 2077 static void
2078 2078 flk_graph_uncolor(graph_t *gp)
2079 2079 {
2080 2080 lock_descriptor_t *lock;
2081 2081
2082 2082 if (gp->mark == UINT_MAX) {
2083 2083 gp->mark = 1;
2084 2084 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
2085 2085 lock = lock->l_next)
2086 2086 lock->l_color = 0;
2087 2087
2088 2088 for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
2089 2089 lock = lock->l_next)
2090 2090 lock->l_color = 0;
2091 2091 } else {
2092 2092 gp->mark++;
2093 2093 }
2094 2094 }
2095 2095
2096 2096 /*
2097 2097 * Wake up locks that are blocked on the given lock.
2098 2098 */
2099 2099
2100 2100 static void
2101 2101 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
2102 2102 {
2103 2103 edge_t *ep;
2104 2104 graph_t *gp = lock->l_graph;
2105 2105 lock_descriptor_t *lck;
2106 2106
2107 2107 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2108 2108 if (NO_DEPENDENTS(lock))
2109 2109 return;
2110 2110 ep = FIRST_IN(lock);
2111 2111 do {
2112 2112 /*
2113 2113 * delete the edge from the adjacency list
2114 2114 * of from vertex. if no more adjacent edges
2115 2115 * for this vertex wake this process.
2116 2116 */
2117 2117 lck = ep->from_vertex;
2118 2118 if (adj_list_remove)
2119 2119 ADJ_LIST_REMOVE(ep);
2120 2120 flk_update_proc_graph(ep, 1);
2121 2121 if (NOT_BLOCKED(lck)) {
2122 2122 GRANT_WAKEUP(lck);
2123 2123 }
2124 2124 lock->l_sedge = NEXT_IN(ep);
2125 2125 IN_LIST_REMOVE(ep);
2126 2126 flk_free_edge(ep);
2127 2127 ep = lock->l_sedge;
2128 2128 } while (ep != HEAD(lock));
2129 2129 ASSERT(NO_DEPENDENTS(lock));
2130 2130 }
2131 2131
2132 2132 /*
2133 2133 * The dependents of request, is checked for its dependency against the
2134 2134 * locks in topology (called topology because the array is and should be in
2135 2135 * topological order for this algorithm, if not in topological order the
2136 2136 * inner loop below might add more edges than necessary. Topological ordering
2137 2137 * of vertices satisfies the property that all edges will be from left to
2138 2138 * right i.e., topology[i] can have an edge to topology[j], iff i<j)
2139 2139 * If lock l1 in the dependent set of request is dependent (blocked by)
2140 2140 * on lock l2 in topology but does not have a path to it, we add an edge
2141 2141 * in the inner loop below.
2142 2142 *
2143 2143 * We don't want to add an edge between l1 and l2 if there exists
2144 2144 * already a path from l1 to l2, so care has to be taken for those vertices
2145 2145 * that have two paths to 'request'. These vertices are referred to here
2146 2146 * as barrier locks.
2147 2147 *
2148 2148 * The barriers has to be found (those vertex that originally had two paths
2149 2149 * to request) because otherwise we may end up adding edges unnecessarily
2150 2150 * to vertices in topology, and thus barrier vertices can have an edge
2151 2151 * to a vertex in topology as well a path to it.
2152 2152 */
2153 2153
2154 2154 static void
2155 2155 flk_recompute_dependencies(lock_descriptor_t *request,
2156 2156 lock_descriptor_t **topology,
2157 2157 int nvertex, int update_graph)
2158 2158 {
2159 2159 lock_descriptor_t *vertex, *lock;
2160 2160 graph_t *gp = request->l_graph;
2161 2161 int i, count;
2162 2162 int barrier_found = 0;
2163 2163 edge_t *ep;
2164 2164 lock_descriptor_t *vertex_stack;
2165 2165
2166 2166 STACK_INIT(vertex_stack);
2167 2167
2168 2168 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2169 2169 if (nvertex == 0)
2170 2170 return;
2171 2171 flk_graph_uncolor(request->l_graph);
2172 2172 barrier_found = flk_find_barriers(request);
2173 2173 request->l_state |= RECOMPUTE_DONE;
2174 2174
2175 2175 STACK_PUSH(vertex_stack, request, l_stack);
2176 2176 request->l_sedge = FIRST_IN(request);
2177 2177
2178 2178
2179 2179 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2180 2180 if (vertex->l_state & RECOMPUTE_DONE) {
2181 2181 count = 0;
2182 2182 goto next_in_edge;
2183 2183 }
2184 2184 if (IS_BARRIER(vertex)) {
2185 2185 /* decrement the barrier count */
2186 2186 if (vertex->l_index) {
2187 2187 vertex->l_index--;
2188 2188 /* this guy will be pushed again anyway ? */
2189 2189 STACK_POP(vertex_stack, l_stack);
2190 2190 if (vertex->l_index == 0) {
2191 2191 /*
2192 2192 * barrier is over we can recompute
2193 2193 * dependencies for this lock in the
2194 2194 * next stack pop
2195 2195 */
2196 2196 vertex->l_state &= ~BARRIER_LOCK;
2197 2197 }
2198 2198 continue;
2199 2199 }
2200 2200 }
2201 2201 vertex->l_state |= RECOMPUTE_DONE;
2202 2202 flk_graph_uncolor(gp);
2203 2203 count = flk_color_reachables(vertex);
2204 2204 for (i = 0; i < nvertex; i++) {
2205 2205 lock = topology[i];
2206 2206 if (COLORED(lock))
2207 2207 continue;
2208 2208 if (BLOCKS(lock, vertex)) {
2209 2209 (void) flk_add_edge(vertex, lock,
2210 2210 NO_CHECK_CYCLE, update_graph);
2211 2211 COLOR(lock);
2212 2212 count++;
2213 2213 count += flk_color_reachables(lock);
2214 2214 }
2215 2215
2216 2216 }
2217 2217
2218 2218 next_in_edge:
2219 2219 if (count == nvertex ||
2220 2220 vertex->l_sedge == HEAD(vertex)) {
2221 2221 /* prune the tree below this */
2222 2222 STACK_POP(vertex_stack, l_stack);
2223 2223 vertex->l_state &= ~RECOMPUTE_DONE;
2224 2224 /* update the barrier locks below this! */
2225 2225 if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
2226 2226 flk_graph_uncolor(gp);
2227 2227 flk_update_barriers(vertex);
2228 2228 }
2229 2229 continue;
2230 2230 }
2231 2231
2232 2232 ep = vertex->l_sedge;
2233 2233 lock = ep->from_vertex;
2234 2234 STACK_PUSH(vertex_stack, lock, l_stack);
2235 2235 lock->l_sedge = FIRST_IN(lock);
2236 2236 vertex->l_sedge = NEXT_IN(ep);
2237 2237 }
2238 2238
2239 2239 }
2240 2240
2241 2241 /*
2242 2242 * Color all reachable vertices from vertex that belongs to topology (here
2243 2243 * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
2244 2244 *
2245 2245 * Note: we need to use a different stack_link l_stack1 because this is
2246 2246 * called from flk_recompute_dependencies() that already uses a stack with
2247 2247 * l_stack as stack_link.
2248 2248 */
2249 2249
2250 2250 static int
2251 2251 flk_color_reachables(lock_descriptor_t *vertex)
2252 2252 {
2253 2253 lock_descriptor_t *ver, *lock;
2254 2254 int count;
2255 2255 edge_t *ep;
2256 2256 lock_descriptor_t *vertex_stack;
2257 2257
2258 2258 STACK_INIT(vertex_stack);
2259 2259
2260 2260 STACK_PUSH(vertex_stack, vertex, l_stack1);
2261 2261 count = 0;
2262 2262 while ((ver = STACK_TOP(vertex_stack)) != NULL) {
2263 2263
2264 2264 STACK_POP(vertex_stack, l_stack1);
2265 2265 for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
2266 2266 ep = NEXT_ADJ(ep)) {
2267 2267 lock = ep->to_vertex;
2268 2268 if (COLORED(lock))
2269 2269 continue;
2270 2270 COLOR(lock);
2271 2271 if (IS_RECOMPUTE(lock))
2272 2272 count++;
2273 2273 STACK_PUSH(vertex_stack, lock, l_stack1);
2274 2274 }
2275 2275
2276 2276 }
2277 2277 return (count);
2278 2278 }
2279 2279
2280 2280 /*
2281 2281 * Called from flk_recompute_dependencies() this routine decrements
2282 2282 * the barrier count of barrier vertices that are reachable from lock.
2283 2283 */
2284 2284
2285 2285 static void
2286 2286 flk_update_barriers(lock_descriptor_t *lock)
2287 2287 {
2288 2288 lock_descriptor_t *vertex, *lck;
2289 2289 edge_t *ep;
2290 2290 lock_descriptor_t *vertex_stack;
2291 2291
2292 2292 STACK_INIT(vertex_stack);
2293 2293
2294 2294 STACK_PUSH(vertex_stack, lock, l_stack1);
2295 2295
2296 2296 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2297 2297 STACK_POP(vertex_stack, l_stack1);
2298 2298 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2299 2299 ep = NEXT_IN(ep)) {
2300 2300 lck = ep->from_vertex;
2301 2301 if (COLORED(lck)) {
2302 2302 if (IS_BARRIER(lck)) {
2303 2303 ASSERT(lck->l_index > 0);
2304 2304 lck->l_index--;
2305 2305 if (lck->l_index == 0)
2306 2306 lck->l_state &= ~BARRIER_LOCK;
2307 2307 }
2308 2308 continue;
2309 2309 }
2310 2310 COLOR(lck);
2311 2311 if (IS_BARRIER(lck)) {
2312 2312 ASSERT(lck->l_index > 0);
2313 2313 lck->l_index--;
2314 2314 if (lck->l_index == 0)
2315 2315 lck->l_state &= ~BARRIER_LOCK;
2316 2316 }
2317 2317 STACK_PUSH(vertex_stack, lck, l_stack1);
2318 2318 }
2319 2319 }
2320 2320 }
2321 2321
2322 2322 /*
2323 2323 * Finds all vertices that are reachable from 'lock' more than once and
2324 2324 * mark them as barrier vertices and increment their barrier count.
2325 2325 * The barrier count is one minus the total number of paths from lock
2326 2326 * to that vertex.
2327 2327 */
2328 2328
2329 2329 static int
2330 2330 flk_find_barriers(lock_descriptor_t *lock)
2331 2331 {
2332 2332 lock_descriptor_t *vertex, *lck;
2333 2333 int found = 0;
2334 2334 edge_t *ep;
2335 2335 lock_descriptor_t *vertex_stack;
2336 2336
2337 2337 STACK_INIT(vertex_stack);
2338 2338
2339 2339 STACK_PUSH(vertex_stack, lock, l_stack1);
2340 2340
2341 2341 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2342 2342 STACK_POP(vertex_stack, l_stack1);
2343 2343 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2344 2344 ep = NEXT_IN(ep)) {
2345 2345 lck = ep->from_vertex;
2346 2346 if (COLORED(lck)) {
2347 2347 /* this is a barrier */
2348 2348 lck->l_state |= BARRIER_LOCK;
2349 2349 /* index will have barrier count */
2350 2350 lck->l_index++;
2351 2351 if (!found)
2352 2352 found = 1;
2353 2353 continue;
2354 2354 }
2355 2355 COLOR(lck);
2356 2356 lck->l_index = 0;
2357 2357 STACK_PUSH(vertex_stack, lck, l_stack1);
2358 2358 }
2359 2359 }
2360 2360 return (found);
2361 2361 }
2362 2362
2363 2363 /*
2364 2364 * Finds the first lock that is mainly responsible for blocking this
2365 2365 * request. If there is no such lock, request->l_flock.l_type is set to
2366 2366 * F_UNLCK. Otherwise, request->l_flock is filled in with the particulars
2367 2367 * of the blocking lock.
2368 2368 *
2369 2369 * Note: It is possible a request is blocked by a sleeping lock because
2370 2370 * of the fairness policy used in flk_process_request() to construct the
2371 2371 * dependencies. (see comments before flk_process_request()).
2372 2372 */
2373 2373
2374 2374 static void
2375 2375 flk_get_first_blocking_lock(lock_descriptor_t *request)
2376 2376 {
2377 2377 graph_t *gp = request->l_graph;
2378 2378 vnode_t *vp = request->l_vnode;
2379 2379 lock_descriptor_t *lock, *blocker;
2380 2380
2381 2381 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2382 2382 blocker = NULL;
2383 2383 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2384 2384
2385 2385 if (lock) {
2386 2386 do {
2387 2387 if (BLOCKS(lock, request)) {
2388 2388 blocker = lock;
2389 2389 break;
2390 2390 }
2391 2391 lock = lock->l_next;
2392 2392 } while (lock->l_vnode == vp);
2393 2393 }
2394 2394
2395 2395 if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
2396 2396 /*
2397 2397 * No active lock is blocking this request, but if a read
2398 2398 * lock is requested, it may also get blocked by a waiting
2399 2399 * writer. So search all sleeping locks and see if there is
2400 2400 * a writer waiting.
2401 2401 */
2402 2402 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2403 2403 if (lock) {
2404 2404 do {
2405 2405 if (BLOCKS(lock, request)) {
2406 2406 blocker = lock;
2407 2407 break;
2408 2408 }
2409 2409 lock = lock->l_next;
2410 2410 } while (lock->l_vnode == vp);
2411 2411 }
2412 2412 }
2413 2413
2414 2414 if (blocker) {
2415 2415 report_blocker(blocker, request);
2416 2416 } else
2417 2417 request->l_flock.l_type = F_UNLCK;
2418 2418 }
2419 2419
2420 2420 /*
2421 2421 * Get the graph_t structure associated with a vnode.
2422 2422 * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2423 2423 * not yet been initialized, then a new element is allocated and returned.
2424 2424 */
2425 2425 graph_t *
2426 2426 flk_get_lock_graph(vnode_t *vp, int initialize)
2427 2427 {
2428 2428 graph_t *gp;
2429 2429 graph_t *gp_alloc = NULL;
2430 2430 int index = HASH_INDEX(vp);
2431 2431
2432 2432 if (initialize == FLK_USE_GRAPH) {
2433 2433 mutex_enter(&flock_lock);
2434 2434 gp = lock_graph[index];
2435 2435 mutex_exit(&flock_lock);
2436 2436 return (gp);
2437 2437 }
2438 2438
2439 2439 ASSERT(initialize == FLK_INIT_GRAPH);
2440 2440
2441 2441 if (lock_graph[index] == NULL) {
2442 2442
2443 2443 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2444 2444
2445 2445 /* Initialize the graph */
2446 2446
2447 2447 gp_alloc->active_locks.l_next =
2448 2448 gp_alloc->active_locks.l_prev =
2449 2449 (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2450 2450 gp_alloc->sleeping_locks.l_next =
2451 2451 gp_alloc->sleeping_locks.l_prev =
2452 2452 (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2453 2453 gp_alloc->index = index;
2454 2454 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2455 2455 }
2456 2456
2457 2457 mutex_enter(&flock_lock);
2458 2458
2459 2459 gp = lock_graph[index];
2460 2460
2461 2461 /* Recheck the value within flock_lock */
2462 2462 if (gp == NULL) {
2463 2463 struct flock_globals *fg;
2464 2464
2465 2465 /* We must have previously allocated the graph_t structure */
2466 2466 ASSERT(gp_alloc != NULL);
2467 2467 lock_graph[index] = gp = gp_alloc;
2468 2468 /*
2469 2469 * The lockmgr status is only needed if KLM is loaded.
2470 2470 */
2471 2471 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2472 2472 fg = flk_get_globals();
2473 2473 fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2474 2474 }
2475 2475 }
2476 2476
2477 2477 mutex_exit(&flock_lock);
2478 2478
2479 2479 if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2480 2480 /* There was a race to allocate the graph_t and we lost */
2481 2481 mutex_destroy(&gp_alloc->gp_mutex);
2482 2482 kmem_free(gp_alloc, sizeof (graph_t));
2483 2483 }
2484 2484
2485 2485 return (gp);
2486 2486 }
2487 2487
2488 2488 /*
2489 2489 * PSARC case 1997/292
2490 2490 */
2491 2491 int
2492 2492 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2493 2493 {
2494 2494 lock_descriptor_t *lock;
2495 2495 int result = 0;
2496 2496 graph_t *gp;
2497 2497 int lock_nlmid;
2498 2498
2499 2499 /*
2500 2500 * Check to see if node is booted as a cluster. If not, return.
2501 2501 */
2502 2502 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2503 2503 return (0);
2504 2504 }
2505 2505
2506 2506 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2507 2507 if (gp == NULL) {
2508 2508 return (0);
2509 2509 }
2510 2510
2511 2511 mutex_enter(&gp->gp_mutex);
2512 2512
2513 2513 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2514 2514
2515 2515 if (lock) {
2516 2516 while (lock->l_vnode == vp) {
2517 2517 /* get NLM id from sysid */
2518 2518 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2519 2519
2520 2520 /*
2521 2521 * If NLM server request _and_ nlmid of lock matches
2522 2522 * nlmid of argument, then we've found a remote lock.
2523 2523 */
2524 2524 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2525 2525 result = 1;
2526 2526 goto done;
2527 2527 }
2528 2528 lock = lock->l_next;
2529 2529 }
2530 2530 }
2531 2531
2532 2532 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2533 2533
2534 2534 if (lock) {
2535 2535 while (lock->l_vnode == vp) {
2536 2536 /* get NLM id from sysid */
2537 2537 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2538 2538
2539 2539 /*
2540 2540 * If NLM server request _and_ nlmid of lock matches
2541 2541 * nlmid of argument, then we've found a remote lock.
2542 2542 */
2543 2543 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2544 2544 result = 1;
2545 2545 goto done;
2546 2546 }
2547 2547 lock = lock->l_next;
2548 2548 }
2549 2549 }
2550 2550
2551 2551 done:
2552 2552 mutex_exit(&gp->gp_mutex);
2553 2553 return (result);
2554 2554 }
2555 2555
2556 2556 /*
2557 2557 * Determine whether there are any locks for the given vnode with a remote
2558 2558 * sysid. Returns zero if not, non-zero if there are.
2559 2559 *
2560 2560 * Note that the return value from this function is potentially invalid
2561 2561 * once it has been returned. The caller is responsible for providing its
2562 2562 * own synchronization mechanism to ensure that the return value is useful
2563 2563 * (e.g., see nfs_lockcompletion()).
2564 2564 */
2565 2565 int
2566 2566 flk_has_remote_locks(vnode_t *vp)
2567 2567 {
2568 2568 lock_descriptor_t *lock;
2569 2569 int result = 0;
2570 2570 graph_t *gp;
2571 2571
2572 2572 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2573 2573 if (gp == NULL) {
2574 2574 return (0);
2575 2575 }
2576 2576
2577 2577 mutex_enter(&gp->gp_mutex);
2578 2578
2579 2579 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2580 2580
2581 2581 if (lock) {
2582 2582 while (lock->l_vnode == vp) {
2583 2583 if (IS_REMOTE(lock)) {
2584 2584 result = 1;
2585 2585 goto done;
2586 2586 }
2587 2587 lock = lock->l_next;
2588 2588 }
2589 2589 }
2590 2590
2591 2591 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2592 2592
2593 2593 if (lock) {
2594 2594 while (lock->l_vnode == vp) {
2595 2595 if (IS_REMOTE(lock)) {
2596 2596 result = 1;
2597 2597 goto done;
2598 2598 }
2599 2599 lock = lock->l_next;
2600 2600 }
2601 2601 }
2602 2602
2603 2603 done:
2604 2604 mutex_exit(&gp->gp_mutex);
2605 2605 return (result);
2606 2606 }
2607 2607
2608 2608 /*
2609 2609 * Determine whether there are any locks for the given vnode with a remote
2610 2610 * sysid matching given sysid.
2611 2611 * Used by the new (open source) NFS Lock Manager (NLM)
2612 2612 */
2613 2613 int
2614 2614 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2615 2615 {
2616 2616 lock_descriptor_t *lock;
2617 2617 int result = 0;
2618 2618 graph_t *gp;
2619 2619
2620 2620 if (sysid == 0)
2621 2621 return (0);
2622 2622
2623 2623 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2624 2624 if (gp == NULL) {
2625 2625 return (0);
2626 2626 }
2627 2627
2628 2628 mutex_enter(&gp->gp_mutex);
2629 2629
2630 2630 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2631 2631
2632 2632 if (lock) {
2633 2633 while (lock->l_vnode == vp) {
2634 2634 if (lock->l_flock.l_sysid == sysid) {
2635 2635 result = 1;
2636 2636 goto done;
2637 2637 }
2638 2638 lock = lock->l_next;
2639 2639 }
2640 2640 }
2641 2641
2642 2642 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2643 2643
2644 2644 if (lock) {
2645 2645 while (lock->l_vnode == vp) {
2646 2646 if (lock->l_flock.l_sysid == sysid) {
2647 2647 result = 1;
2648 2648 goto done;
2649 2649 }
2650 2650 lock = lock->l_next;
2651 2651 }
2652 2652 }
2653 2653
2654 2654 done:
2655 2655 mutex_exit(&gp->gp_mutex);
2656 2656 return (result);
2657 2657 }
2658 2658
2659 2659 /*
2660 2660 * Determine if there are any locks owned by the given sysid.
2661 2661 * Returns zero if not, non-zero if there are. Note that this return code
2662 2662 * could be derived from flk_get_{sleeping,active}_locks, but this routine
2663 2663 * avoids all the memory allocations of those routines.
2664 2664 *
2665 2665 * This routine has the same synchronization issues as
2666 2666 * flk_has_remote_locks.
2667 2667 */
2668 2668
2669 2669 int
2670 2670 flk_sysid_has_locks(int sysid, int lck_type)
2671 2671 {
2672 2672 int has_locks = 0;
2673 2673 lock_descriptor_t *lock;
2674 2674 graph_t *gp;
2675 2675 int i;
2676 2676
2677 2677 for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2678 2678 mutex_enter(&flock_lock);
2679 2679 gp = lock_graph[i];
2680 2680 mutex_exit(&flock_lock);
2681 2681 if (gp == NULL) {
2682 2682 continue;
2683 2683 }
2684 2684
2685 2685 mutex_enter(&gp->gp_mutex);
2686 2686
2687 2687 if (lck_type & FLK_QUERY_ACTIVE) {
2688 2688 for (lock = ACTIVE_HEAD(gp)->l_next;
2689 2689 lock != ACTIVE_HEAD(gp) && !has_locks;
2690 2690 lock = lock->l_next) {
2691 2691 if (lock->l_flock.l_sysid == sysid)
2692 2692 has_locks = 1;
2693 2693 }
2694 2694 }
2695 2695
2696 2696 if (lck_type & FLK_QUERY_SLEEPING) {
2697 2697 for (lock = SLEEPING_HEAD(gp)->l_next;
2698 2698 lock != SLEEPING_HEAD(gp) && !has_locks;
2699 2699 lock = lock->l_next) {
2700 2700 if (lock->l_flock.l_sysid == sysid)
2701 2701 has_locks = 1;
2702 2702 }
2703 2703 }
2704 2704 mutex_exit(&gp->gp_mutex);
2705 2705 }
2706 2706
2707 2707 return (has_locks);
2708 2708 }
2709 2709
2710 2710
2711 2711 /*
2712 2712 * PSARC case 1997/292
2713 2713 *
2714 2714 * Requires: "sysid" is a pair [nlmid, sysid]. The lower half is 16-bit
2715 2715 * quantity, the real sysid generated by the NLM server; the upper half
2716 2716 * identifies the node of the cluster where the NLM server ran.
2717 2717 * This routine is only called by an NLM server running in a cluster.
2718 2718 * Effects: Remove all locks held on behalf of the client identified
2719 2719 * by "sysid."
2720 2720 */
2721 2721 void
2722 2722 cl_flk_remove_locks_by_sysid(int sysid)
2723 2723 {
2724 2724 graph_t *gp;
2725 2725 int i;
2726 2726 lock_descriptor_t *lock, *nlock;
2727 2727
2728 2728 /*
2729 2729 * Check to see if node is booted as a cluster. If not, return.
2730 2730 */
2731 2731 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2732 2732 return;
2733 2733 }
2734 2734
2735 2735 ASSERT(sysid != 0);
2736 2736 for (i = 0; i < HASH_SIZE; i++) {
2737 2737 mutex_enter(&flock_lock);
2738 2738 gp = lock_graph[i];
2739 2739 mutex_exit(&flock_lock);
2740 2740
2741 2741 if (gp == NULL)
2742 2742 continue;
2743 2743
2744 2744 mutex_enter(&gp->gp_mutex); /* get mutex on lock graph */
2745 2745
2746 2746 /* signal sleeping requests so that they bail out */
2747 2747 lock = SLEEPING_HEAD(gp)->l_next;
2748 2748 while (lock != SLEEPING_HEAD(gp)) {
2749 2749 nlock = lock->l_next;
2750 2750 if (lock->l_flock.l_sysid == sysid) {
2751 2751 INTERRUPT_WAKEUP(lock);
2752 2752 }
2753 2753 lock = nlock;
2754 2754 }
2755 2755
2756 2756 /* delete active locks */
2757 2757 lock = ACTIVE_HEAD(gp)->l_next;
2758 2758 while (lock != ACTIVE_HEAD(gp)) {
2759 2759 nlock = lock->l_next;
2760 2760 if (lock->l_flock.l_sysid == sysid) {
2761 2761 flk_delete_active_lock(lock, 0);
2762 2762 flk_wakeup(lock, 1);
2763 2763 flk_free_lock(lock);
2764 2764 }
2765 2765 lock = nlock;
2766 2766 }
2767 2767 mutex_exit(&gp->gp_mutex); /* release mutex on lock graph */
2768 2768 }
2769 2769 }
2770 2770
2771 2771 /*
2772 2772 * Delete all locks in the system that belongs to the sysid of the request.
2773 2773 */
2774 2774
2775 2775 static void
2776 2776 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2777 2777 {
2778 2778 int sysid = request->l_flock.l_sysid;
2779 2779 lock_descriptor_t *lock, *nlock;
2780 2780 graph_t *gp;
2781 2781 int i;
2782 2782
2783 2783 ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2784 2784 ASSERT(sysid != 0);
2785 2785
2786 2786 mutex_exit(&request->l_graph->gp_mutex);
2787 2787
2788 2788 for (i = 0; i < HASH_SIZE; i++) {
2789 2789 mutex_enter(&flock_lock);
2790 2790 gp = lock_graph[i];
2791 2791 mutex_exit(&flock_lock);
2792 2792
2793 2793 if (gp == NULL)
2794 2794 continue;
2795 2795
2796 2796 mutex_enter(&gp->gp_mutex);
2797 2797
2798 2798 /* signal sleeping requests so that they bail out */
2799 2799 lock = SLEEPING_HEAD(gp)->l_next;
2800 2800 while (lock != SLEEPING_HEAD(gp)) {
2801 2801 nlock = lock->l_next;
2802 2802 if (lock->l_flock.l_sysid == sysid) {
2803 2803 INTERRUPT_WAKEUP(lock);
2804 2804 }
2805 2805 lock = nlock;
2806 2806 }
2807 2807
2808 2808 /* delete active locks */
2809 2809 lock = ACTIVE_HEAD(gp)->l_next;
2810 2810 while (lock != ACTIVE_HEAD(gp)) {
2811 2811 nlock = lock->l_next;
2812 2812 if (lock->l_flock.l_sysid == sysid) {
2813 2813 flk_delete_active_lock(lock, 0);
2814 2814 flk_wakeup(lock, 1);
2815 2815 flk_free_lock(lock);
2816 2816 }
2817 2817 lock = nlock;
2818 2818 }
2819 2819 mutex_exit(&gp->gp_mutex);
2820 2820 }
2821 2821
2822 2822 mutex_enter(&request->l_graph->gp_mutex);
2823 2823 }
2824 2824
2825 2825 /*
2826 2826 * Clustering: Deletes PXFS locks
2827 2827 * Effects: Delete all locks on files in the given file system and with the
2828 2828 * given PXFS id.
2829 2829 */
2830 2830 void
2831 2831 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2832 2832 {
2833 2833 lock_descriptor_t *lock, *nlock;
2834 2834 graph_t *gp;
2835 2835 int i;
2836 2836
2837 2837 for (i = 0; i < HASH_SIZE; i++) {
2838 2838 mutex_enter(&flock_lock);
2839 2839 gp = lock_graph[i];
2840 2840 mutex_exit(&flock_lock);
2841 2841
2842 2842 if (gp == NULL)
2843 2843 continue;
2844 2844
2845 2845 mutex_enter(&gp->gp_mutex);
2846 2846
2847 2847 /* signal sleeping requests so that they bail out */
2848 2848 lock = SLEEPING_HEAD(gp)->l_next;
2849 2849 while (lock != SLEEPING_HEAD(gp)) {
2850 2850 nlock = lock->l_next;
2851 2851 if (lock->l_vnode->v_vfsp == vfsp) {
2852 2852 ASSERT(IS_PXFS(lock));
2853 2853 if (GETPXFSID(lock->l_flock.l_sysid) ==
2854 2854 pxfsid) {
2855 2855 flk_set_state(lock,
2856 2856 FLK_CANCELLED_STATE);
2857 2857 flk_cancel_sleeping_lock(lock, 1);
2858 2858 }
2859 2859 }
2860 2860 lock = nlock;
2861 2861 }
2862 2862
2863 2863 /* delete active locks */
2864 2864 lock = ACTIVE_HEAD(gp)->l_next;
2865 2865 while (lock != ACTIVE_HEAD(gp)) {
2866 2866 nlock = lock->l_next;
2867 2867 if (lock->l_vnode->v_vfsp == vfsp) {
2868 2868 ASSERT(IS_PXFS(lock));
2869 2869 if (GETPXFSID(lock->l_flock.l_sysid) ==
2870 2870 pxfsid) {
2871 2871 flk_delete_active_lock(lock, 0);
2872 2872 flk_wakeup(lock, 1);
2873 2873 flk_free_lock(lock);
2874 2874 }
2875 2875 }
2876 2876 lock = nlock;
2877 2877 }
2878 2878 mutex_exit(&gp->gp_mutex);
2879 2879 }
2880 2880 }
2881 2881
2882 2882 /*
2883 2883 * Search for a sleeping lock manager lock which matches exactly this lock
2884 2884 * request; if one is found, fake a signal to cancel it.
2885 2885 *
2886 2886 * Return 1 if a matching lock was found, 0 otherwise.
2887 2887 */
2888 2888
2889 2889 static int
2890 2890 flk_canceled(lock_descriptor_t *request)
2891 2891 {
2892 2892 lock_descriptor_t *lock, *nlock;
2893 2893 graph_t *gp = request->l_graph;
2894 2894 vnode_t *vp = request->l_vnode;
2895 2895
2896 2896 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2897 2897 ASSERT(IS_LOCKMGR(request));
2898 2898 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2899 2899
2900 2900 if (lock) {
2901 2901 while (lock->l_vnode == vp) {
2902 2902 nlock = lock->l_next;
2903 2903 if (SAME_OWNER(lock, request) &&
2904 2904 lock->l_start == request->l_start &&
2905 2905 lock->l_end == request->l_end) {
2906 2906 INTERRUPT_WAKEUP(lock);
2907 2907 return (1);
2908 2908 }
2909 2909 lock = nlock;
2910 2910 }
2911 2911 }
2912 2912 return (0);
2913 2913 }
2914 2914
2915 2915 /*
2916 2916 * Remove all non-OFD locks for the vnode belonging to the given pid and sysid.
2917 2917 * That is, since OFD locks are pid-less we'll never match on the incoming
2918 2918 * pid. OFD locks are removed earlier in the close() path via closef() and
2919 2919 * ofdcleanlock().
2920 2920 */
2921 2921 void
2922 2922 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2923 2923 {
2924 2924 graph_t *gp;
2925 2925 lock_descriptor_t *lock, *nlock;
2926 2926 lock_descriptor_t *link_stack;
2927 2927
2928 2928 STACK_INIT(link_stack);
2929 2929
2930 2930 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2931 2931
2932 2932 if (gp == NULL)
2933 2933 return;
2934 2934 mutex_enter(&gp->gp_mutex);
2935 2935
2936 2936 CHECK_SLEEPING_LOCKS(gp);
2937 2937 CHECK_ACTIVE_LOCKS(gp);
2938 2938
2939 2939 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2940 2940
2941 2941 if (lock) {
2942 2942 do {
2943 2943 nlock = lock->l_next;
2944 2944 if ((lock->l_flock.l_pid == pid ||
2945 2945 pid == IGN_PID) &&
2946 2946 lock->l_flock.l_sysid == sysid) {
2947 2947 CANCEL_WAKEUP(lock);
2948 2948 }
2949 2949 lock = nlock;
2950 2950 } while (lock->l_vnode == vp);
2951 2951 }
2952 2952
2953 2953 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2954 2954
2955 2955 if (lock) {
2956 2956 do {
2957 2957 nlock = lock->l_next;
2958 2958 if ((lock->l_flock.l_pid == pid ||
2959 2959 pid == IGN_PID) &&
2960 2960 lock->l_flock.l_sysid == sysid) {
2961 2961 flk_delete_active_lock(lock, 0);
2962 2962 STACK_PUSH(link_stack, lock, l_stack);
2963 2963 }
2964 2964 lock = nlock;
2965 2965 } while (lock->l_vnode == vp);
2966 2966 }
2967 2967
2968 2968 while ((lock = STACK_TOP(link_stack)) != NULL) {
2969 2969 STACK_POP(link_stack, l_stack);
2970 2970 flk_wakeup(lock, 1);
2971 2971 flk_free_lock(lock);
2972 2972 }
2973 2973
2974 2974 CHECK_SLEEPING_LOCKS(gp);
2975 2975 CHECK_ACTIVE_LOCKS(gp);
2976 2976 CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2977 2977 mutex_exit(&gp->gp_mutex);
2978 2978 }
2979 2979
2980 2980
2981 2981 /*
2982 2982 * Called from 'fs' read and write routines for files that have mandatory
2983 2983 * locking enabled.
2984 2984 */
2985 2985
2986 2986 int
2987 2987 chklock(
2988 2988 struct vnode *vp,
2989 2989 int iomode,
2990 2990 u_offset_t offset,
2991 2991 ssize_t len,
2992 2992 int fmode,
2993 2993 caller_context_t *ct)
2994 2994 {
2995 2995 register int i;
2996 2996 struct flock64 bf;
2997 2997 int error = 0;
2998 2998
2999 2999 bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
3000 3000 bf.l_whence = 0;
3001 3001 bf.l_start = offset;
3002 3002 bf.l_len = len;
3003 3003 if (ct == NULL) {
3004 3004 bf.l_pid = curproc->p_pid;
3005 3005 bf.l_sysid = 0;
3006 3006 } else {
3007 3007 bf.l_pid = ct->cc_pid;
3008 3008 bf.l_sysid = ct->cc_sysid;
3009 3009 }
3010 3010 i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
3011 3011 if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
3012 3012 bf.l_type != F_UNLCK)
3013 3013 error = i ? i : EAGAIN;
3014 3014 return (error);
3015 3015 }
3016 3016
3017 3017 /*
3018 3018 * convoff - converts the given data (start, whence) to the
3019 3019 * given whence.
3020 3020 */
3021 3021 int
3022 3022 convoff(vp, lckdat, whence, offset)
3023 3023 struct vnode *vp;
3024 3024 struct flock64 *lckdat;
3025 3025 int whence;
3026 3026 offset_t offset;
3027 3027 {
3028 3028 int error;
3029 3029 struct vattr vattr;
3030 3030
3031 3031 if ((lckdat->l_whence == 2) || (whence == 2)) {
3032 3032 vattr.va_mask = AT_SIZE;
3033 3033 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3034 3034 return (error);
3035 3035 }
3036 3036
3037 3037 switch (lckdat->l_whence) {
3038 3038 case 1:
3039 3039 lckdat->l_start += offset;
3040 3040 break;
3041 3041 case 2:
3042 3042 lckdat->l_start += vattr.va_size;
3043 3043 /* FALLTHRU */
3044 3044 case 0:
3045 3045 break;
3046 3046 default:
3047 3047 return (EINVAL);
3048 3048 }
3049 3049
3050 3050 if (lckdat->l_start < 0)
3051 3051 return (EINVAL);
3052 3052
3053 3053 switch (whence) {
3054 3054 case 1:
3055 3055 lckdat->l_start -= offset;
3056 3056 break;
3057 3057 case 2:
3058 3058 lckdat->l_start -= vattr.va_size;
3059 3059 /* FALLTHRU */
3060 3060 case 0:
3061 3061 break;
3062 3062 default:
3063 3063 return (EINVAL);
3064 3064 }
3065 3065
3066 3066 lckdat->l_whence = (short)whence;
3067 3067 return (0);
3068 3068 }
3069 3069
3070 3070
3071 3071 /* proc_graph function definitions */
3072 3072
3073 3073 /*
3074 3074 * Function checks for deadlock due to the new 'lock'. If deadlock found
3075 3075 * edges of this lock are freed and returned.
3076 3076 */
3077 3077
3078 3078 static int
3079 3079 flk_check_deadlock(lock_descriptor_t *lock)
3080 3080 {
3081 3081 proc_vertex_t *start_vertex, *pvertex;
3082 3082 proc_vertex_t *dvertex;
3083 3083 proc_edge_t *pep, *ppep;
3084 3084 edge_t *ep, *nep;
3085 3085 proc_vertex_t *process_stack;
3086 3086
3087 3087 /*
3088 3088 * OFD style locks are not associated with any process so there is
3089 3089 * no proc graph for these. Thus we cannot, and do not, do deadlock
3090 3090 * detection.
3091 3091 */
3092 3092 if (lock->l_ofd != NULL)
3093 3093 return (0);
3094 3094
3095 3095 STACK_INIT(process_stack);
3096 3096
3097 3097 mutex_enter(&flock_lock);
3098 3098 start_vertex = flk_get_proc_vertex(lock);
3099 3099 ASSERT(start_vertex != NULL);
3100 3100
3101 3101 /* construct the edges from this process to other processes */
3102 3102
3103 3103 ep = FIRST_ADJ(lock);
3104 3104 while (ep != HEAD(lock)) {
3105 3105 proc_vertex_t *adj_proc;
3106 3106
3107 3107 adj_proc = flk_get_proc_vertex(ep->to_vertex);
3108 3108 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
3109 3109 if (pep->to_proc == adj_proc) {
3110 3110 ASSERT(pep->refcount);
3111 3111 pep->refcount++;
3112 3112 break;
3113 3113 }
3114 3114 }
3115 3115 if (pep == NULL) {
3116 3116 pep = flk_get_proc_edge();
3117 3117 pep->to_proc = adj_proc;
3118 3118 pep->refcount = 1;
3119 3119 adj_proc->incount++;
3120 3120 pep->next = start_vertex->edge;
3121 3121 start_vertex->edge = pep;
3122 3122 }
3123 3123 ep = NEXT_ADJ(ep);
3124 3124 }
3125 3125
3126 3126 ep = FIRST_IN(lock);
3127 3127
3128 3128 while (ep != HEAD(lock)) {
3129 3129 proc_vertex_t *in_proc;
3130 3130
3131 3131 in_proc = flk_get_proc_vertex(ep->from_vertex);
3132 3132
3133 3133 for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
3134 3134 if (pep->to_proc == start_vertex) {
3135 3135 ASSERT(pep->refcount);
3136 3136 pep->refcount++;
3137 3137 break;
3138 3138 }
3139 3139 }
3140 3140 if (pep == NULL) {
3141 3141 pep = flk_get_proc_edge();
3142 3142 pep->to_proc = start_vertex;
3143 3143 pep->refcount = 1;
3144 3144 start_vertex->incount++;
3145 3145 pep->next = in_proc->edge;
3146 3146 in_proc->edge = pep;
3147 3147 }
3148 3148 ep = NEXT_IN(ep);
3149 3149 }
3150 3150
3151 3151 if (start_vertex->incount == 0) {
3152 3152 mutex_exit(&flock_lock);
3153 3153 return (0);
3154 3154 }
3155 3155
3156 3156 flk_proc_graph_uncolor();
3157 3157
3158 3158 start_vertex->p_sedge = start_vertex->edge;
3159 3159
3160 3160 STACK_PUSH(process_stack, start_vertex, p_stack);
3161 3161
3162 3162 while ((pvertex = STACK_TOP(process_stack)) != NULL) {
3163 3163 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
3164 3164 dvertex = pep->to_proc;
3165 3165 if (!PROC_ARRIVED(dvertex)) {
3166 3166 STACK_PUSH(process_stack, dvertex, p_stack);
3167 3167 dvertex->p_sedge = dvertex->edge;
3168 3168 PROC_ARRIVE(pvertex);
3169 3169 pvertex->p_sedge = pep->next;
3170 3170 break;
3171 3171 }
3172 3172 if (!PROC_DEPARTED(dvertex))
3173 3173 goto deadlock;
3174 3174 }
3175 3175 if (pep == NULL) {
3176 3176 PROC_DEPART(pvertex);
3177 3177 STACK_POP(process_stack, p_stack);
3178 3178 }
3179 3179 }
3180 3180 mutex_exit(&flock_lock);
3181 3181 return (0);
3182 3182
3183 3183 deadlock:
3184 3184
3185 3185 /* we remove all lock edges and proc edges */
3186 3186
3187 3187 ep = FIRST_ADJ(lock);
3188 3188 while (ep != HEAD(lock)) {
3189 3189 proc_vertex_t *adj_proc;
3190 3190 adj_proc = flk_get_proc_vertex(ep->to_vertex);
3191 3191 nep = NEXT_ADJ(ep);
3192 3192 IN_LIST_REMOVE(ep);
3193 3193 ADJ_LIST_REMOVE(ep);
3194 3194 flk_free_edge(ep);
3195 3195 ppep = start_vertex->edge;
3196 3196 for (pep = start_vertex->edge; pep != NULL; ppep = pep,
3197 3197 pep = ppep->next) {
3198 3198 if (pep->to_proc == adj_proc) {
3199 3199 pep->refcount--;
3200 3200 if (pep->refcount == 0) {
3201 3201 if (pep == ppep) {
3202 3202 start_vertex->edge = pep->next;
3203 3203 } else {
3204 3204 ppep->next = pep->next;
3205 3205 }
3206 3206 adj_proc->incount--;
3207 3207 flk_proc_release(adj_proc);
3208 3208 flk_free_proc_edge(pep);
3209 3209 }
3210 3210 break;
3211 3211 }
3212 3212 }
3213 3213 ep = nep;
3214 3214 }
3215 3215 ep = FIRST_IN(lock);
3216 3216 while (ep != HEAD(lock)) {
3217 3217 proc_vertex_t *in_proc;
3218 3218 in_proc = flk_get_proc_vertex(ep->from_vertex);
3219 3219 nep = NEXT_IN(ep);
3220 3220 IN_LIST_REMOVE(ep);
3221 3221 ADJ_LIST_REMOVE(ep);
3222 3222 flk_free_edge(ep);
3223 3223 ppep = in_proc->edge;
3224 3224 for (pep = in_proc->edge; pep != NULL; ppep = pep,
3225 3225 pep = ppep->next) {
3226 3226 if (pep->to_proc == start_vertex) {
3227 3227 pep->refcount--;
3228 3228 if (pep->refcount == 0) {
3229 3229 if (pep == ppep) {
3230 3230 in_proc->edge = pep->next;
3231 3231 } else {
3232 3232 ppep->next = pep->next;
3233 3233 }
3234 3234 start_vertex->incount--;
3235 3235 flk_proc_release(in_proc);
3236 3236 flk_free_proc_edge(pep);
3237 3237 }
3238 3238 break;
3239 3239 }
3240 3240 }
3241 3241 ep = nep;
3242 3242 }
3243 3243 flk_proc_release(start_vertex);
3244 3244 mutex_exit(&flock_lock);
3245 3245 return (1);
3246 3246 }
3247 3247
3248 3248 /*
3249 3249 * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
3250 3250 * from the list we return that, otherwise we allocate one. If necessary,
3251 3251 * we grow the list of vertices also.
3252 3252 */
3253 3253
3254 3254 static proc_vertex_t *
3255 3255 flk_get_proc_vertex(lock_descriptor_t *lock)
3256 3256 {
3257 3257 int i;
3258 3258 proc_vertex_t *pv;
3259 3259 proc_vertex_t **palloc;
3260 3260
3261 3261 ASSERT(MUTEX_HELD(&flock_lock));
3262 3262 if (lock->pvertex != -1) {
3263 3263 ASSERT(lock->pvertex >= 0);
3264 3264 pv = pgraph.proc[lock->pvertex];
3265 3265 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3266 3266 return (pv);
3267 3267 }
3268 3268 }
3269 3269 for (i = 0; i < pgraph.gcount; i++) {
3270 3270 pv = pgraph.proc[i];
3271 3271 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3272 3272 lock->pvertex = pv->index = i;
3273 3273 return (pv);
3274 3274 }
3275 3275 }
3276 3276 pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
3277 3277 pv->pid = lock->l_flock.l_pid;
3278 3278 pv->sysid = lock->l_flock.l_sysid;
3279 3279 flk_proc_vertex_allocs++;
3280 3280 if (pgraph.free != 0) {
3281 3281 for (i = 0; i < pgraph.gcount; i++) {
3282 3282 if (pgraph.proc[i] == NULL) {
3283 3283 pgraph.proc[i] = pv;
3284 3284 lock->pvertex = pv->index = i;
3285 3285 pgraph.free--;
3286 3286 return (pv);
3287 3287 }
3288 3288 }
3289 3289 }
3290 3290 palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
3291 3291 sizeof (proc_vertex_t *), KM_SLEEP);
3292 3292
3293 3293 if (pgraph.proc) {
3294 3294 bcopy(pgraph.proc, palloc,
3295 3295 pgraph.gcount * sizeof (proc_vertex_t *));
3296 3296
3297 3297 kmem_free(pgraph.proc,
3298 3298 pgraph.gcount * sizeof (proc_vertex_t *));
3299 3299 }
3300 3300 pgraph.proc = palloc;
3301 3301 pgraph.free += (PROC_CHUNK - 1);
3302 3302 pv->index = lock->pvertex = pgraph.gcount;
3303 3303 pgraph.gcount += PROC_CHUNK;
3304 3304 pgraph.proc[pv->index] = pv;
3305 3305 return (pv);
3306 3306 }
3307 3307
3308 3308 /*
3309 3309 * Allocate a proc edge.
3310 3310 */
3311 3311
3312 3312 static proc_edge_t *
3313 3313 flk_get_proc_edge()
3314 3314 {
3315 3315 proc_edge_t *pep;
3316 3316
3317 3317 pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3318 3318 flk_proc_edge_allocs++;
3319 3319 return (pep);
3320 3320 }
3321 3321
3322 3322 /*
3323 3323 * Free the proc edge. Called whenever its reference count goes to zero.
3324 3324 */
3325 3325
3326 3326 static void
3327 3327 flk_free_proc_edge(proc_edge_t *pep)
3328 3328 {
3329 3329 ASSERT(pep->refcount == 0);
3330 3330 kmem_free((void *)pep, sizeof (proc_edge_t));
3331 3331 flk_proc_edge_frees++;
3332 3332 }
3333 3333
3334 3334 /*
3335 3335 * Color the graph explicitly done only when the mark value hits max value.
3336 3336 */
3337 3337
3338 3338 static void
3339 3339 flk_proc_graph_uncolor()
3340 3340 {
3341 3341 int i;
3342 3342
3343 3343 if (pgraph.mark == UINT_MAX) {
3344 3344 for (i = 0; i < pgraph.gcount; i++)
3345 3345 if (pgraph.proc[i] != NULL) {
3346 3346 pgraph.proc[i]->atime = 0;
3347 3347 pgraph.proc[i]->dtime = 0;
3348 3348 }
3349 3349 pgraph.mark = 1;
3350 3350 } else {
3351 3351 pgraph.mark++;
3352 3352 }
3353 3353 }
3354 3354
3355 3355 /*
3356 3356 * Release the proc vertex iff both there are no in edges and out edges
3357 3357 */
3358 3358
3359 3359 static void
3360 3360 flk_proc_release(proc_vertex_t *proc)
3361 3361 {
3362 3362 ASSERT(MUTEX_HELD(&flock_lock));
3363 3363 if (proc->edge == NULL && proc->incount == 0) {
3364 3364 pgraph.proc[proc->index] = NULL;
3365 3365 pgraph.free++;
3366 3366 kmem_free(proc, sizeof (proc_vertex_t));
3367 3367 flk_proc_vertex_frees++;
3368 3368 }
3369 3369 }
3370 3370
3371 3371 /*
3372 3372 * Updates process graph to reflect change in a lock_graph.
3373 3373 * Note: We should call this function only after we have a correctly
3374 3374 * recomputed lock graph. Otherwise we might miss a deadlock detection.
3375 3375 * eg: in function flk_relation() we call this function after flk_recompute_
3376 3376 * dependencies() otherwise if a process tries to lock a vnode hashed
3377 3377 * into another graph it might sleep for ever.
3378 3378 */
3379 3379
3380 3380 static void
3381 3381 flk_update_proc_graph(edge_t *ep, int delete)
3382 3382 {
3383 3383 proc_vertex_t *toproc, *fromproc;
3384 3384 proc_edge_t *pep, *prevpep;
3385 3385
3386 3386 mutex_enter(&flock_lock);
3387 3387
3388 3388 /*
3389 3389 * OFD style locks are not associated with any process so there is
3390 3390 * no proc graph for these.
3391 3391 */
3392 3392 if (ep->from_vertex->l_ofd != NULL) {
3393 3393 mutex_exit(&flock_lock);
3394 3394 return;
3395 3395 }
3396 3396
3397 3397 toproc = flk_get_proc_vertex(ep->to_vertex);
3398 3398 fromproc = flk_get_proc_vertex(ep->from_vertex);
3399 3399
3400 3400 if (!delete)
3401 3401 goto add;
3402 3402 pep = prevpep = fromproc->edge;
3403 3403
3404 3404 ASSERT(pep != NULL);
3405 3405 while (pep != NULL) {
3406 3406 if (pep->to_proc == toproc) {
3407 3407 ASSERT(pep->refcount > 0);
3408 3408 pep->refcount--;
3409 3409 if (pep->refcount == 0) {
3410 3410 if (pep == prevpep) {
3411 3411 fromproc->edge = pep->next;
3412 3412 } else {
3413 3413 prevpep->next = pep->next;
3414 3414 }
3415 3415 toproc->incount--;
3416 3416 flk_proc_release(toproc);
3417 3417 flk_free_proc_edge(pep);
3418 3418 }
3419 3419 break;
3420 3420 }
3421 3421 prevpep = pep;
3422 3422 pep = pep->next;
3423 3423 }
3424 3424 flk_proc_release(fromproc);
3425 3425 mutex_exit(&flock_lock);
3426 3426 return;
3427 3427 add:
3428 3428
3429 3429 pep = fromproc->edge;
3430 3430
3431 3431 while (pep != NULL) {
3432 3432 if (pep->to_proc == toproc) {
3433 3433 ASSERT(pep->refcount > 0);
3434 3434 pep->refcount++;
3435 3435 break;
3436 3436 }
3437 3437 pep = pep->next;
3438 3438 }
3439 3439 if (pep == NULL) {
3440 3440 pep = flk_get_proc_edge();
3441 3441 pep->to_proc = toproc;
3442 3442 pep->refcount = 1;
3443 3443 toproc->incount++;
3444 3444 pep->next = fromproc->edge;
3445 3445 fromproc->edge = pep;
3446 3446 }
3447 3447 mutex_exit(&flock_lock);
3448 3448 }
3449 3449
3450 3450 /*
3451 3451 * Set the control status for lock manager requests.
3452 3452 *
3453 3453 */
3454 3454
3455 3455 /*
3456 3456 * PSARC case 1997/292
3457 3457 *
3458 3458 * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3459 3459 * Effects: Set the state of the NLM server identified by "nlmid"
3460 3460 * in the NLM registry to state "nlm_state."
3461 3461 * Raises exception no_such_nlm if "nlmid" doesn't identify a known
3462 3462 * NLM server to this LLM.
3463 3463 * Note that when this routine is called with NLM_SHUTTING_DOWN there
3464 3464 * may be locks requests that have gotten started but not finished. In
3465 3465 * particular, there may be blocking requests that are in the callback code
3466 3466 * before sleeping (so they're not holding the lock for the graph). If
3467 3467 * such a thread reacquires the graph's lock (to go to sleep) after
3468 3468 * NLM state in the NLM registry is set to a non-up value,
3469 3469 * it will notice the status and bail out. If the request gets
3470 3470 * granted before the thread can check the NLM registry, let it
3471 3471 * continue normally. It will get flushed when we are called with NLM_DOWN.
3472 3472 *
3473 3473 * Modifies: nlm_reg_obj (global)
3474 3474 * Arguments:
3475 3475 * nlmid (IN): id uniquely identifying an NLM server
3476 3476 * nlm_state (IN): NLM server state to change "nlmid" to
3477 3477 */
3478 3478 void
3479 3479 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3480 3480 {
3481 3481 /*
3482 3482 * Check to see if node is booted as a cluster. If not, return.
3483 3483 */
3484 3484 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3485 3485 return;
3486 3486 }
3487 3487
3488 3488 /*
3489 3489 * Check for development/debugging. It is possible to boot a node
3490 3490 * in non-cluster mode, and then run a special script, currently
3491 3491 * available only to developers, to bring up the node as part of a
3492 3492 * cluster. The problem is that running such a script does not
3493 3493 * result in the routine flk_init() being called and hence global array
3494 3494 * nlm_reg_status is NULL. The NLM thinks it's in cluster mode,
3495 3495 * but the LLM needs to do an additional check to see if the global
3496 3496 * array has been created or not. If nlm_reg_status is NULL, then
3497 3497 * return, else continue.
3498 3498 */
3499 3499 if (nlm_reg_status == NULL) {
3500 3500 return;
3501 3501 }
3502 3502
3503 3503 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3504 3504 mutex_enter(&nlm_reg_lock);
3505 3505
3506 3506 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3507 3507 /*
3508 3508 * If the NLM server "nlmid" is unknown in the NLM registry,
3509 3509 * add it to the registry in the nlm shutting down state.
3510 3510 */
3511 3511 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3512 3512 FLK_NLM_SHUTTING_DOWN);
3513 3513 } else {
3514 3514 /*
3515 3515 * Change the state of the NLM server identified by "nlmid"
3516 3516 * in the NLM registry to the argument "nlm_state."
3517 3517 */
3518 3518 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3519 3519 nlm_state);
3520 3520 }
3521 3521
3522 3522 /*
3523 3523 * The reason we must register the NLM server that is shutting down
3524 3524 * with an LLM that doesn't already know about it (never sent a lock
3525 3525 * request) is to handle correctly a race between shutdown and a new
3526 3526 * lock request. Suppose that a shutdown request from the NLM server
3527 3527 * invokes this routine at the LLM, and a thread is spawned to
3528 3528 * service the request. Now suppose a new lock request is in
3529 3529 * progress and has already passed the first line of defense in
3530 3530 * reclock(), which denies new locks requests from NLM servers
3531 3531 * that are not in the NLM_UP state. After the current routine
3532 3532 * is invoked for both phases of shutdown, the routine will return,
3533 3533 * having done nothing, and the lock request will proceed and
3534 3534 * probably be granted. The problem is that the shutdown was ignored
3535 3535 * by the lock request because there was no record of that NLM server
3536 3536 * shutting down. We will be in the peculiar position of thinking
3537 3537 * that we've shutdown the NLM server and all locks at all LLMs have
3538 3538 * been discarded, but in fact there's still one lock held.
3539 3539 * The solution is to record the existence of NLM server and change
3540 3540 * its state immediately to NLM_SHUTTING_DOWN. The lock request in
3541 3541 * progress may proceed because the next phase NLM_DOWN will catch
3542 3542 * this lock and discard it.
3543 3543 */
3544 3544 mutex_exit(&nlm_reg_lock);
3545 3545
3546 3546 switch (nlm_state) {
3547 3547 case FLK_NLM_UP:
3548 3548 /*
3549 3549 * Change the NLM state of all locks still held on behalf of
3550 3550 * the NLM server identified by "nlmid" to NLM_UP.
3551 3551 */
3552 3552 cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3553 3553 break;
3554 3554
3555 3555 case FLK_NLM_SHUTTING_DOWN:
3556 3556 /*
3557 3557 * Wake up all sleeping locks for the NLM server identified
3558 3558 * by "nlmid." Note that eventually all woken threads will
3559 3559 * have their lock requests cancelled and descriptors
3560 3560 * removed from the sleeping lock list. Note that the NLM
3561 3561 * server state associated with each lock descriptor is
3562 3562 * changed to FLK_NLM_SHUTTING_DOWN.
3563 3563 */
3564 3564 cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3565 3565 break;
3566 3566
3567 3567 case FLK_NLM_DOWN:
3568 3568 /*
3569 3569 * Discard all active, granted locks for this NLM server
3570 3570 * identified by "nlmid."
3571 3571 */
3572 3572 cl_flk_unlock_nlm_granted(nlmid);
3573 3573 break;
3574 3574
3575 3575 default:
3576 3576 panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3577 3577 }
3578 3578 }
3579 3579
3580 3580 /*
3581 3581 * Set the control status for lock manager requests.
3582 3582 *
3583 3583 * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3584 3584 * may be locks requests that have gotten started but not finished. In
3585 3585 * particular, there may be blocking requests that are in the callback code
3586 3586 * before sleeping (so they're not holding the lock for the graph). If
3587 3587 * such a thread reacquires the graph's lock (to go to sleep) after
3588 3588 * flk_lockmgr_status is set to a non-up value, it will notice the status
3589 3589 * and bail out. If the request gets granted before the thread can check
3590 3590 * flk_lockmgr_status, let it continue normally. It will get flushed when
3591 3591 * we are called with FLK_LOCKMGR_DOWN.
3592 3592 */
3593 3593
3594 3594 void
3595 3595 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3596 3596 {
3597 3597 int i;
3598 3598 graph_t *gp;
3599 3599 struct flock_globals *fg;
3600 3600
3601 3601 fg = flk_get_globals();
3602 3602 ASSERT(fg != NULL);
3603 3603
3604 3604 mutex_enter(&flock_lock);
3605 3605 fg->flk_lockmgr_status = status;
3606 3606 mutex_exit(&flock_lock);
3607 3607
3608 3608 /*
3609 3609 * If the lock manager is coming back up, all that's needed is to
3610 3610 * propagate this information to the graphs. If the lock manager
3611 3611 * is going down, additional action is required, and each graph's
3612 3612 * copy of the state is updated atomically with this other action.
3613 3613 */
3614 3614 switch (status) {
3615 3615 case FLK_LOCKMGR_UP:
3616 3616 for (i = 0; i < HASH_SIZE; i++) {
3617 3617 mutex_enter(&flock_lock);
3618 3618 gp = lock_graph[i];
3619 3619 mutex_exit(&flock_lock);
3620 3620 if (gp == NULL)
3621 3621 continue;
3622 3622 mutex_enter(&gp->gp_mutex);
3623 3623 fg->lockmgr_status[i] = status;
3624 3624 mutex_exit(&gp->gp_mutex);
3625 3625 }
3626 3626 break;
3627 3627 case FLK_WAKEUP_SLEEPERS:
3628 3628 wakeup_sleeping_lockmgr_locks(fg);
3629 3629 break;
3630 3630 case FLK_LOCKMGR_DOWN:
3631 3631 unlock_lockmgr_granted(fg);
3632 3632 break;
3633 3633 default:
3634 3634 panic("flk_set_lockmgr_status: bad status (%d)", status);
3635 3635 break;
3636 3636 }
3637 3637 }
3638 3638
3639 3639 /*
3640 3640 * This routine returns all the locks that are active or sleeping and are
3641 3641 * associated with a particular set of identifiers. If lock_state != 0, then
3642 3642 * only locks that match the lock_state are returned. If lock_state == 0, then
3643 3643 * all locks are returned. If pid == NOPID, the pid is ignored. If
3644 3644 * use_sysid is FALSE, then the sysid is ignored. If vp is NULL, then the
3645 3645 * vnode pointer is ignored.
3646 3646 *
3647 3647 * A list containing the vnode pointer and an flock structure
3648 3648 * describing the lock is returned. Each element in the list is
3649 3649 * dynamically allocated and must be freed by the caller. The
3650 3650 * last item in the list is denoted by a NULL value in the ll_next
3651 3651 * field.
3652 3652 *
3653 3653 * The vnode pointers returned are held. The caller is responsible
3654 3654 * for releasing these. Note that the returned list is only a snapshot of
3655 3655 * the current lock information, and that it is a snapshot of a moving
3656 3656 * target (only one graph is locked at a time).
3657 3657 */
3658 3658
3659 3659 locklist_t *
3660 3660 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3661 3661 pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3662 3662 {
3663 3663 lock_descriptor_t *lock;
3664 3664 lock_descriptor_t *graph_head;
3665 3665 locklist_t listhead;
3666 3666 locklist_t *llheadp;
3667 3667 locklist_t *llp;
3668 3668 locklist_t *lltp;
3669 3669 graph_t *gp;
3670 3670 int i;
3671 3671 int first_index; /* graph index */
3672 3672 int num_indexes; /* graph index */
3673 3673
3674 3674 ASSERT((list_type == FLK_ACTIVE_STATE) ||
3675 3675 (list_type == FLK_SLEEPING_STATE));
3676 3676
3677 3677 /*
3678 3678 * Get a pointer to something to use as a list head while building
3679 3679 * the rest of the list.
3680 3680 */
3681 3681 llheadp = &listhead;
3682 3682 lltp = llheadp;
3683 3683 llheadp->ll_next = (locklist_t *)NULL;
3684 3684
3685 3685 /* Figure out which graphs we want to look at. */
3686 3686 if (vp == NULL) {
3687 3687 first_index = 0;
3688 3688 num_indexes = HASH_SIZE;
3689 3689 } else {
3690 3690 first_index = HASH_INDEX(vp);
3691 3691 num_indexes = 1;
3692 3692 }
3693 3693
3694 3694 for (i = first_index; i < first_index + num_indexes; i++) {
3695 3695 mutex_enter(&flock_lock);
3696 3696 gp = lock_graph[i];
3697 3697 mutex_exit(&flock_lock);
3698 3698 if (gp == NULL) {
3699 3699 continue;
3700 3700 }
3701 3701
3702 3702 mutex_enter(&gp->gp_mutex);
3703 3703 graph_head = (list_type == FLK_ACTIVE_STATE) ?
3704 3704 ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3705 3705 for (lock = graph_head->l_next;
3706 3706 lock != graph_head;
3707 3707 lock = lock->l_next) {
3708 3708 if (use_sysid && lock->l_flock.l_sysid != sysid)
3709 3709 continue;
3710 3710 if (pid != NOPID && lock->l_flock.l_pid != pid)
3711 3711 continue;
3712 3712 if (vp != NULL && lock->l_vnode != vp)
3713 3713 continue;
3714 3714 if (lock_state && !(lock_state & lock->l_state))
3715 3715 continue;
3716 3716 if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3717 3717 continue;
3718 3718 /*
3719 3719 * A matching lock was found. Allocate
3720 3720 * space for a new locklist entry and fill
3721 3721 * it in.
3722 3722 */
3723 3723 llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3724 3724 lltp->ll_next = llp;
3725 3725 VN_HOLD(lock->l_vnode);
3726 3726 llp->ll_vp = lock->l_vnode;
3727 3727 create_flock(lock, &(llp->ll_flock));
3728 3728 llp->ll_next = (locklist_t *)NULL;
3729 3729 lltp = llp;
3730 3730 }
3731 3731 mutex_exit(&gp->gp_mutex);
3732 3732 }
3733 3733
3734 3734 llp = llheadp->ll_next;
3735 3735 return (llp);
3736 3736 }
3737 3737
3738 3738 /*
3739 3739 * These two functions are simply interfaces to get_lock_list. They return
3740 3740 * a list of sleeping or active locks for the given sysid and pid. See
3741 3741 * get_lock_list for details.
3742 3742 *
3743 3743 * In either case we don't particularly care to specify the zone of interest;
3744 3744 * the sysid-space is global across zones, so the sysid will map to exactly one
3745 3745 * zone, and we'll return information for that zone.
3746 3746 */
3747 3747
3748 3748 locklist_t *
3749 3749 flk_get_sleeping_locks(int sysid, pid_t pid)
3750 3750 {
3751 3751 return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3752 3752 ALL_ZONES));
3753 3753 }
3754 3754
3755 3755 locklist_t *
3756 3756 flk_get_active_locks(int sysid, pid_t pid)
3757 3757 {
3758 3758 return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3759 3759 ALL_ZONES));
3760 3760 }
3761 3761
3762 3762 /*
3763 3763 * Another interface to get_lock_list. This one returns all the active
3764 3764 * locks for a given vnode. Again, see get_lock_list for details.
3765 3765 *
3766 3766 * We don't need to specify which zone's locks we're interested in. The matter
3767 3767 * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3768 3768 * be used by multiple zones, so the list of locks will all be from the right
3769 3769 * zone.
3770 3770 */
3771 3771
3772 3772 locklist_t *
3773 3773 flk_active_locks_for_vp(const vnode_t *vp)
3774 3774 {
3775 3775 return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3776 3776 ALL_ZONES));
3777 3777 }
3778 3778
3779 3779 /*
3780 3780 * Another interface to get_lock_list. This one returns all the active
3781 3781 * nbmand locks for a given vnode. Again, see get_lock_list for details.
3782 3782 *
3783 3783 * See the comment for flk_active_locks_for_vp() for why we don't care to
3784 3784 * specify the particular zone of interest.
3785 3785 */
3786 3786 locklist_t *
3787 3787 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3788 3788 {
3789 3789 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3790 3790 NOPID, vp, ALL_ZONES));
3791 3791 }
3792 3792
3793 3793 /*
3794 3794 * Another interface to get_lock_list. This one returns all the active
3795 3795 * nbmand locks for a given pid. Again, see get_lock_list for details.
3796 3796 *
3797 3797 * The zone doesn't need to be specified here; the locks held by a
3798 3798 * particular process will either be local (ie, non-NFS) or from the zone
3799 3799 * the process is executing in. This is because other parts of the system
3800 3800 * ensure that an NFS vnode can't be used in a zone other than that in
3801 3801 * which it was opened.
3802 3802 */
3803 3803 locklist_t *
3804 3804 flk_active_nbmand_locks(pid_t pid)
3805 3805 {
3806 3806 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3807 3807 pid, NULL, ALL_ZONES));
3808 3808 }
3809 3809
3810 3810 /*
3811 3811 * Free up all entries in the locklist.
3812 3812 */
3813 3813 void
3814 3814 flk_free_locklist(locklist_t *llp)
3815 3815 {
3816 3816 locklist_t *next_llp;
3817 3817
3818 3818 while (llp) {
3819 3819 next_llp = llp->ll_next;
3820 3820 VN_RELE(llp->ll_vp);
3821 3821 kmem_free(llp, sizeof (*llp));
3822 3822 llp = next_llp;
3823 3823 }
3824 3824 }
3825 3825
3826 3826 static void
3827 3827 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3828 3828 {
3829 3829 /*
3830 3830 * For each graph "lg" in the hash table lock_graph do
3831 3831 * a. Get the list of sleeping locks
3832 3832 * b. For each lock descriptor in the list do
3833 3833 * i. If the requested lock is an NLM server request AND
3834 3834 * the nlmid is the same as the routine argument then
3835 3835 * change the lock descriptor's state field to
3836 3836 * "nlm_state."
3837 3837 * c. Get the list of active locks
3838 3838 * d. For each lock descriptor in the list do
3839 3839 * i. If the requested lock is an NLM server request AND
3840 3840 * the nlmid is the same as the routine argument then
3841 3841 * change the lock descriptor's state field to
3842 3842 * "nlm_state."
3843 3843 */
3844 3844
3845 3845 int i;
3846 3846 graph_t *gp; /* lock graph */
3847 3847 lock_descriptor_t *lock; /* lock */
3848 3848 lock_descriptor_t *nlock = NULL; /* next lock */
3849 3849 int lock_nlmid;
3850 3850
3851 3851 for (i = 0; i < HASH_SIZE; i++) {
3852 3852 mutex_enter(&flock_lock);
3853 3853 gp = lock_graph[i];
3854 3854 mutex_exit(&flock_lock);
3855 3855 if (gp == NULL) {
3856 3856 continue;
3857 3857 }
3858 3858
3859 3859 /* Get list of sleeping locks in current lock graph. */
3860 3860 mutex_enter(&gp->gp_mutex);
3861 3861 for (lock = SLEEPING_HEAD(gp)->l_next;
3862 3862 lock != SLEEPING_HEAD(gp);
3863 3863 lock = nlock) {
3864 3864 nlock = lock->l_next;
3865 3865 /* get NLM id */
3866 3866 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3867 3867
3868 3868 /*
3869 3869 * If NLM server request AND nlmid of lock matches
3870 3870 * nlmid of argument, then set the NLM state of the
3871 3871 * lock to "nlm_state."
3872 3872 */
3873 3873 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3874 3874 SET_NLM_STATE(lock, nlm_state);
3875 3875 }
3876 3876 }
3877 3877
3878 3878 /* Get list of active locks in current lock graph. */
3879 3879 for (lock = ACTIVE_HEAD(gp)->l_next;
3880 3880 lock != ACTIVE_HEAD(gp);
3881 3881 lock = nlock) {
3882 3882 nlock = lock->l_next;
3883 3883 /* get NLM id */
3884 3884 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3885 3885
3886 3886 /*
3887 3887 * If NLM server request AND nlmid of lock matches
3888 3888 * nlmid of argument, then set the NLM state of the
3889 3889 * lock to "nlm_state."
3890 3890 */
3891 3891 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3892 3892 ASSERT(IS_ACTIVE(lock));
3893 3893 SET_NLM_STATE(lock, nlm_state);
3894 3894 }
3895 3895 }
3896 3896 mutex_exit(&gp->gp_mutex);
3897 3897 }
3898 3898 }
3899 3899
3900 3900 /*
3901 3901 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3902 3902 * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3903 3903 * identified by "nlmid." Poke those lock requests.
3904 3904 */
3905 3905 static void
3906 3906 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3907 3907 {
3908 3908 lock_descriptor_t *lock;
3909 3909 lock_descriptor_t *nlock = NULL; /* next lock */
3910 3910 int i;
3911 3911 graph_t *gp;
3912 3912 int lock_nlmid;
3913 3913
3914 3914 for (i = 0; i < HASH_SIZE; i++) {
3915 3915 mutex_enter(&flock_lock);
3916 3916 gp = lock_graph[i];
3917 3917 mutex_exit(&flock_lock);
3918 3918 if (gp == NULL) {
3919 3919 continue;
3920 3920 }
3921 3921
3922 3922 mutex_enter(&gp->gp_mutex);
3923 3923 for (lock = SLEEPING_HEAD(gp)->l_next;
3924 3924 lock != SLEEPING_HEAD(gp);
3925 3925 lock = nlock) {
3926 3926 nlock = lock->l_next;
3927 3927 /*
3928 3928 * If NLM server request _and_ nlmid of lock matches
3929 3929 * nlmid of argument, then set the NLM state of the
3930 3930 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3931 3931 * request.
3932 3932 */
3933 3933 if (IS_LOCKMGR(lock)) {
3934 3934 /* get NLM id */
3935 3935 lock_nlmid =
3936 3936 GETNLMID(lock->l_flock.l_sysid);
3937 3937 if (nlmid == lock_nlmid) {
3938 3938 SET_NLM_STATE(lock,
3939 3939 FLK_NLM_SHUTTING_DOWN);
3940 3940 INTERRUPT_WAKEUP(lock);
3941 3941 }
3942 3942 }
3943 3943 }
3944 3944 mutex_exit(&gp->gp_mutex);
3945 3945 }
3946 3946 }
3947 3947
3948 3948 /*
3949 3949 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3950 3950 * Effects: Find all active (granted) lock manager locks _only_ for the
3951 3951 * NLM server identified by "nlmid" and release them.
3952 3952 */
3953 3953 static void
3954 3954 cl_flk_unlock_nlm_granted(int nlmid)
3955 3955 {
3956 3956 lock_descriptor_t *lock;
3957 3957 lock_descriptor_t *nlock = NULL; /* next lock */
3958 3958 int i;
3959 3959 graph_t *gp;
3960 3960 int lock_nlmid;
3961 3961
3962 3962 for (i = 0; i < HASH_SIZE; i++) {
3963 3963 mutex_enter(&flock_lock);
3964 3964 gp = lock_graph[i];
3965 3965 mutex_exit(&flock_lock);
3966 3966 if (gp == NULL) {
3967 3967 continue;
3968 3968 }
3969 3969
3970 3970 mutex_enter(&gp->gp_mutex);
3971 3971 for (lock = ACTIVE_HEAD(gp)->l_next;
3972 3972 lock != ACTIVE_HEAD(gp);
3973 3973 lock = nlock) {
3974 3974 nlock = lock->l_next;
3975 3975 ASSERT(IS_ACTIVE(lock));
3976 3976
3977 3977 /*
3978 3978 * If it's an NLM server request _and_ nlmid of
3979 3979 * the lock matches nlmid of argument, then
3980 3980 * remove the active lock the list, wakup blocked
3981 3981 * threads, and free the storage for the lock.
3982 3982 * Note that there's no need to mark the NLM state
3983 3983 * of this lock to NLM_DOWN because the lock will
3984 3984 * be deleted anyway and its storage freed.
3985 3985 */
3986 3986 if (IS_LOCKMGR(lock)) {
3987 3987 /* get NLM id */
3988 3988 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3989 3989 if (nlmid == lock_nlmid) {
3990 3990 flk_delete_active_lock(lock, 0);
3991 3991 flk_wakeup(lock, 1);
3992 3992 flk_free_lock(lock);
3993 3993 }
3994 3994 }
3995 3995 }
3996 3996 mutex_exit(&gp->gp_mutex);
3997 3997 }
3998 3998 }
3999 3999
4000 4000 /*
4001 4001 * Find all sleeping lock manager requests and poke them.
4002 4002 */
4003 4003 static void
4004 4004 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
4005 4005 {
4006 4006 lock_descriptor_t *lock;
4007 4007 lock_descriptor_t *nlock = NULL; /* next lock */
4008 4008 int i;
4009 4009 graph_t *gp;
4010 4010 zoneid_t zoneid = getzoneid();
4011 4011
4012 4012 for (i = 0; i < HASH_SIZE; i++) {
4013 4013 mutex_enter(&flock_lock);
4014 4014 gp = lock_graph[i];
4015 4015 mutex_exit(&flock_lock);
4016 4016 if (gp == NULL) {
4017 4017 continue;
4018 4018 }
4019 4019
4020 4020 mutex_enter(&gp->gp_mutex);
4021 4021 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
4022 4022 for (lock = SLEEPING_HEAD(gp)->l_next;
4023 4023 lock != SLEEPING_HEAD(gp);
4024 4024 lock = nlock) {
4025 4025 nlock = lock->l_next;
4026 4026 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4027 4027 INTERRUPT_WAKEUP(lock);
4028 4028 }
4029 4029 }
4030 4030 mutex_exit(&gp->gp_mutex);
4031 4031 }
4032 4032 }
4033 4033
4034 4034
4035 4035 /*
4036 4036 * Find all active (granted) lock manager locks and release them.
4037 4037 */
4038 4038 static void
4039 4039 unlock_lockmgr_granted(struct flock_globals *fg)
4040 4040 {
4041 4041 lock_descriptor_t *lock;
4042 4042 lock_descriptor_t *nlock = NULL; /* next lock */
4043 4043 int i;
4044 4044 graph_t *gp;
4045 4045 zoneid_t zoneid = getzoneid();
4046 4046
4047 4047 for (i = 0; i < HASH_SIZE; i++) {
4048 4048 mutex_enter(&flock_lock);
4049 4049 gp = lock_graph[i];
4050 4050 mutex_exit(&flock_lock);
4051 4051 if (gp == NULL) {
4052 4052 continue;
4053 4053 }
4054 4054
4055 4055 mutex_enter(&gp->gp_mutex);
4056 4056 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
4057 4057 for (lock = ACTIVE_HEAD(gp)->l_next;
4058 4058 lock != ACTIVE_HEAD(gp);
4059 4059 lock = nlock) {
4060 4060 nlock = lock->l_next;
4061 4061 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4062 4062 ASSERT(IS_ACTIVE(lock));
4063 4063 flk_delete_active_lock(lock, 0);
4064 4064 flk_wakeup(lock, 1);
4065 4065 flk_free_lock(lock);
4066 4066 }
4067 4067 }
4068 4068 mutex_exit(&gp->gp_mutex);
4069 4069 }
4070 4070 }
4071 4071
4072 4072
4073 4073 /*
4074 4074 * Wait until a lock is granted, cancelled, or interrupted.
4075 4075 */
4076 4076
4077 4077 static void
4078 4078 wait_for_lock(lock_descriptor_t *request)
4079 4079 {
4080 4080 graph_t *gp = request->l_graph;
4081 4081
4082 4082 ASSERT(MUTEX_HELD(&gp->gp_mutex));
4083 4083
4084 4084 while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
4085 4085 !(IS_INTERRUPTED(request))) {
4086 4086 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
4087 4087 flk_set_state(request, FLK_INTERRUPTED_STATE);
4088 4088 request->l_state |= INTERRUPTED_LOCK;
4089 4089 }
4090 4090 }
4091 4091 }
4092 4092
4093 4093 /*
4094 4094 * Create an flock structure from the existing lock information
4095 4095 *
4096 4096 * This routine is used to create flock structures for the lock manager
4097 4097 * to use in a reclaim request. Since the lock was originated on this
4098 4098 * host, it must be conforming to UNIX semantics, so no checking is
4099 4099 * done to make sure it falls within the lower half of the 32-bit range.
4100 4100 */
4101 4101
4102 4102 static void
4103 4103 create_flock(lock_descriptor_t *lp, flock64_t *flp)
4104 4104 {
4105 4105 ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
4106 4106 ASSERT(lp->l_end >= lp->l_start);
4107 4107
4108 4108 flp->l_type = lp->l_type;
4109 4109 flp->l_whence = 0;
4110 4110 flp->l_start = lp->l_start;
4111 4111 flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
4112 4112 (lp->l_end - lp->l_start + 1);
4113 4113 flp->l_sysid = lp->l_flock.l_sysid;
4114 4114 flp->l_pid = lp->l_flock.l_pid;
4115 4115 }
4116 4116
4117 4117 /*
4118 4118 * Convert flock_t data describing a lock range into unsigned long starting
4119 4119 * and ending points, which are put into lock_request. Returns 0 or an
4120 4120 * errno value.
4121 4121 * Large Files: max is passed by the caller and we return EOVERFLOW
4122 4122 * as defined by LFS API.
4123 4123 */
4124 4124
4125 4125 int
4126 4126 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
4127 4127 u_offset_t *start, u_offset_t *end, offset_t offset)
4128 4128 {
4129 4129 struct vattr vattr;
4130 4130 int error;
4131 4131
4132 4132 /*
4133 4133 * Determine the starting point of the request
4134 4134 */
4135 4135 switch (flp->l_whence) {
4136 4136 case 0: /* SEEK_SET */
4137 4137 *start = (u_offset_t)flp->l_start;
4138 4138 break;
4139 4139 case 1: /* SEEK_CUR */
4140 4140 *start = (u_offset_t)(flp->l_start + offset);
4141 4141 break;
4142 4142 case 2: /* SEEK_END */
4143 4143 vattr.va_mask = AT_SIZE;
4144 4144 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
4145 4145 return (error);
4146 4146 *start = (u_offset_t)(flp->l_start + vattr.va_size);
4147 4147 break;
4148 4148 default:
4149 4149 return (EINVAL);
4150 4150 }
4151 4151
4152 4152 /*
4153 4153 * Determine the range covered by the request.
4154 4154 */
4155 4155 if (flp->l_len == 0)
4156 4156 *end = MAX_U_OFFSET_T;
4157 4157 else if ((offset_t)flp->l_len > 0) {
4158 4158 *end = (u_offset_t)(*start + (flp->l_len - 1));
4159 4159 } else {
4160 4160 /*
4161 4161 * Negative length; why do we even allow this ?
4162 4162 * Because this allows easy specification of
4163 4163 * the last n bytes of the file.
4164 4164 */
4165 4165 *end = *start;
4166 4166 *start += (u_offset_t)flp->l_len;
4167 4167 (*start)++;
4168 4168 }
4169 4169 return (0);
4170 4170 }
4171 4171
4172 4172 /*
4173 4173 * Check the validity of lock data. This can used by the NFS
4174 4174 * frlock routines to check data before contacting the server. The
4175 4175 * server must support semantics that aren't as restrictive as
4176 4176 * the UNIX API, so the NFS client is required to check.
4177 4177 * The maximum is now passed in by the caller.
4178 4178 */
4179 4179
4180 4180 int
4181 4181 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
4182 4182 {
4183 4183 /*
4184 4184 * The end (length) for local locking should never be greater
4185 4185 * than MAXEND. However, the representation for
4186 4186 * the entire file is MAX_U_OFFSET_T.
4187 4187 */
4188 4188 if ((start > max) ||
4189 4189 ((end > max) && (end != MAX_U_OFFSET_T))) {
4190 4190 return (EINVAL);
4191 4191 }
4192 4192 if (start > end) {
4193 4193 return (EINVAL);
4194 4194 }
4195 4195 return (0);
4196 4196 }
4197 4197
4198 4198 /*
4199 4199 * Fill in request->l_flock with information about the lock blocking the
4200 4200 * request. The complexity here is that lock manager requests are allowed
4201 4201 * to see into the upper part of the 32-bit address range, whereas local
4202 4202 * requests are only allowed to see signed values.
4203 4203 *
4204 4204 * What should be done when "blocker" is a lock manager lock that uses the
4205 4205 * upper portion of the 32-bit range, but "request" is local? Since the
4206 4206 * request has already been determined to have been blocked by the blocker,
4207 4207 * at least some portion of "blocker" must be in the range of the request,
4208 4208 * or the request extends to the end of file. For the first case, the
4209 4209 * portion in the lower range is returned with the indication that it goes
4210 4210 * "to EOF." For the second case, the last byte of the lower range is
4211 4211 * returned with the indication that it goes "to EOF."
4212 4212 */
4213 4213
4214 4214 static void
4215 4215 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
4216 4216 {
4217 4217 flock64_t *flrp; /* l_flock portion of request */
4218 4218
4219 4219 ASSERT(blocker != NULL);
4220 4220
4221 4221 flrp = &request->l_flock;
4222 4222 flrp->l_whence = 0;
4223 4223 flrp->l_type = blocker->l_type;
4224 4224 flrp->l_pid = blocker->l_flock.l_pid;
4225 4225 flrp->l_sysid = blocker->l_flock.l_sysid;
4226 4226 request->l_ofd = blocker->l_ofd;
4227 4227
4228 4228 if (IS_LOCKMGR(request)) {
4229 4229 flrp->l_start = blocker->l_start;
4230 4230 if (blocker->l_end == MAX_U_OFFSET_T)
4231 4231 flrp->l_len = 0;
4232 4232 else
4233 4233 flrp->l_len = blocker->l_end - blocker->l_start + 1;
4234 4234 } else {
4235 4235 if (blocker->l_start > MAXEND) {
4236 4236 flrp->l_start = MAXEND;
4237 4237 flrp->l_len = 0;
4238 4238 } else {
4239 4239 flrp->l_start = blocker->l_start;
4240 4240 if (blocker->l_end == MAX_U_OFFSET_T)
4241 4241 flrp->l_len = 0;
4242 4242 else
4243 4243 flrp->l_len = blocker->l_end -
4244 4244 blocker->l_start + 1;
4245 4245 }
4246 4246 }
4247 4247 }
4248 4248
4249 4249 /*
4250 4250 * PSARC case 1997/292
4251 4251 */
4252 4252 /*
4253 4253 * This is the public routine exported by flock.h.
4254 4254 */
4255 4255 void
4256 4256 cl_flk_change_nlm_state_to_unknown(int nlmid)
4257 4257 {
4258 4258 /*
4259 4259 * Check to see if node is booted as a cluster. If not, return.
4260 4260 */
4261 4261 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
4262 4262 return;
4263 4263 }
4264 4264
4265 4265 /*
4266 4266 * See comment in cl_flk_set_nlm_status().
4267 4267 */
4268 4268 if (nlm_reg_status == NULL) {
4269 4269 return;
4270 4270 }
4271 4271
4272 4272 /*
4273 4273 * protect NLM registry state with a mutex.
4274 4274 */
4275 4275 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
4276 4276 mutex_enter(&nlm_reg_lock);
4277 4277 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
4278 4278 mutex_exit(&nlm_reg_lock);
4279 4279 }
4280 4280
4281 4281 /*
4282 4282 * Return non-zero if the given I/O request conflicts with an active NBMAND
4283 4283 * lock.
4284 4284 * If svmand is non-zero, it means look at all active locks, not just NBMAND
4285 4285 * locks.
4286 4286 */
4287 4287
4288 4288 int
4289 4289 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
4290 4290 ssize_t length, int svmand, caller_context_t *ct)
4291 4291 {
4292 4292 int conflict = 0;
4293 4293 graph_t *gp;
4294 4294 lock_descriptor_t *lock;
4295 4295 pid_t pid;
4296 4296 int sysid;
4297 4297
4298 4298 if (ct == NULL) {
4299 4299 pid = curproc->p_pid;
4300 4300 sysid = 0;
4301 4301 } else {
4302 4302 pid = ct->cc_pid;
4303 4303 sysid = ct->cc_sysid;
4304 4304 }
4305 4305
4306 4306 mutex_enter(&flock_lock);
4307 4307 gp = lock_graph[HASH_INDEX(vp)];
4308 4308 mutex_exit(&flock_lock);
4309 4309 if (gp == NULL)
4310 4310 return (0);
4311 4311
4312 4312 mutex_enter(&gp->gp_mutex);
4313 4313 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4314 4314
4315 4315 for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
4316 4316 if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
4317 4317 (lock->l_flock.l_sysid != sysid ||
4318 4318 lock->l_flock.l_pid != pid) &&
4319 4319 lock_blocks_io(op, offset, length,
4320 4320 lock->l_type, lock->l_start, lock->l_end)) {
4321 4321 conflict = 1;
4322 4322 break;
4323 4323 }
4324 4324 }
4325 4325 mutex_exit(&gp->gp_mutex);
4326 4326
4327 4327 return (conflict);
4328 4328 }
4329 4329
4330 4330 /*
4331 4331 * Return non-zero if the given I/O request conflicts with the given lock.
4332 4332 */
4333 4333
4334 4334 static int
4335 4335 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4336 4336 int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4337 4337 {
4338 4338 ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4339 4339 ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4340 4340
4341 4341 if (op == NBL_READ && lock_type == F_RDLCK)
4342 4342 return (0);
4343 4343
4344 4344 if (offset <= lock_start && lock_start < offset + length)
4345 4345 return (1);
4346 4346 if (lock_start <= offset && offset <= lock_end)
4347 4347 return (1);
4348 4348
4349 4349 return (0);
4350 4350 }
4351 4351
4352 4352 #ifdef DEBUG
4353 4353 static void
4354 4354 check_active_locks(graph_t *gp)
4355 4355 {
4356 4356 lock_descriptor_t *lock, *lock1;
4357 4357 edge_t *ep;
4358 4358
4359 4359 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4360 4360 lock = lock->l_next) {
4361 4361 ASSERT(IS_ACTIVE(lock));
4362 4362 ASSERT(NOT_BLOCKED(lock));
4363 4363 ASSERT(!IS_BARRIER(lock));
4364 4364
4365 4365 ep = FIRST_IN(lock);
4366 4366
4367 4367 while (ep != HEAD(lock)) {
4368 4368 ASSERT(IS_SLEEPING(ep->from_vertex));
4369 4369 ASSERT(!NOT_BLOCKED(ep->from_vertex));
4370 4370 ep = NEXT_IN(ep);
4371 4371 }
4372 4372
4373 4373 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4374 4374 lock1 = lock1->l_next) {
4375 4375 if (lock1->l_vnode == lock->l_vnode) {
4376 4376 if (BLOCKS(lock1, lock)) {
4377 4377 cmn_err(CE_PANIC,
4378 4378 "active lock %p blocks %p",
4379 4379 (void *)lock1, (void *)lock);
4380 4380 } else if (BLOCKS(lock, lock1)) {
4381 4381 cmn_err(CE_PANIC,
4382 4382 "active lock %p blocks %p",
4383 4383 (void *)lock, (void *)lock1);
4384 4384 }
4385 4385 }
4386 4386 }
4387 4387 }
4388 4388 }
4389 4389
4390 4390 /*
4391 4391 * Effect: This functions checks to see if the transition from 'old_state' to
4392 4392 * 'new_state' is a valid one. It returns 0 if the transition is valid
4393 4393 * and 1 if it is not.
4394 4394 * For a map of valid transitions, see sys/flock_impl.h
4395 4395 */
4396 4396 static int
4397 4397 check_lock_transition(int old_state, int new_state)
4398 4398 {
4399 4399 switch (old_state) {
4400 4400 case FLK_INITIAL_STATE:
4401 4401 if ((new_state == FLK_START_STATE) ||
4402 4402 (new_state == FLK_SLEEPING_STATE) ||
4403 4403 (new_state == FLK_ACTIVE_STATE) ||
4404 4404 (new_state == FLK_DEAD_STATE)) {
4405 4405 return (0);
4406 4406 } else {
4407 4407 return (1);
4408 4408 }
4409 4409 case FLK_START_STATE:
4410 4410 if ((new_state == FLK_ACTIVE_STATE) ||
4411 4411 (new_state == FLK_DEAD_STATE)) {
4412 4412 return (0);
4413 4413 } else {
4414 4414 return (1);
4415 4415 }
4416 4416 case FLK_ACTIVE_STATE:
4417 4417 if (new_state == FLK_DEAD_STATE) {
4418 4418 return (0);
4419 4419 } else {
4420 4420 return (1);
4421 4421 }
4422 4422 case FLK_SLEEPING_STATE:
4423 4423 if ((new_state == FLK_GRANTED_STATE) ||
4424 4424 (new_state == FLK_INTERRUPTED_STATE) ||
4425 4425 (new_state == FLK_CANCELLED_STATE)) {
4426 4426 return (0);
4427 4427 } else {
4428 4428 return (1);
4429 4429 }
4430 4430 case FLK_GRANTED_STATE:
4431 4431 if ((new_state == FLK_START_STATE) ||
4432 4432 (new_state == FLK_INTERRUPTED_STATE) ||
4433 4433 (new_state == FLK_CANCELLED_STATE)) {
4434 4434 return (0);
4435 4435 } else {
4436 4436 return (1);
4437 4437 }
4438 4438 case FLK_CANCELLED_STATE:
4439 4439 if ((new_state == FLK_INTERRUPTED_STATE) ||
4440 4440 (new_state == FLK_DEAD_STATE)) {
4441 4441 return (0);
4442 4442 } else {
4443 4443 return (1);
4444 4444 }
4445 4445 case FLK_INTERRUPTED_STATE:
4446 4446 if (new_state == FLK_DEAD_STATE) {
4447 4447 return (0);
4448 4448 } else {
4449 4449 return (1);
4450 4450 }
4451 4451 case FLK_DEAD_STATE:
4452 4452 /* May be set more than once */
4453 4453 if (new_state == FLK_DEAD_STATE) {
4454 4454 return (0);
4455 4455 } else {
4456 4456 return (1);
4457 4457 }
4458 4458 default:
4459 4459 return (1);
4460 4460 }
4461 4461 }
4462 4462
4463 4463 static void
4464 4464 check_sleeping_locks(graph_t *gp)
4465 4465 {
4466 4466 lock_descriptor_t *lock1, *lock2;
4467 4467 edge_t *ep;
4468 4468 for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4469 4469 lock1 = lock1->l_next) {
4470 4470 ASSERT(!IS_BARRIER(lock1));
4471 4471 for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4472 4472 lock2 = lock2->l_next) {
4473 4473 if (lock1->l_vnode == lock2->l_vnode) {
4474 4474 if (BLOCKS(lock2, lock1)) {
4475 4475 ASSERT(!IS_GRANTED(lock1));
4476 4476 ASSERT(!NOT_BLOCKED(lock1));
4477 4477 path(lock1, lock2);
4478 4478 }
4479 4479 }
4480 4480 }
4481 4481
4482 4482 for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4483 4483 lock2 = lock2->l_next) {
4484 4484 ASSERT(!IS_BARRIER(lock1));
4485 4485 if (lock1->l_vnode == lock2->l_vnode) {
4486 4486 if (BLOCKS(lock2, lock1)) {
4487 4487 ASSERT(!IS_GRANTED(lock1));
4488 4488 ASSERT(!NOT_BLOCKED(lock1));
4489 4489 path(lock1, lock2);
4490 4490 }
4491 4491 }
4492 4492 }
4493 4493 ep = FIRST_ADJ(lock1);
4494 4494 while (ep != HEAD(lock1)) {
4495 4495 ASSERT(BLOCKS(ep->to_vertex, lock1));
4496 4496 ep = NEXT_ADJ(ep);
4497 4497 }
4498 4498 }
4499 4499 }
4500 4500
4501 4501 static int
4502 4502 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4503 4503 {
4504 4504 edge_t *ep;
4505 4505 lock_descriptor_t *vertex;
4506 4506 lock_descriptor_t *vertex_stack;
4507 4507
4508 4508 STACK_INIT(vertex_stack);
4509 4509
4510 4510 flk_graph_uncolor(lock1->l_graph);
4511 4511 ep = FIRST_ADJ(lock1);
4512 4512 ASSERT(ep != HEAD(lock1));
4513 4513 while (ep != HEAD(lock1)) {
4514 4514 if (no_path)
4515 4515 ASSERT(ep->to_vertex != lock2);
4516 4516 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4517 4517 COLOR(ep->to_vertex);
4518 4518 ep = NEXT_ADJ(ep);
4519 4519 }
4520 4520
4521 4521 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4522 4522 STACK_POP(vertex_stack, l_dstack);
4523 4523 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4524 4524 ep = NEXT_ADJ(ep)) {
4525 4525 if (COLORED(ep->to_vertex))
4526 4526 continue;
4527 4527 COLOR(ep->to_vertex);
4528 4528 if (ep->to_vertex == lock2)
4529 4529 return (1);
4530 4530
4531 4531 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4532 4532 }
4533 4533 }
4534 4534 return (0);
4535 4535 }
4536 4536
4537 4537 static void
4538 4538 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4539 4539 {
4540 4540 lock_descriptor_t *lock;
4541 4541
4542 4542 /* Ignore OFD style locks since they're not process-wide. */
4543 4543 if (pid == 0)
4544 4544 return;
4545 4545
4546 4546 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4547 4547
4548 4548 if (lock) {
4549 4549 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4550 4550 if (lock->l_flock.l_pid == pid &&
4551 4551 lock->l_flock.l_sysid == sysid)
4552 4552 cmn_err(CE_PANIC,
4553 4553 "owner pid %d's lock %p in active queue",
4554 4554 pid, (void *)lock);
4555 4555 lock = lock->l_next;
4556 4556 }
4557 4557 }
4558 4558 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4559 4559
4560 4560 if (lock) {
4561 4561 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4562 4562 if (lock->l_flock.l_pid == pid &&
4563 4563 lock->l_flock.l_sysid == sysid)
4564 4564 cmn_err(CE_PANIC,
4565 4565 "owner pid %d's lock %p in sleep queue",
4566 4566 pid, (void *)lock);
4567 4567 lock = lock->l_next;
4568 4568 }
4569 4569 }
4570 4570 }
4571 4571
4572 4572 static int
4573 4573 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4574 4574 {
4575 4575 edge_t *ep = FIRST_ADJ(lock1);
4576 4576
4577 4577 while (ep != HEAD(lock1)) {
4578 4578 if (ep->to_vertex == lock2)
4579 4579 return (1);
4580 4580 else
4581 4581 ep = NEXT_ADJ(ep);
4582 4582 }
4583 4583 return (0);
4584 4584 }
4585 4585
4586 4586 static int
4587 4587 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4588 4588 {
4589 4589 return (!level_two_path(lock1, lock2, 1));
4590 4590 }
4591 4591
4592 4592 static void
4593 4593 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4594 4594 {
4595 4595 if (level_one_path(lock1, lock2)) {
4596 4596 if (level_two_path(lock1, lock2, 0) != 0) {
4597 4597 cmn_err(CE_WARN,
4598 4598 "one edge one path from lock1 %p lock2 %p",
4599 4599 (void *)lock1, (void *)lock2);
4600 4600 }
4601 4601 } else if (no_path(lock1, lock2)) {
4602 4602 cmn_err(CE_PANIC,
4603 4603 "No path from lock1 %p to lock2 %p",
4604 4604 (void *)lock1, (void *)lock2);
4605 4605 }
4606 4606 }
4607 4607 #endif /* DEBUG */
|
↓ open down ↓ |
3591 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX