1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
32 * Copyright 2015 Joyent, Inc.
33 */
34
35 #include <sys/flock_impl.h>
36 #include <sys/vfs.h>
37 #include <sys/t_lock.h> /* for <sys/callb.h> */
38 #include <sys/callb.h>
39 #include <sys/clconf.h>
40 #include <sys/cladm.h>
41 #include <sys/nbmlock.h>
42 #include <sys/cred.h>
43 #include <sys/policy.h>
44 #include <sys/list.h>
45 #include <sys/sysmacros.h>
46 #include <sys/socket.h>
47 #include <inet/ip.h>
48
49 /*
50 * The following four variables are for statistics purposes and they are
51 * not protected by locks. They may not be accurate but will at least be
52 * close to the actual value.
53 */
54
55 int flk_lock_allocs;
56 int flk_lock_frees;
57 int edge_allocs;
58 int edge_frees;
59 int flk_proc_vertex_allocs;
60 int flk_proc_edge_allocs;
61 int flk_proc_vertex_frees;
62 int flk_proc_edge_frees;
63
64 static kmutex_t flock_lock;
65
66 #ifdef DEBUG
67 int check_debug = 0;
68 #define CHECK_ACTIVE_LOCKS(gp) if (check_debug) \
69 check_active_locks(gp);
70 #define CHECK_SLEEPING_LOCKS(gp) if (check_debug) \
71 check_sleeping_locks(gp);
72 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp) \
73 if (check_debug) \
74 check_owner_locks(gp, pid, sysid, vp);
75 #define CHECK_LOCK_TRANSITION(old_state, new_state) \
76 { \
77 if (check_lock_transition(old_state, new_state)) { \
78 cmn_err(CE_PANIC, "Illegal lock transition \
79 from %d to %d", old_state, new_state); \
80 } \
81 }
82 #else
83
84 #define CHECK_ACTIVE_LOCKS(gp)
85 #define CHECK_SLEEPING_LOCKS(gp)
86 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
87 #define CHECK_LOCK_TRANSITION(old_state, new_state)
88
89 #endif /* DEBUG */
90
91 struct kmem_cache *flk_edge_cache;
92
93 graph_t *lock_graph[HASH_SIZE];
94 proc_graph_t pgraph;
95
96 /*
97 * Clustering.
98 *
99 * NLM REGISTRY TYPE IMPLEMENTATION
100 *
101 * Assumptions:
102 * 1. Nodes in a cluster are numbered starting at 1; always non-negative
103 * integers; maximum node id is returned by clconf_maximum_nodeid().
104 * 2. We use this node id to identify the node an NLM server runs on.
105 */
106
107 /*
108 * NLM registry object keeps track of NLM servers via their
109 * nlmids (which are the node ids of the node in the cluster they run on)
110 * that have requested locks at this LLM with which this registry is
111 * associated.
112 *
113 * Representation of abstraction:
114 * rep = record[ states: array[nlm_state],
115 * lock: mutex]
116 *
117 * Representation invariants:
118 * 1. index i of rep.states is between 0 and n - 1 where n is number
119 * of elements in the array, which happen to be the maximum number
120 * of nodes in the cluster configuration + 1.
121 * 2. map nlmid to index i of rep.states
122 * 0 -> 0
123 * 1 -> 1
124 * 2 -> 2
125 * n-1 -> clconf_maximum_nodeid()+1
126 * 3. This 1-1 mapping is quite convenient and it avoids errors resulting
127 * from forgetting to subtract 1 from the index.
128 * 4. The reason we keep the 0th index is the following. A legitimate
129 * cluster configuration includes making a UFS file system NFS
130 * exportable. The code is structured so that if you're in a cluster
131 * you do one thing; otherwise, you do something else. The problem
132 * is what to do if you think you're in a cluster with PXFS loaded,
133 * but you're using UFS not PXFS? The upper two bytes of the sysid
134 * encode the node id of the node where NLM server runs; these bytes
135 * are zero for UFS. Since the nodeid is used to index into the
136 * registry, we can record the NLM server state information at index
137 * 0 using the same mechanism used for PXFS file locks!
138 */
139 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
140 static kmutex_t nlm_reg_lock; /* lock to protect arrary */
141 static uint_t nlm_status_size; /* size of state array */
142
143 /*
144 * Although we need a global lock dependency graph (and associated data
145 * structures), we also need a per-zone notion of whether the lock manager is
146 * running, and so whether to allow lock manager requests or not.
147 *
148 * Thus, on a per-zone basis we maintain a ``global'' variable
149 * (flk_lockmgr_status), protected by flock_lock, and set when the lock
150 * manager is determined to be changing state (starting or stopping).
151 *
152 * Each graph/zone pair also has a copy of this variable, which is protected by
153 * the graph's mutex.
154 *
155 * The per-graph copies are used to synchronize lock requests with shutdown
156 * requests. The global copy is used to initialize the per-graph field when a
157 * new graph is created.
158 */
159 struct flock_globals {
160 flk_lockmgr_status_t flk_lockmgr_status;
161 flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
162 };
163
164 zone_key_t flock_zone_key;
165
166 /*
167 * Support for the remote stale lock detection
168 *
169 * The sysid_to_host_translator_lock readers/writer lock protects
170 * sysid_to_host_translator_list.
171 *
172 * The sysid_to_host_translator_list is a list of sysid to host name translator
173 * functions. The new translators are added using the public
174 * flk_add_sysid_to_host_translator() call.
175 *
176 * The stale_lock_timeout is in seconds and it determines the interval for the
177 * remote stale lock checking. When set to 0, the remote stale lock checking
178 * is disabled.
179 */
180 struct sysid_to_host_translator_entry {
181 sysid_to_host_translator_t translator;
182 list_node_t node;
183 };
184 static krwlock_t sysid_to_host_translator_lock;
185 static list_t sysid_to_host_translator_list;
186 volatile int stale_lock_timeout = 3600; /* one hour, in seconds */
187
188 static void create_flock(lock_descriptor_t *, flock64_t *);
189 static lock_descriptor_t *flk_get_lock(void);
190 static void flk_free_lock(lock_descriptor_t *lock);
191 static void flk_get_first_blocking_lock(lock_descriptor_t *);
192 static int flk_process_request(lock_descriptor_t *);
193 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
194 static edge_t *flk_get_edge(void);
195 static int flk_wait_execute_request(lock_descriptor_t *);
196 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
197 static void flk_insert_active_lock(lock_descriptor_t *);
198 static void flk_delete_active_lock(lock_descriptor_t *, int);
199 static void flk_insert_sleeping_lock(lock_descriptor_t *);
200 static void flk_graph_uncolor(graph_t *);
201 static void flk_wakeup(lock_descriptor_t *, int);
202 static void flk_free_edge(edge_t *);
203 static void flk_recompute_dependencies(lock_descriptor_t *,
204 lock_descriptor_t **, int, int);
205 static int flk_find_barriers(lock_descriptor_t *);
206 static void flk_update_barriers(lock_descriptor_t *);
207 static int flk_color_reachables(lock_descriptor_t *);
208 static int flk_canceled(lock_descriptor_t *);
209 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
210 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
211 static void wait_for_lock(lock_descriptor_t *);
212 static void unlock_lockmgr_granted(struct flock_globals *);
213 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
214
215 /* Clustering hooks */
216 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
217 static void cl_flk_wakeup_sleeping_nlm_locks(int);
218 static void cl_flk_unlock_nlm_granted(int);
219
220 #ifdef DEBUG
221 static int check_lock_transition(int, int);
222 static void check_sleeping_locks(graph_t *);
223 static void check_active_locks(graph_t *);
224 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
225 static void path(lock_descriptor_t *, lock_descriptor_t *);
226 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
227 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
228 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
229 #endif
230
231 /* proc_graph function definitions */
232 static int flk_check_deadlock(lock_descriptor_t *);
233 static void flk_proc_graph_uncolor(void);
234 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
235 static proc_edge_t *flk_get_proc_edge(void);
236 static void flk_proc_release(proc_vertex_t *);
237 static void flk_free_proc_edge(proc_edge_t *);
238 static void flk_update_proc_graph(edge_t *, int);
239
240 /* Non-blocking mandatory locking */
241 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
242 u_offset_t);
243
244 static struct flock_globals *
245 flk_get_globals(void)
246 {
247 /*
248 * The KLM module had better be loaded if we're attempting to handle
249 * lockmgr requests.
250 */
251 ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
252 return (zone_getspecific(flock_zone_key, curproc->p_zone));
253 }
254
255 static flk_lockmgr_status_t
256 flk_get_lockmgr_status(void)
257 {
258 struct flock_globals *fg;
259
260 ASSERT(MUTEX_HELD(&flock_lock));
261
262 if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
263 /*
264 * KLM module not loaded; lock manager definitely not running.
265 */
266 return (FLK_LOCKMGR_DOWN);
267 }
268 fg = flk_get_globals();
269 return (fg->flk_lockmgr_status);
270 }
271
272 /*
273 * This implements Open File Description (not descriptor) style record locking.
274 * These locks can also be thought of as pid-less since they are not tied to a
275 * specific process, thus they're preserved across fork.
276 *
277 * Called directly from fcntl.
278 *
279 * See reclock() for the implementation of the traditional POSIX style record
280 * locking scheme (pid-ful). This function is derived from reclock() but
281 * simplified and modified to work for OFD style locking.
282 *
283 * The two primary advantages of OFD style of locking are:
284 * 1) It is per-file description, so closing a file descriptor that refers to a
285 * different file description for the same file will not drop the lock (i.e.
286 * two open's of the same file get different descriptions but a dup or fork
287 * will refer to the same description).
288 * 2) Locks are preserved across fork(2).
289 *
290 * Because these locks are per-description a lock ptr lives at the f_filocks
291 * member of the file_t and the lock_descriptor includes a file_t pointer
292 * to enable unique lock identification and management.
293 *
294 * Since these locks are pid-less we cannot do deadlock detection with the
295 * current process-oriented implementation. This is consistent with OFD locking
296 * behavior on other operating systems such as Linux. Since we don't do
297 * deadlock detection we never interact with the process graph that is
298 * maintained for deadlock detection on the traditional POSIX-style locks.
299 *
300 * Future Work:
301 *
302 * The current implementation does not support record locks. That is,
303 * currently the single lock must cover the entire file. This is validated in
304 * fcntl. To support record locks the f_filock pointer in the file_t needs to
305 * be changed to a list of pointers to the locks. That list needs to be
306 * managed independently of the lock list on the vnode itself and it needs to
307 * be maintained as record locks are created, split, coalesced and deleted.
308 *
309 * The current implementation does not support remote file systems (e.g.
310 * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks
311 * interact with the NLM is not clear since the NLM protocol/implementation
312 * appears to be oriented around locks associated with a process. A further
313 * problem is that a design is needed for what nlm_send_siglost() should do and
314 * where it will send SIGLOST. More recent versions of Linux apparently try to
315 * emulate OFD locks on NFS by converting them to traditional POSIX style locks
316 * that work with the NLM. It is not clear that this provides the correct
317 * semantics in all cases.
318 */
319 int
320 ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, u_offset_t offset)
321 {
322 int cmd = 0;
323 vnode_t *vp;
324 lock_descriptor_t stack_lock_request;
325 lock_descriptor_t *lock_request;
326 int error = 0;
327 graph_t *gp;
328 int serialize = 0;
329
330 if (fcmd != F_OFD_GETLK)
331 cmd = SETFLCK;
332
333 if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW)
334 cmd |= SLPFLCK;
335
336 /* see block comment */
337 VERIFY(lckdat->l_whence == 0);
338 VERIFY(lckdat->l_start == 0);
339 VERIFY(lckdat->l_len == 0);
340
341 vp = fp->f_vnode;
342
343 /*
344 * For reclock fs_frlock() would normally have set these in a few
345 * places but for us it's cleaner to centralize it here. Note that
346 * IGN_PID is -1. We use 0 for our pid-less locks.
347 */
348 lckdat->l_pid = 0;
349 lckdat->l_sysid = 0;
350
351 /*
352 * Check access permissions
353 */
354 if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) &&
355 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
356 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
357 return (EBADF);
358
359 /*
360 * for query and unlock we use the stack_lock_request
361 */
362 if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) {
363 lock_request = &stack_lock_request;
364 (void) bzero((caddr_t)lock_request,
365 sizeof (lock_descriptor_t));
366
367 /*
368 * following is added to make the assertions in
369 * flk_execute_request() pass
370 */
371 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
372 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
373 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
374 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
375 lock_request->l_status = FLK_INITIAL_STATE;
376 } else {
377 lock_request = flk_get_lock();
378 fp->f_filock = (struct filock *)lock_request;
379 }
380 lock_request->l_state = 0;
381 lock_request->l_vnode = vp;
382 lock_request->l_zoneid = getzoneid();
383 lock_request->l_ofd = fp;
384
385 /*
386 * Convert the request range into the canonical start and end
387 * values then check the validity of the lock range.
388 */
389 error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start,
390 &lock_request->l_end, offset);
391 if (error)
392 goto done;
393
394 error = flk_check_lock_data(lock_request->l_start, lock_request->l_end,
395 MAXEND);
396 if (error)
397 goto done;
398
399 ASSERT(lock_request->l_end >= lock_request->l_start);
400
401 lock_request->l_type = lckdat->l_type;
402 if (cmd & SLPFLCK)
403 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
404
405 if (!(cmd & SETFLCK)) {
406 if (lock_request->l_type == F_RDLCK ||
407 lock_request->l_type == F_WRLCK)
408 lock_request->l_state |= QUERY_LOCK;
409 }
410 lock_request->l_flock = (*lckdat);
411
412 /*
413 * We are ready for processing the request
414 */
415
416 if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK &&
417 nbl_need_check(vp)) {
418 nbl_start_crit(vp, RW_WRITER);
419 serialize = 1;
420 }
421
422 /* Get the lock graph for a particular vnode */
423 gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
424
425 mutex_enter(&gp->gp_mutex);
426
427 lock_request->l_state |= REFERENCED_LOCK;
428 lock_request->l_graph = gp;
429
430 switch (lock_request->l_type) {
431 case F_RDLCK:
432 case F_WRLCK:
433 if (IS_QUERY_LOCK(lock_request)) {
434 flk_get_first_blocking_lock(lock_request);
435 if (lock_request->l_ofd != NULL)
436 lock_request->l_flock.l_pid = -1;
437 (*lckdat) = lock_request->l_flock;
438 } else {
439 /* process the request now */
440 error = flk_process_request(lock_request);
441 }
442 break;
443
444 case F_UNLCK:
445 /* unlock request will not block so execute it immediately */
446 error = flk_execute_request(lock_request);
447 break;
448
449 default:
450 error = EINVAL;
451 break;
452 }
453
454 if (lock_request == &stack_lock_request) {
455 flk_set_state(lock_request, FLK_DEAD_STATE);
456 } else {
457 lock_request->l_state &= ~REFERENCED_LOCK;
458 if ((error != 0) || IS_DELETED(lock_request)) {
459 flk_set_state(lock_request, FLK_DEAD_STATE);
460 flk_free_lock(lock_request);
461 }
462 }
463
464 mutex_exit(&gp->gp_mutex);
465 if (serialize)
466 nbl_end_crit(vp);
467
468 return (error);
469
470 done:
471 flk_set_state(lock_request, FLK_DEAD_STATE);
472 if (lock_request != &stack_lock_request)
473 flk_free_lock(lock_request);
474 return (error);
475 }
476
477 /*
478 * Remove any lock on the vnode belonging to the given file_t.
479 * Called from closef on last close, file_t is locked.
480 *
481 * This is modeled on the cleanlocks() function but only removes the single
482 * lock associated with fp.
483 */
484 void
485 ofdcleanlock(file_t *fp)
486 {
487 lock_descriptor_t *fplock, *lock, *nlock;
488 vnode_t *vp;
489 graph_t *gp;
490
491 ASSERT(MUTEX_HELD(&fp->f_tlock));
492
493 if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL)
494 return;
495
496 fp->f_filock = NULL;
497 vp = fp->f_vnode;
498
499 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
500
501 if (gp == NULL)
502 return;
503 mutex_enter(&gp->gp_mutex);
504
505 CHECK_SLEEPING_LOCKS(gp);
506 CHECK_ACTIVE_LOCKS(gp);
507
508 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
509
510 if (lock) {
511 do {
512 nlock = lock->l_next;
513 if (fplock == lock) {
514 CANCEL_WAKEUP(lock);
515 break;
516 }
517 lock = nlock;
518 } while (lock->l_vnode == vp);
519 }
520
521 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
522
523 if (lock) {
524 do {
525 nlock = lock->l_next;
526 if (fplock == lock) {
527 flk_delete_active_lock(lock, 0);
528 flk_wakeup(lock, 1);
529 flk_free_lock(lock);
530 break;
531 }
532 lock = nlock;
533 } while (lock->l_vnode == vp);
534 }
535
536 CHECK_SLEEPING_LOCKS(gp);
537 CHECK_ACTIVE_LOCKS(gp);
538 mutex_exit(&gp->gp_mutex);
539 }
540
541 /*
542 * Routine called from fs_frlock in fs/fs_subr.c
543 *
544 * This implements traditional POSIX style record locking. The two primary
545 * drawbacks to this style of locking are:
546 * 1) It is per-process, so any close of a file descriptor that refers to the
547 * file will drop the lock (e.g. lock /etc/passwd, call a library function
548 * which opens /etc/passwd to read the file, when the library closes it's
549 * file descriptor the application loses its lock and does not know).
550 * 2) Locks are not preserved across fork(2).
551 *
552 * Because these locks are only associated with a PID, they are per-process.
553 * This is why any close will drop the lock and is also why, once the process
554 * forks, the lock is no longer related to the new process. These locks can
555 * be considered as PID-ful.
556 *
557 * See ofdlock() for the implementation of a similar but improved locking
558 * scheme.
559 */
560 int
561 reclock(vnode_t *vp, flock64_t *lckdat, int cmd, int flag, u_offset_t offset,
562 flk_callback_t *flk_cbp)
563 {
564 lock_descriptor_t stack_lock_request;
565 lock_descriptor_t *lock_request;
566 int error = 0;
567 graph_t *gp;
568 int nlmid;
569
570 /*
571 * Check access permissions
572 */
573 if ((cmd & SETFLCK) &&
574 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
575 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
576 return (EBADF);
577
578 /*
579 * for query and unlock we use the stack_lock_request
580 */
581
582 if ((lckdat->l_type == F_UNLCK) ||
583 !((cmd & INOFLCK) || (cmd & SETFLCK))) {
584 lock_request = &stack_lock_request;
585 bzero(lock_request, sizeof (lock_descriptor_t));
586
587 /*
588 * following is added to make the assertions in
589 * flk_execute_request() to pass through
590 */
591
592 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
593 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
594 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
595 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
596 lock_request->l_status = FLK_INITIAL_STATE;
597 } else {
598 lock_request = flk_get_lock();
599 }
600 lock_request->l_state = 0;
601 lock_request->l_vnode = vp;
602 lock_request->l_zoneid = getzoneid();
603
604 /*
605 * Convert the request range into the canonical start and end
606 * values. The NLM protocol supports locking over the entire
607 * 32-bit range, so there's no range checking for remote requests,
608 * but we still need to verify that local requests obey the rules.
609 */
610 /* Clustering */
611 if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
612 ASSERT(lckdat->l_whence == 0);
613 lock_request->l_start = lckdat->l_start;
614 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
615 lckdat->l_start + (lckdat->l_len - 1);
616 } else {
617 /* check the validity of the lock range */
618 error = flk_convert_lock_data(vp, lckdat,
619 &lock_request->l_start, &lock_request->l_end,
620 offset);
621 if (error) {
622 goto done;
623 }
624 error = flk_check_lock_data(lock_request->l_start,
625 lock_request->l_end, MAXEND);
626 if (error) {
627 goto done;
628 }
629 }
630
631 ASSERT(lock_request->l_end >= lock_request->l_start);
632
633 lock_request->l_type = lckdat->l_type;
634 if (cmd & INOFLCK)
635 lock_request->l_state |= IO_LOCK;
636 if (cmd & SLPFLCK)
637 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
638 if (cmd & RCMDLCK)
639 lock_request->l_state |= LOCKMGR_LOCK;
640 if (cmd & NBMLCK)
641 lock_request->l_state |= NBMAND_LOCK;
642 /*
643 * Clustering: set flag for PXFS locks
644 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
645 * also be of type 'RCMDLCK'.
646 * We do not _only_ check the GETPXFSID() macro because local PXFS
647 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
648 */
649
650 if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
651 lock_request->l_state |= PXFS_LOCK;
652 }
653 if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
654 if (lock_request->l_type == F_RDLCK ||
655 lock_request->l_type == F_WRLCK)
656 lock_request->l_state |= QUERY_LOCK;
657 }
658 lock_request->l_flock = (*lckdat);
659 lock_request->l_callbacks = flk_cbp;
660
661 /*
662 * We are ready for processing the request
663 */
664 if (IS_LOCKMGR(lock_request)) {
665 /*
666 * If the lock request is an NLM server request ....
667 */
668 if (nlm_status_size == 0) { /* not booted as cluster */
669 mutex_enter(&flock_lock);
670 /*
671 * Bail out if this is a lock manager request and the
672 * lock manager is not supposed to be running.
673 */
674 if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
675 mutex_exit(&flock_lock);
676 error = ENOLCK;
677 goto done;
678 }
679 mutex_exit(&flock_lock);
680 } else { /* booted as a cluster */
681 nlmid = GETNLMID(lock_request->l_flock.l_sysid);
682 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
683
684 mutex_enter(&nlm_reg_lock);
685 /*
686 * If the NLM registry does not know about this
687 * NLM server making the request, add its nlmid
688 * to the registry.
689 */
690 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
691 nlmid)) {
692 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
693 } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
694 nlmid)) {
695 /*
696 * If the NLM server is already known (has made
697 * previous lock requests) and its state is
698 * not NLM_UP (means that NLM server is
699 * shutting down), then bail out with an
700 * error to deny the lock request.
701 */
702 mutex_exit(&nlm_reg_lock);
703 error = ENOLCK;
704 goto done;
705 }
706 mutex_exit(&nlm_reg_lock);
707 }
708 }
709
710 /* Now get the lock graph for a particular vnode */
711 gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
712
713 /*
714 * We drop rwlock here otherwise this might end up causing a
715 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
716 */
717
718 if (IS_IO_LOCK(lock_request)) {
719 VOP_RWUNLOCK(vp,
720 (lock_request->l_type == F_RDLCK) ?
721 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
722 }
723 mutex_enter(&gp->gp_mutex);
724
725 lock_request->l_state |= REFERENCED_LOCK;
726 lock_request->l_graph = gp;
727
728 switch (lock_request->l_type) {
729 case F_RDLCK:
730 case F_WRLCK:
731 if (IS_QUERY_LOCK(lock_request)) {
732 flk_get_first_blocking_lock(lock_request);
733 if (lock_request->l_ofd != NULL)
734 lock_request->l_flock.l_pid = -1;
735 (*lckdat) = lock_request->l_flock;
736 break;
737 }
738
739 /* process the request now */
740
741 error = flk_process_request(lock_request);
742 break;
743
744 case F_UNLCK:
745 /* unlock request will not block so execute it immediately */
746
747 if (IS_LOCKMGR(lock_request) &&
748 flk_canceled(lock_request)) {
749 error = 0;
750 } else {
751 error = flk_execute_request(lock_request);
752 }
753 break;
754
755 case F_UNLKSYS:
756 /*
757 * Recovery mechanism to release lock manager locks when
758 * NFS client crashes and restart. NFS server will clear
759 * old locks and grant new locks.
760 */
761
762 if (lock_request->l_flock.l_sysid == 0) {
763 mutex_exit(&gp->gp_mutex);
764 return (EINVAL);
765 }
766 if (secpolicy_nfs(CRED()) != 0) {
767 mutex_exit(&gp->gp_mutex);
768 return (EPERM);
769 }
770 flk_delete_locks_by_sysid(lock_request);
771 lock_request->l_state &= ~REFERENCED_LOCK;
772 flk_set_state(lock_request, FLK_DEAD_STATE);
773 flk_free_lock(lock_request);
774 mutex_exit(&gp->gp_mutex);
775 return (0);
776
777 default:
778 error = EINVAL;
779 break;
780 }
781
782 /* Clustering: For blocked PXFS locks, return */
783 if (error == PXFS_LOCK_BLOCKED) {
784 lock_request->l_state &= ~REFERENCED_LOCK;
785 mutex_exit(&gp->gp_mutex);
786 return (error);
787 }
788
789 /*
790 * Now that we have seen the status of locks in the system for
791 * this vnode we acquire the rwlock if it is an IO_LOCK.
792 */
793
794 if (IS_IO_LOCK(lock_request)) {
795 (void) VOP_RWLOCK(vp,
796 (lock_request->l_type == F_RDLCK) ?
797 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
798 if (!error) {
799 lckdat->l_type = F_UNLCK;
800
801 /*
802 * This wake up is needed otherwise
803 * if IO_LOCK has slept the dependents on this
804 * will not be woken up at all. (bugid # 1185482).
805 */
806
807 flk_wakeup(lock_request, 1);
808 flk_set_state(lock_request, FLK_DEAD_STATE);
809 flk_free_lock(lock_request);
810 }
811 /*
812 * else if error had occurred either flk_process_request()
813 * has returned EDEADLK in which case there will be no
814 * dependents for this lock or EINTR from flk_wait_execute_
815 * request() in which case flk_cancel_sleeping_lock()
816 * would have been done. same is true with EBADF.
817 */
818 }
819
820 if (lock_request == &stack_lock_request) {
821 flk_set_state(lock_request, FLK_DEAD_STATE);
822 } else {
823 lock_request->l_state &= ~REFERENCED_LOCK;
824 if ((error != 0) || IS_DELETED(lock_request)) {
825 flk_set_state(lock_request, FLK_DEAD_STATE);
826 flk_free_lock(lock_request);
827 }
828 }
829
830 mutex_exit(&gp->gp_mutex);
831 return (error);
832
833 done:
834 flk_set_state(lock_request, FLK_DEAD_STATE);
835 if (lock_request != &stack_lock_request)
836 flk_free_lock(lock_request);
837 return (error);
838 }
839
840 /*
841 * Invoke the callbacks in the given list. If before sleeping, invoke in
842 * list order. If after sleeping, invoke in reverse order.
843 *
844 * CPR (suspend/resume) support: if one of the callbacks returns a
845 * callb_cpr_t, return it. This will be used to make the thread CPR-safe
846 * while it is sleeping. There should be at most one callb_cpr_t for the
847 * thread.
848 * XXX This is unnecessarily complicated. The CPR information should just
849 * get passed in directly through VOP_FRLOCK and reclock, rather than
850 * sneaking it in via a callback.
851 */
852
853 callb_cpr_t *
854 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
855 {
856 callb_cpr_t *cpr_callbackp = NULL;
857 callb_cpr_t *one_result;
858 flk_callback_t *cb;
859
860 if (cblist == NULL)
861 return (NULL);
862
863 if (when == FLK_BEFORE_SLEEP) {
864 cb = cblist;
865 do {
866 one_result = (*cb->cb_callback)(when, cb->cb_data);
867 if (one_result != NULL) {
868 ASSERT(cpr_callbackp == NULL);
869 cpr_callbackp = one_result;
870 }
871 cb = cb->cb_next;
872 } while (cb != cblist);
873 } else {
874 cb = cblist->cb_prev;
875 do {
876 one_result = (*cb->cb_callback)(when, cb->cb_data);
877 if (one_result != NULL) {
878 cpr_callbackp = one_result;
879 }
880 cb = cb->cb_prev;
881 } while (cb != cblist->cb_prev);
882 }
883
884 return (cpr_callbackp);
885 }
886
887 /*
888 * Initialize a flk_callback_t to hold the given callback.
889 */
890
891 void
892 flk_init_callback(flk_callback_t *flk_cb,
893 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
894 {
895 flk_cb->cb_next = flk_cb;
896 flk_cb->cb_prev = flk_cb;
897 flk_cb->cb_callback = cb_fcn;
898 flk_cb->cb_data = cbdata;
899 }
900
901 /*
902 * Initialize an flk_callback_t and then link it into the head of an
903 * existing list (which may be NULL).
904 */
905
906 void
907 flk_add_callback(flk_callback_t *newcb,
908 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
909 void *cbdata, flk_callback_t *cblist)
910 {
911 flk_init_callback(newcb, cb_fcn, cbdata);
912
913 if (cblist == NULL)
914 return;
915
916 newcb->cb_prev = cblist->cb_prev;
917 newcb->cb_next = cblist;
918 cblist->cb_prev->cb_next = newcb;
919 cblist->cb_prev = newcb;
920 }
921
922 /*
923 * Remove the callback from a list.
924 */
925
926 void
927 flk_del_callback(flk_callback_t *flk_cb)
928 {
929 flk_cb->cb_next->cb_prev = flk_cb->cb_prev;
930 flk_cb->cb_prev->cb_next = flk_cb->cb_next;
931
932 flk_cb->cb_prev = flk_cb;
933 flk_cb->cb_next = flk_cb;
934 }
935
936 /*
937 * Initialize the flk_edge_cache data structure and create the
938 * nlm_reg_status array.
939 */
940
941 void
942 flk_init(void)
943 {
944 uint_t i;
945
946 flk_edge_cache = kmem_cache_create("flk_edges",
947 sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
948 if (flk_edge_cache == NULL) {
949 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
950 }
951 /*
952 * Create the NLM registry object.
953 */
954
955 if (cluster_bootflags & CLUSTER_BOOTED) {
956 /*
957 * This routine tells you the maximum node id that will be used
958 * in the cluster. This number will be the size of the nlm
959 * registry status array. We add 1 because we will be using
960 * all entries indexed from 0 to maxnodeid; e.g., from 0
961 * to 64, for a total of 65 entries.
962 */
963 nlm_status_size = clconf_maximum_nodeid() + 1;
964 } else {
965 nlm_status_size = 0;
966 }
967
968 if (nlm_status_size != 0) { /* booted as a cluster */
969 nlm_reg_status = (flk_nlm_status_t *)
970 kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
971 KM_SLEEP);
972
973 /* initialize all NLM states in array to NLM_UNKNOWN */
974 for (i = 0; i < nlm_status_size; i++) {
975 nlm_reg_status[i] = FLK_NLM_UNKNOWN;
976 }
977 }
978
979 mutex_init(&flock_lock, NULL, MUTEX_DEFAULT, NULL);
980 mutex_init(&nlm_reg_lock, NULL, MUTEX_DEFAULT, NULL);
981
982 rw_init(&sysid_to_host_translator_lock, NULL, RW_DEFAULT, NULL);
983 list_create(&sysid_to_host_translator_list,
984 sizeof (struct sysid_to_host_translator_entry),
985 offsetof(struct sysid_to_host_translator_entry, node));
986 }
987
988 /*
989 * Zone constructor/destructor callbacks to be executed when a zone is
990 * created/destroyed.
991 */
992 /* ARGSUSED */
993 void *
994 flk_zone_init(zoneid_t zoneid)
995 {
996 struct flock_globals *fg;
997 uint_t i;
998
999 fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
1000 fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
1001 for (i = 0; i < HASH_SIZE; i++)
1002 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
1003 return (fg);
1004 }
1005
1006 /* ARGSUSED */
1007 void
1008 flk_zone_fini(zoneid_t zoneid, void *data)
1009 {
1010 struct flock_globals *fg = data;
1011
1012 kmem_free(fg, sizeof (*fg));
1013 }
1014
1015 /*
1016 * Get a lock_descriptor structure with initialization of edge lists.
1017 */
1018
1019 static lock_descriptor_t *
1020 flk_get_lock(void)
1021 {
1022 lock_descriptor_t *l;
1023
1024 l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
1025
1026 cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
1027 l->l_edge.edge_in_next = &l->l_edge;
1028 l->l_edge.edge_in_prev = &l->l_edge;
1029 l->l_edge.edge_adj_next = &l->l_edge;
1030 l->l_edge.edge_adj_prev = &l->l_edge;
1031 l->pvertex = -1;
1032 l->l_status = FLK_INITIAL_STATE;
1033 flk_lock_allocs++;
1034 return (l);
1035 }
1036
1037 /*
1038 * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
1039 * when some thread has a reference to it as in reclock().
1040 */
1041
1042 void
1043 flk_free_lock(lock_descriptor_t *lock)
1044 {
1045 file_t *fp;
1046
1047 ASSERT(lock->l_blocker >= 0);
1048 ASSERT(IS_DEAD(lock));
1049
1050 if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock)
1051 fp->f_filock = NULL;
1052
1053 if (IS_REFERENCED(lock)) {
1054 lock->l_state |= DELETED_LOCK;
1055 return;
1056 }
1057 flk_lock_frees++;
1058 kmem_free(lock, sizeof (lock_descriptor_t));
1059 }
1060
1061 void
1062 flk_set_state(lock_descriptor_t *lock, int new_state)
1063 {
1064 /*
1065 * Locks in the sleeping list may be woken up in a number of ways,
1066 * and more than once. If a sleeping lock is signaled awake more
1067 * than once, then it may or may not change state depending on its
1068 * current state.
1069 * Also note that NLM locks that are sleeping could be moved to an
1070 * interrupted state more than once if the unlock request is
1071 * retransmitted by the NLM client - the second time around, this is
1072 * just a nop.
1073 * The ordering of being signaled awake is:
1074 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
1075 * The checks below implement this ordering.
1076 */
1077 if (IS_INTERRUPTED(lock)) {
1078 if ((new_state == FLK_CANCELLED_STATE) ||
1079 (new_state == FLK_GRANTED_STATE) ||
1080 (new_state == FLK_INTERRUPTED_STATE)) {
1081 return;
1082 }
1083 }
1084 if (IS_CANCELLED(lock)) {
1085 if ((new_state == FLK_GRANTED_STATE) ||
1086 (new_state == FLK_CANCELLED_STATE)) {
1087 return;
1088 }
1089 }
1090 CHECK_LOCK_TRANSITION(lock->l_status, new_state);
1091 if (IS_PXFS(lock)) {
1092 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
1093 }
1094 lock->l_status = new_state;
1095 }
1096
1097 /*
1098 * Support for the remote stale lock detection
1099 */
1100
1101 void
1102 flk_add_sysid_to_host_translator(sysid_to_host_translator_t tr)
1103 {
1104 struct sysid_to_host_translator_entry *te;
1105
1106 te = kmem_alloc(sizeof (struct sysid_to_host_translator_entry),
1107 KM_SLEEP);
1108
1109 te->translator = tr;
1110
1111 rw_enter(&sysid_to_host_translator_lock, RW_WRITER);
1112 list_insert_head(&sysid_to_host_translator_list, te);
1113 rw_exit(&sysid_to_host_translator_lock);
1114 }
1115
1116 static void
1117 translate_sysid_to_host(zoneid_t zoneid, sysid_t sysid, char *host, size_t hlen,
1118 const char **type)
1119 {
1120 struct sockaddr sa;
1121 struct sysid_to_host_translator_entry *te;
1122
1123 /* Some defaults in a case the translation will fail */
1124 *type = "?";
1125 (void) strlcpy(host, "?", hlen);
1126
1127 rw_enter(&sysid_to_host_translator_lock, RW_READER);
1128
1129 for (te = list_head(&sysid_to_host_translator_list); te != NULL;
1130 te = list_next(&sysid_to_host_translator_list, te)) {
1131
1132 if (te->translator(zoneid, sysid, &sa, type) != 0) {
1133 rw_exit(&sysid_to_host_translator_lock);
1134
1135 switch (sa.sa_family) {
1136 case AF_INET:
1137 (void) inet_ntop(AF_INET,
1138 &((struct sockaddr_in *)&sa)->sin_addr,
1139 host, hlen);
1140 break;
1141 case AF_INET6:
1142 (void) inet_ntop(AF_INET6,
1143 &((struct sockaddr_in6 *)&sa)->sin6_addr,
1144 host, hlen);
1145 break;
1146 default:
1147 break;
1148 }
1149
1150 return;
1151 }
1152 }
1153
1154 rw_exit(&sysid_to_host_translator_lock);
1155 }
1156
1157 static char *
1158 get_vnode_path(vnode_t *vp)
1159 {
1160 size_t len;
1161 char *ret;
1162
1163 mutex_enter(&vp->v_lock);
1164 if (vp->v_path == NULL) {
1165 mutex_exit(&vp->v_lock);
1166 return (NULL);
1167 }
1168 len = strlen(vp->v_path) + 1;
1169 mutex_exit(&vp->v_lock);
1170
1171 ret = kmem_alloc(len, KM_SLEEP);
1172
1173 mutex_enter(&vp->v_lock);
1174 if (vp->v_path == NULL || strlen(vp->v_path) + 1 != len) {
1175 mutex_exit(&vp->v_lock);
1176 kmem_free(ret, len);
1177 return (NULL);
1178 }
1179 bcopy(vp->v_path, ret, len);
1180 mutex_exit(&vp->v_lock);
1181
1182 return (ret);
1183 }
1184
1185 static void
1186 flk_stale_lock_check(lock_descriptor_t *lock)
1187 {
1188 char *path;
1189
1190 char host[INET6_ADDRSTRLEN]; /* host name */
1191 const char *type; /* host type */
1192
1193 /* temporary variables for the cmn_err() call */
1194 char *p, *t; /* path, lock type */
1195 pid_t pid; /* pid */
1196 void *v; /* vnode */
1197 u_offset_t s, e; /* start, end */
1198
1199 ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1200
1201 /*
1202 * Either not a remote lock, or the stale lock checking is disabled, or
1203 * the lock is already reported.
1204 */
1205 if (IS_LOCAL(lock) || stale_lock_timeout == 0 || lock->l_blocker < 0)
1206 return;
1207
1208 /* Seen first time? */
1209 if (lock->l_blocker == 0) {
1210 lock->l_blocker = gethrtime();
1211 return;
1212 }
1213
1214 /* Old enough? */
1215 if ((gethrtime() - lock->l_blocker) / NANOSEC < stale_lock_timeout)
1216 return;
1217
1218 translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1219 sizeof (host), &type);
1220 path = get_vnode_path(lock->l_vnode);
1221
1222 pid = lock->l_flock.l_pid;
1223 v = (void *)lock->l_vnode;
1224 p = path == NULL ? "?" : path;
1225 t = lock->l_type == F_WRLCK ? "WR" : "RD";
1226 s = lock->l_start;
1227 e = lock->l_end;
1228
1229 /* Report the blocker as stale */
1230 cmn_err(CE_NOTE, "!Stale lock (host: %s (%s), pid: %d, vnode: %p, "
1231 "path: %s, %sLCK: %llu:%llu)", host, type, pid, v, p, t, s, e);
1232
1233 if (path != NULL)
1234 strfree(path);
1235
1236 /* Mark this blocker as reported */
1237 lock->l_blocker = -lock->l_blocker;
1238 }
1239
1240 static void
1241 flk_stale_lock_shrink(lock_descriptor_t *lock, lock_descriptor_t *new)
1242 {
1243 char *path;
1244
1245 char host[INET6_ADDRSTRLEN]; /* host name */
1246 const char *type; /* host type */
1247
1248 /* temporary variables for the cmn_err() call */
1249 char *p, *t; /* path, lock type */
1250 pid_t pid; /* pid */
1251 void *v; /* vnode */
1252 u_offset_t s, e; /* start, end */
1253 u_offset_t ns, ne; /* new start, new end */
1254
1255 ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1256
1257 translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1258 sizeof (host), &type);
1259 path = get_vnode_path(lock->l_vnode);
1260
1261 pid = lock->l_flock.l_pid;
1262 v = (void *)lock->l_vnode;
1263 p = path == NULL ? "?" : path;
1264 t = lock->l_type == F_WRLCK ? "WR" : "RD";
1265 s = lock->l_start;
1266 e = lock->l_end;
1267 ns = new->l_start;
1268 ne = new->l_end;
1269
1270 cmn_err(CE_NOTE, "!Stale lock SHRINK (host: %s (%s), pid: %d, "
1271 "vnode: %p, path: %s, %sLCK: %llu:%llu -> %llu:%llu)", host, type,
1272 pid, v, p, t, s, e, ns, ne);
1273
1274 if (path != NULL)
1275 strfree(path);
1276 }
1277
1278 static void
1279 flk_stale_lock_split(lock_descriptor_t *lock, lock_descriptor_t *new1,
1280 lock_descriptor_t *new2)
1281 {
1282 char *path;
1283
1284 char host[INET6_ADDRSTRLEN]; /* host name */
1285 const char *type; /* host type */
1286
1287 /* temporary variables for the cmn_err() call */
1288 char *p, *t; /* path, lock type */
1289 pid_t pid; /* pid */
1290 void *v; /* vnode */
1291 u_offset_t s, e; /* start, end */
1292 u_offset_t n1s, n1e; /* new1 start, new1 end */
1293 u_offset_t n2s, n2e; /* new2 start, new2 end */
1294
1295 ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1296
1297 translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1298 sizeof (host), &type);
1299 path = get_vnode_path(lock->l_vnode);
1300
1301 pid = lock->l_flock.l_pid;
1302 v = (void *)lock->l_vnode;
1303 p = path == NULL ? "?" : path;
1304 t = lock->l_type == F_WRLCK ? "WR" : "RD";
1305 s = lock->l_start;
1306 e = lock->l_end;
1307 n1s = new1->l_start;
1308 n1e = new1->l_end;
1309 n2s = new2->l_start;
1310 n2e = new2->l_end;
1311
1312 cmn_err(CE_NOTE, "!Stale lock SPLIT (host: %s (%s), pid: %d, "
1313 "vnode: %p, path: %s, %sLCK: %llu:%llu -> %llu:%llu and %llu:%llu)",
1314 host, type, pid, v, p, t, s, e, n1s, n1e, n2s, n2e);
1315
1316 if (path != NULL)
1317 strfree(path);
1318 }
1319
1320 static void
1321 flk_stale_lock_release(lock_descriptor_t *lock)
1322 {
1323 char *path;
1324
1325 char host[INET6_ADDRSTRLEN]; /* host name */
1326 const char *type; /* host type */
1327
1328 /* temporary variables for the cmn_err() call */
1329 char *p, *t; /* path, lock type */
1330 pid_t pid; /* pid */
1331 void *v; /* vnode */
1332 u_offset_t s, e; /* start, end */
1333
1334 ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1335
1336 translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1337 sizeof (host), &type);
1338 path = get_vnode_path(lock->l_vnode);
1339
1340 pid = lock->l_flock.l_pid;
1341 v = (void *)lock->l_vnode;
1342 p = path == NULL ? "?" : path;
1343 t = lock->l_type == F_WRLCK ? "WR" : "RD";
1344 s = lock->l_start;
1345 e = lock->l_end;
1346
1347 cmn_err(CE_NOTE, "!Stale lock RELEASE (host: %s (%s), pid: %d, "
1348 "vnode: %p, path: %s, %sLCK: %llu:%llu)", host, type, pid, v, p, t,
1349 s, e);
1350
1351 if (path != NULL)
1352 strfree(path);
1353 }
1354
1355 /*
1356 * Routine that checks whether there are any blocking locks in the system.
1357 *
1358 * The policy followed is if a write lock is sleeping we don't allow read
1359 * locks before this write lock even though there may not be any active
1360 * locks corresponding to the read locks' region.
1361 *
1362 * flk_add_edge() function adds an edge between l1 and l2 iff there
1363 * is no path between l1 and l2. This is done to have a "minimum
1364 * storage representation" of the dependency graph.
1365 *
1366 * Another property of the graph is since only the new request throws
1367 * edges to the existing locks in the graph, the graph is always topologically
1368 * ordered.
1369 */
1370
1371 static int
1372 flk_process_request(lock_descriptor_t *request)
1373 {
1374 graph_t *gp = request->l_graph;
1375 lock_descriptor_t *lock;
1376 int request_blocked_by_active = 0;
1377 int request_blocked_by_granted = 0;
1378 int request_blocked_by_sleeping = 0;
1379 vnode_t *vp = request->l_vnode;
1380 int error = 0;
1381 int request_will_wait = 0;
1382 int found_covering_lock = 0;
1383 lock_descriptor_t *covered_by = NULL;
1384
1385 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1386 request_will_wait = IS_WILLING_TO_SLEEP(request);
1387
1388 /*
1389 * check active locks
1390 */
1391
1392 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1393
1394 if (lock) {
1395 do {
1396 if (BLOCKS(lock, request)) {
1397 if (!request_will_wait) {
1398 flk_stale_lock_check(lock);
1399 return (EAGAIN);
1400 }
1401 request_blocked_by_active = 1;
1402 break;
1403 }
1404 /*
1405 * Grant lock if it is for the same owner holding active
1406 * lock that covers the request.
1407 */
1408
1409 if (SAME_OWNER(lock, request) &&
1410 COVERS(lock, request) &&
1411 (request->l_type == F_RDLCK))
1412 return (flk_execute_request(request));
1413 lock = lock->l_next;
1414 } while (lock->l_vnode == vp);
1415 }
1416
1417 if (!request_blocked_by_active) {
1418 lock_descriptor_t *lk[1];
1419 lock_descriptor_t *first_glock = NULL;
1420
1421 /*
1422 * Shall we grant this?! NO!!
1423 * What about those locks that were just granted and still
1424 * in sleep queue. Those threads are woken up and so locks
1425 * are almost active.
1426 */
1427 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1428 if (lock) {
1429 do {
1430 if (BLOCKS(lock, request)) {
1431 if (IS_GRANTED(lock)) {
1432 request_blocked_by_granted = 1;
1433 } else {
1434 request_blocked_by_sleeping = 1;
1435 }
1436 }
1437
1438 lock = lock->l_next;
1439 } while ((lock->l_vnode == vp));
1440 first_glock = lock->l_prev;
1441 ASSERT(first_glock->l_vnode == vp);
1442 }
1443
1444 if (request_blocked_by_granted)
1445 goto block;
1446
1447 if (!request_blocked_by_sleeping) {
1448 /*
1449 * If the request isn't going to be blocked by a
1450 * sleeping request, we know that it isn't going to
1451 * be blocked; we can just execute the request --
1452 * without performing costly deadlock detection.
1453 */
1454 ASSERT(!request_blocked_by_active);
1455 return (flk_execute_request(request));
1456 } else if (request->l_type == F_RDLCK) {
1457 /*
1458 * If we have a sleeping writer in the requested
1459 * lock's range, block.
1460 */
1461 goto block;
1462 }
1463
1464 lk[0] = request;
1465 request->l_state |= RECOMPUTE_LOCK;
1466 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1467 if (lock) {
1468 do {
1469 flk_recompute_dependencies(lock, lk, 1, 0);
1470 lock = lock->l_next;
1471 } while (lock->l_vnode == vp);
1472 }
1473 lock = first_glock;
1474 if (lock) {
1475 do {
1476 if (IS_GRANTED(lock)) {
1477 flk_recompute_dependencies(lock, lk, 1,
1478 0);
1479 }
1480 lock = lock->l_prev;
1481 } while ((lock->l_vnode == vp));
1482 }
1483 request->l_state &= ~RECOMPUTE_LOCK;
1484 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
1485 return (EDEADLK);
1486 return (flk_execute_request(request));
1487 }
1488
1489 block:
1490 if (request_will_wait)
1491 flk_graph_uncolor(gp);
1492
1493 /* check sleeping locks */
1494
1495 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1496
1497 /*
1498 * If we find a sleeping write lock that is a superset of the
1499 * region wanted by request we can be assured that by adding an
1500 * edge to this write lock we have paths to all locks in the
1501 * graph that blocks the request except in one case and that is why
1502 * another check for SAME_OWNER in the loop below. The exception
1503 * case is when this process that owns the sleeping write lock 'l1'
1504 * has other locks l2, l3, l4 that are in the system and arrived
1505 * before l1. l1 does not have path to these locks as they are from
1506 * same process. We break when we find a second covering sleeping
1507 * lock l5 owned by a process different from that owning l1, because
1508 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
1509 * it has l1 would have produced a deadlock already.
1510 */
1511
1512 if (lock) {
1513 do {
1514 if (BLOCKS(lock, request)) {
1515 if (!request_will_wait)
1516 return (EAGAIN);
1517 if (COVERS(lock, request) &&
1518 lock->l_type == F_WRLCK) {
1519 if (found_covering_lock &&
1520 !SAME_OWNER(lock, covered_by)) {
1521 found_covering_lock++;
1522 break;
1523 }
1524 found_covering_lock = 1;
1525 covered_by = lock;
1526 }
1527 if (found_covering_lock &&
1528 !SAME_OWNER(lock, covered_by)) {
1529 lock = lock->l_next;
1530 continue;
1531 }
1532 if ((error = flk_add_edge(request, lock,
1533 !found_covering_lock, 0)))
1534 return (error);
1535 }
1536 lock = lock->l_next;
1537 } while (lock->l_vnode == vp);
1538 }
1539
1540 /*
1541 * found_covering_lock == 2 iff at this point 'request' has paths to
1542 * all locks that blocks 'request'. found_covering_lock == 1 iff at
1543 * this point 'request' has paths to all locks that blocks 'request'
1544 * whose owners are not same as the one that covers 'request'
1545 * (covered_by above) and we can have locks whose owner is same as
1546 * covered_by in the active list.
1547 */
1548
1549 if (request_blocked_by_active && found_covering_lock != 2) {
1550 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1551 ASSERT(lock != NULL);
1552 do {
1553 if (BLOCKS(lock, request)) {
1554 if (found_covering_lock &&
1555 !SAME_OWNER(lock, covered_by)) {
1556 lock = lock->l_next;
1557 continue;
1558 }
1559 if ((error = flk_add_edge(request, lock,
1560 CHECK_CYCLE, 0)))
1561 return (error);
1562 }
1563 lock = lock->l_next;
1564 } while (lock->l_vnode == vp);
1565 }
1566
1567 if (NOT_BLOCKED(request)) {
1568 /*
1569 * request not dependent on any other locks
1570 * so execute this request
1571 */
1572 return (flk_execute_request(request));
1573 } else {
1574 /*
1575 * check for deadlock
1576 */
1577 if (flk_check_deadlock(request))
1578 return (EDEADLK);
1579 /*
1580 * this thread has to sleep
1581 */
1582 return (flk_wait_execute_request(request));
1583 }
1584 }
1585
1586 /*
1587 * The actual execution of the request in the simple case is only to
1588 * insert the 'request' in the list of active locks if it is not an
1589 * UNLOCK.
1590 * We have to consider the existing active locks' relation to
1591 * this 'request' if they are owned by same process. flk_relation() does
1592 * this job and sees to that the dependency graph information is maintained
1593 * properly.
1594 */
1595
1596 int
1597 flk_execute_request(lock_descriptor_t *request)
1598 {
1599 graph_t *gp = request->l_graph;
1600 vnode_t *vp = request->l_vnode;
1601 lock_descriptor_t *lock, *lock1;
1602 int done_searching = 0;
1603
1604 CHECK_SLEEPING_LOCKS(gp);
1605 CHECK_ACTIVE_LOCKS(gp);
1606
1607 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1608
1609 flk_set_state(request, FLK_START_STATE);
1610
1611 ASSERT(NOT_BLOCKED(request));
1612
1613 /* IO_LOCK requests are only to check status */
1614
1615 if (IS_IO_LOCK(request))
1616 return (0);
1617
1618 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1619
1620 if (lock != NULL) {
1621 /*
1622 * There are some active locks so check for relations
1623 */
1624 do {
1625 lock1 = lock->l_next;
1626 if (SAME_OWNER(request, lock)) {
1627 done_searching = flk_relation(lock, request);
1628 }
1629 lock = lock1;
1630 } while (lock->l_vnode == vp && !done_searching);
1631 }
1632
1633 /*
1634 * insert in active queue
1635 */
1636
1637 if (request->l_type != F_UNLCK)
1638 flk_insert_active_lock(request);
1639
1640 return (0);
1641 }
1642
1643 /*
1644 * 'request' is blocked by some one therefore we put it into sleep queue.
1645 */
1646 static int
1647 flk_wait_execute_request(lock_descriptor_t *request)
1648 {
1649 graph_t *gp = request->l_graph;
1650 callb_cpr_t *cprp; /* CPR info from callback */
1651 struct flock_globals *fg;
1652 int index;
1653
1654 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1655 ASSERT(IS_WILLING_TO_SLEEP(request));
1656
1657 flk_insert_sleeping_lock(request);
1658
1659 if (IS_LOCKMGR(request)) {
1660 index = HASH_INDEX(request->l_vnode);
1661 fg = flk_get_globals();
1662
1663 if (nlm_status_size == 0) { /* not booted as a cluster */
1664 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1665 flk_cancel_sleeping_lock(request, 1);
1666 return (ENOLCK);
1667 }
1668 } else { /* booted as a cluster */
1669 /*
1670 * If the request is an NLM server lock request,
1671 * and the NLM state of the lock request is not
1672 * NLM_UP (because the NLM server is shutting
1673 * down), then cancel the sleeping lock and
1674 * return error ENOLCK that will encourage the
1675 * client to retransmit.
1676 */
1677 if (!IS_NLM_UP(request)) {
1678 flk_cancel_sleeping_lock(request, 1);
1679 return (ENOLCK);
1680 }
1681 }
1682 }
1683
1684 /* Clustering: For blocking PXFS locks, return */
1685 if (IS_PXFS(request)) {
1686 /*
1687 * PXFS locks sleep on the client side.
1688 * The callback argument is used to wake up the sleeper
1689 * when the lock is granted.
1690 * We return -1 (rather than an errno value) to indicate
1691 * the client side should sleep
1692 */
1693 return (PXFS_LOCK_BLOCKED);
1694 }
1695
1696 if (request->l_callbacks != NULL) {
1697 /*
1698 * To make sure the shutdown code works correctly, either
1699 * the callback must happen after putting the lock on the
1700 * sleep list, or we must check the shutdown status after
1701 * returning from the callback (and before sleeping). At
1702 * least for now, we'll use the first option. If a
1703 * shutdown or signal or whatever happened while the graph
1704 * mutex was dropped, that will be detected by
1705 * wait_for_lock().
1706 */
1707 mutex_exit(&gp->gp_mutex);
1708
1709 cprp = flk_invoke_callbacks(request->l_callbacks,
1710 FLK_BEFORE_SLEEP);
1711
1712 mutex_enter(&gp->gp_mutex);
1713
1714 if (cprp == NULL) {
1715 wait_for_lock(request);
1716 } else {
1717 mutex_enter(cprp->cc_lockp);
1718 CALLB_CPR_SAFE_BEGIN(cprp);
1719 mutex_exit(cprp->cc_lockp);
1720 wait_for_lock(request);
1721 mutex_enter(cprp->cc_lockp);
1722 CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1723 mutex_exit(cprp->cc_lockp);
1724 }
1725
1726 mutex_exit(&gp->gp_mutex);
1727 (void) flk_invoke_callbacks(request->l_callbacks,
1728 FLK_AFTER_SLEEP);
1729 mutex_enter(&gp->gp_mutex);
1730 } else {
1731 wait_for_lock(request);
1732 }
1733
1734 if (IS_LOCKMGR(request)) {
1735 /*
1736 * If the lock manager is shutting down, return an
1737 * error that will encourage the client to retransmit.
1738 */
1739 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1740 !IS_GRANTED(request)) {
1741 flk_cancel_sleeping_lock(request, 1);
1742 return (ENOLCK);
1743 }
1744 }
1745
1746 if (IS_INTERRUPTED(request)) {
1747 /* we got a signal, or act like we did */
1748 flk_cancel_sleeping_lock(request, 1);
1749 return (EINTR);
1750 }
1751
1752 /* Cancelled if some other thread has closed the file */
1753
1754 if (IS_CANCELLED(request)) {
1755 flk_cancel_sleeping_lock(request, 1);
1756 return (EBADF);
1757 }
1758
1759 request->l_state &= ~GRANTED_LOCK;
1760 REMOVE_SLEEP_QUEUE(request);
1761 return (flk_execute_request(request));
1762 }
1763
1764 /*
1765 * This routine adds an edge between from and to because from depends
1766 * to. If asked to check for deadlock it checks whether there are any
1767 * reachable locks from "from_lock" that is owned by the same process
1768 * as "from_lock".
1769 * NOTE: It is the caller's responsibility to make sure that the color
1770 * of the graph is consistent between the calls to flk_add_edge as done
1771 * in flk_process_request. This routine does not color and check for
1772 * deadlock explicitly.
1773 */
1774
1775 static int
1776 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1777 int check_cycle, int update_graph)
1778 {
1779 edge_t *edge;
1780 edge_t *ep;
1781 lock_descriptor_t *vertex;
1782 lock_descriptor_t *vertex_stack;
1783
1784 STACK_INIT(vertex_stack);
1785
1786 /*
1787 * if to vertex already has mark_color just return
1788 * don't add an edge as it is reachable from from vertex
1789 * before itself.
1790 */
1791
1792 if (COLORED(to_lock))
1793 return (0);
1794
1795 edge = flk_get_edge();
1796
1797 /*
1798 * set the from and to vertex
1799 */
1800
1801 edge->from_vertex = from_lock;
1802 edge->to_vertex = to_lock;
1803
1804 /*
1805 * put in adjacency list of from vertex
1806 */
1807
1808 from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1809 edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1810 edge->edge_adj_prev = &from_lock->l_edge;
1811 from_lock->l_edge.edge_adj_next = edge;
1812
1813 /*
1814 * put in list of to vertex
1815 */
1816
1817 to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1818 edge->edge_in_next = to_lock->l_edge.edge_in_next;
1819 to_lock->l_edge.edge_in_next = edge;
1820 edge->edge_in_prev = &to_lock->l_edge;
1821
1822
1823 if (update_graph) {
1824 flk_update_proc_graph(edge, 0);
1825 return (0);
1826 }
1827 if (!check_cycle) {
1828 return (0);
1829 }
1830
1831 STACK_PUSH(vertex_stack, from_lock, l_stack);
1832
1833 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1834
1835 STACK_POP(vertex_stack, l_stack);
1836
1837 for (ep = FIRST_ADJ(vertex);
1838 ep != HEAD(vertex);
1839 ep = NEXT_ADJ(ep)) {
1840 if (COLORED(ep->to_vertex))
1841 continue;
1842 COLOR(ep->to_vertex);
1843 if (SAME_OWNER(ep->to_vertex, from_lock))
1844 goto dead_lock;
1845 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1846 }
1847 }
1848 return (0);
1849
1850 dead_lock:
1851
1852 /*
1853 * remove all edges
1854 */
1855
1856 ep = FIRST_ADJ(from_lock);
1857
1858 while (ep != HEAD(from_lock)) {
1859 IN_LIST_REMOVE(ep);
1860 from_lock->l_sedge = NEXT_ADJ(ep);
1861 ADJ_LIST_REMOVE(ep);
1862 flk_free_edge(ep);
1863 ep = from_lock->l_sedge;
1864 }
1865 return (EDEADLK);
1866 }
1867
1868 /*
1869 * Get an edge structure for representing the dependency between two locks.
1870 */
1871
1872 static edge_t *
1873 flk_get_edge()
1874 {
1875 edge_t *ep;
1876
1877 ASSERT(flk_edge_cache != NULL);
1878
1879 ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1880 edge_allocs++;
1881 return (ep);
1882 }
1883
1884 /*
1885 * Free the edge structure.
1886 */
1887
1888 static void
1889 flk_free_edge(edge_t *ep)
1890 {
1891 edge_frees++;
1892 kmem_cache_free(flk_edge_cache, (void *)ep);
1893 }
1894
1895 /*
1896 * Check the relationship of 'request' with 'lock' and perform the
1897 * recomputation of dependencies, break 'lock' if required, and return
1898 * 1 if 'request' cannot have any more relationship with the next
1899 * active locks.
1900 *
1901 * The 'lock' and 'request' are compared and in case of overlap we
1902 * delete the 'lock' and form new locks to represent the non-overlapped
1903 * portion of original 'lock'. This function has side effects such as
1904 * 'lock' will be freed, new locks will be added to the active list.
1905 */
1906
1907 static int
1908 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1909 {
1910 int lock_effect;
1911 lock_descriptor_t *topology[3];
1912 int nvertex = 0;
1913 int i;
1914 edge_t *ep;
1915 graph_t *gp = lock->l_graph;
1916 boolean_t mergeable;
1917
1918 ASSERT(request->l_blocker == 0);
1919
1920 CHECK_SLEEPING_LOCKS(gp);
1921 CHECK_ACTIVE_LOCKS(gp);
1922
1923 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1924
1925 topology[0] = topology[1] = topology[2] = NULL;
1926
1927 if (request->l_type == F_UNLCK)
1928 lock_effect = FLK_UNLOCK;
1929 else if (request->l_type == F_RDLCK &&
1930 lock->l_type == F_WRLCK)
1931 lock_effect = FLK_DOWNGRADE;
1932 else if (request->l_type == F_WRLCK &&
1933 lock->l_type == F_RDLCK)
1934 lock_effect = FLK_UPGRADE;
1935 else
1936 lock_effect = FLK_STAY_SAME;
1937
1938 /*
1939 * The 'lock' and 'request' are merged only in a case the effect of
1940 * both locks is same (FLK_STAY_SAME) and their blocker status
1941 * (l_blocker) is same as well. We do not merge 'lock' and 'request'
1942 * with different l_blocker values because such merge might affect the
1943 * stale lock detection. It might cause either false positives, or
1944 * miss some stale locks.
1945 */
1946 mergeable = lock_effect == FLK_STAY_SAME &&
1947 lock->l_blocker == request->l_blocker;
1948
1949 if (lock->l_end < request->l_start) {
1950 /* If the 'lock' is just next to 'request', try to merge them */
1951 if (lock->l_end == request->l_start - 1 && mergeable) {
1952 request->l_start = lock->l_start;
1953 goto recompute;
1954 }
1955
1956 /* Otherwise, they do not overlap, so return immediately */
1957 return (0);
1958 }
1959
1960 if (request->l_end < lock->l_start) {
1961 /* If the 'request' is just next to 'lock', try to merge them */
1962 if (request->l_end == lock->l_start - 1 && mergeable) {
1963 request->l_end = lock->l_end;
1964 goto recompute;
1965 }
1966
1967 /* Otherwise, they do not overlap, so return immediately */
1968 return (1);
1969 }
1970
1971 /*
1972 * Here we are sure the 'lock' and 'request' overlaps, so the 'request'
1973 * will replace the 'lock' (either fully, or at least partially).
1974 */
1975
1976 /*
1977 * If the 'request' does not fully cover the 'lock' at the start,
1978 * either move the start of the 'request' to cover the 'lock', or split
1979 * the 'lock'.
1980 */
1981 if (lock->l_start < request->l_start) {
1982 if (mergeable) {
1983 request->l_start = lock->l_start;
1984 } else {
1985 lock_descriptor_t *new_lock = flk_get_lock();
1986
1987 COPY(new_lock, lock);
1988 new_lock->l_end = request->l_start - 1;
1989
1990 topology[nvertex++] = new_lock;
1991 }
1992 }
1993
1994 /*
1995 * If the 'request' does not fully cover the 'lock' at the end, either
1996 * move the end of the 'request' to cover the 'lock', or split the
1997 * 'lock'.
1998 */
1999 if (request->l_end < lock->l_end) {
2000 if (mergeable) {
2001 request->l_end = lock->l_end;
2002 } else {
2003 lock_descriptor_t *new_lock = flk_get_lock();
2004
2005 COPY(new_lock, lock);
2006 new_lock->l_start = request->l_end + 1;
2007
2008 topology[nvertex++] = new_lock;
2009 }
2010 }
2011
2012 /*
2013 * Log the blocker change
2014 */
2015 if (nvertex > 0 && lock->l_blocker < 0) {
2016 if (nvertex == 1)
2017 flk_stale_lock_shrink(lock, topology[0]);
2018 if (nvertex == 2)
2019 flk_stale_lock_split(lock, topology[0], topology[1]);
2020
2021 lock->l_blocker = 0;
2022 }
2023
2024 recompute:
2025 /*
2026 * For unlock we don't send the 'request' to for recomputing
2027 * dependencies because no lock will add an edge to this.
2028 */
2029 if (lock_effect != FLK_UNLOCK)
2030 topology[nvertex++] = request;
2031
2032 for (i = 0; i < nvertex; i++) {
2033 topology[i]->l_state |= RECOMPUTE_LOCK;
2034 topology[i]->l_color = NO_COLOR;
2035 }
2036
2037 ASSERT(FIRST_ADJ(lock) == HEAD(lock));
2038
2039 /*
2040 * we remove the adjacent edges for all vertices' to this vertex
2041 * 'lock'.
2042 */
2043 ep = FIRST_IN(lock);
2044 while (ep != HEAD(lock)) {
2045 ADJ_LIST_REMOVE(ep);
2046 ep = NEXT_IN(ep);
2047 }
2048
2049 flk_delete_active_lock(lock, 0);
2050
2051 /* We are ready for recomputing the dependencies now */
2052 flk_recompute_dependencies(lock, topology, nvertex, 1);
2053
2054 for (i = 0; i < nvertex; i++) {
2055 topology[i]->l_state &= ~RECOMPUTE_LOCK;
2056 topology[i]->l_color = NO_COLOR;
2057 }
2058
2059 if (lock_effect == FLK_UNLOCK) {
2060 nvertex++;
2061 }
2062 for (i = 0; i < nvertex - 1; i++) {
2063 flk_insert_active_lock(topology[i]);
2064 }
2065
2066 if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
2067 flk_wakeup(lock, 0);
2068 } else {
2069 ep = FIRST_IN(lock);
2070 while (ep != HEAD(lock)) {
2071 lock->l_sedge = NEXT_IN(ep);
2072 IN_LIST_REMOVE(ep);
2073 flk_update_proc_graph(ep, 1);
2074 flk_free_edge(ep);
2075 ep = lock->l_sedge;
2076 }
2077 }
2078 flk_free_lock(lock);
2079
2080 CHECK_SLEEPING_LOCKS(gp);
2081 CHECK_ACTIVE_LOCKS(gp);
2082 return (0);
2083 }
2084
2085 /*
2086 * Insert a lock into the active queue.
2087 */
2088
2089 static void
2090 flk_insert_active_lock(lock_descriptor_t *new_lock)
2091 {
2092 graph_t *gp = new_lock->l_graph;
2093 vnode_t *vp = new_lock->l_vnode;
2094 lock_descriptor_t *first_lock, *lock;
2095
2096 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2097
2098 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2099 first_lock = lock;
2100
2101 if (first_lock != NULL) {
2102 for (; (lock->l_vnode == vp &&
2103 lock->l_start < new_lock->l_start); lock = lock->l_next)
2104 ;
2105 } else {
2106 lock = ACTIVE_HEAD(gp);
2107 }
2108
2109 lock->l_prev->l_next = new_lock;
2110 new_lock->l_next = lock;
2111 new_lock->l_prev = lock->l_prev;
2112 lock->l_prev = new_lock;
2113
2114 if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
2115 vp->v_filocks = (struct filock *)new_lock;
2116 }
2117 flk_set_state(new_lock, FLK_ACTIVE_STATE);
2118 new_lock->l_state |= ACTIVE_LOCK;
2119
2120 CHECK_ACTIVE_LOCKS(gp);
2121 CHECK_SLEEPING_LOCKS(gp);
2122 }
2123
2124 /*
2125 * Delete the active lock : Performs two functions depending on the
2126 * value of second parameter. One is to remove from the active lists
2127 * only and other is to both remove and free the lock.
2128 */
2129
2130 static void
2131 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
2132 {
2133 vnode_t *vp = lock->l_vnode;
2134 graph_t *gp = lock->l_graph;
2135
2136 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2137 if (free_lock)
2138 ASSERT(NO_DEPENDENTS(lock));
2139 ASSERT(NOT_BLOCKED(lock));
2140 ASSERT(IS_ACTIVE(lock));
2141
2142 ASSERT((vp->v_filocks != NULL));
2143
2144 if (lock->l_blocker < 0) {
2145 /* Log the blocker release */
2146 flk_stale_lock_release(lock);
2147 lock->l_blocker = 0;
2148 }
2149
2150 if (vp->v_filocks == (struct filock *)lock) {
2151 vp->v_filocks = (struct filock *)
2152 ((lock->l_next->l_vnode == vp) ? lock->l_next :
2153 NULL);
2154 }
2155 lock->l_next->l_prev = lock->l_prev;
2156 lock->l_prev->l_next = lock->l_next;
2157 lock->l_next = lock->l_prev = NULL;
2158 flk_set_state(lock, FLK_DEAD_STATE);
2159 lock->l_state &= ~ACTIVE_LOCK;
2160
2161 if (free_lock)
2162 flk_free_lock(lock);
2163 CHECK_ACTIVE_LOCKS(gp);
2164 CHECK_SLEEPING_LOCKS(gp);
2165 }
2166
2167 /*
2168 * Insert into the sleep queue.
2169 */
2170
2171 static void
2172 flk_insert_sleeping_lock(lock_descriptor_t *request)
2173 {
2174 graph_t *gp = request->l_graph;
2175 vnode_t *vp = request->l_vnode;
2176 lock_descriptor_t *lock;
2177
2178 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2179 ASSERT(IS_INITIAL(request));
2180
2181 for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
2182 lock->l_vnode < vp); lock = lock->l_next)
2183 ;
2184
2185 lock->l_prev->l_next = request;
2186 request->l_prev = lock->l_prev;
2187 lock->l_prev = request;
2188 request->l_next = lock;
2189 flk_set_state(request, FLK_SLEEPING_STATE);
2190 request->l_state |= SLEEPING_LOCK;
2191 }
2192
2193 /*
2194 * Cancelling a sleeping lock implies removing a vertex from the
2195 * dependency graph and therefore we should recompute the dependencies
2196 * of all vertices that have a path to this vertex, w.r.t. all
2197 * vertices reachable from this vertex.
2198 */
2199
2200 void
2201 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
2202 {
2203 graph_t *gp = request->l_graph;
2204 vnode_t *vp = request->l_vnode;
2205 lock_descriptor_t **topology = NULL;
2206 edge_t *ep;
2207 lock_descriptor_t *vertex, *lock;
2208 int nvertex = 0;
2209 int i;
2210 lock_descriptor_t *vertex_stack;
2211
2212 STACK_INIT(vertex_stack);
2213
2214 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2215 /*
2216 * count number of vertex pointers that has to be allocated
2217 * All vertices that are reachable from request.
2218 */
2219
2220 STACK_PUSH(vertex_stack, request, l_stack);
2221
2222 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2223 STACK_POP(vertex_stack, l_stack);
2224 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
2225 ep = NEXT_ADJ(ep)) {
2226 if (IS_RECOMPUTE(ep->to_vertex))
2227 continue;
2228 ep->to_vertex->l_state |= RECOMPUTE_LOCK;
2229 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
2230 nvertex++;
2231 }
2232 }
2233
2234 /*
2235 * allocate memory for holding the vertex pointers
2236 */
2237
2238 if (nvertex) {
2239 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
2240 KM_SLEEP);
2241 }
2242
2243 /*
2244 * one more pass to actually store the vertices in the
2245 * allocated array.
2246 * We first check sleeping locks and then active locks
2247 * so that topology array will be in a topological
2248 * order.
2249 */
2250
2251 nvertex = 0;
2252 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2253
2254 if (lock) {
2255 do {
2256 if (IS_RECOMPUTE(lock)) {
2257 lock->l_index = nvertex;
2258 topology[nvertex++] = lock;
2259 }
2260 lock->l_color = NO_COLOR;
2261 lock = lock->l_next;
2262 } while (lock->l_vnode == vp);
2263 }
2264
2265 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2266
2267 if (lock) {
2268 do {
2269 if (IS_RECOMPUTE(lock)) {
2270 lock->l_index = nvertex;
2271 topology[nvertex++] = lock;
2272 }
2273 lock->l_color = NO_COLOR;
2274 lock = lock->l_next;
2275 } while (lock->l_vnode == vp);
2276 }
2277
2278 /*
2279 * remove in and out edges of request
2280 * They are freed after updating proc_graph below.
2281 */
2282
2283 for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
2284 ADJ_LIST_REMOVE(ep);
2285 }
2286
2287
2288 if (remove_from_queue)
2289 REMOVE_SLEEP_QUEUE(request);
2290
2291 /* we are ready to recompute */
2292
2293 flk_recompute_dependencies(request, topology, nvertex, 1);
2294
2295 ep = FIRST_ADJ(request);
2296 while (ep != HEAD(request)) {
2297 IN_LIST_REMOVE(ep);
2298 request->l_sedge = NEXT_ADJ(ep);
2299 ADJ_LIST_REMOVE(ep);
2300 flk_update_proc_graph(ep, 1);
2301 flk_free_edge(ep);
2302 ep = request->l_sedge;
2303 }
2304
2305
2306 /*
2307 * unset the RECOMPUTE flag in those vertices
2308 */
2309
2310 for (i = 0; i < nvertex; i++) {
2311 topology[i]->l_state &= ~RECOMPUTE_LOCK;
2312 }
2313
2314 /*
2315 * free the topology
2316 */
2317 if (nvertex)
2318 kmem_free(topology,
2319 (nvertex * sizeof (lock_descriptor_t *)));
2320 /*
2321 * Possibility of some locks unblocked now
2322 */
2323
2324 flk_wakeup(request, 0);
2325
2326 /*
2327 * we expect to have a correctly recomputed graph now.
2328 */
2329 flk_set_state(request, FLK_DEAD_STATE);
2330 flk_free_lock(request);
2331 CHECK_SLEEPING_LOCKS(gp);
2332 CHECK_ACTIVE_LOCKS(gp);
2333
2334 }
2335
2336 /*
2337 * Uncoloring the graph is simply to increment the mark value of the graph
2338 * And only when wrap round takes place will we color all vertices in
2339 * the graph explicitly.
2340 */
2341
2342 static void
2343 flk_graph_uncolor(graph_t *gp)
2344 {
2345 lock_descriptor_t *lock;
2346
2347 if (gp->mark == UINT_MAX) {
2348 gp->mark = 1;
2349 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
2350 lock = lock->l_next)
2351 lock->l_color = 0;
2352
2353 for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
2354 lock = lock->l_next)
2355 lock->l_color = 0;
2356 } else {
2357 gp->mark++;
2358 }
2359 }
2360
2361 /*
2362 * Wake up locks that are blocked on the given lock.
2363 */
2364
2365 static void
2366 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
2367 {
2368 edge_t *ep;
2369 graph_t *gp = lock->l_graph;
2370 lock_descriptor_t *lck;
2371
2372 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2373 if (NO_DEPENDENTS(lock))
2374 return;
2375 ep = FIRST_IN(lock);
2376 do {
2377 /*
2378 * delete the edge from the adjacency list
2379 * of from vertex. if no more adjacent edges
2380 * for this vertex wake this process.
2381 */
2382 lck = ep->from_vertex;
2383 if (adj_list_remove)
2384 ADJ_LIST_REMOVE(ep);
2385 flk_update_proc_graph(ep, 1);
2386 if (NOT_BLOCKED(lck)) {
2387 GRANT_WAKEUP(lck);
2388 }
2389 lock->l_sedge = NEXT_IN(ep);
2390 IN_LIST_REMOVE(ep);
2391 flk_free_edge(ep);
2392 ep = lock->l_sedge;
2393 } while (ep != HEAD(lock));
2394 ASSERT(NO_DEPENDENTS(lock));
2395 }
2396
2397 /*
2398 * The dependents of request, is checked for its dependency against the
2399 * locks in topology (called topology because the array is and should be in
2400 * topological order for this algorithm, if not in topological order the
2401 * inner loop below might add more edges than necessary. Topological ordering
2402 * of vertices satisfies the property that all edges will be from left to
2403 * right i.e., topology[i] can have an edge to topology[j], iff i<j)
2404 * If lock l1 in the dependent set of request is dependent (blocked by)
2405 * on lock l2 in topology but does not have a path to it, we add an edge
2406 * in the inner loop below.
2407 *
2408 * We don't want to add an edge between l1 and l2 if there exists
2409 * already a path from l1 to l2, so care has to be taken for those vertices
2410 * that have two paths to 'request'. These vertices are referred to here
2411 * as barrier locks.
2412 *
2413 * The barriers has to be found (those vertex that originally had two paths
2414 * to request) because otherwise we may end up adding edges unnecessarily
2415 * to vertices in topology, and thus barrier vertices can have an edge
2416 * to a vertex in topology as well a path to it.
2417 */
2418
2419 static void
2420 flk_recompute_dependencies(lock_descriptor_t *request,
2421 lock_descriptor_t **topology, int nvertex, int update_graph)
2422 {
2423 lock_descriptor_t *vertex, *lock;
2424 graph_t *gp = request->l_graph;
2425 int i, count;
2426 int barrier_found = 0;
2427 edge_t *ep;
2428 lock_descriptor_t *vertex_stack;
2429
2430 STACK_INIT(vertex_stack);
2431
2432 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2433 if (nvertex == 0)
2434 return;
2435 flk_graph_uncolor(request->l_graph);
2436 barrier_found = flk_find_barriers(request);
2437 request->l_state |= RECOMPUTE_DONE;
2438
2439 STACK_PUSH(vertex_stack, request, l_stack);
2440 request->l_sedge = FIRST_IN(request);
2441
2442
2443 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2444 if (vertex->l_state & RECOMPUTE_DONE) {
2445 count = 0;
2446 goto next_in_edge;
2447 }
2448 if (IS_BARRIER(vertex)) {
2449 /* decrement the barrier count */
2450 if (vertex->l_index) {
2451 vertex->l_index--;
2452 /* this guy will be pushed again anyway ? */
2453 STACK_POP(vertex_stack, l_stack);
2454 if (vertex->l_index == 0) {
2455 /*
2456 * barrier is over we can recompute
2457 * dependencies for this lock in the
2458 * next stack pop
2459 */
2460 vertex->l_state &= ~BARRIER_LOCK;
2461 }
2462 continue;
2463 }
2464 }
2465 vertex->l_state |= RECOMPUTE_DONE;
2466 flk_graph_uncolor(gp);
2467 count = flk_color_reachables(vertex);
2468 for (i = 0; i < nvertex; i++) {
2469 lock = topology[i];
2470 if (COLORED(lock))
2471 continue;
2472 if (BLOCKS(lock, vertex)) {
2473 (void) flk_add_edge(vertex, lock,
2474 NO_CHECK_CYCLE, update_graph);
2475 COLOR(lock);
2476 count++;
2477 count += flk_color_reachables(lock);
2478 }
2479
2480 }
2481
2482 next_in_edge:
2483 if (count == nvertex ||
2484 vertex->l_sedge == HEAD(vertex)) {
2485 /* prune the tree below this */
2486 STACK_POP(vertex_stack, l_stack);
2487 vertex->l_state &= ~RECOMPUTE_DONE;
2488 /* update the barrier locks below this! */
2489 if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
2490 flk_graph_uncolor(gp);
2491 flk_update_barriers(vertex);
2492 }
2493 continue;
2494 }
2495
2496 ep = vertex->l_sedge;
2497 lock = ep->from_vertex;
2498 STACK_PUSH(vertex_stack, lock, l_stack);
2499 lock->l_sedge = FIRST_IN(lock);
2500 vertex->l_sedge = NEXT_IN(ep);
2501 }
2502
2503 }
2504
2505 /*
2506 * Color all reachable vertices from vertex that belongs to topology (here
2507 * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
2508 *
2509 * Note: we need to use a different stack_link l_stack1 because this is
2510 * called from flk_recompute_dependencies() that already uses a stack with
2511 * l_stack as stack_link.
2512 */
2513
2514 static int
2515 flk_color_reachables(lock_descriptor_t *vertex)
2516 {
2517 lock_descriptor_t *ver, *lock;
2518 int count;
2519 edge_t *ep;
2520 lock_descriptor_t *vertex_stack;
2521
2522 STACK_INIT(vertex_stack);
2523
2524 STACK_PUSH(vertex_stack, vertex, l_stack1);
2525 count = 0;
2526 while ((ver = STACK_TOP(vertex_stack)) != NULL) {
2527
2528 STACK_POP(vertex_stack, l_stack1);
2529 for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
2530 ep = NEXT_ADJ(ep)) {
2531 lock = ep->to_vertex;
2532 if (COLORED(lock))
2533 continue;
2534 COLOR(lock);
2535 if (IS_RECOMPUTE(lock))
2536 count++;
2537 STACK_PUSH(vertex_stack, lock, l_stack1);
2538 }
2539
2540 }
2541 return (count);
2542 }
2543
2544 /*
2545 * Called from flk_recompute_dependencies() this routine decrements
2546 * the barrier count of barrier vertices that are reachable from lock.
2547 */
2548
2549 static void
2550 flk_update_barriers(lock_descriptor_t *lock)
2551 {
2552 lock_descriptor_t *vertex, *lck;
2553 edge_t *ep;
2554 lock_descriptor_t *vertex_stack;
2555
2556 STACK_INIT(vertex_stack);
2557
2558 STACK_PUSH(vertex_stack, lock, l_stack1);
2559
2560 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2561 STACK_POP(vertex_stack, l_stack1);
2562 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2563 ep = NEXT_IN(ep)) {
2564 lck = ep->from_vertex;
2565 if (COLORED(lck)) {
2566 if (IS_BARRIER(lck)) {
2567 ASSERT(lck->l_index > 0);
2568 lck->l_index--;
2569 if (lck->l_index == 0)
2570 lck->l_state &= ~BARRIER_LOCK;
2571 }
2572 continue;
2573 }
2574 COLOR(lck);
2575 if (IS_BARRIER(lck)) {
2576 ASSERT(lck->l_index > 0);
2577 lck->l_index--;
2578 if (lck->l_index == 0)
2579 lck->l_state &= ~BARRIER_LOCK;
2580 }
2581 STACK_PUSH(vertex_stack, lck, l_stack1);
2582 }
2583 }
2584 }
2585
2586 /*
2587 * Finds all vertices that are reachable from 'lock' more than once and
2588 * mark them as barrier vertices and increment their barrier count.
2589 * The barrier count is one minus the total number of paths from lock
2590 * to that vertex.
2591 */
2592
2593 static int
2594 flk_find_barriers(lock_descriptor_t *lock)
2595 {
2596 lock_descriptor_t *vertex, *lck;
2597 int found = 0;
2598 edge_t *ep;
2599 lock_descriptor_t *vertex_stack;
2600
2601 STACK_INIT(vertex_stack);
2602
2603 STACK_PUSH(vertex_stack, lock, l_stack1);
2604
2605 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2606 STACK_POP(vertex_stack, l_stack1);
2607 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2608 ep = NEXT_IN(ep)) {
2609 lck = ep->from_vertex;
2610 if (COLORED(lck)) {
2611 /* this is a barrier */
2612 lck->l_state |= BARRIER_LOCK;
2613 /* index will have barrier count */
2614 lck->l_index++;
2615 if (!found)
2616 found = 1;
2617 continue;
2618 }
2619 COLOR(lck);
2620 lck->l_index = 0;
2621 STACK_PUSH(vertex_stack, lck, l_stack1);
2622 }
2623 }
2624 return (found);
2625 }
2626
2627 /*
2628 * Finds the first lock that is mainly responsible for blocking this
2629 * request. If there is no such lock, request->l_flock.l_type is set to
2630 * F_UNLCK. Otherwise, request->l_flock is filled in with the particulars
2631 * of the blocking lock.
2632 *
2633 * Note: It is possible a request is blocked by a sleeping lock because
2634 * of the fairness policy used in flk_process_request() to construct the
2635 * dependencies. (see comments before flk_process_request()).
2636 */
2637
2638 static void
2639 flk_get_first_blocking_lock(lock_descriptor_t *request)
2640 {
2641 graph_t *gp = request->l_graph;
2642 vnode_t *vp = request->l_vnode;
2643 lock_descriptor_t *lock, *blocker;
2644
2645 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2646 blocker = NULL;
2647 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2648
2649 if (lock) {
2650 do {
2651 if (BLOCKS(lock, request)) {
2652 blocker = lock;
2653 break;
2654 }
2655 lock = lock->l_next;
2656 } while (lock->l_vnode == vp);
2657 }
2658
2659 if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
2660 /*
2661 * No active lock is blocking this request, but if a read
2662 * lock is requested, it may also get blocked by a waiting
2663 * writer. So search all sleeping locks and see if there is
2664 * a writer waiting.
2665 */
2666 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2667 if (lock) {
2668 do {
2669 if (BLOCKS(lock, request)) {
2670 blocker = lock;
2671 break;
2672 }
2673 lock = lock->l_next;
2674 } while (lock->l_vnode == vp);
2675 }
2676 }
2677
2678 if (blocker) {
2679 report_blocker(blocker, request);
2680 } else
2681 request->l_flock.l_type = F_UNLCK;
2682 }
2683
2684 /*
2685 * Get the graph_t structure associated with a vnode.
2686 * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2687 * not yet been initialized, then a new element is allocated and returned.
2688 */
2689 graph_t *
2690 flk_get_lock_graph(vnode_t *vp, int initialize)
2691 {
2692 graph_t *gp;
2693 graph_t *gp_alloc = NULL;
2694 int index = HASH_INDEX(vp);
2695
2696 if (initialize == FLK_USE_GRAPH) {
2697 mutex_enter(&flock_lock);
2698 gp = lock_graph[index];
2699 mutex_exit(&flock_lock);
2700 return (gp);
2701 }
2702
2703 ASSERT(initialize == FLK_INIT_GRAPH);
2704
2705 if (lock_graph[index] == NULL) {
2706
2707 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2708
2709 /* Initialize the graph */
2710
2711 gp_alloc->active_locks.l_next =
2712 gp_alloc->active_locks.l_prev =
2713 (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2714 gp_alloc->sleeping_locks.l_next =
2715 gp_alloc->sleeping_locks.l_prev =
2716 (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2717 gp_alloc->index = index;
2718 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2719 }
2720
2721 mutex_enter(&flock_lock);
2722
2723 gp = lock_graph[index];
2724
2725 /* Recheck the value within flock_lock */
2726 if (gp == NULL) {
2727 struct flock_globals *fg;
2728
2729 /* We must have previously allocated the graph_t structure */
2730 ASSERT(gp_alloc != NULL);
2731 lock_graph[index] = gp = gp_alloc;
2732 /*
2733 * The lockmgr status is only needed if KLM is loaded.
2734 */
2735 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2736 fg = flk_get_globals();
2737 fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2738 }
2739 }
2740
2741 mutex_exit(&flock_lock);
2742
2743 if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2744 /* There was a race to allocate the graph_t and we lost */
2745 mutex_destroy(&gp_alloc->gp_mutex);
2746 kmem_free(gp_alloc, sizeof (graph_t));
2747 }
2748
2749 return (gp);
2750 }
2751
2752 /*
2753 * PSARC case 1997/292
2754 */
2755 int
2756 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2757 {
2758 lock_descriptor_t *lock;
2759 int result = 0;
2760 graph_t *gp;
2761 int lock_nlmid;
2762
2763 /*
2764 * Check to see if node is booted as a cluster. If not, return.
2765 */
2766 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2767 return (0);
2768 }
2769
2770 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2771 if (gp == NULL) {
2772 return (0);
2773 }
2774
2775 mutex_enter(&gp->gp_mutex);
2776
2777 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2778
2779 if (lock) {
2780 while (lock->l_vnode == vp) {
2781 /* get NLM id from sysid */
2782 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2783
2784 /*
2785 * If NLM server request _and_ nlmid of lock matches
2786 * nlmid of argument, then we've found a remote lock.
2787 */
2788 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2789 result = 1;
2790 goto done;
2791 }
2792 lock = lock->l_next;
2793 }
2794 }
2795
2796 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2797
2798 if (lock) {
2799 while (lock->l_vnode == vp) {
2800 /* get NLM id from sysid */
2801 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2802
2803 /*
2804 * If NLM server request _and_ nlmid of lock matches
2805 * nlmid of argument, then we've found a remote lock.
2806 */
2807 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2808 result = 1;
2809 goto done;
2810 }
2811 lock = lock->l_next;
2812 }
2813 }
2814
2815 done:
2816 mutex_exit(&gp->gp_mutex);
2817 return (result);
2818 }
2819
2820 /*
2821 * Determine whether there are any locks for the given vnode with a remote
2822 * sysid. Returns zero if not, non-zero if there are.
2823 *
2824 * Note that the return value from this function is potentially invalid
2825 * once it has been returned. The caller is responsible for providing its
2826 * own synchronization mechanism to ensure that the return value is useful
2827 * (e.g., see nfs_lockcompletion()).
2828 */
2829 int
2830 flk_has_remote_locks(vnode_t *vp)
2831 {
2832 lock_descriptor_t *lock;
2833 int result = 0;
2834 graph_t *gp;
2835
2836 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2837 if (gp == NULL) {
2838 return (0);
2839 }
2840
2841 mutex_enter(&gp->gp_mutex);
2842
2843 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2844
2845 if (lock) {
2846 while (lock->l_vnode == vp) {
2847 if (IS_REMOTE(lock)) {
2848 result = 1;
2849 goto done;
2850 }
2851 lock = lock->l_next;
2852 }
2853 }
2854
2855 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2856
2857 if (lock) {
2858 while (lock->l_vnode == vp) {
2859 if (IS_REMOTE(lock)) {
2860 result = 1;
2861 goto done;
2862 }
2863 lock = lock->l_next;
2864 }
2865 }
2866
2867 done:
2868 mutex_exit(&gp->gp_mutex);
2869 return (result);
2870 }
2871
2872 /*
2873 * Determine whether there are any locks for the given vnode with a remote
2874 * sysid matching given sysid.
2875 * Used by the new (open source) NFS Lock Manager (NLM)
2876 */
2877 int
2878 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2879 {
2880 lock_descriptor_t *lock;
2881 int result = 0;
2882 graph_t *gp;
2883
2884 if (sysid == 0)
2885 return (0);
2886
2887 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2888 if (gp == NULL) {
2889 return (0);
2890 }
2891
2892 mutex_enter(&gp->gp_mutex);
2893
2894 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2895
2896 if (lock) {
2897 while (lock->l_vnode == vp) {
2898 if (lock->l_flock.l_sysid == sysid) {
2899 result = 1;
2900 goto done;
2901 }
2902 lock = lock->l_next;
2903 }
2904 }
2905
2906 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2907
2908 if (lock) {
2909 while (lock->l_vnode == vp) {
2910 if (lock->l_flock.l_sysid == sysid) {
2911 result = 1;
2912 goto done;
2913 }
2914 lock = lock->l_next;
2915 }
2916 }
2917
2918 done:
2919 mutex_exit(&gp->gp_mutex);
2920 return (result);
2921 }
2922
2923 /*
2924 * Determine if there are any locks owned by the given sysid.
2925 * Returns zero if not, non-zero if there are. Note that this return code
2926 * could be derived from flk_get_{sleeping,active}_locks, but this routine
2927 * avoids all the memory allocations of those routines.
2928 *
2929 * This routine has the same synchronization issues as
2930 * flk_has_remote_locks.
2931 */
2932
2933 int
2934 flk_sysid_has_locks(int sysid, int lck_type)
2935 {
2936 int has_locks = 0;
2937 lock_descriptor_t *lock;
2938 graph_t *gp;
2939 int i;
2940
2941 for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2942 mutex_enter(&flock_lock);
2943 gp = lock_graph[i];
2944 mutex_exit(&flock_lock);
2945 if (gp == NULL) {
2946 continue;
2947 }
2948
2949 mutex_enter(&gp->gp_mutex);
2950
2951 if (lck_type & FLK_QUERY_ACTIVE) {
2952 for (lock = ACTIVE_HEAD(gp)->l_next;
2953 lock != ACTIVE_HEAD(gp) && !has_locks;
2954 lock = lock->l_next) {
2955 if (lock->l_flock.l_sysid == sysid)
2956 has_locks = 1;
2957 }
2958 }
2959
2960 if (lck_type & FLK_QUERY_SLEEPING) {
2961 for (lock = SLEEPING_HEAD(gp)->l_next;
2962 lock != SLEEPING_HEAD(gp) && !has_locks;
2963 lock = lock->l_next) {
2964 if (lock->l_flock.l_sysid == sysid)
2965 has_locks = 1;
2966 }
2967 }
2968 mutex_exit(&gp->gp_mutex);
2969 }
2970
2971 return (has_locks);
2972 }
2973
2974
2975 /*
2976 * PSARC case 1997/292
2977 *
2978 * Requires: "sysid" is a pair [nlmid, sysid]. The lower half is 16-bit
2979 * quantity, the real sysid generated by the NLM server; the upper half
2980 * identifies the node of the cluster where the NLM server ran.
2981 * This routine is only called by an NLM server running in a cluster.
2982 * Effects: Remove all locks held on behalf of the client identified
2983 * by "sysid."
2984 */
2985 void
2986 cl_flk_remove_locks_by_sysid(int sysid)
2987 {
2988 graph_t *gp;
2989 int i;
2990 lock_descriptor_t *lock, *nlock;
2991
2992 /*
2993 * Check to see if node is booted as a cluster. If not, return.
2994 */
2995 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2996 return;
2997 }
2998
2999 ASSERT(sysid != 0);
3000 for (i = 0; i < HASH_SIZE; i++) {
3001 mutex_enter(&flock_lock);
3002 gp = lock_graph[i];
3003 mutex_exit(&flock_lock);
3004
3005 if (gp == NULL)
3006 continue;
3007
3008 mutex_enter(&gp->gp_mutex); /* get mutex on lock graph */
3009
3010 /* signal sleeping requests so that they bail out */
3011 lock = SLEEPING_HEAD(gp)->l_next;
3012 while (lock != SLEEPING_HEAD(gp)) {
3013 nlock = lock->l_next;
3014 if (lock->l_flock.l_sysid == sysid) {
3015 INTERRUPT_WAKEUP(lock);
3016 }
3017 lock = nlock;
3018 }
3019
3020 /* delete active locks */
3021 lock = ACTIVE_HEAD(gp)->l_next;
3022 while (lock != ACTIVE_HEAD(gp)) {
3023 nlock = lock->l_next;
3024 if (lock->l_flock.l_sysid == sysid) {
3025 flk_delete_active_lock(lock, 0);
3026 flk_wakeup(lock, 1);
3027 flk_free_lock(lock);
3028 }
3029 lock = nlock;
3030 }
3031 mutex_exit(&gp->gp_mutex); /* release mutex on lock graph */
3032 }
3033 }
3034
3035 /*
3036 * Delete all locks in the system that belongs to the sysid of the request.
3037 */
3038
3039 static void
3040 flk_delete_locks_by_sysid(lock_descriptor_t *request)
3041 {
3042 int sysid = request->l_flock.l_sysid;
3043 lock_descriptor_t *lock, *nlock;
3044 graph_t *gp;
3045 int i;
3046
3047 ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
3048 ASSERT(sysid != 0);
3049
3050 mutex_exit(&request->l_graph->gp_mutex);
3051
3052 for (i = 0; i < HASH_SIZE; i++) {
3053 mutex_enter(&flock_lock);
3054 gp = lock_graph[i];
3055 mutex_exit(&flock_lock);
3056
3057 if (gp == NULL)
3058 continue;
3059
3060 mutex_enter(&gp->gp_mutex);
3061
3062 /* signal sleeping requests so that they bail out */
3063 lock = SLEEPING_HEAD(gp)->l_next;
3064 while (lock != SLEEPING_HEAD(gp)) {
3065 nlock = lock->l_next;
3066 if (lock->l_flock.l_sysid == sysid) {
3067 INTERRUPT_WAKEUP(lock);
3068 }
3069 lock = nlock;
3070 }
3071
3072 /* delete active locks */
3073 lock = ACTIVE_HEAD(gp)->l_next;
3074 while (lock != ACTIVE_HEAD(gp)) {
3075 nlock = lock->l_next;
3076 if (lock->l_flock.l_sysid == sysid) {
3077 flk_delete_active_lock(lock, 0);
3078 flk_wakeup(lock, 1);
3079 flk_free_lock(lock);
3080 }
3081 lock = nlock;
3082 }
3083 mutex_exit(&gp->gp_mutex);
3084 }
3085
3086 mutex_enter(&request->l_graph->gp_mutex);
3087 }
3088
3089 /*
3090 * Clustering: Deletes PXFS locks
3091 * Effects: Delete all locks on files in the given file system and with the
3092 * given PXFS id.
3093 */
3094 void
3095 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
3096 {
3097 lock_descriptor_t *lock, *nlock;
3098 graph_t *gp;
3099 int i;
3100
3101 for (i = 0; i < HASH_SIZE; i++) {
3102 mutex_enter(&flock_lock);
3103 gp = lock_graph[i];
3104 mutex_exit(&flock_lock);
3105
3106 if (gp == NULL)
3107 continue;
3108
3109 mutex_enter(&gp->gp_mutex);
3110
3111 /* signal sleeping requests so that they bail out */
3112 lock = SLEEPING_HEAD(gp)->l_next;
3113 while (lock != SLEEPING_HEAD(gp)) {
3114 nlock = lock->l_next;
3115 if (lock->l_vnode->v_vfsp == vfsp) {
3116 ASSERT(IS_PXFS(lock));
3117 if (GETPXFSID(lock->l_flock.l_sysid) ==
3118 pxfsid) {
3119 flk_set_state(lock,
3120 FLK_CANCELLED_STATE);
3121 flk_cancel_sleeping_lock(lock, 1);
3122 }
3123 }
3124 lock = nlock;
3125 }
3126
3127 /* delete active locks */
3128 lock = ACTIVE_HEAD(gp)->l_next;
3129 while (lock != ACTIVE_HEAD(gp)) {
3130 nlock = lock->l_next;
3131 if (lock->l_vnode->v_vfsp == vfsp) {
3132 ASSERT(IS_PXFS(lock));
3133 if (GETPXFSID(lock->l_flock.l_sysid) ==
3134 pxfsid) {
3135 flk_delete_active_lock(lock, 0);
3136 flk_wakeup(lock, 1);
3137 flk_free_lock(lock);
3138 }
3139 }
3140 lock = nlock;
3141 }
3142 mutex_exit(&gp->gp_mutex);
3143 }
3144 }
3145
3146 /*
3147 * Search for a sleeping lock manager lock which matches exactly this lock
3148 * request; if one is found, fake a signal to cancel it.
3149 *
3150 * Return 1 if a matching lock was found, 0 otherwise.
3151 */
3152
3153 static int
3154 flk_canceled(lock_descriptor_t *request)
3155 {
3156 lock_descriptor_t *lock, *nlock;
3157 graph_t *gp = request->l_graph;
3158 vnode_t *vp = request->l_vnode;
3159
3160 ASSERT(MUTEX_HELD(&gp->gp_mutex));
3161 ASSERT(IS_LOCKMGR(request));
3162 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
3163
3164 if (lock) {
3165 while (lock->l_vnode == vp) {
3166 nlock = lock->l_next;
3167 if (SAME_OWNER(lock, request) &&
3168 lock->l_start == request->l_start &&
3169 lock->l_end == request->l_end) {
3170 INTERRUPT_WAKEUP(lock);
3171 return (1);
3172 }
3173 lock = nlock;
3174 }
3175 }
3176 return (0);
3177 }
3178
3179 /*
3180 * Remove all non-OFD locks for the vnode belonging to the given pid and sysid.
3181 * That is, since OFD locks are pid-less we'll never match on the incoming
3182 * pid. OFD locks are removed earlier in the close() path via closef() and
3183 * ofdcleanlock().
3184 */
3185 void
3186 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
3187 {
3188 graph_t *gp;
3189 lock_descriptor_t *lock, *nlock;
3190 lock_descriptor_t *link_stack;
3191
3192 STACK_INIT(link_stack);
3193
3194 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
3195
3196 if (gp == NULL)
3197 return;
3198 mutex_enter(&gp->gp_mutex);
3199
3200 CHECK_SLEEPING_LOCKS(gp);
3201 CHECK_ACTIVE_LOCKS(gp);
3202
3203 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
3204
3205 if (lock) {
3206 do {
3207 nlock = lock->l_next;
3208 if ((lock->l_flock.l_pid == pid ||
3209 pid == IGN_PID) &&
3210 lock->l_flock.l_sysid == sysid) {
3211 CANCEL_WAKEUP(lock);
3212 }
3213 lock = nlock;
3214 } while (lock->l_vnode == vp);
3215 }
3216
3217 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3218
3219 if (lock) {
3220 do {
3221 nlock = lock->l_next;
3222 if ((lock->l_flock.l_pid == pid ||
3223 pid == IGN_PID) &&
3224 lock->l_flock.l_sysid == sysid) {
3225 flk_delete_active_lock(lock, 0);
3226 STACK_PUSH(link_stack, lock, l_stack);
3227 }
3228 lock = nlock;
3229 } while (lock->l_vnode == vp);
3230 }
3231
3232 while ((lock = STACK_TOP(link_stack)) != NULL) {
3233 STACK_POP(link_stack, l_stack);
3234 flk_wakeup(lock, 1);
3235 flk_free_lock(lock);
3236 }
3237
3238 CHECK_SLEEPING_LOCKS(gp);
3239 CHECK_ACTIVE_LOCKS(gp);
3240 CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
3241 mutex_exit(&gp->gp_mutex);
3242 }
3243
3244
3245 /*
3246 * Called from 'fs' read and write routines for files that have mandatory
3247 * locking enabled.
3248 */
3249
3250 int
3251 chklock(struct vnode *vp, int iomode, u_offset_t offset, ssize_t len, int fmode,
3252 caller_context_t *ct)
3253 {
3254 int i;
3255 struct flock64 bf;
3256 int error = 0;
3257
3258 bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
3259 bf.l_whence = 0;
3260 bf.l_start = offset;
3261 bf.l_len = len;
3262 if (ct == NULL) {
3263 bf.l_pid = curproc->p_pid;
3264 bf.l_sysid = 0;
3265 } else {
3266 bf.l_pid = ct->cc_pid;
3267 bf.l_sysid = ct->cc_sysid;
3268 }
3269 i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
3270 if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
3271 bf.l_type != F_UNLCK)
3272 error = i ? i : EAGAIN;
3273 return (error);
3274 }
3275
3276 /*
3277 * convoff - converts the given data (start, whence) to the
3278 * given whence.
3279 */
3280 int
3281 convoff(struct vnode *vp, struct flock64 *lckdat, int whence, offset_t offset)
3282 {
3283 int error;
3284 struct vattr vattr;
3285
3286 if ((lckdat->l_whence == 2) || (whence == 2)) {
3287 vattr.va_mask = AT_SIZE;
3288 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3289 return (error);
3290 }
3291
3292 switch (lckdat->l_whence) {
3293 case 1:
3294 lckdat->l_start += offset;
3295 break;
3296 case 2:
3297 lckdat->l_start += vattr.va_size;
3298 /* FALLTHRU */
3299 case 0:
3300 break;
3301 default:
3302 return (EINVAL);
3303 }
3304
3305 if (lckdat->l_start < 0)
3306 return (EINVAL);
3307
3308 switch (whence) {
3309 case 1:
3310 lckdat->l_start -= offset;
3311 break;
3312 case 2:
3313 lckdat->l_start -= vattr.va_size;
3314 /* FALLTHRU */
3315 case 0:
3316 break;
3317 default:
3318 return (EINVAL);
3319 }
3320
3321 lckdat->l_whence = (short)whence;
3322 return (0);
3323 }
3324
3325
3326 /* proc_graph function definitions */
3327
3328 /*
3329 * Function checks for deadlock due to the new 'lock'. If deadlock found
3330 * edges of this lock are freed and returned.
3331 */
3332
3333 static int
3334 flk_check_deadlock(lock_descriptor_t *lock)
3335 {
3336 proc_vertex_t *start_vertex, *pvertex;
3337 proc_vertex_t *dvertex;
3338 proc_edge_t *pep, *ppep;
3339 edge_t *ep, *nep;
3340 proc_vertex_t *process_stack;
3341
3342 /*
3343 * OFD style locks are not associated with any process so there is
3344 * no proc graph for these. Thus we cannot, and do not, do deadlock
3345 * detection.
3346 */
3347 if (lock->l_ofd != NULL)
3348 return (0);
3349
3350 STACK_INIT(process_stack);
3351
3352 mutex_enter(&flock_lock);
3353 start_vertex = flk_get_proc_vertex(lock);
3354 ASSERT(start_vertex != NULL);
3355
3356 /* construct the edges from this process to other processes */
3357
3358 ep = FIRST_ADJ(lock);
3359 while (ep != HEAD(lock)) {
3360 proc_vertex_t *adj_proc;
3361
3362 adj_proc = flk_get_proc_vertex(ep->to_vertex);
3363 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
3364 if (pep->to_proc == adj_proc) {
3365 ASSERT(pep->refcount);
3366 pep->refcount++;
3367 break;
3368 }
3369 }
3370 if (pep == NULL) {
3371 pep = flk_get_proc_edge();
3372 pep->to_proc = adj_proc;
3373 pep->refcount = 1;
3374 adj_proc->incount++;
3375 pep->next = start_vertex->edge;
3376 start_vertex->edge = pep;
3377 }
3378 ep = NEXT_ADJ(ep);
3379 }
3380
3381 ep = FIRST_IN(lock);
3382
3383 while (ep != HEAD(lock)) {
3384 proc_vertex_t *in_proc;
3385
3386 in_proc = flk_get_proc_vertex(ep->from_vertex);
3387
3388 for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
3389 if (pep->to_proc == start_vertex) {
3390 ASSERT(pep->refcount);
3391 pep->refcount++;
3392 break;
3393 }
3394 }
3395 if (pep == NULL) {
3396 pep = flk_get_proc_edge();
3397 pep->to_proc = start_vertex;
3398 pep->refcount = 1;
3399 start_vertex->incount++;
3400 pep->next = in_proc->edge;
3401 in_proc->edge = pep;
3402 }
3403 ep = NEXT_IN(ep);
3404 }
3405
3406 if (start_vertex->incount == 0) {
3407 mutex_exit(&flock_lock);
3408 return (0);
3409 }
3410
3411 flk_proc_graph_uncolor();
3412
3413 start_vertex->p_sedge = start_vertex->edge;
3414
3415 STACK_PUSH(process_stack, start_vertex, p_stack);
3416
3417 while ((pvertex = STACK_TOP(process_stack)) != NULL) {
3418 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
3419 dvertex = pep->to_proc;
3420 if (!PROC_ARRIVED(dvertex)) {
3421 STACK_PUSH(process_stack, dvertex, p_stack);
3422 dvertex->p_sedge = dvertex->edge;
3423 PROC_ARRIVE(pvertex);
3424 pvertex->p_sedge = pep->next;
3425 break;
3426 }
3427 if (!PROC_DEPARTED(dvertex))
3428 goto deadlock;
3429 }
3430 if (pep == NULL) {
3431 PROC_DEPART(pvertex);
3432 STACK_POP(process_stack, p_stack);
3433 }
3434 }
3435 mutex_exit(&flock_lock);
3436 return (0);
3437
3438 deadlock:
3439
3440 /* we remove all lock edges and proc edges */
3441
3442 ep = FIRST_ADJ(lock);
3443 while (ep != HEAD(lock)) {
3444 proc_vertex_t *adj_proc;
3445 adj_proc = flk_get_proc_vertex(ep->to_vertex);
3446 nep = NEXT_ADJ(ep);
3447 IN_LIST_REMOVE(ep);
3448 ADJ_LIST_REMOVE(ep);
3449 flk_free_edge(ep);
3450 ppep = start_vertex->edge;
3451 for (pep = start_vertex->edge; pep != NULL; ppep = pep,
3452 pep = ppep->next) {
3453 if (pep->to_proc == adj_proc) {
3454 pep->refcount--;
3455 if (pep->refcount == 0) {
3456 if (pep == ppep) {
3457 start_vertex->edge = pep->next;
3458 } else {
3459 ppep->next = pep->next;
3460 }
3461 adj_proc->incount--;
3462 flk_proc_release(adj_proc);
3463 flk_free_proc_edge(pep);
3464 }
3465 break;
3466 }
3467 }
3468 ep = nep;
3469 }
3470 ep = FIRST_IN(lock);
3471 while (ep != HEAD(lock)) {
3472 proc_vertex_t *in_proc;
3473 in_proc = flk_get_proc_vertex(ep->from_vertex);
3474 nep = NEXT_IN(ep);
3475 IN_LIST_REMOVE(ep);
3476 ADJ_LIST_REMOVE(ep);
3477 flk_free_edge(ep);
3478 ppep = in_proc->edge;
3479 for (pep = in_proc->edge; pep != NULL; ppep = pep,
3480 pep = ppep->next) {
3481 if (pep->to_proc == start_vertex) {
3482 pep->refcount--;
3483 if (pep->refcount == 0) {
3484 if (pep == ppep) {
3485 in_proc->edge = pep->next;
3486 } else {
3487 ppep->next = pep->next;
3488 }
3489 start_vertex->incount--;
3490 flk_proc_release(in_proc);
3491 flk_free_proc_edge(pep);
3492 }
3493 break;
3494 }
3495 }
3496 ep = nep;
3497 }
3498 flk_proc_release(start_vertex);
3499 mutex_exit(&flock_lock);
3500 return (1);
3501 }
3502
3503 /*
3504 * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
3505 * from the list we return that, otherwise we allocate one. If necessary,
3506 * we grow the list of vertices also.
3507 */
3508
3509 static proc_vertex_t *
3510 flk_get_proc_vertex(lock_descriptor_t *lock)
3511 {
3512 int i;
3513 proc_vertex_t *pv;
3514 proc_vertex_t **palloc;
3515
3516 ASSERT(MUTEX_HELD(&flock_lock));
3517 if (lock->pvertex != -1) {
3518 ASSERT(lock->pvertex >= 0);
3519 pv = pgraph.proc[lock->pvertex];
3520 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3521 return (pv);
3522 }
3523 }
3524 for (i = 0; i < pgraph.gcount; i++) {
3525 pv = pgraph.proc[i];
3526 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3527 lock->pvertex = pv->index = i;
3528 return (pv);
3529 }
3530 }
3531 pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
3532 pv->pid = lock->l_flock.l_pid;
3533 pv->sysid = lock->l_flock.l_sysid;
3534 flk_proc_vertex_allocs++;
3535 if (pgraph.free != 0) {
3536 for (i = 0; i < pgraph.gcount; i++) {
3537 if (pgraph.proc[i] == NULL) {
3538 pgraph.proc[i] = pv;
3539 lock->pvertex = pv->index = i;
3540 pgraph.free--;
3541 return (pv);
3542 }
3543 }
3544 }
3545 palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
3546 sizeof (proc_vertex_t *), KM_SLEEP);
3547
3548 if (pgraph.proc) {
3549 bcopy(pgraph.proc, palloc,
3550 pgraph.gcount * sizeof (proc_vertex_t *));
3551
3552 kmem_free(pgraph.proc,
3553 pgraph.gcount * sizeof (proc_vertex_t *));
3554 }
3555 pgraph.proc = palloc;
3556 pgraph.free += (PROC_CHUNK - 1);
3557 pv->index = lock->pvertex = pgraph.gcount;
3558 pgraph.gcount += PROC_CHUNK;
3559 pgraph.proc[pv->index] = pv;
3560 return (pv);
3561 }
3562
3563 /*
3564 * Allocate a proc edge.
3565 */
3566
3567 static proc_edge_t *
3568 flk_get_proc_edge()
3569 {
3570 proc_edge_t *pep;
3571
3572 pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3573 flk_proc_edge_allocs++;
3574 return (pep);
3575 }
3576
3577 /*
3578 * Free the proc edge. Called whenever its reference count goes to zero.
3579 */
3580
3581 static void
3582 flk_free_proc_edge(proc_edge_t *pep)
3583 {
3584 ASSERT(pep->refcount == 0);
3585 kmem_free(pep, sizeof (proc_edge_t));
3586 flk_proc_edge_frees++;
3587 }
3588
3589 /*
3590 * Color the graph explicitly done only when the mark value hits max value.
3591 */
3592
3593 static void
3594 flk_proc_graph_uncolor()
3595 {
3596 int i;
3597
3598 if (pgraph.mark == UINT_MAX) {
3599 for (i = 0; i < pgraph.gcount; i++)
3600 if (pgraph.proc[i] != NULL) {
3601 pgraph.proc[i]->atime = 0;
3602 pgraph.proc[i]->dtime = 0;
3603 }
3604 pgraph.mark = 1;
3605 } else {
3606 pgraph.mark++;
3607 }
3608 }
3609
3610 /*
3611 * Release the proc vertex iff both there are no in edges and out edges
3612 */
3613
3614 static void
3615 flk_proc_release(proc_vertex_t *proc)
3616 {
3617 ASSERT(MUTEX_HELD(&flock_lock));
3618 if (proc->edge == NULL && proc->incount == 0) {
3619 pgraph.proc[proc->index] = NULL;
3620 pgraph.free++;
3621 kmem_free(proc, sizeof (proc_vertex_t));
3622 flk_proc_vertex_frees++;
3623 }
3624 }
3625
3626 /*
3627 * Updates process graph to reflect change in a lock_graph.
3628 * Note: We should call this function only after we have a correctly
3629 * recomputed lock graph. Otherwise we might miss a deadlock detection.
3630 * eg: in function flk_relation() we call this function after flk_recompute_
3631 * dependencies() otherwise if a process tries to lock a vnode hashed
3632 * into another graph it might sleep for ever.
3633 */
3634
3635 static void
3636 flk_update_proc_graph(edge_t *ep, int delete)
3637 {
3638 proc_vertex_t *toproc, *fromproc;
3639 proc_edge_t *pep, *prevpep;
3640
3641 mutex_enter(&flock_lock);
3642
3643 /*
3644 * OFD style locks are not associated with any process so there is
3645 * no proc graph for these.
3646 */
3647 if (ep->from_vertex->l_ofd != NULL) {
3648 mutex_exit(&flock_lock);
3649 return;
3650 }
3651
3652 toproc = flk_get_proc_vertex(ep->to_vertex);
3653 fromproc = flk_get_proc_vertex(ep->from_vertex);
3654
3655 if (!delete)
3656 goto add;
3657 pep = prevpep = fromproc->edge;
3658
3659 ASSERT(pep != NULL);
3660 while (pep != NULL) {
3661 if (pep->to_proc == toproc) {
3662 ASSERT(pep->refcount > 0);
3663 pep->refcount--;
3664 if (pep->refcount == 0) {
3665 if (pep == prevpep) {
3666 fromproc->edge = pep->next;
3667 } else {
3668 prevpep->next = pep->next;
3669 }
3670 toproc->incount--;
3671 flk_proc_release(toproc);
3672 flk_free_proc_edge(pep);
3673 }
3674 break;
3675 }
3676 prevpep = pep;
3677 pep = pep->next;
3678 }
3679 flk_proc_release(fromproc);
3680 mutex_exit(&flock_lock);
3681 return;
3682 add:
3683
3684 pep = fromproc->edge;
3685
3686 while (pep != NULL) {
3687 if (pep->to_proc == toproc) {
3688 ASSERT(pep->refcount > 0);
3689 pep->refcount++;
3690 break;
3691 }
3692 pep = pep->next;
3693 }
3694 if (pep == NULL) {
3695 pep = flk_get_proc_edge();
3696 pep->to_proc = toproc;
3697 pep->refcount = 1;
3698 toproc->incount++;
3699 pep->next = fromproc->edge;
3700 fromproc->edge = pep;
3701 }
3702 mutex_exit(&flock_lock);
3703 }
3704
3705 /*
3706 * Set the control status for lock manager requests.
3707 *
3708 */
3709
3710 /*
3711 * PSARC case 1997/292
3712 *
3713 * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3714 * Effects: Set the state of the NLM server identified by "nlmid"
3715 * in the NLM registry to state "nlm_state."
3716 * Raises exception no_such_nlm if "nlmid" doesn't identify a known
3717 * NLM server to this LLM.
3718 * Note that when this routine is called with NLM_SHUTTING_DOWN there
3719 * may be locks requests that have gotten started but not finished. In
3720 * particular, there may be blocking requests that are in the callback code
3721 * before sleeping (so they're not holding the lock for the graph). If
3722 * such a thread reacquires the graph's lock (to go to sleep) after
3723 * NLM state in the NLM registry is set to a non-up value,
3724 * it will notice the status and bail out. If the request gets
3725 * granted before the thread can check the NLM registry, let it
3726 * continue normally. It will get flushed when we are called with NLM_DOWN.
3727 *
3728 * Modifies: nlm_reg_obj (global)
3729 * Arguments:
3730 * nlmid (IN): id uniquely identifying an NLM server
3731 * nlm_state (IN): NLM server state to change "nlmid" to
3732 */
3733 void
3734 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3735 {
3736 /*
3737 * Check to see if node is booted as a cluster. If not, return.
3738 */
3739 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3740 return;
3741 }
3742
3743 /*
3744 * Check for development/debugging. It is possible to boot a node
3745 * in non-cluster mode, and then run a special script, currently
3746 * available only to developers, to bring up the node as part of a
3747 * cluster. The problem is that running such a script does not
3748 * result in the routine flk_init() being called and hence global array
3749 * nlm_reg_status is NULL. The NLM thinks it's in cluster mode,
3750 * but the LLM needs to do an additional check to see if the global
3751 * array has been created or not. If nlm_reg_status is NULL, then
3752 * return, else continue.
3753 */
3754 if (nlm_reg_status == NULL) {
3755 return;
3756 }
3757
3758 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3759 mutex_enter(&nlm_reg_lock);
3760
3761 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3762 /*
3763 * If the NLM server "nlmid" is unknown in the NLM registry,
3764 * add it to the registry in the nlm shutting down state.
3765 */
3766 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3767 FLK_NLM_SHUTTING_DOWN);
3768 } else {
3769 /*
3770 * Change the state of the NLM server identified by "nlmid"
3771 * in the NLM registry to the argument "nlm_state."
3772 */
3773 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3774 nlm_state);
3775 }
3776
3777 /*
3778 * The reason we must register the NLM server that is shutting down
3779 * with an LLM that doesn't already know about it (never sent a lock
3780 * request) is to handle correctly a race between shutdown and a new
3781 * lock request. Suppose that a shutdown request from the NLM server
3782 * invokes this routine at the LLM, and a thread is spawned to
3783 * service the request. Now suppose a new lock request is in
3784 * progress and has already passed the first line of defense in
3785 * reclock(), which denies new locks requests from NLM servers
3786 * that are not in the NLM_UP state. After the current routine
3787 * is invoked for both phases of shutdown, the routine will return,
3788 * having done nothing, and the lock request will proceed and
3789 * probably be granted. The problem is that the shutdown was ignored
3790 * by the lock request because there was no record of that NLM server
3791 * shutting down. We will be in the peculiar position of thinking
3792 * that we've shutdown the NLM server and all locks at all LLMs have
3793 * been discarded, but in fact there's still one lock held.
3794 * The solution is to record the existence of NLM server and change
3795 * its state immediately to NLM_SHUTTING_DOWN. The lock request in
3796 * progress may proceed because the next phase NLM_DOWN will catch
3797 * this lock and discard it.
3798 */
3799 mutex_exit(&nlm_reg_lock);
3800
3801 switch (nlm_state) {
3802 case FLK_NLM_UP:
3803 /*
3804 * Change the NLM state of all locks still held on behalf of
3805 * the NLM server identified by "nlmid" to NLM_UP.
3806 */
3807 cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3808 break;
3809
3810 case FLK_NLM_SHUTTING_DOWN:
3811 /*
3812 * Wake up all sleeping locks for the NLM server identified
3813 * by "nlmid." Note that eventually all woken threads will
3814 * have their lock requests cancelled and descriptors
3815 * removed from the sleeping lock list. Note that the NLM
3816 * server state associated with each lock descriptor is
3817 * changed to FLK_NLM_SHUTTING_DOWN.
3818 */
3819 cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3820 break;
3821
3822 case FLK_NLM_DOWN:
3823 /*
3824 * Discard all active, granted locks for this NLM server
3825 * identified by "nlmid."
3826 */
3827 cl_flk_unlock_nlm_granted(nlmid);
3828 break;
3829
3830 default:
3831 panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3832 }
3833 }
3834
3835 /*
3836 * Set the control status for lock manager requests.
3837 *
3838 * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3839 * may be locks requests that have gotten started but not finished. In
3840 * particular, there may be blocking requests that are in the callback code
3841 * before sleeping (so they're not holding the lock for the graph). If
3842 * such a thread reacquires the graph's lock (to go to sleep) after
3843 * flk_lockmgr_status is set to a non-up value, it will notice the status
3844 * and bail out. If the request gets granted before the thread can check
3845 * flk_lockmgr_status, let it continue normally. It will get flushed when
3846 * we are called with FLK_LOCKMGR_DOWN.
3847 */
3848
3849 void
3850 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3851 {
3852 int i;
3853 graph_t *gp;
3854 struct flock_globals *fg;
3855
3856 fg = flk_get_globals();
3857 ASSERT(fg != NULL);
3858
3859 mutex_enter(&flock_lock);
3860 fg->flk_lockmgr_status = status;
3861 mutex_exit(&flock_lock);
3862
3863 /*
3864 * If the lock manager is coming back up, all that's needed is to
3865 * propagate this information to the graphs. If the lock manager
3866 * is going down, additional action is required, and each graph's
3867 * copy of the state is updated atomically with this other action.
3868 */
3869 switch (status) {
3870 case FLK_LOCKMGR_UP:
3871 for (i = 0; i < HASH_SIZE; i++) {
3872 mutex_enter(&flock_lock);
3873 gp = lock_graph[i];
3874 mutex_exit(&flock_lock);
3875 if (gp == NULL)
3876 continue;
3877 mutex_enter(&gp->gp_mutex);
3878 fg->lockmgr_status[i] = status;
3879 mutex_exit(&gp->gp_mutex);
3880 }
3881 break;
3882 case FLK_WAKEUP_SLEEPERS:
3883 wakeup_sleeping_lockmgr_locks(fg);
3884 break;
3885 case FLK_LOCKMGR_DOWN:
3886 unlock_lockmgr_granted(fg);
3887 break;
3888 default:
3889 panic("flk_set_lockmgr_status: bad status (%d)", status);
3890 break;
3891 }
3892 }
3893
3894 /*
3895 * This routine returns all the locks that are active or sleeping and are
3896 * associated with a particular set of identifiers. If lock_state != 0, then
3897 * only locks that match the lock_state are returned. If lock_state == 0, then
3898 * all locks are returned. If pid == NOPID, the pid is ignored. If
3899 * use_sysid is FALSE, then the sysid is ignored. If vp is NULL, then the
3900 * vnode pointer is ignored.
3901 *
3902 * A list containing the vnode pointer and an flock structure
3903 * describing the lock is returned. Each element in the list is
3904 * dynamically allocated and must be freed by the caller. The
3905 * last item in the list is denoted by a NULL value in the ll_next
3906 * field.
3907 *
3908 * The vnode pointers returned are held. The caller is responsible
3909 * for releasing these. Note that the returned list is only a snapshot of
3910 * the current lock information, and that it is a snapshot of a moving
3911 * target (only one graph is locked at a time).
3912 */
3913
3914 locklist_t *
3915 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3916 pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3917 {
3918 lock_descriptor_t *lock;
3919 lock_descriptor_t *graph_head;
3920 locklist_t listhead;
3921 locklist_t *llheadp;
3922 locklist_t *llp;
3923 locklist_t *lltp;
3924 graph_t *gp;
3925 int i;
3926 int first_index; /* graph index */
3927 int num_indexes; /* graph index */
3928
3929 ASSERT((list_type == FLK_ACTIVE_STATE) ||
3930 (list_type == FLK_SLEEPING_STATE));
3931
3932 /*
3933 * Get a pointer to something to use as a list head while building
3934 * the rest of the list.
3935 */
3936 llheadp = &listhead;
3937 lltp = llheadp;
3938 llheadp->ll_next = (locklist_t *)NULL;
3939
3940 /* Figure out which graphs we want to look at. */
3941 if (vp == NULL) {
3942 first_index = 0;
3943 num_indexes = HASH_SIZE;
3944 } else {
3945 first_index = HASH_INDEX(vp);
3946 num_indexes = 1;
3947 }
3948
3949 for (i = first_index; i < first_index + num_indexes; i++) {
3950 mutex_enter(&flock_lock);
3951 gp = lock_graph[i];
3952 mutex_exit(&flock_lock);
3953 if (gp == NULL) {
3954 continue;
3955 }
3956
3957 mutex_enter(&gp->gp_mutex);
3958 graph_head = (list_type == FLK_ACTIVE_STATE) ?
3959 ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3960 for (lock = graph_head->l_next;
3961 lock != graph_head;
3962 lock = lock->l_next) {
3963 if (use_sysid && lock->l_flock.l_sysid != sysid)
3964 continue;
3965 if (pid != NOPID && lock->l_flock.l_pid != pid)
3966 continue;
3967 if (vp != NULL && lock->l_vnode != vp)
3968 continue;
3969 if (lock_state && !(lock_state & lock->l_state))
3970 continue;
3971 if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3972 continue;
3973 /*
3974 * A matching lock was found. Allocate
3975 * space for a new locklist entry and fill
3976 * it in.
3977 */
3978 llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3979 lltp->ll_next = llp;
3980 VN_HOLD(lock->l_vnode);
3981 llp->ll_vp = lock->l_vnode;
3982 create_flock(lock, &(llp->ll_flock));
3983 llp->ll_next = (locklist_t *)NULL;
3984 lltp = llp;
3985 }
3986 mutex_exit(&gp->gp_mutex);
3987 }
3988
3989 llp = llheadp->ll_next;
3990 return (llp);
3991 }
3992
3993 /*
3994 * These two functions are simply interfaces to get_lock_list. They return
3995 * a list of sleeping or active locks for the given sysid and pid. See
3996 * get_lock_list for details.
3997 *
3998 * In either case we don't particularly care to specify the zone of interest;
3999 * the sysid-space is global across zones, so the sysid will map to exactly one
4000 * zone, and we'll return information for that zone.
4001 */
4002
4003 locklist_t *
4004 flk_get_sleeping_locks(int sysid, pid_t pid)
4005 {
4006 return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
4007 ALL_ZONES));
4008 }
4009
4010 locklist_t *
4011 flk_get_active_locks(int sysid, pid_t pid)
4012 {
4013 return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
4014 ALL_ZONES));
4015 }
4016
4017 /*
4018 * Another interface to get_lock_list. This one returns all the active
4019 * locks for a given vnode. Again, see get_lock_list for details.
4020 *
4021 * We don't need to specify which zone's locks we're interested in. The matter
4022 * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
4023 * be used by multiple zones, so the list of locks will all be from the right
4024 * zone.
4025 */
4026
4027 locklist_t *
4028 flk_active_locks_for_vp(const vnode_t *vp)
4029 {
4030 return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
4031 ALL_ZONES));
4032 }
4033
4034 /*
4035 * Another interface to get_lock_list. This one returns all the active
4036 * nbmand locks for a given vnode. Again, see get_lock_list for details.
4037 *
4038 * See the comment for flk_active_locks_for_vp() for why we don't care to
4039 * specify the particular zone of interest.
4040 */
4041 locklist_t *
4042 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
4043 {
4044 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
4045 NOPID, vp, ALL_ZONES));
4046 }
4047
4048 /*
4049 * Another interface to get_lock_list. This one returns all the active
4050 * nbmand locks for a given pid. Again, see get_lock_list for details.
4051 *
4052 * The zone doesn't need to be specified here; the locks held by a
4053 * particular process will either be local (ie, non-NFS) or from the zone
4054 * the process is executing in. This is because other parts of the system
4055 * ensure that an NFS vnode can't be used in a zone other than that in
4056 * which it was opened.
4057 */
4058 locklist_t *
4059 flk_active_nbmand_locks(pid_t pid)
4060 {
4061 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
4062 pid, NULL, ALL_ZONES));
4063 }
4064
4065 /*
4066 * Free up all entries in the locklist.
4067 */
4068 void
4069 flk_free_locklist(locklist_t *llp)
4070 {
4071 locklist_t *next_llp;
4072
4073 while (llp) {
4074 next_llp = llp->ll_next;
4075 VN_RELE(llp->ll_vp);
4076 kmem_free(llp, sizeof (*llp));
4077 llp = next_llp;
4078 }
4079 }
4080
4081 static void
4082 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
4083 {
4084 /*
4085 * For each graph "lg" in the hash table lock_graph do
4086 * a. Get the list of sleeping locks
4087 * b. For each lock descriptor in the list do
4088 * i. If the requested lock is an NLM server request AND
4089 * the nlmid is the same as the routine argument then
4090 * change the lock descriptor's state field to
4091 * "nlm_state."
4092 * c. Get the list of active locks
4093 * d. For each lock descriptor in the list do
4094 * i. If the requested lock is an NLM server request AND
4095 * the nlmid is the same as the routine argument then
4096 * change the lock descriptor's state field to
4097 * "nlm_state."
4098 */
4099
4100 int i;
4101 graph_t *gp; /* lock graph */
4102 lock_descriptor_t *lock; /* lock */
4103 lock_descriptor_t *nlock = NULL; /* next lock */
4104 int lock_nlmid;
4105
4106 for (i = 0; i < HASH_SIZE; i++) {
4107 mutex_enter(&flock_lock);
4108 gp = lock_graph[i];
4109 mutex_exit(&flock_lock);
4110 if (gp == NULL) {
4111 continue;
4112 }
4113
4114 /* Get list of sleeping locks in current lock graph. */
4115 mutex_enter(&gp->gp_mutex);
4116 for (lock = SLEEPING_HEAD(gp)->l_next;
4117 lock != SLEEPING_HEAD(gp);
4118 lock = nlock) {
4119 nlock = lock->l_next;
4120 /* get NLM id */
4121 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
4122
4123 /*
4124 * If NLM server request AND nlmid of lock matches
4125 * nlmid of argument, then set the NLM state of the
4126 * lock to "nlm_state."
4127 */
4128 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
4129 SET_NLM_STATE(lock, nlm_state);
4130 }
4131 }
4132
4133 /* Get list of active locks in current lock graph. */
4134 for (lock = ACTIVE_HEAD(gp)->l_next;
4135 lock != ACTIVE_HEAD(gp);
4136 lock = nlock) {
4137 nlock = lock->l_next;
4138 /* get NLM id */
4139 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
4140
4141 /*
4142 * If NLM server request AND nlmid of lock matches
4143 * nlmid of argument, then set the NLM state of the
4144 * lock to "nlm_state."
4145 */
4146 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
4147 ASSERT(IS_ACTIVE(lock));
4148 SET_NLM_STATE(lock, nlm_state);
4149 }
4150 }
4151 mutex_exit(&gp->gp_mutex);
4152 }
4153 }
4154
4155 /*
4156 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
4157 * Effects: Find all sleeping lock manager requests _only_ for the NLM server
4158 * identified by "nlmid." Poke those lock requests.
4159 */
4160 static void
4161 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
4162 {
4163 lock_descriptor_t *lock;
4164 lock_descriptor_t *nlock = NULL; /* next lock */
4165 int i;
4166 graph_t *gp;
4167 int lock_nlmid;
4168
4169 for (i = 0; i < HASH_SIZE; i++) {
4170 mutex_enter(&flock_lock);
4171 gp = lock_graph[i];
4172 mutex_exit(&flock_lock);
4173 if (gp == NULL) {
4174 continue;
4175 }
4176
4177 mutex_enter(&gp->gp_mutex);
4178 for (lock = SLEEPING_HEAD(gp)->l_next;
4179 lock != SLEEPING_HEAD(gp);
4180 lock = nlock) {
4181 nlock = lock->l_next;
4182 /*
4183 * If NLM server request _and_ nlmid of lock matches
4184 * nlmid of argument, then set the NLM state of the
4185 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
4186 * request.
4187 */
4188 if (IS_LOCKMGR(lock)) {
4189 /* get NLM id */
4190 lock_nlmid =
4191 GETNLMID(lock->l_flock.l_sysid);
4192 if (nlmid == lock_nlmid) {
4193 SET_NLM_STATE(lock,
4194 FLK_NLM_SHUTTING_DOWN);
4195 INTERRUPT_WAKEUP(lock);
4196 }
4197 }
4198 }
4199 mutex_exit(&gp->gp_mutex);
4200 }
4201 }
4202
4203 /*
4204 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
4205 * Effects: Find all active (granted) lock manager locks _only_ for the
4206 * NLM server identified by "nlmid" and release them.
4207 */
4208 static void
4209 cl_flk_unlock_nlm_granted(int nlmid)
4210 {
4211 lock_descriptor_t *lock;
4212 lock_descriptor_t *nlock = NULL; /* next lock */
4213 int i;
4214 graph_t *gp;
4215 int lock_nlmid;
4216
4217 for (i = 0; i < HASH_SIZE; i++) {
4218 mutex_enter(&flock_lock);
4219 gp = lock_graph[i];
4220 mutex_exit(&flock_lock);
4221 if (gp == NULL) {
4222 continue;
4223 }
4224
4225 mutex_enter(&gp->gp_mutex);
4226 for (lock = ACTIVE_HEAD(gp)->l_next;
4227 lock != ACTIVE_HEAD(gp);
4228 lock = nlock) {
4229 nlock = lock->l_next;
4230 ASSERT(IS_ACTIVE(lock));
4231
4232 /*
4233 * If it's an NLM server request _and_ nlmid of
4234 * the lock matches nlmid of argument, then
4235 * remove the active lock the list, wakup blocked
4236 * threads, and free the storage for the lock.
4237 * Note that there's no need to mark the NLM state
4238 * of this lock to NLM_DOWN because the lock will
4239 * be deleted anyway and its storage freed.
4240 */
4241 if (IS_LOCKMGR(lock)) {
4242 /* get NLM id */
4243 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
4244 if (nlmid == lock_nlmid) {
4245 flk_delete_active_lock(lock, 0);
4246 flk_wakeup(lock, 1);
4247 flk_free_lock(lock);
4248 }
4249 }
4250 }
4251 mutex_exit(&gp->gp_mutex);
4252 }
4253 }
4254
4255 /*
4256 * Find all sleeping lock manager requests and poke them.
4257 */
4258 static void
4259 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
4260 {
4261 lock_descriptor_t *lock;
4262 lock_descriptor_t *nlock = NULL; /* next lock */
4263 int i;
4264 graph_t *gp;
4265 zoneid_t zoneid = getzoneid();
4266
4267 for (i = 0; i < HASH_SIZE; i++) {
4268 mutex_enter(&flock_lock);
4269 gp = lock_graph[i];
4270 mutex_exit(&flock_lock);
4271 if (gp == NULL) {
4272 continue;
4273 }
4274
4275 mutex_enter(&gp->gp_mutex);
4276 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
4277 for (lock = SLEEPING_HEAD(gp)->l_next;
4278 lock != SLEEPING_HEAD(gp);
4279 lock = nlock) {
4280 nlock = lock->l_next;
4281 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4282 INTERRUPT_WAKEUP(lock);
4283 }
4284 }
4285 mutex_exit(&gp->gp_mutex);
4286 }
4287 }
4288
4289
4290 /*
4291 * Find all active (granted) lock manager locks and release them.
4292 */
4293 static void
4294 unlock_lockmgr_granted(struct flock_globals *fg)
4295 {
4296 lock_descriptor_t *lock;
4297 lock_descriptor_t *nlock = NULL; /* next lock */
4298 int i;
4299 graph_t *gp;
4300 zoneid_t zoneid = getzoneid();
4301
4302 for (i = 0; i < HASH_SIZE; i++) {
4303 mutex_enter(&flock_lock);
4304 gp = lock_graph[i];
4305 mutex_exit(&flock_lock);
4306 if (gp == NULL) {
4307 continue;
4308 }
4309
4310 mutex_enter(&gp->gp_mutex);
4311 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
4312 for (lock = ACTIVE_HEAD(gp)->l_next;
4313 lock != ACTIVE_HEAD(gp);
4314 lock = nlock) {
4315 nlock = lock->l_next;
4316 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4317 ASSERT(IS_ACTIVE(lock));
4318 flk_delete_active_lock(lock, 0);
4319 flk_wakeup(lock, 1);
4320 flk_free_lock(lock);
4321 }
4322 }
4323 mutex_exit(&gp->gp_mutex);
4324 }
4325 }
4326
4327 /*
4328 * Wait until a lock is granted, cancelled, or interrupted.
4329 */
4330
4331 static void
4332 wait_for_lock(lock_descriptor_t *request)
4333 {
4334 graph_t *gp = request->l_graph;
4335 vnode_t *vp = request->l_vnode;
4336
4337 ASSERT(MUTEX_HELD(&gp->gp_mutex));
4338
4339 while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
4340 !(IS_INTERRUPTED(request))) {
4341 lock_descriptor_t *lock;
4342
4343 if (stale_lock_timeout == 0) {
4344 /* The stale lock detection is disabled */
4345 if (cv_wait_sig(&request->l_cv, &gp->gp_mutex) == 0) {
4346 flk_set_state(request, FLK_INTERRUPTED_STATE);
4347 request->l_state |= INTERRUPTED_LOCK;
4348 }
4349
4350 continue;
4351 }
4352
4353 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4354
4355 if (lock != NULL) {
4356 do {
4357 if (BLOCKS(lock, request)) {
4358 flk_stale_lock_check(lock);
4359 break;
4360 }
4361 lock = lock->l_next;
4362 } while (lock->l_vnode == vp);
4363 }
4364
4365 if (cv_timedwait_sig(&request->l_cv, &gp->gp_mutex,
4366 ddi_get_lbolt() + SEC_TO_TICK(stale_lock_timeout)) == 0) {
4367 flk_set_state(request, FLK_INTERRUPTED_STATE);
4368 request->l_state |= INTERRUPTED_LOCK;
4369 }
4370 }
4371 }
4372
4373 /*
4374 * Create an flock structure from the existing lock information
4375 *
4376 * This routine is used to create flock structures for the lock manager
4377 * to use in a reclaim request. Since the lock was originated on this
4378 * host, it must be conforming to UNIX semantics, so no checking is
4379 * done to make sure it falls within the lower half of the 32-bit range.
4380 */
4381
4382 static void
4383 create_flock(lock_descriptor_t *lp, flock64_t *flp)
4384 {
4385 ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
4386 ASSERT(lp->l_end >= lp->l_start);
4387
4388 flp->l_type = lp->l_type;
4389 flp->l_whence = 0;
4390 flp->l_start = lp->l_start;
4391 flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
4392 (lp->l_end - lp->l_start + 1);
4393 flp->l_sysid = lp->l_flock.l_sysid;
4394 flp->l_pid = lp->l_flock.l_pid;
4395 }
4396
4397 /*
4398 * Convert flock_t data describing a lock range into unsigned long starting
4399 * and ending points, which are put into lock_request. Returns 0 or an
4400 * errno value.
4401 */
4402
4403 int
4404 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
4405 u_offset_t *start, u_offset_t *end, offset_t offset)
4406 {
4407 struct vattr vattr;
4408 int error;
4409
4410 /*
4411 * Determine the starting point of the request
4412 */
4413 switch (flp->l_whence) {
4414 case 0: /* SEEK_SET */
4415 *start = (u_offset_t)flp->l_start;
4416 break;
4417 case 1: /* SEEK_CUR */
4418 *start = (u_offset_t)(flp->l_start + offset);
4419 break;
4420 case 2: /* SEEK_END */
4421 vattr.va_mask = AT_SIZE;
4422 if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
4423 return (error);
4424 *start = (u_offset_t)(flp->l_start + vattr.va_size);
4425 break;
4426 default:
4427 return (EINVAL);
4428 }
4429
4430 /*
4431 * Determine the range covered by the request.
4432 */
4433 if (flp->l_len == 0)
4434 *end = MAX_U_OFFSET_T;
4435 else if ((offset_t)flp->l_len > 0) {
4436 *end = (u_offset_t)(*start + (flp->l_len - 1));
4437 } else {
4438 /*
4439 * Negative length; why do we even allow this ?
4440 * Because this allows easy specification of
4441 * the last n bytes of the file.
4442 */
4443 *end = *start;
4444 *start += (u_offset_t)flp->l_len;
4445 (*start)++;
4446 }
4447 return (0);
4448 }
4449
4450 /*
4451 * Check the validity of lock data. This can used by the NFS
4452 * frlock routines to check data before contacting the server. The
4453 * server must support semantics that aren't as restrictive as
4454 * the UNIX API, so the NFS client is required to check.
4455 * The maximum is passed in by the caller.
4456 */
4457
4458 int
4459 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
4460 {
4461 /*
4462 * The end (length) for local locking should never be greater
4463 * than max. However, the representation for
4464 * the entire file is MAX_U_OFFSET_T.
4465 */
4466 if ((start > max) ||
4467 ((end > max) && (end != MAX_U_OFFSET_T))) {
4468 return (EINVAL);
4469 }
4470 if (start > end) {
4471 return (EINVAL);
4472 }
4473 return (0);
4474 }
4475
4476 /*
4477 * Fill in request->l_flock with information about the lock blocking the
4478 * request. The complexity here is that lock manager requests are allowed
4479 * to see into the upper part of the 32-bit address range, whereas local
4480 * requests are only allowed to see signed values.
4481 *
4482 * What should be done when "blocker" is a lock manager lock that uses the
4483 * upper portion of the 32-bit range, but "request" is local? Since the
4484 * request has already been determined to have been blocked by the blocker,
4485 * at least some portion of "blocker" must be in the range of the request,
4486 * or the request extends to the end of file. For the first case, the
4487 * portion in the lower range is returned with the indication that it goes
4488 * "to EOF." For the second case, the last byte of the lower range is
4489 * returned with the indication that it goes "to EOF."
4490 */
4491
4492 static void
4493 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
4494 {
4495 flock64_t *flrp; /* l_flock portion of request */
4496
4497 ASSERT(blocker != NULL);
4498
4499 flrp = &request->l_flock;
4500 flrp->l_whence = 0;
4501 flrp->l_type = blocker->l_type;
4502 flrp->l_pid = blocker->l_flock.l_pid;
4503 flrp->l_sysid = blocker->l_flock.l_sysid;
4504 request->l_ofd = blocker->l_ofd;
4505
4506 if (IS_LOCKMGR(request)) {
4507 flrp->l_start = blocker->l_start;
4508 if (blocker->l_end == MAX_U_OFFSET_T)
4509 flrp->l_len = 0;
4510 else
4511 flrp->l_len = blocker->l_end - blocker->l_start + 1;
4512 } else {
4513 if (blocker->l_start > MAXEND) {
4514 flrp->l_start = MAXEND;
4515 flrp->l_len = 0;
4516 } else {
4517 flrp->l_start = blocker->l_start;
4518 if (blocker->l_end == MAX_U_OFFSET_T)
4519 flrp->l_len = 0;
4520 else
4521 flrp->l_len = blocker->l_end -
4522 blocker->l_start + 1;
4523 }
4524 }
4525 }
4526
4527 /*
4528 * PSARC case 1997/292
4529 */
4530 /*
4531 * This is the public routine exported by flock.h.
4532 */
4533 void
4534 cl_flk_change_nlm_state_to_unknown(int nlmid)
4535 {
4536 /*
4537 * Check to see if node is booted as a cluster. If not, return.
4538 */
4539 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
4540 return;
4541 }
4542
4543 /*
4544 * See comment in cl_flk_set_nlm_status().
4545 */
4546 if (nlm_reg_status == NULL) {
4547 return;
4548 }
4549
4550 /*
4551 * protect NLM registry state with a mutex.
4552 */
4553 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
4554 mutex_enter(&nlm_reg_lock);
4555 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
4556 mutex_exit(&nlm_reg_lock);
4557 }
4558
4559 /*
4560 * Return non-zero if the given I/O request conflicts with an active NBMAND
4561 * lock.
4562 * If svmand is non-zero, it means look at all active locks, not just NBMAND
4563 * locks.
4564 */
4565
4566 int
4567 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
4568 ssize_t length, int svmand, caller_context_t *ct)
4569 {
4570 int conflict = 0;
4571 graph_t *gp;
4572 lock_descriptor_t *lock;
4573 pid_t pid;
4574 int sysid;
4575
4576 if (ct == NULL) {
4577 pid = curproc->p_pid;
4578 sysid = 0;
4579 } else {
4580 pid = ct->cc_pid;
4581 sysid = ct->cc_sysid;
4582 }
4583
4584 mutex_enter(&flock_lock);
4585 gp = lock_graph[HASH_INDEX(vp)];
4586 mutex_exit(&flock_lock);
4587 if (gp == NULL)
4588 return (0);
4589
4590 mutex_enter(&gp->gp_mutex);
4591 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4592
4593 for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
4594 if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
4595 (lock->l_flock.l_sysid != sysid ||
4596 lock->l_flock.l_pid != pid) &&
4597 lock_blocks_io(op, offset, length,
4598 lock->l_type, lock->l_start, lock->l_end)) {
4599 conflict = 1;
4600 break;
4601 }
4602 }
4603 mutex_exit(&gp->gp_mutex);
4604
4605 return (conflict);
4606 }
4607
4608 /*
4609 * Return non-zero if the given I/O request conflicts with the given lock.
4610 */
4611
4612 static int
4613 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4614 int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4615 {
4616 ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4617 ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4618
4619 if (op == NBL_READ && lock_type == F_RDLCK)
4620 return (0);
4621
4622 if (offset <= lock_start && lock_start < offset + length)
4623 return (1);
4624 if (lock_start <= offset && offset <= lock_end)
4625 return (1);
4626
4627 return (0);
4628 }
4629
4630 #ifdef DEBUG
4631 static void
4632 check_active_locks(graph_t *gp)
4633 {
4634 lock_descriptor_t *lock, *lock1;
4635 edge_t *ep;
4636
4637 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4638 lock = lock->l_next) {
4639 ASSERT(IS_ACTIVE(lock));
4640 ASSERT(NOT_BLOCKED(lock));
4641 ASSERT(!IS_BARRIER(lock));
4642
4643 ep = FIRST_IN(lock);
4644
4645 while (ep != HEAD(lock)) {
4646 ASSERT(IS_SLEEPING(ep->from_vertex));
4647 ASSERT(!NOT_BLOCKED(ep->from_vertex));
4648 ep = NEXT_IN(ep);
4649 }
4650
4651 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4652 lock1 = lock1->l_next) {
4653 if (lock1->l_vnode == lock->l_vnode) {
4654 if (BLOCKS(lock1, lock)) {
4655 cmn_err(CE_PANIC,
4656 "active lock %p blocks %p",
4657 (void *)lock1, (void *)lock);
4658 } else if (BLOCKS(lock, lock1)) {
4659 cmn_err(CE_PANIC,
4660 "active lock %p blocks %p",
4661 (void *)lock, (void *)lock1);
4662 }
4663 }
4664 }
4665 }
4666 }
4667
4668 /*
4669 * Effect: This functions checks to see if the transition from 'old_state' to
4670 * 'new_state' is a valid one. It returns 0 if the transition is valid
4671 * and 1 if it is not.
4672 * For a map of valid transitions, see sys/flock_impl.h
4673 */
4674 static int
4675 check_lock_transition(int old_state, int new_state)
4676 {
4677 switch (old_state) {
4678 case FLK_INITIAL_STATE:
4679 if ((new_state == FLK_START_STATE) ||
4680 (new_state == FLK_SLEEPING_STATE) ||
4681 (new_state == FLK_ACTIVE_STATE) ||
4682 (new_state == FLK_DEAD_STATE)) {
4683 return (0);
4684 } else {
4685 return (1);
4686 }
4687 case FLK_START_STATE:
4688 if ((new_state == FLK_ACTIVE_STATE) ||
4689 (new_state == FLK_DEAD_STATE)) {
4690 return (0);
4691 } else {
4692 return (1);
4693 }
4694 case FLK_ACTIVE_STATE:
4695 if (new_state == FLK_DEAD_STATE) {
4696 return (0);
4697 } else {
4698 return (1);
4699 }
4700 case FLK_SLEEPING_STATE:
4701 if ((new_state == FLK_GRANTED_STATE) ||
4702 (new_state == FLK_INTERRUPTED_STATE) ||
4703 (new_state == FLK_CANCELLED_STATE)) {
4704 return (0);
4705 } else {
4706 return (1);
4707 }
4708 case FLK_GRANTED_STATE:
4709 if ((new_state == FLK_START_STATE) ||
4710 (new_state == FLK_INTERRUPTED_STATE) ||
4711 (new_state == FLK_CANCELLED_STATE)) {
4712 return (0);
4713 } else {
4714 return (1);
4715 }
4716 case FLK_CANCELLED_STATE:
4717 if ((new_state == FLK_INTERRUPTED_STATE) ||
4718 (new_state == FLK_DEAD_STATE)) {
4719 return (0);
4720 } else {
4721 return (1);
4722 }
4723 case FLK_INTERRUPTED_STATE:
4724 if (new_state == FLK_DEAD_STATE) {
4725 return (0);
4726 } else {
4727 return (1);
4728 }
4729 case FLK_DEAD_STATE:
4730 /* May be set more than once */
4731 if (new_state == FLK_DEAD_STATE) {
4732 return (0);
4733 } else {
4734 return (1);
4735 }
4736 default:
4737 return (1);
4738 }
4739 }
4740
4741 static void
4742 check_sleeping_locks(graph_t *gp)
4743 {
4744 lock_descriptor_t *lock1, *lock2;
4745 edge_t *ep;
4746 for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4747 lock1 = lock1->l_next) {
4748 ASSERT(!IS_BARRIER(lock1));
4749 for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4750 lock2 = lock2->l_next) {
4751 if (lock1->l_vnode == lock2->l_vnode) {
4752 if (BLOCKS(lock2, lock1)) {
4753 ASSERT(!IS_GRANTED(lock1));
4754 ASSERT(!NOT_BLOCKED(lock1));
4755 path(lock1, lock2);
4756 }
4757 }
4758 }
4759
4760 for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4761 lock2 = lock2->l_next) {
4762 ASSERT(!IS_BARRIER(lock1));
4763 if (lock1->l_vnode == lock2->l_vnode) {
4764 if (BLOCKS(lock2, lock1)) {
4765 ASSERT(!IS_GRANTED(lock1));
4766 ASSERT(!NOT_BLOCKED(lock1));
4767 path(lock1, lock2);
4768 }
4769 }
4770 }
4771 ep = FIRST_ADJ(lock1);
4772 while (ep != HEAD(lock1)) {
4773 ASSERT(BLOCKS(ep->to_vertex, lock1));
4774 ep = NEXT_ADJ(ep);
4775 }
4776 }
4777 }
4778
4779 static int
4780 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4781 {
4782 edge_t *ep;
4783 lock_descriptor_t *vertex;
4784 lock_descriptor_t *vertex_stack;
4785
4786 STACK_INIT(vertex_stack);
4787
4788 flk_graph_uncolor(lock1->l_graph);
4789 ep = FIRST_ADJ(lock1);
4790 ASSERT(ep != HEAD(lock1));
4791 while (ep != HEAD(lock1)) {
4792 if (no_path)
4793 ASSERT(ep->to_vertex != lock2);
4794 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4795 COLOR(ep->to_vertex);
4796 ep = NEXT_ADJ(ep);
4797 }
4798
4799 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4800 STACK_POP(vertex_stack, l_dstack);
4801 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4802 ep = NEXT_ADJ(ep)) {
4803 if (COLORED(ep->to_vertex))
4804 continue;
4805 COLOR(ep->to_vertex);
4806 if (ep->to_vertex == lock2)
4807 return (1);
4808
4809 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4810 }
4811 }
4812 return (0);
4813 }
4814
4815 static void
4816 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4817 {
4818 lock_descriptor_t *lock;
4819
4820 /* Ignore OFD style locks since they're not process-wide. */
4821 if (pid == 0)
4822 return;
4823
4824 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4825
4826 if (lock) {
4827 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4828 if (lock->l_flock.l_pid == pid &&
4829 lock->l_flock.l_sysid == sysid)
4830 cmn_err(CE_PANIC,
4831 "owner pid %d's lock %p in active queue",
4832 pid, (void *)lock);
4833 lock = lock->l_next;
4834 }
4835 }
4836 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4837
4838 if (lock) {
4839 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4840 if (lock->l_flock.l_pid == pid &&
4841 lock->l_flock.l_sysid == sysid)
4842 cmn_err(CE_PANIC,
4843 "owner pid %d's lock %p in sleep queue",
4844 pid, (void *)lock);
4845 lock = lock->l_next;
4846 }
4847 }
4848 }
4849
4850 static int
4851 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4852 {
4853 edge_t *ep = FIRST_ADJ(lock1);
4854
4855 while (ep != HEAD(lock1)) {
4856 if (ep->to_vertex == lock2)
4857 return (1);
4858 else
4859 ep = NEXT_ADJ(ep);
4860 }
4861 return (0);
4862 }
4863
4864 static int
4865 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4866 {
4867 return (!level_two_path(lock1, lock2, 1));
4868 }
4869
4870 static void
4871 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4872 {
4873 if (level_one_path(lock1, lock2)) {
4874 if (level_two_path(lock1, lock2, 0) != 0) {
4875 cmn_err(CE_WARN,
4876 "one edge one path from lock1 %p lock2 %p",
4877 (void *)lock1, (void *)lock2);
4878 }
4879 } else if (no_path(lock1, lock2)) {
4880 cmn_err(CE_PANIC,
4881 "No path from lock1 %p to lock2 %p",
4882 (void *)lock1, (void *)lock2);
4883 }
4884 }
4885 #endif /* DEBUG */