1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
25 */
26
27 /*
28 * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
29 * Use is subject to license terms.
30 * Copyright 2015 Joyent, Inc.
31 */
32
33 #ifndef _SYS_FLOCK_IMPL_H
34 #define _SYS_FLOCK_IMPL_H
35
36 #include <sys/types.h>
37 #include <sys/fcntl.h> /* flock definition */
38 #include <sys/file.h> /* FREAD etc */
39 #include <sys/flock.h> /* RCMD etc */
40 #include <sys/kmem.h>
41 #include <sys/user.h>
42 #include <sys/thread.h>
43 #include <sys/proc.h>
44 #include <sys/cred.h>
45 #include <sys/debug.h>
46 #include <sys/cmn_err.h>
47 #include <sys/errno.h>
48 #include <sys/systm.h>
49 #include <sys/vnode.h>
50 #include <sys/share.h> /* just to get GETSYSID def */
51 #include <sys/time.h>
52
53 #ifdef __cplusplus
54 extern "C" {
55 #endif
56
57 struct edge {
58 struct edge *edge_adj_next; /* adjacency list next */
59 struct edge *edge_adj_prev; /* adjacency list prev */
60 struct edge *edge_in_next; /* incoming edges list next */
61 struct edge *edge_in_prev; /* incoming edges list prev */
62 struct lock_descriptor *from_vertex; /* edge emanating from lock */
63 struct lock_descriptor *to_vertex; /* edge pointing to lock */
64 };
65
66 typedef struct edge edge_t;
67
68 struct lock_descriptor {
69 struct lock_descriptor *l_next; /* next active/sleep lock */
70 struct lock_descriptor *l_prev; /* previous active/sleep lock */
71 struct edge l_edge; /* edge for adj and in lists */
72 struct lock_descriptor *l_stack; /* for stack operations */
73 struct lock_descriptor *l_stack1; /* for stack operations */
74 struct lock_descriptor *l_dstack; /* stack for debug functions */
75 struct edge *l_sedge; /* start edge for graph alg. */
76 int l_index; /* used for barrier count */
77 struct graph *l_graph; /* graph this belongs to */
78 vnode_t *l_vnode; /* vnode being locked */
79 int l_type; /* type of lock */
80 int l_state; /* state described below */
81 u_offset_t l_start; /* start offset */
82 u_offset_t l_end; /* end offset */
83 flock64_t l_flock; /* original flock request */
84 int l_color; /* color used for graph alg */
85 kcondvar_t l_cv; /* wait condition for lock */
86 int pvertex; /* index to proc vertex */
87 int l_status; /* status described below */
88 flk_nlm_status_t l_nlm_state; /* state of NLM server */
89 flk_callback_t *l_callbacks; /* callbacks, or NULL */
90 zoneid_t l_zoneid; /* zone of request */
91 hrtime_t l_blocker; /* time when this lock */
92 /* started to prevent other */
93 /* locks from being set */
94 file_t *l_ofd; /* OFD-style reference */
95 };
96
97 typedef struct lock_descriptor lock_descriptor_t;
98
99 /*
100 * Each graph holds locking information for some number of vnodes. The
101 * active and sleeping lists are circular, with a dummy head element.
102 */
103
104 struct graph {
105 kmutex_t gp_mutex; /* mutex for this graph */
106 struct lock_descriptor active_locks;
107 struct lock_descriptor sleeping_locks;
108 int index; /* index of this graph into the hash table */
109 int mark; /* used for coloring the graph */
110 };
111
112 typedef struct graph graph_t;
113
114 /*
115 * The possible states a lock can be in. These states are stored in the
116 * 'l_status' member of the 'lock_descriptor_t' structure. All locks start
117 * life in the INITIAL state, and end up in the DEAD state. Possible state
118 * transitions are :
119 *
120 * INITIAL--> START --> ACTIVE --> DEAD
121 *
122 * --> DEAD
123 *
124 * --> ACTIVE --> DEAD (new locks from flk_relation)
125 *
126 * --> SLEEPING --> GRANTED --> START --> ACTIVE --> DEAD
127 *
128 * --> INTR --> DEAD
129 *
130 * --> CANCELLED --> DEAD
131 *
132 * --> INTR --> DEAD
133 *
134 * --> INTR --> DEAD
135 *
136 * --> CANCELLED --> DEAD
137 *
138 * --> INTR --> DEAD
139 *
140 * Lock transitions are done in the following functions:
141 * --> INITIAL flk_get_lock(), reclock()
142 * --> START flk_execute_request()
143 * --> ACTIVE flk_insert_active_lock()
144 * --> SLEEPING flk_insert_sleeping_lock()
145 * --> GRANTED GRANT_WAKEUP
146 * --> INTERRUPTED INTERRUPT_WAKEUP
147 * --> CANCELLED CANCEL_WAKEUP
148 * --> DEAD reclock(), flk_delete_active_lock(), and
149 * flk_cancel_sleeping_lock()
150 */
151
152 #define FLK_INITIAL_STATE 1 /* Initial state of all requests */
153 #define FLK_START_STATE 2 /* Request has started execution */
154 #define FLK_ACTIVE_STATE 3 /* In active queue */
155 #define FLK_SLEEPING_STATE 4 /* Request is blocked */
156 #define FLK_GRANTED_STATE 5 /* Request is granted */
157 #define FLK_INTERRUPTED_STATE 6 /* Request is interrupted */
158 #define FLK_CANCELLED_STATE 7 /* Request is cancelled */
159 #define FLK_DEAD_STATE 8 /* Request is done - will be deleted */
160
161 /* flags defining state of locks */
162
163 /*
164 * The LLM design has been modified so that lock states are now stored
165 * in the l_status field of lock_descriptor_t. The l_state field is
166 * currently preserved for binary compatibility, but may be modified or
167 * removed in a minor release of Solaris. Note that both of these
168 * fields (and the rest of the lock_descriptor_t structure) are private
169 * to the implementation of the lock manager and should not be used
170 * externally.
171 */
172
173 #define ACTIVE_LOCK 0x0001 /* in active queue */
174 #define SLEEPING_LOCK 0x0002 /* in sleep queue */
175 #define IO_LOCK 0x0004 /* is an IO lock */
176 #define REFERENCED_LOCK 0x0008 /* referenced some where */
177 #define QUERY_LOCK 0x0010 /* querying about lock */
178 #define WILLING_TO_SLEEP_LOCK 0x0020 /* lock can be put in sleep queue */
179 #define RECOMPUTE_LOCK 0x0040 /* used for recomputing dependencies */
180 #define RECOMPUTE_DONE 0x0080 /* used for recomputing dependencies */
181 #define BARRIER_LOCK 0x0100 /* used for recomputing dependencies */
182 #define GRANTED_LOCK 0x0200 /* granted but still in sleep queue */
183 #define CANCELLED_LOCK 0x0400 /* cancelled will be thrown out */
184 #define DELETED_LOCK 0x0800 /* deleted - free at earliest */
185 #define INTERRUPTED_LOCK 0x1000 /* pretend signal */
186 #define LOCKMGR_LOCK 0x2000 /* remote lock (server-side) */
187 /* Clustering: flag for PXFS locks */
188 #define PXFS_LOCK 0x4000 /* lock created by PXFS file system */
189 #define NBMAND_LOCK 0x8000 /* non-blocking mandatory locking */
190
191 #define HASH_SIZE 32
192 #define HASH_SHIFT (HASH_SIZE - 1)
193 #define HASH_INDEX(vp) (((uintptr_t)vp >> 7) & HASH_SHIFT)
194
195 /* extern definitions */
196
197 extern struct graph *lock_graph[HASH_SIZE];
198 extern struct kmem_cache *flk_edge_cache;
199
200 /* Clustering: functions called by PXFS */
201 int flk_execute_request(lock_descriptor_t *);
202 void flk_cancel_sleeping_lock(lock_descriptor_t *, int);
203 void flk_set_state(lock_descriptor_t *, int);
204 graph_t *flk_get_lock_graph(vnode_t *, int);
205
206 /* flags used for readability in flock.c */
207
208 #define FLK_USE_GRAPH 0 /* don't initialize the lock_graph */
209 #define FLK_INIT_GRAPH 1 /* initialize the lock graph */
210 #define NO_COLOR 0 /* vertex is not colored */
211 #define NO_CHECK_CYCLE 0 /* don't mark vertex's in flk_add_edge */
212 #define CHECK_CYCLE 1 /* mark vertex's in flk_add_edge */
213
214 #define SAME_OWNER(lock1, lock2) \
215 (((lock1)->l_flock.l_pid == (lock2)->l_flock.l_pid) && \
216 ((lock1)->l_flock.l_sysid == (lock2)->l_flock.l_sysid) && \
217 ((lock1)->l_ofd == (lock2)->l_ofd))
218
219 #define COLORED(vertex) ((vertex)->l_color == (vertex)->l_graph->mark)
220 #define COLOR(vertex) ((vertex)->l_color = (vertex)->l_graph->mark)
221
222 /*
223 * stack data structure and operations
224 */
225
226 #define STACK_INIT(stack) ((stack) = NULL)
227 #define STACK_PUSH(stack, ptr, stack_link) (ptr)->stack_link = (stack),\
228 (stack) = (ptr)
229 #define STACK_POP(stack, stack_link) (stack) = (stack)->stack_link
230 #define STACK_TOP(stack) (stack)
231 #define STACK_EMPTY(stack) ((stack) == NULL)
232
233
234 #define ACTIVE_HEAD(gp) (&(gp)->active_locks)
235
236 #define SLEEPING_HEAD(gp) (&(gp)->sleeping_locks)
237
238 #define SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp) \
239 { \
240 (lock) = (lock_descriptor_t *)vp->v_filocks; \
241 }
242
243 #define SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp) \
244 { \
245 for ((lock) = SLEEPING_HEAD((gp))->l_next; ((lock) != SLEEPING_HEAD((gp)) && \
246 (lock)->l_vnode != (vp)); (lock) = (lock)->l_next) \
247 ; \
248 (lock) = ((lock) == SLEEPING_HEAD((gp))) ? NULL : (lock); \
249 }
250
251 #define OVERLAP(lock1, lock2) \
252 (((lock1)->l_start <= (lock2)->l_start && \
253 (lock2)->l_start <= (lock1)->l_end) || \
254 ((lock2)->l_start <= (lock1)->l_start && \
255 (lock1)->l_start <= (lock2)->l_end))
256
257 #define IS_INITIAL(lock) ((lock)->l_status == FLK_INITIAL_STATE)
258 #define IS_ACTIVE(lock) ((lock)->l_status == FLK_ACTIVE_STATE)
259 #define IS_SLEEPING(lock) ((lock)->l_status == FLK_SLEEPING_STATE)
260 #define IS_GRANTED(lock) ((lock)->l_status == FLK_GRANTED_STATE)
261 #define IS_INTERRUPTED(lock) ((lock)->l_status == FLK_INTERRUPTED_STATE)
262 #define IS_CANCELLED(lock) ((lock)->l_status == FLK_CANCELLED_STATE)
263 #define IS_DEAD(lock) ((lock)->l_status == FLK_DEAD_STATE)
264
265 #define IS_QUERY_LOCK(lock) ((lock)->l_state & QUERY_LOCK)
266 #define IS_RECOMPUTE(lock) ((lock)->l_state & RECOMPUTE_LOCK)
267 #define IS_BARRIER(lock) ((lock)->l_state & BARRIER_LOCK)
268 #define IS_DELETED(lock) ((lock)->l_state & DELETED_LOCK)
269 #define IS_REFERENCED(lock) ((lock)->l_state & REFERENCED_LOCK)
270 #define IS_IO_LOCK(lock) ((lock)->l_state & IO_LOCK)
271 #define IS_WILLING_TO_SLEEP(lock) \
272 ((lock)->l_state & WILLING_TO_SLEEP_LOCK)
273 #define IS_LOCKMGR(lock) ((lock)->l_state & LOCKMGR_LOCK)
274 #define IS_NLM_UP(lock) ((lock)->l_nlm_state == FLK_NLM_UP)
275 /* Clustering: Macro for PXFS locks */
276 #define IS_PXFS(lock) ((lock)->l_state & PXFS_LOCK)
277
278 /*
279 * "local" requests don't involve the NFS lock manager in any way.
280 * "remote" requests can be on the server (requests from a remote client),
281 * in which case they should be associated with a local vnode (UFS, tmpfs,
282 * etc.). These requests are flagged with LOCKMGR_LOCK and are made using
283 * kernel service threads. Remote requests can also be on an NFS client,
284 * because the NFS lock manager uses local locking for some of its
285 * bookkeeping. These requests are made by regular user processes.
286 */
287 #define IS_LOCAL(lock) (GETSYSID((lock)->l_flock.l_sysid) == 0)
288 #define IS_REMOTE(lock) (! IS_LOCAL(lock))
289
290 /* Clustering: Return value for blocking PXFS locks */
291 /*
292 * For PXFS locks, reclock() will return this error code for requests that
293 * need to block
294 */
295 #define PXFS_LOCK_BLOCKED -1
296
297 /* Clustering: PXFS callback function */
298 /*
299 * This function is a callback from the LLM into the PXFS server module. It
300 * is initialized as a weak stub, and is functional when the pxfs server module
301 * is loaded.
302 */
303 extern void cl_flk_state_transition_notify(lock_descriptor_t *lock,
304 int old_state, int new_state);
305
306 #define BLOCKS(lock1, lock2) (!SAME_OWNER((lock1), (lock2)) && \
307 (((lock1)->l_type == F_WRLCK) || \
308 ((lock2)->l_type == F_WRLCK)) && \
309 OVERLAP((lock1), (lock2)))
310
311 #define COVERS(lock1, lock2) \
312 (((lock1)->l_start <= (lock2)->l_start) && \
313 ((lock1)->l_end >= (lock2)->l_end))
314
315 #define IN_LIST_REMOVE(ep) \
316 { \
317 (ep)->edge_in_next->edge_in_prev = (ep)->edge_in_prev; \
318 (ep)->edge_in_prev->edge_in_next = (ep)->edge_in_next; \
319 }
320
321 #define ADJ_LIST_REMOVE(ep) \
322 { \
323 (ep)->edge_adj_next->edge_adj_prev = (ep)->edge_adj_prev; \
324 (ep)->edge_adj_prev->edge_adj_next = (ep)->edge_adj_next; \
325 }
326
327 #define NOT_BLOCKED(lock) \
328 ((lock)->l_edge.edge_adj_next == &(lock)->l_edge && !IS_GRANTED(lock))
329
330 #define GRANT_WAKEUP(lock) \
331 { \
332 flk_set_state(lock, FLK_GRANTED_STATE); \
333 (lock)->l_state |= GRANTED_LOCK; \
334 /* \
335 * Clustering: PXFS locks do not sleep in the LLM, \
336 * so there is no need to signal them \
337 */ \
338 if (!IS_PXFS(lock)) { \
339 cv_signal(&(lock)->l_cv); \
340 } \
341 }
342
343 #define CANCEL_WAKEUP(lock) \
344 { \
345 flk_set_state(lock, FLK_CANCELLED_STATE); \
346 (lock)->l_state |= CANCELLED_LOCK; \
347 /* \
348 * Clustering: PXFS locks do not sleep in the LLM, \
349 * so there is no need to signal them \
350 */ \
351 if (!IS_PXFS(lock)) { \
352 cv_signal(&(lock)->l_cv); \
353 } \
354 }
355
356 #define INTERRUPT_WAKEUP(lock) \
357 { \
358 flk_set_state(lock, FLK_INTERRUPTED_STATE); \
359 (lock)->l_state |= INTERRUPTED_LOCK; \
360 /* \
361 * Clustering: PXFS locks do not sleep in the LLM, \
362 * so there is no need to signal them \
363 */ \
364 if (!IS_PXFS(lock)) { \
365 cv_signal(&(lock)->l_cv); \
366 } \
367 }
368
369 #define REMOVE_SLEEP_QUEUE(lock) \
370 { \
371 ASSERT(IS_SLEEPING(lock) || IS_GRANTED(lock) || \
372 IS_INTERRUPTED(lock) || IS_CANCELLED(lock)); \
373 (lock)->l_state &= ~SLEEPING_LOCK; \
374 (lock)->l_next->l_prev = (lock)->l_prev; \
375 (lock)->l_prev->l_next = (lock)->l_next; \
376 (lock)->l_next = (lock)->l_prev = (lock_descriptor_t *)NULL; \
377 }
378
379 #define NO_DEPENDENTS(lock) \
380 ((lock)->l_edge.edge_in_next == &(lock)->l_edge)
381
382 #define GRANT(lock) \
383 { \
384 (lock)->l_state |= GRANTED_LOCK; \
385 flk_set_state(lock, FLK_GRANTED_STATE); \
386 }
387
388 #define FIRST_IN(lock) ((lock)->l_edge.edge_in_next)
389 #define FIRST_ADJ(lock) ((lock)->l_edge.edge_adj_next)
390 #define HEAD(lock) (&(lock)->l_edge)
391 #define NEXT_ADJ(ep) ((ep)->edge_adj_next)
392 #define NEXT_IN(ep) ((ep)->edge_in_next)
393 #define IN_ADJ_INIT(lock) \
394 { \
395 (lock)->l_edge.edge_adj_next = (lock)->l_edge.edge_adj_prev = &(lock)->l_edge; \
396 (lock)->l_edge.edge_in_next = (lock)->l_edge.edge_in_prev = &(lock)->l_edge; \
397 }
398
399 #define COPY(lock1, lock2) \
400 { \
401 (lock1)->l_graph = (lock2)->l_graph; \
402 (lock1)->l_vnode = (lock2)->l_vnode; \
403 (lock1)->l_type = (lock2)->l_type; \
404 (lock1)->l_state = (lock2)->l_state; \
405 (lock1)->l_start = (lock2)->l_start; \
406 (lock1)->l_end = (lock2)->l_end; \
407 (lock1)->l_flock = (lock2)->l_flock; \
408 (lock1)->l_zoneid = (lock2)->l_zoneid; \
409 (lock1)->pvertex = (lock2)->pvertex; \
410 (lock1)->l_blocker = (lock2)->l_blocker; \
411 }
412
413 /*
414 * Clustering
415 */
416 /* Routines to set and get the NLM state in a lock request */
417 #define SET_NLM_STATE(lock, nlm_state) ((lock)->l_nlm_state = nlm_state)
418 #define GET_NLM_STATE(lock) ((lock)->l_nlm_state)
419 /*
420 * NLM registry abstraction:
421 * Abstraction overview:
422 * This registry keeps track of the NLM servers via their nlmids
423 * that have requested locks at the LLM this registry is associated
424 * with.
425 */
426 /* Routines to manipulate the NLM registry object state */
427 #define FLK_REGISTRY_IS_NLM_UNKNOWN(nlmreg, nlmid) \
428 ((nlmreg)[nlmid] == FLK_NLM_UNKNOWN)
429 #define FLK_REGISTRY_IS_NLM_UP(nlmreg, nlmid) \
430 ((nlmreg)[nlmid] == FLK_NLM_UP)
431 #define FLK_REGISTRY_ADD_NLMID(nlmreg, nlmid) \
432 ((nlmreg)[nlmid] = FLK_NLM_UP)
433 #define FLK_REGISTRY_CHANGE_NLM_STATE(nlmreg, nlmid, state) \
434 ((nlmreg)[nlmid] = state)
435
436 /* Indicates the effect of executing a request on the existing locks */
437
438 #define FLK_UNLOCK 0x1 /* request unlocks the existing lock */
439 #define FLK_DOWNGRADE 0x2 /* request downgrades the existing lock */
440 #define FLK_UPGRADE 0x3 /* request upgrades the existing lock */
441 #define FLK_STAY_SAME 0x4 /* request type is same as existing lock */
442
443
444 /* proc graph definitions */
445
446 /*
447 * Proc graph is the global process graph that maintains information
448 * about the dependencies between processes. An edge is added between two
449 * processes represented by proc_vertex's A and B, iff there exists l1
450 * owned by process A in any of the lock_graph's dependent on l2
451 * (thus having an edge to l2) owned by process B.
452 */
453 struct proc_vertex {
454 pid_t pid; /* pid of the process */
455 long sysid; /* sysid of the process */
456 struct proc_edge *edge; /* adajcent edges of this process */
457 int incount; /* Number of inedges to this process */
458 struct proc_edge *p_sedge; /* used for implementing stack alg. */
459 struct proc_vertex *p_stack; /* used for stack alg. */
460 int atime; /* used for cycle detection algorithm */
461 int dtime; /* used for cycle detection algorithm */
462 int index; /* index into the array of proc_graph vertices */
463 };
464
465 typedef struct proc_vertex proc_vertex_t;
466
467 struct proc_edge {
468 struct proc_edge *next; /* next edge in adjacency list */
469 int refcount; /* reference count of this edge */
470 struct proc_vertex *to_proc; /* process this points to */
471 };
472
473 typedef struct proc_edge proc_edge_t;
474
475
476 #define PROC_CHUNK 100
477
478 struct proc_graph {
479 struct proc_vertex **proc; /* list of proc_vertexes */
480 int gcount; /* list size */
481 int free; /* number of free slots in the list */
482 int mark; /* used for graph coloring */
483 };
484
485 typedef struct proc_graph proc_graph_t;
486
487 extern struct proc_graph pgraph;
488
489 #define PROC_SAME_OWNER(lock, pvertex) \
490 (((lock)->l_flock.l_pid == (pvertex)->pid) && \
491 ((lock)->l_flock.l_sysid == (pvertex)->sysid))
492
493 #define PROC_ARRIVE(pvertex) ((pvertex)->atime = pgraph.mark)
494 #define PROC_DEPART(pvertex) ((pvertex)->dtime = pgraph.mark)
495 #define PROC_ARRIVED(pvertex) ((pvertex)->atime == pgraph.mark)
496 #define PROC_DEPARTED(pvertex) ((pvertex)->dtime == pgraph.mark)
497
498 #ifdef __cplusplus
499 }
500 #endif
501
502 #endif /* _SYS_FLOCK_IMPL_H */