1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2015 Joyent, Inc.
  26  */
  27 
  28 #ifndef _SYS_FLOCK_IMPL_H
  29 #define _SYS_FLOCK_IMPL_H
  30 
  31 #include <sys/types.h>
  32 #include <sys/fcntl.h>            /* flock definition */
  33 #include <sys/file.h>             /* FREAD etc */
  34 #include <sys/flock.h>            /* RCMD etc */
  35 #include <sys/kmem.h>
  36 #include <sys/user.h>
  37 #include <sys/thread.h>
  38 #include <sys/proc.h>
  39 #include <sys/cred.h>
  40 #include <sys/debug.h>
  41 #include <sys/cmn_err.h>
  42 #include <sys/errno.h>
  43 #include <sys/systm.h>
  44 #include <sys/vnode.h>
  45 #include <sys/share.h>            /* just to get GETSYSID def */
  46 
  47 #ifdef  __cplusplus
  48 extern "C" {
  49 #endif
  50 
  51 struct  edge {
  52         struct  edge    *edge_adj_next; /* adjacency list next */
  53         struct  edge    *edge_adj_prev; /* adjacency list prev */
  54         struct  edge    *edge_in_next;  /* incoming edges list next */
  55         struct  edge    *edge_in_prev;  /* incoming edges list prev */
  56         struct  lock_descriptor *from_vertex;   /* edge emanating from lock */
  57         struct  lock_descriptor *to_vertex;     /* edge pointing to lock */
  58 };
  59 
  60 typedef struct  edge    edge_t;
  61 
  62 struct lock_descriptor {
  63         struct  lock_descriptor *l_next;        /* next active/sleep lock */
  64         struct  lock_descriptor *l_prev;        /* previous active/sleep lock */
  65         struct  edge            l_edge;         /* edge for adj and in lists */
  66         struct  lock_descriptor *l_stack;       /* for stack operations */
  67         struct  lock_descriptor *l_stack1;      /* for stack operations */
  68         struct  lock_descriptor *l_dstack;      /* stack for debug functions */
  69         struct  edge            *l_sedge;       /* start edge for graph alg. */
  70                         int     l_index;        /* used for barrier count */
  71                 struct  graph   *l_graph;       /* graph this belongs to */
  72                 vnode_t         *l_vnode;       /* vnode being locked */
  73                         int     l_type;         /* type of lock */
  74                         int     l_state;        /* state described below */
  75                 u_offset_t      l_start;        /* start offset */
  76                 u_offset_t      l_end;          /* end offset */
  77                 flock64_t       l_flock;        /* original flock request */
  78                         int     l_color;        /* color used for graph alg */
  79                 kcondvar_t      l_cv;           /* wait condition for lock */
  80                 int             pvertex;        /* index to proc vertex */
  81                         int     l_status;       /* status described below */
  82                 flk_nlm_status_t l_nlm_state;   /* state of NLM server */
  83                 flk_callback_t  *l_callbacks;   /* callbacks, or NULL */
  84                 zoneid_t        l_zoneid;       /* zone of request */
  85                 file_t          *l_ofd;         /* OFD-style reference */
  86 };
  87 
  88 typedef struct  lock_descriptor lock_descriptor_t;
  89 
  90 /*
  91  * Each graph holds locking information for some number of vnodes.  The
  92  * active and sleeping lists are circular, with a dummy head element.
  93  */
  94 
  95 struct  graph {
  96         kmutex_t        gp_mutex;       /* mutex for this graph */
  97         struct  lock_descriptor active_locks;
  98         struct  lock_descriptor sleeping_locks;
  99         int index;      /* index of this graph into the hash table */
 100         int mark;       /* used for coloring the graph */
 101 };
 102 
 103 typedef struct  graph   graph_t;
 104 
 105 /*
 106  * The possible states a lock can be in.  These states are stored in the
 107  * 'l_status' member of the 'lock_descriptor_t' structure.  All locks start
 108  * life in the INITIAL state, and end up in the DEAD state.  Possible state
 109  * transitions are :
 110  *
 111  * INITIAL--> START    --> ACTIVE    --> DEAD
 112  *
 113  *                     --> DEAD
 114  *
 115  *        --> ACTIVE   --> DEAD          (new locks from flk_relation)
 116  *
 117  *        --> SLEEPING --> GRANTED   --> START     --> ACTIVE --> DEAD
 118  *
 119  *                                   --> INTR      --> DEAD
 120  *
 121  *                                   --> CANCELLED --> DEAD
 122  *
 123  *                                                 --> INTR   --> DEAD
 124  *
 125  *                     --> INTR      --> DEAD
 126  *
 127  *                     --> CANCELLED --> DEAD
 128  *
 129  *                                   --> INTR      --> DEAD
 130  *
 131  * Lock transitions are done in the following functions:
 132  * --> INITIAL               flk_get_lock(), reclock()
 133  * --> START         flk_execute_request()
 134  * --> ACTIVE                flk_insert_active_lock()
 135  * --> SLEEPING              flk_insert_sleeping_lock()
 136  * --> GRANTED               GRANT_WAKEUP
 137  * --> INTERRUPTED   INTERRUPT_WAKEUP
 138  * --> CANCELLED     CANCEL_WAKEUP
 139  * --> DEAD          reclock(), flk_delete_active_lock(), and
 140  *                          flk_cancel_sleeping_lock()
 141  */
 142 
 143 #define FLK_INITIAL_STATE       1       /* Initial state of all requests */
 144 #define FLK_START_STATE         2       /* Request has started execution */
 145 #define FLK_ACTIVE_STATE        3       /* In active queue */
 146 #define FLK_SLEEPING_STATE      4       /* Request is blocked */
 147 #define FLK_GRANTED_STATE       5       /* Request is granted */
 148 #define FLK_INTERRUPTED_STATE   6       /* Request is interrupted */
 149 #define FLK_CANCELLED_STATE     7       /* Request is cancelled */
 150 #define FLK_DEAD_STATE          8       /* Request is done - will be deleted */
 151 
 152 /* flags defining state of locks */
 153 
 154 /*
 155  * The LLM design has been modified so that lock states are now stored
 156  * in the l_status field of lock_descriptor_t.  The l_state field is
 157  * currently preserved for binary compatibility, but may be modified or
 158  * removed in a minor release of Solaris.  Note that both of these
 159  * fields (and the rest of the lock_descriptor_t structure) are private
 160  * to the implementation of the lock manager and should not be used
 161  * externally.
 162  */
 163 
 164 #define ACTIVE_LOCK             0x0001  /* in active queue */
 165 #define SLEEPING_LOCK           0x0002  /* in sleep queue */
 166 #define IO_LOCK                 0x0004  /* is an IO lock */
 167 #define REFERENCED_LOCK         0x0008  /* referenced some where */
 168 #define QUERY_LOCK              0x0010  /* querying about lock */
 169 #define WILLING_TO_SLEEP_LOCK   0x0020  /* lock can be put in sleep queue */
 170 #define RECOMPUTE_LOCK          0x0040  /* used for recomputing dependencies */
 171 #define RECOMPUTE_DONE          0x0080  /* used for recomputing dependencies */
 172 #define BARRIER_LOCK            0x0100  /* used for recomputing dependencies */
 173 #define GRANTED_LOCK            0x0200  /* granted but still in sleep queue */
 174 #define CANCELLED_LOCK          0x0400  /* cancelled will be thrown out */
 175 #define DELETED_LOCK            0x0800  /* deleted - free at earliest */
 176 #define INTERRUPTED_LOCK        0x1000  /* pretend signal */
 177 #define LOCKMGR_LOCK            0x2000  /* remote lock (server-side) */
 178 /* Clustering: flag for PXFS locks */
 179 #define PXFS_LOCK               0x4000  /* lock created by PXFS file system */
 180 #define NBMAND_LOCK             0x8000  /* non-blocking mandatory locking */
 181 
 182 #define HASH_SIZE       32
 183 #define HASH_SHIFT      (HASH_SIZE - 1)
 184 #define HASH_INDEX(vp)  (((uintptr_t)vp >> 7) & HASH_SHIFT)
 185 
 186 /* extern definitions */
 187 
 188 extern struct graph     *lock_graph[HASH_SIZE];
 189 extern struct kmem_cache *flk_edge_cache;
 190 
 191 /* Clustering: functions called by PXFS */
 192 int flk_execute_request(lock_descriptor_t *);
 193 void flk_cancel_sleeping_lock(lock_descriptor_t *, int);
 194 void flk_set_state(lock_descriptor_t *, int);
 195 graph_t *flk_get_lock_graph(vnode_t *, int);
 196 
 197 /* flags used for readability in flock.c */
 198 
 199 #define FLK_USE_GRAPH   0       /* don't initialize the lock_graph */
 200 #define FLK_INIT_GRAPH  1       /* initialize the lock graph */
 201 #define NO_COLOR        0       /* vertex is not colored */
 202 #define NO_CHECK_CYCLE  0       /* don't mark vertex's in flk_add_edge */
 203 #define CHECK_CYCLE     1       /* mark vertex's in flk_add_edge */
 204 
 205 #define SAME_OWNER(lock1, lock2)        \
 206         (((lock1)->l_flock.l_pid == (lock2)->l_flock.l_pid) && \
 207                 ((lock1)->l_flock.l_sysid == (lock2)->l_flock.l_sysid) && \
 208                 ((lock1)->l_ofd == (lock2)->l_ofd))
 209 
 210 #define COLORED(vertex)         ((vertex)->l_color == (vertex)->l_graph->mark)
 211 #define COLOR(vertex)           ((vertex)->l_color = (vertex)->l_graph->mark)
 212 
 213 /*
 214  * stack data structure and operations
 215  */
 216 
 217 #define STACK_INIT(stack)       ((stack) = NULL)
 218 #define STACK_PUSH(stack, ptr, stack_link)      (ptr)->stack_link = (stack),\
 219                                 (stack) = (ptr)
 220 #define STACK_POP(stack, stack_link)    (stack) = (stack)->stack_link
 221 #define STACK_TOP(stack)        (stack)
 222 #define STACK_EMPTY(stack)      ((stack) == NULL)
 223 
 224 
 225 #define ACTIVE_HEAD(gp) (&(gp)->active_locks)
 226 
 227 #define SLEEPING_HEAD(gp)       (&(gp)->sleeping_locks)
 228 
 229 #define SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp) \
 230 { \
 231         (lock) = (lock_descriptor_t *)vp->v_filocks; \
 232 }
 233 
 234 #define SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp) \
 235 { \
 236 for ((lock) = SLEEPING_HEAD((gp))->l_next; ((lock) != SLEEPING_HEAD((gp)) && \
 237                         (lock)->l_vnode != (vp)); (lock) = (lock)->l_next) \
 238                         ; \
 239 (lock) = ((lock) == SLEEPING_HEAD((gp))) ? NULL : (lock); \
 240 }
 241 
 242 #define OVERLAP(lock1, lock2) \
 243         (((lock1)->l_start <= (lock2)->l_start && \
 244                 (lock2)->l_start <= (lock1)->l_end) || \
 245         ((lock2)->l_start <= (lock1)->l_start && \
 246                 (lock1)->l_start <= (lock2)->l_end))
 247 
 248 #define IS_INITIAL(lock)        ((lock)->l_status == FLK_INITIAL_STATE)
 249 #define IS_ACTIVE(lock)         ((lock)->l_status == FLK_ACTIVE_STATE)
 250 #define IS_SLEEPING(lock)       ((lock)->l_status == FLK_SLEEPING_STATE)
 251 #define IS_GRANTED(lock)        ((lock)->l_status == FLK_GRANTED_STATE)
 252 #define IS_INTERRUPTED(lock)    ((lock)->l_status == FLK_INTERRUPTED_STATE)
 253 #define IS_CANCELLED(lock)      ((lock)->l_status == FLK_CANCELLED_STATE)
 254 #define IS_DEAD(lock)           ((lock)->l_status == FLK_DEAD_STATE)
 255 
 256 #define IS_QUERY_LOCK(lock)     ((lock)->l_state & QUERY_LOCK)
 257 #define IS_RECOMPUTE(lock)      ((lock)->l_state & RECOMPUTE_LOCK)
 258 #define IS_BARRIER(lock)        ((lock)->l_state & BARRIER_LOCK)
 259 #define IS_DELETED(lock)        ((lock)->l_state & DELETED_LOCK)
 260 #define IS_REFERENCED(lock)     ((lock)->l_state & REFERENCED_LOCK)
 261 #define IS_IO_LOCK(lock)        ((lock)->l_state & IO_LOCK)
 262 #define IS_WILLING_TO_SLEEP(lock)       \
 263                 ((lock)->l_state & WILLING_TO_SLEEP_LOCK)
 264 #define IS_LOCKMGR(lock)        ((lock)->l_state & LOCKMGR_LOCK)
 265 #define IS_NLM_UP(lock)         ((lock)->l_nlm_state == FLK_NLM_UP)
 266 /* Clustering: Macro for PXFS locks */
 267 #define IS_PXFS(lock)           ((lock)->l_state & PXFS_LOCK)
 268 
 269 /*
 270  * "local" requests don't involve the NFS lock manager in any way.
 271  * "remote" requests can be on the server (requests from a remote client),
 272  * in which case they should be associated with a local vnode (UFS, tmpfs,
 273  * etc.).  These requests are flagged with LOCKMGR_LOCK and are made using
 274  * kernel service threads.  Remote requests can also be on an NFS client,
 275  * because the NFS lock manager uses local locking for some of its
 276  * bookkeeping.  These requests are made by regular user processes.
 277  */
 278 #define IS_LOCAL(lock)  (GETSYSID((lock)->l_flock.l_sysid) == 0)
 279 #define IS_REMOTE(lock) (! IS_LOCAL(lock))
 280 
 281 /* Clustering: Return value for blocking PXFS locks */
 282 /*
 283  * For PXFS locks, reclock() will return this error code for requests that
 284  * need to block
 285  */
 286 #define PXFS_LOCK_BLOCKED -1
 287 
 288 /* Clustering: PXFS callback function */
 289 /*
 290  * This function is a callback from the LLM into the PXFS server module.  It
 291  * is initialized as a weak stub, and is functional when the pxfs server module
 292  * is loaded.
 293  */
 294 extern void cl_flk_state_transition_notify(lock_descriptor_t *lock,
 295     int old_state, int new_state);
 296 
 297 #define BLOCKS(lock1, lock2)    (!SAME_OWNER((lock1), (lock2)) && \
 298                                         (((lock1)->l_type == F_WRLCK) || \
 299                                         ((lock2)->l_type == F_WRLCK)) && \
 300                                         OVERLAP((lock1), (lock2)))
 301 
 302 #define COVERS(lock1, lock2)    \
 303                 (((lock1)->l_start <= (lock2)->l_start) && \
 304                         ((lock1)->l_end >= (lock2)->l_end))
 305 
 306 #define IN_LIST_REMOVE(ep)      \
 307         { \
 308         (ep)->edge_in_next->edge_in_prev = (ep)->edge_in_prev; \
 309         (ep)->edge_in_prev->edge_in_next = (ep)->edge_in_next; \
 310         }
 311 
 312 #define ADJ_LIST_REMOVE(ep)     \
 313         { \
 314         (ep)->edge_adj_next->edge_adj_prev = (ep)->edge_adj_prev; \
 315         (ep)->edge_adj_prev->edge_adj_next = (ep)->edge_adj_next; \
 316         }
 317 
 318 #define NOT_BLOCKED(lock)       \
 319         ((lock)->l_edge.edge_adj_next == &(lock)->l_edge && !IS_GRANTED(lock))
 320 
 321 #define GRANT_WAKEUP(lock)      \
 322         {       \
 323                 flk_set_state(lock, FLK_GRANTED_STATE); \
 324                 (lock)->l_state |= GRANTED_LOCK; \
 325                 /* \
 326                  * Clustering: PXFS locks do not sleep in the LLM, \
 327                  * so there is no need to signal them \
 328                  */ \
 329                 if (!IS_PXFS(lock)) { \
 330                         cv_signal(&(lock)->l_cv); \
 331                 } \
 332         }
 333 
 334 #define CANCEL_WAKEUP(lock)     \
 335         { \
 336                 flk_set_state(lock, FLK_CANCELLED_STATE); \
 337                 (lock)->l_state |= CANCELLED_LOCK; \
 338                 /* \
 339                  * Clustering: PXFS locks do not sleep in the LLM, \
 340                  * so there is no need to signal them \
 341                  */ \
 342                 if (!IS_PXFS(lock)) { \
 343                         cv_signal(&(lock)->l_cv); \
 344                 } \
 345         }
 346 
 347 #define INTERRUPT_WAKEUP(lock)  \
 348         { \
 349                 flk_set_state(lock, FLK_INTERRUPTED_STATE); \
 350                 (lock)->l_state |= INTERRUPTED_LOCK; \
 351                 /* \
 352                  * Clustering: PXFS locks do not sleep in the LLM, \
 353                  * so there is no need to signal them \
 354                  */ \
 355                 if (!IS_PXFS(lock)) { \
 356                         cv_signal(&(lock)->l_cv); \
 357                 } \
 358         }
 359 
 360 #define REMOVE_SLEEP_QUEUE(lock)        \
 361         { \
 362         ASSERT(IS_SLEEPING(lock) || IS_GRANTED(lock) || \
 363             IS_INTERRUPTED(lock) || IS_CANCELLED(lock)); \
 364         (lock)->l_state &= ~SLEEPING_LOCK; \
 365         (lock)->l_next->l_prev = (lock)->l_prev; \
 366         (lock)->l_prev->l_next = (lock)->l_next; \
 367         (lock)->l_next = (lock)->l_prev = (lock_descriptor_t *)NULL; \
 368         }
 369 
 370 #define NO_DEPENDENTS(lock)     \
 371         ((lock)->l_edge.edge_in_next == &(lock)->l_edge)
 372 
 373 #define GRANT(lock)     \
 374         { \
 375         (lock)->l_state |= GRANTED_LOCK; \
 376         flk_set_state(lock, FLK_GRANTED_STATE); \
 377         }
 378 
 379 #define FIRST_IN(lock)  ((lock)->l_edge.edge_in_next)
 380 #define FIRST_ADJ(lock) ((lock)->l_edge.edge_adj_next)
 381 #define HEAD(lock)      (&(lock)->l_edge)
 382 #define NEXT_ADJ(ep)    ((ep)->edge_adj_next)
 383 #define NEXT_IN(ep)     ((ep)->edge_in_next)
 384 #define IN_ADJ_INIT(lock)       \
 385 {       \
 386 (lock)->l_edge.edge_adj_next = (lock)->l_edge.edge_adj_prev = &(lock)->l_edge; \
 387 (lock)->l_edge.edge_in_next = (lock)->l_edge.edge_in_prev = &(lock)->l_edge; \
 388 }
 389 
 390 #define COPY(lock1, lock2)      \
 391 {       \
 392 (lock1)->l_graph = (lock2)->l_graph; \
 393 (lock1)->l_vnode = (lock2)->l_vnode; \
 394 (lock1)->l_type = (lock2)->l_type; \
 395 (lock1)->l_state = (lock2)->l_state; \
 396 (lock1)->l_start = (lock2)->l_start; \
 397 (lock1)->l_end = (lock2)->l_end; \
 398 (lock1)->l_flock = (lock2)->l_flock; \
 399 (lock1)->l_zoneid = (lock2)->l_zoneid; \
 400 (lock1)->pvertex = (lock2)->pvertex; \
 401 }
 402 
 403 /*
 404  * Clustering
 405  */
 406 /* Routines to set and get the NLM state in a lock request */
 407 #define SET_NLM_STATE(lock, nlm_state)  ((lock)->l_nlm_state = nlm_state)
 408 #define GET_NLM_STATE(lock)     ((lock)->l_nlm_state)
 409 /*
 410  * NLM registry abstraction:
 411  *   Abstraction overview:
 412  *   This registry keeps track of the NLM servers via their nlmids
 413  *   that have requested locks at the LLM this registry is associated
 414  *   with.
 415  */
 416 /* Routines to manipulate the NLM registry object state */
 417 #define FLK_REGISTRY_IS_NLM_UNKNOWN(nlmreg, nlmid) \
 418             ((nlmreg)[nlmid] == FLK_NLM_UNKNOWN)
 419 #define FLK_REGISTRY_IS_NLM_UP(nlmreg, nlmid) \
 420             ((nlmreg)[nlmid] == FLK_NLM_UP)
 421 #define FLK_REGISTRY_ADD_NLMID(nlmreg, nlmid) \
 422             ((nlmreg)[nlmid] = FLK_NLM_UP)
 423 #define FLK_REGISTRY_CHANGE_NLM_STATE(nlmreg, nlmid, state) \
 424             ((nlmreg)[nlmid] = state)
 425 
 426 /* Indicates the effect of executing a request on the existing locks */
 427 
 428 #define FLK_UNLOCK      0x1     /* request unlocks the existing lock */
 429 #define FLK_DOWNGRADE   0x2     /* request downgrades the existing lock */
 430 #define FLK_UPGRADE     0x3     /* request upgrades the existing lock */
 431 #define FLK_STAY_SAME   0x4     /* request type is same as existing lock */
 432 
 433 
 434 /*      proc graph definitions  */
 435 
 436 /*
 437  * Proc graph is the global process graph that maintains information
 438  * about the dependencies between processes. An edge is added between two
 439  * processes represented by proc_vertex's A and B, iff there exists l1
 440  * owned by process A in any of the lock_graph's dependent on l2
 441  * (thus having an edge to l2) owned by process B.
 442  */
 443 struct proc_vertex {
 444         pid_t   pid;    /* pid of the process */
 445         long    sysid;  /* sysid of the process */
 446         struct proc_edge        *edge;  /* adajcent edges of this process */
 447         int incount;            /* Number of inedges to this process */
 448         struct proc_edge *p_sedge;      /* used for implementing stack alg. */
 449         struct proc_vertex      *p_stack;       /* used for stack alg. */
 450         int atime;      /* used for cycle detection algorithm */
 451         int dtime;      /* used for cycle detection algorithm */
 452         int index;      /* index into the  array of proc_graph vertices */
 453 };
 454 
 455 typedef struct proc_vertex proc_vertex_t;
 456 
 457 struct proc_edge {
 458         struct proc_edge        *next;  /* next edge in adjacency list */
 459         int  refcount;                  /* reference count of this edge */
 460         struct proc_vertex      *to_proc;       /* process this points to */
 461 };
 462 
 463 typedef struct proc_edge proc_edge_t;
 464 
 465 
 466 #define PROC_CHUNK      100
 467 
 468 struct proc_graph {
 469         struct proc_vertex **proc;      /* list of proc_vertexes */
 470         int gcount;             /* list size */
 471         int free;               /* number of free slots in the list */
 472         int mark;               /* used for graph coloring */
 473 };
 474 
 475 typedef struct proc_graph proc_graph_t;
 476 
 477 extern  struct proc_graph       pgraph;
 478 
 479 #define PROC_SAME_OWNER(lock, pvertex)  \
 480         (((lock)->l_flock.l_pid == (pvertex)->pid) && \
 481                 ((lock)->l_flock.l_sysid == (pvertex)->sysid))
 482 
 483 #define PROC_ARRIVE(pvertex)    ((pvertex)->atime = pgraph.mark)
 484 #define PROC_DEPART(pvertex)    ((pvertex)->dtime = pgraph.mark)
 485 #define PROC_ARRIVED(pvertex)   ((pvertex)->atime == pgraph.mark)
 486 #define PROC_DEPARTED(pvertex)  ((pvertex)->dtime == pgraph.mark)
 487 
 488 #ifdef  __cplusplus
 489 }
 490 #endif
 491 
 492 #endif  /* _SYS_FLOCK_IMPL_H */