1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28 /*      All Rights Reserved */
  29 
  30 /*
  31  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  32  * Copyright 2015 Joyent, Inc.
  33  */
  34 
  35 #include <sys/flock_impl.h>
  36 #include <sys/vfs.h>
  37 #include <sys/t_lock.h>           /* for <sys/callb.h> */
  38 #include <sys/callb.h>
  39 #include <sys/clconf.h>
  40 #include <sys/cladm.h>
  41 #include <sys/nbmlock.h>
  42 #include <sys/cred.h>
  43 #include <sys/policy.h>
  44 #include <sys/list.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/socket.h>
  47 #include <inet/ip.h>
  48 
  49 /*
  50  * The following four variables are for statistics purposes and they are
  51  * not protected by locks. They may not be accurate but will at least be
  52  * close to the actual value.
  53  */
  54 
  55 int     flk_lock_allocs;
  56 int     flk_lock_frees;
  57 int     edge_allocs;
  58 int     edge_frees;
  59 int     flk_proc_vertex_allocs;
  60 int     flk_proc_edge_allocs;
  61 int     flk_proc_vertex_frees;
  62 int     flk_proc_edge_frees;
  63 
  64 static kmutex_t flock_lock;
  65 
  66 #ifdef DEBUG
  67 int check_debug = 0;
  68 #define CHECK_ACTIVE_LOCKS(gp)  if (check_debug) \
  69                                         check_active_locks(gp);
  70 #define CHECK_SLEEPING_LOCKS(gp)        if (check_debug) \
  71                                                 check_sleeping_locks(gp);
  72 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)   \
  73                 if (check_debug)        \
  74                         check_owner_locks(gp, pid, sysid, vp);
  75 #define CHECK_LOCK_TRANSITION(old_state, new_state) \
  76         { \
  77                 if (check_lock_transition(old_state, new_state)) { \
  78                         cmn_err(CE_PANIC, "Illegal lock transition \
  79                             from %d to %d", old_state, new_state); \
  80                 } \
  81         }
  82 #else
  83 
  84 #define CHECK_ACTIVE_LOCKS(gp)
  85 #define CHECK_SLEEPING_LOCKS(gp)
  86 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
  87 #define CHECK_LOCK_TRANSITION(old_state, new_state)
  88 
  89 #endif /* DEBUG */
  90 
  91 struct kmem_cache       *flk_edge_cache;
  92 
  93 graph_t         *lock_graph[HASH_SIZE];
  94 proc_graph_t    pgraph;
  95 
  96 /*
  97  * Clustering.
  98  *
  99  * NLM REGISTRY TYPE IMPLEMENTATION
 100  *
 101  * Assumptions:
 102  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
 103  *      integers; maximum node id is returned by clconf_maximum_nodeid().
 104  *  2.  We use this node id to identify the node an NLM server runs on.
 105  */
 106 
 107 /*
 108  * NLM registry object keeps track of NLM servers via their
 109  * nlmids (which are the node ids of the node in the cluster they run on)
 110  * that have requested locks at this LLM with which this registry is
 111  * associated.
 112  *
 113  * Representation of abstraction:
 114  *    rep = record[     states: array[nlm_state],
 115  *                      lock: mutex]
 116  *
 117  *    Representation invariants:
 118  *      1. index i of rep.states is between 0 and n - 1 where n is number
 119  *         of elements in the array, which happen to be the maximum number
 120  *         of nodes in the cluster configuration + 1.
 121  *      2. map nlmid to index i of rep.states
 122  *              0   -> 0
 123  *              1   -> 1
 124  *              2   -> 2
 125  *              n-1 -> clconf_maximum_nodeid()+1
 126  *      3.  This 1-1 mapping is quite convenient and it avoids errors resulting
 127  *          from forgetting to subtract 1 from the index.
 128  *      4.  The reason we keep the 0th index is the following.  A legitimate
 129  *          cluster configuration includes making a UFS file system NFS
 130  *          exportable.  The code is structured so that if you're in a cluster
 131  *          you do one thing; otherwise, you do something else.  The problem
 132  *          is what to do if you think you're in a cluster with PXFS loaded,
 133  *          but you're using UFS not PXFS?  The upper two bytes of the sysid
 134  *          encode the node id of the node where NLM server runs; these bytes
 135  *          are zero for UFS.  Since the nodeid is used to index into the
 136  *          registry, we can record the NLM server state information at index
 137  *          0 using the same mechanism used for PXFS file locks!
 138  */
 139 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
 140 static kmutex_t nlm_reg_lock;                   /* lock to protect arrary */
 141 static uint_t nlm_status_size;                  /* size of state array */
 142 
 143 /*
 144  * Although we need a global lock dependency graph (and associated data
 145  * structures), we also need a per-zone notion of whether the lock manager is
 146  * running, and so whether to allow lock manager requests or not.
 147  *
 148  * Thus, on a per-zone basis we maintain a ``global'' variable
 149  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
 150  * manager is determined to be changing state (starting or stopping).
 151  *
 152  * Each graph/zone pair also has a copy of this variable, which is protected by
 153  * the graph's mutex.
 154  *
 155  * The per-graph copies are used to synchronize lock requests with shutdown
 156  * requests.  The global copy is used to initialize the per-graph field when a
 157  * new graph is created.
 158  */
 159 struct flock_globals {
 160         flk_lockmgr_status_t flk_lockmgr_status;
 161         flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
 162 };
 163 
 164 zone_key_t flock_zone_key;
 165 
 166 /*
 167  * Support for the remote stale lock detection
 168  *
 169  * The sysid_to_host_translator_lock readers/writer lock protects
 170  * sysid_to_host_translator_list.
 171  *
 172  * The sysid_to_host_translator_list is a list of sysid to host name translator
 173  * functions.  The new translators are added using the public
 174  * flk_add_sysid_to_host_translator() call.
 175  *
 176  * The stale_lock_timeout is in seconds and it determines the interval for the
 177  * remote stale lock checking.  When set to 0, the remote stale lock checking
 178  * is disabled.
 179  */
 180 struct sysid_to_host_translator_entry {
 181         sysid_to_host_translator_t translator;
 182         list_node_t node;
 183 };
 184 static krwlock_t sysid_to_host_translator_lock;
 185 static list_t sysid_to_host_translator_list;
 186 volatile int stale_lock_timeout = 3600;         /* one hour, in seconds */
 187 
 188 static void create_flock(lock_descriptor_t *, flock64_t *);
 189 static lock_descriptor_t        *flk_get_lock(void);
 190 static void     flk_free_lock(lock_descriptor_t *lock);
 191 static void     flk_get_first_blocking_lock(lock_descriptor_t *);
 192 static int flk_process_request(lock_descriptor_t *);
 193 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
 194 static edge_t *flk_get_edge(void);
 195 static int flk_wait_execute_request(lock_descriptor_t *);
 196 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
 197 static void flk_insert_active_lock(lock_descriptor_t *);
 198 static void flk_delete_active_lock(lock_descriptor_t *, int);
 199 static void flk_insert_sleeping_lock(lock_descriptor_t *);
 200 static void flk_graph_uncolor(graph_t *);
 201 static void flk_wakeup(lock_descriptor_t *, int);
 202 static void flk_free_edge(edge_t *);
 203 static void flk_recompute_dependencies(lock_descriptor_t *,
 204                         lock_descriptor_t **,  int, int);
 205 static int flk_find_barriers(lock_descriptor_t *);
 206 static void flk_update_barriers(lock_descriptor_t *);
 207 static int flk_color_reachables(lock_descriptor_t *);
 208 static int flk_canceled(lock_descriptor_t *);
 209 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
 210 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
 211 static void wait_for_lock(lock_descriptor_t *);
 212 static void unlock_lockmgr_granted(struct flock_globals *);
 213 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
 214 
 215 /* Clustering hooks */
 216 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
 217 static void cl_flk_wakeup_sleeping_nlm_locks(int);
 218 static void cl_flk_unlock_nlm_granted(int);
 219 
 220 #ifdef DEBUG
 221 static int check_lock_transition(int, int);
 222 static void check_sleeping_locks(graph_t *);
 223 static void check_active_locks(graph_t *);
 224 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
 225 static void path(lock_descriptor_t *, lock_descriptor_t *);
 226 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
 227 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
 228 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
 229 #endif
 230 
 231 /*      proc_graph function definitions */
 232 static int flk_check_deadlock(lock_descriptor_t *);
 233 static void flk_proc_graph_uncolor(void);
 234 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
 235 static proc_edge_t *flk_get_proc_edge(void);
 236 static void flk_proc_release(proc_vertex_t *);
 237 static void flk_free_proc_edge(proc_edge_t *);
 238 static void flk_update_proc_graph(edge_t *, int);
 239 
 240 /* Non-blocking mandatory locking */
 241 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
 242                         u_offset_t);
 243 
 244 static struct flock_globals *
 245 flk_get_globals(void)
 246 {
 247         /*
 248          * The KLM module had better be loaded if we're attempting to handle
 249          * lockmgr requests.
 250          */
 251         ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
 252         return (zone_getspecific(flock_zone_key, curproc->p_zone));
 253 }
 254 
 255 static flk_lockmgr_status_t
 256 flk_get_lockmgr_status(void)
 257 {
 258         struct flock_globals *fg;
 259 
 260         ASSERT(MUTEX_HELD(&flock_lock));
 261 
 262         if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
 263                 /*
 264                  * KLM module not loaded; lock manager definitely not running.
 265                  */
 266                 return (FLK_LOCKMGR_DOWN);
 267         }
 268         fg = flk_get_globals();
 269         return (fg->flk_lockmgr_status);
 270 }
 271 
 272 /*
 273  * This implements Open File Description (not descriptor) style record locking.
 274  * These locks can also be thought of as pid-less since they are not tied to a
 275  * specific process, thus they're preserved across fork.
 276  *
 277  * Called directly from fcntl.
 278  *
 279  * See reclock() for the implementation of the traditional POSIX style record
 280  * locking scheme (pid-ful). This function is derived from reclock() but
 281  * simplified and modified to work for OFD style locking.
 282  *
 283  * The two primary advantages of OFD style of locking are:
 284  * 1) It is per-file description, so closing a file descriptor that refers to a
 285  *    different file description for the same file will not drop the lock (i.e.
 286  *    two open's of the same file get different descriptions but a dup or fork
 287  *    will refer to the same description).
 288  * 2) Locks are preserved across fork(2).
 289  *
 290  * Because these locks are per-description a lock ptr lives at the f_filocks
 291  * member of the file_t and the lock_descriptor includes a file_t pointer
 292  * to enable unique lock identification and management.
 293  *
 294  * Since these locks are pid-less we cannot do deadlock detection with the
 295  * current process-oriented implementation. This is consistent with OFD locking
 296  * behavior on other operating systems such as Linux. Since we don't do
 297  * deadlock detection we never interact with the process graph that is
 298  * maintained for deadlock detection on the traditional POSIX-style locks.
 299  *
 300  * Future Work:
 301  *
 302  * The current implementation does not support record locks. That is,
 303  * currently the single lock must cover the entire file. This is validated in
 304  * fcntl. To support record locks the f_filock pointer in the file_t needs to
 305  * be changed to a list of pointers to the locks. That list needs to be
 306  * managed independently of the lock list on the vnode itself and it needs to
 307  * be maintained as record locks are created, split, coalesced and deleted.
 308  *
 309  * The current implementation does not support remote file systems (e.g.
 310  * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks
 311  * interact with the NLM is not clear since the NLM protocol/implementation
 312  * appears to be oriented around locks associated with a process. A further
 313  * problem is that a design is needed for what nlm_send_siglost() should do and
 314  * where it will send SIGLOST. More recent versions of Linux apparently try to
 315  * emulate OFD locks on NFS by converting them to traditional POSIX style locks
 316  * that work with the NLM. It is not clear that this provides the correct
 317  * semantics in all cases.
 318  */
 319 int
 320 ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, u_offset_t offset)
 321 {
 322         int cmd = 0;
 323         vnode_t *vp;
 324         lock_descriptor_t       stack_lock_request;
 325         lock_descriptor_t       *lock_request;
 326         int error = 0;
 327         graph_t *gp;
 328         int serialize = 0;
 329 
 330         if (fcmd != F_OFD_GETLK)
 331                 cmd = SETFLCK;
 332 
 333         if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW)
 334                 cmd |= SLPFLCK;
 335 
 336         /* see block comment */
 337         VERIFY(lckdat->l_whence == 0);
 338         VERIFY(lckdat->l_start == 0);
 339         VERIFY(lckdat->l_len == 0);
 340 
 341         vp = fp->f_vnode;
 342 
 343         /*
 344          * For reclock fs_frlock() would normally have set these in a few
 345          * places but for us it's cleaner to centralize it here. Note that
 346          * IGN_PID is -1. We use 0 for our pid-less locks.
 347          */
 348         lckdat->l_pid = 0;
 349         lckdat->l_sysid = 0;
 350 
 351         /*
 352          * Check access permissions
 353          */
 354         if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) &&
 355             ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 356             (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 357                 return (EBADF);
 358 
 359         /*
 360          * for query and unlock we use the stack_lock_request
 361          */
 362         if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) {
 363                 lock_request = &stack_lock_request;
 364                 (void) bzero((caddr_t)lock_request,
 365                     sizeof (lock_descriptor_t));
 366 
 367                 /*
 368                  * following is added to make the assertions in
 369                  * flk_execute_request() pass
 370                  */
 371                 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 372                 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 373                 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 374                 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 375                 lock_request->l_status = FLK_INITIAL_STATE;
 376         } else {
 377                 lock_request = flk_get_lock();
 378                 fp->f_filock = (struct filock *)lock_request;
 379         }
 380         lock_request->l_state = 0;
 381         lock_request->l_vnode = vp;
 382         lock_request->l_zoneid = getzoneid();
 383         lock_request->l_ofd = fp;
 384 
 385         /*
 386          * Convert the request range into the canonical start and end
 387          * values then check the validity of the lock range.
 388          */
 389         error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start,
 390             &lock_request->l_end, offset);
 391         if (error)
 392                 goto done;
 393 
 394         error = flk_check_lock_data(lock_request->l_start, lock_request->l_end,
 395             MAXEND);
 396         if (error)
 397                 goto done;
 398 
 399         ASSERT(lock_request->l_end >= lock_request->l_start);
 400 
 401         lock_request->l_type = lckdat->l_type;
 402         if (cmd & SLPFLCK)
 403                 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
 404 
 405         if (!(cmd & SETFLCK)) {
 406                 if (lock_request->l_type == F_RDLCK ||
 407                     lock_request->l_type == F_WRLCK)
 408                         lock_request->l_state |= QUERY_LOCK;
 409         }
 410         lock_request->l_flock = (*lckdat);
 411 
 412         /*
 413          * We are ready for processing the request
 414          */
 415 
 416         if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK &&
 417             nbl_need_check(vp)) {
 418                 nbl_start_crit(vp, RW_WRITER);
 419                 serialize = 1;
 420         }
 421 
 422         /* Get the lock graph for a particular vnode */
 423         gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
 424 
 425         mutex_enter(&gp->gp_mutex);
 426 
 427         lock_request->l_state |= REFERENCED_LOCK;
 428         lock_request->l_graph = gp;
 429 
 430         switch (lock_request->l_type) {
 431         case F_RDLCK:
 432         case F_WRLCK:
 433                 if (IS_QUERY_LOCK(lock_request)) {
 434                         flk_get_first_blocking_lock(lock_request);
 435                         if (lock_request->l_ofd != NULL)
 436                                 lock_request->l_flock.l_pid = -1;
 437                         (*lckdat) = lock_request->l_flock;
 438                 } else {
 439                         /* process the request now */
 440                         error = flk_process_request(lock_request);
 441                 }
 442                 break;
 443 
 444         case F_UNLCK:
 445                 /* unlock request will not block so execute it immediately */
 446                 error = flk_execute_request(lock_request);
 447                 break;
 448 
 449         default:
 450                 error = EINVAL;
 451                 break;
 452         }
 453 
 454         if (lock_request == &stack_lock_request) {
 455                 flk_set_state(lock_request, FLK_DEAD_STATE);
 456         } else {
 457                 lock_request->l_state &= ~REFERENCED_LOCK;
 458                 if ((error != 0) || IS_DELETED(lock_request)) {
 459                         flk_set_state(lock_request, FLK_DEAD_STATE);
 460                         flk_free_lock(lock_request);
 461                 }
 462         }
 463 
 464         mutex_exit(&gp->gp_mutex);
 465         if (serialize)
 466                 nbl_end_crit(vp);
 467 
 468         return (error);
 469 
 470 done:
 471         flk_set_state(lock_request, FLK_DEAD_STATE);
 472         if (lock_request != &stack_lock_request)
 473                 flk_free_lock(lock_request);
 474         return (error);
 475 }
 476 
 477 /*
 478  * Remove any lock on the vnode belonging to the given file_t.
 479  * Called from closef on last close, file_t is locked.
 480  *
 481  * This is modeled on the cleanlocks() function but only removes the single
 482  * lock associated with fp.
 483  */
 484 void
 485 ofdcleanlock(file_t *fp)
 486 {
 487         lock_descriptor_t *fplock, *lock, *nlock;
 488         vnode_t *vp;
 489         graph_t *gp;
 490 
 491         ASSERT(MUTEX_HELD(&fp->f_tlock));
 492 
 493         if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL)
 494                 return;
 495 
 496         fp->f_filock = NULL;
 497         vp = fp->f_vnode;
 498 
 499         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
 500 
 501         if (gp == NULL)
 502                 return;
 503         mutex_enter(&gp->gp_mutex);
 504 
 505         CHECK_SLEEPING_LOCKS(gp);
 506         CHECK_ACTIVE_LOCKS(gp);
 507 
 508         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 509 
 510         if (lock) {
 511                 do {
 512                         nlock = lock->l_next;
 513                         if (fplock == lock) {
 514                                 CANCEL_WAKEUP(lock);
 515                                 break;
 516                         }
 517                         lock = nlock;
 518                 } while (lock->l_vnode == vp);
 519         }
 520 
 521         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 522 
 523         if (lock) {
 524                 do {
 525                         nlock = lock->l_next;
 526                         if (fplock == lock) {
 527                                 flk_delete_active_lock(lock, 0);
 528                                 flk_wakeup(lock, 1);
 529                                 flk_free_lock(lock);
 530                                 break;
 531                         }
 532                         lock = nlock;
 533                 } while (lock->l_vnode == vp);
 534         }
 535 
 536         CHECK_SLEEPING_LOCKS(gp);
 537         CHECK_ACTIVE_LOCKS(gp);
 538         mutex_exit(&gp->gp_mutex);
 539 }
 540 
 541 /*
 542  * Routine called from fs_frlock in fs/fs_subr.c
 543  *
 544  * This implements traditional POSIX style record locking. The two primary
 545  * drawbacks to this style of locking are:
 546  * 1) It is per-process, so any close of a file descriptor that refers to the
 547  *    file will drop the lock (e.g. lock /etc/passwd, call a library function
 548  *    which opens /etc/passwd to read the file, when the library closes it's
 549  *    file descriptor the application loses its lock and does not know).
 550  * 2) Locks are not preserved across fork(2).
 551  *
 552  * Because these locks are only associated with a PID, they are per-process.
 553  * This is why any close will drop the lock and is also why, once the process
 554  * forks, the lock is no longer related to the new process. These locks can
 555  * be considered as PID-ful.
 556  *
 557  * See ofdlock() for the implementation of a similar but improved locking
 558  * scheme.
 559  */
 560 int
 561 reclock(vnode_t *vp, flock64_t *lckdat, int cmd, int flag, u_offset_t offset,
 562     flk_callback_t *flk_cbp)
 563 {
 564         lock_descriptor_t       stack_lock_request;
 565         lock_descriptor_t       *lock_request;
 566         int error = 0;
 567         graph_t *gp;
 568         int                     nlmid;
 569 
 570         /*
 571          * Check access permissions
 572          */
 573         if ((cmd & SETFLCK) &&
 574             ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 575             (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 576                         return (EBADF);
 577 
 578         /*
 579          * for query and unlock we use the stack_lock_request
 580          */
 581 
 582         if ((lckdat->l_type == F_UNLCK) ||
 583             !((cmd & INOFLCK) || (cmd & SETFLCK))) {
 584                 lock_request = &stack_lock_request;
 585                 bzero(lock_request, sizeof (lock_descriptor_t));
 586 
 587                 /*
 588                  * following is added to make the assertions in
 589                  * flk_execute_request() to pass through
 590                  */
 591 
 592                 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 593                 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 594                 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 595                 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 596                 lock_request->l_status = FLK_INITIAL_STATE;
 597         } else {
 598                 lock_request = flk_get_lock();
 599         }
 600         lock_request->l_state = 0;
 601         lock_request->l_vnode = vp;
 602         lock_request->l_zoneid = getzoneid();
 603 
 604         /*
 605          * Convert the request range into the canonical start and end
 606          * values.  The NLM protocol supports locking over the entire
 607          * 32-bit range, so there's no range checking for remote requests,
 608          * but we still need to verify that local requests obey the rules.
 609          */
 610         /* Clustering */
 611         if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
 612                 ASSERT(lckdat->l_whence == 0);
 613                 lock_request->l_start = lckdat->l_start;
 614                 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
 615                     lckdat->l_start + (lckdat->l_len - 1);
 616         } else {
 617                 /* check the validity of the lock range */
 618                 error = flk_convert_lock_data(vp, lckdat,
 619                     &lock_request->l_start, &lock_request->l_end,
 620                     offset);
 621                 if (error) {
 622                         goto done;
 623                 }
 624                 error = flk_check_lock_data(lock_request->l_start,
 625                     lock_request->l_end, MAXEND);
 626                 if (error) {
 627                         goto done;
 628                 }
 629         }
 630 
 631         ASSERT(lock_request->l_end >= lock_request->l_start);
 632 
 633         lock_request->l_type = lckdat->l_type;
 634         if (cmd & INOFLCK)
 635                 lock_request->l_state |= IO_LOCK;
 636         if (cmd & SLPFLCK)
 637                 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
 638         if (cmd & RCMDLCK)
 639                 lock_request->l_state |= LOCKMGR_LOCK;
 640         if (cmd & NBMLCK)
 641                 lock_request->l_state |= NBMAND_LOCK;
 642         /*
 643          * Clustering: set flag for PXFS locks
 644          * We do not _only_ check for the PCMDLCK flag because PXFS locks could
 645          * also be of type 'RCMDLCK'.
 646          * We do not _only_ check the GETPXFSID() macro because local PXFS
 647          * clients use a pxfsid of zero to permit deadlock detection in the LLM.
 648          */
 649 
 650         if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
 651                 lock_request->l_state |= PXFS_LOCK;
 652         }
 653         if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
 654                 if (lock_request->l_type == F_RDLCK ||
 655                     lock_request->l_type == F_WRLCK)
 656                         lock_request->l_state |= QUERY_LOCK;
 657         }
 658         lock_request->l_flock = (*lckdat);
 659         lock_request->l_callbacks = flk_cbp;
 660 
 661         /*
 662          * We are ready for processing the request
 663          */
 664         if (IS_LOCKMGR(lock_request)) {
 665                 /*
 666                  * If the lock request is an NLM server request ....
 667                  */
 668                 if (nlm_status_size == 0) { /* not booted as cluster */
 669                         mutex_enter(&flock_lock);
 670                         /*
 671                          * Bail out if this is a lock manager request and the
 672                          * lock manager is not supposed to be running.
 673                          */
 674                         if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
 675                                 mutex_exit(&flock_lock);
 676                                 error = ENOLCK;
 677                                 goto done;
 678                         }
 679                         mutex_exit(&flock_lock);
 680                 } else {                        /* booted as a cluster */
 681                         nlmid = GETNLMID(lock_request->l_flock.l_sysid);
 682                         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
 683 
 684                         mutex_enter(&nlm_reg_lock);
 685                         /*
 686                          * If the NLM registry does not know about this
 687                          * NLM server making the request, add its nlmid
 688                          * to the registry.
 689                          */
 690                         if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
 691                             nlmid)) {
 692                                 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
 693                         } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
 694                             nlmid)) {
 695                                 /*
 696                                  * If the NLM server is already known (has made
 697                                  * previous lock requests) and its state is
 698                                  * not NLM_UP (means that NLM server is
 699                                  * shutting down), then bail out with an
 700                                  * error to deny the lock request.
 701                                  */
 702                                 mutex_exit(&nlm_reg_lock);
 703                                 error = ENOLCK;
 704                                 goto done;
 705                         }
 706                         mutex_exit(&nlm_reg_lock);
 707                 }
 708         }
 709 
 710         /* Now get the lock graph for a particular vnode */
 711         gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
 712 
 713         /*
 714          * We drop rwlock here otherwise this might end up causing a
 715          * deadlock if this IOLOCK sleeps. (bugid # 1183392).
 716          */
 717 
 718         if (IS_IO_LOCK(lock_request)) {
 719                 VOP_RWUNLOCK(vp,
 720                     (lock_request->l_type == F_RDLCK) ?
 721                     V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 722         }
 723         mutex_enter(&gp->gp_mutex);
 724 
 725         lock_request->l_state |= REFERENCED_LOCK;
 726         lock_request->l_graph = gp;
 727 
 728         switch (lock_request->l_type) {
 729         case F_RDLCK:
 730         case F_WRLCK:
 731                 if (IS_QUERY_LOCK(lock_request)) {
 732                         flk_get_first_blocking_lock(lock_request);
 733                         if (lock_request->l_ofd != NULL)
 734                                 lock_request->l_flock.l_pid = -1;
 735                         (*lckdat) = lock_request->l_flock;
 736                         break;
 737                 }
 738 
 739                 /* process the request now */
 740 
 741                 error = flk_process_request(lock_request);
 742                 break;
 743 
 744         case F_UNLCK:
 745                 /* unlock request will not block so execute it immediately */
 746 
 747                 if (IS_LOCKMGR(lock_request) &&
 748                     flk_canceled(lock_request)) {
 749                         error = 0;
 750                 } else {
 751                         error = flk_execute_request(lock_request);
 752                 }
 753                 break;
 754 
 755         case F_UNLKSYS:
 756                 /*
 757                  * Recovery mechanism to release lock manager locks when
 758                  * NFS client crashes and restart. NFS server will clear
 759                  * old locks and grant new locks.
 760                  */
 761 
 762                 if (lock_request->l_flock.l_sysid == 0) {
 763                         mutex_exit(&gp->gp_mutex);
 764                         return (EINVAL);
 765                 }
 766                 if (secpolicy_nfs(CRED()) != 0) {
 767                         mutex_exit(&gp->gp_mutex);
 768                         return (EPERM);
 769                 }
 770                 flk_delete_locks_by_sysid(lock_request);
 771                 lock_request->l_state &= ~REFERENCED_LOCK;
 772                 flk_set_state(lock_request, FLK_DEAD_STATE);
 773                 flk_free_lock(lock_request);
 774                 mutex_exit(&gp->gp_mutex);
 775                 return (0);
 776 
 777         default:
 778                 error = EINVAL;
 779                 break;
 780         }
 781 
 782         /* Clustering: For blocked PXFS locks, return */
 783         if (error == PXFS_LOCK_BLOCKED) {
 784                 lock_request->l_state &= ~REFERENCED_LOCK;
 785                 mutex_exit(&gp->gp_mutex);
 786                 return (error);
 787         }
 788 
 789         /*
 790          * Now that we have seen the status of locks in the system for
 791          * this vnode we acquire the rwlock if it is an IO_LOCK.
 792          */
 793 
 794         if (IS_IO_LOCK(lock_request)) {
 795                 (void) VOP_RWLOCK(vp,
 796                     (lock_request->l_type == F_RDLCK) ?
 797                     V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 798                 if (!error) {
 799                         lckdat->l_type = F_UNLCK;
 800 
 801                         /*
 802                          * This wake up is needed otherwise
 803                          * if IO_LOCK has slept the dependents on this
 804                          * will not be woken up at all. (bugid # 1185482).
 805                          */
 806 
 807                         flk_wakeup(lock_request, 1);
 808                         flk_set_state(lock_request, FLK_DEAD_STATE);
 809                         flk_free_lock(lock_request);
 810                 }
 811                 /*
 812                  * else if error had occurred either flk_process_request()
 813                  * has returned EDEADLK in which case there will be no
 814                  * dependents for this lock or EINTR from flk_wait_execute_
 815                  * request() in which case flk_cancel_sleeping_lock()
 816                  * would have been done. same is true with EBADF.
 817                  */
 818         }
 819 
 820         if (lock_request == &stack_lock_request) {
 821                 flk_set_state(lock_request, FLK_DEAD_STATE);
 822         } else {
 823                 lock_request->l_state &= ~REFERENCED_LOCK;
 824                 if ((error != 0) || IS_DELETED(lock_request)) {
 825                         flk_set_state(lock_request, FLK_DEAD_STATE);
 826                         flk_free_lock(lock_request);
 827                 }
 828         }
 829 
 830         mutex_exit(&gp->gp_mutex);
 831         return (error);
 832 
 833 done:
 834         flk_set_state(lock_request, FLK_DEAD_STATE);
 835         if (lock_request != &stack_lock_request)
 836                 flk_free_lock(lock_request);
 837         return (error);
 838 }
 839 
 840 /*
 841  * Invoke the callbacks in the given list.  If before sleeping, invoke in
 842  * list order.  If after sleeping, invoke in reverse order.
 843  *
 844  * CPR (suspend/resume) support: if one of the callbacks returns a
 845  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
 846  * while it is sleeping.  There should be at most one callb_cpr_t for the
 847  * thread.
 848  * XXX This is unnecessarily complicated.  The CPR information should just
 849  * get passed in directly through VOP_FRLOCK and reclock, rather than
 850  * sneaking it in via a callback.
 851  */
 852 
 853 callb_cpr_t *
 854 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
 855 {
 856         callb_cpr_t *cpr_callbackp = NULL;
 857         callb_cpr_t *one_result;
 858         flk_callback_t *cb;
 859 
 860         if (cblist == NULL)
 861                 return (NULL);
 862 
 863         if (when == FLK_BEFORE_SLEEP) {
 864                 cb = cblist;
 865                 do {
 866                         one_result = (*cb->cb_callback)(when, cb->cb_data);
 867                         if (one_result != NULL) {
 868                                 ASSERT(cpr_callbackp == NULL);
 869                                 cpr_callbackp = one_result;
 870                         }
 871                         cb = cb->cb_next;
 872                 } while (cb != cblist);
 873         } else {
 874                 cb = cblist->cb_prev;
 875                 do {
 876                         one_result = (*cb->cb_callback)(when, cb->cb_data);
 877                         if (one_result != NULL) {
 878                                 cpr_callbackp = one_result;
 879                         }
 880                         cb = cb->cb_prev;
 881                 } while (cb != cblist->cb_prev);
 882         }
 883 
 884         return (cpr_callbackp);
 885 }
 886 
 887 /*
 888  * Initialize a flk_callback_t to hold the given callback.
 889  */
 890 
 891 void
 892 flk_init_callback(flk_callback_t *flk_cb,
 893     callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
 894 {
 895         flk_cb->cb_next = flk_cb;
 896         flk_cb->cb_prev = flk_cb;
 897         flk_cb->cb_callback = cb_fcn;
 898         flk_cb->cb_data = cbdata;
 899 }
 900 
 901 /*
 902  * Initialize an flk_callback_t and then link it into the head of an
 903  * existing list (which may be NULL).
 904  */
 905 
 906 void
 907 flk_add_callback(flk_callback_t *newcb,
 908     callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
 909     void *cbdata, flk_callback_t *cblist)
 910 {
 911         flk_init_callback(newcb, cb_fcn, cbdata);
 912 
 913         if (cblist == NULL)
 914                 return;
 915 
 916         newcb->cb_prev = cblist->cb_prev;
 917         newcb->cb_next = cblist;
 918         cblist->cb_prev->cb_next = newcb;
 919         cblist->cb_prev = newcb;
 920 }
 921 
 922 /*
 923  * Remove the callback from a list.
 924  */
 925 
 926 void
 927 flk_del_callback(flk_callback_t *flk_cb)
 928 {
 929         flk_cb->cb_next->cb_prev = flk_cb->cb_prev;
 930         flk_cb->cb_prev->cb_next = flk_cb->cb_next;
 931 
 932         flk_cb->cb_prev = flk_cb;
 933         flk_cb->cb_next = flk_cb;
 934 }
 935 
 936 /*
 937  * Initialize the flk_edge_cache data structure and create the
 938  * nlm_reg_status array.
 939  */
 940 
 941 void
 942 flk_init(void)
 943 {
 944         uint_t  i;
 945 
 946         flk_edge_cache = kmem_cache_create("flk_edges",
 947             sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
 948         if (flk_edge_cache == NULL) {
 949                 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
 950         }
 951         /*
 952          * Create the NLM registry object.
 953          */
 954 
 955         if (cluster_bootflags & CLUSTER_BOOTED) {
 956                 /*
 957                  * This routine tells you the maximum node id that will be used
 958                  * in the cluster.  This number will be the size of the nlm
 959                  * registry status array.  We add 1 because we will be using
 960                  * all entries indexed from 0 to maxnodeid; e.g., from 0
 961                  * to 64, for a total of 65 entries.
 962                  */
 963                 nlm_status_size = clconf_maximum_nodeid() + 1;
 964         } else {
 965                 nlm_status_size = 0;
 966         }
 967 
 968         if (nlm_status_size != 0) {     /* booted as a cluster */
 969                 nlm_reg_status = (flk_nlm_status_t *)
 970                     kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
 971                     KM_SLEEP);
 972 
 973                 /* initialize all NLM states in array to NLM_UNKNOWN */
 974                 for (i = 0; i < nlm_status_size; i++) {
 975                         nlm_reg_status[i] = FLK_NLM_UNKNOWN;
 976                 }
 977         }
 978 
 979         mutex_init(&flock_lock, NULL, MUTEX_DEFAULT, NULL);
 980         mutex_init(&nlm_reg_lock, NULL, MUTEX_DEFAULT, NULL);
 981 
 982         rw_init(&sysid_to_host_translator_lock, NULL, RW_DEFAULT, NULL);
 983         list_create(&sysid_to_host_translator_list,
 984             sizeof (struct sysid_to_host_translator_entry),
 985             offsetof(struct sysid_to_host_translator_entry, node));
 986 }
 987 
 988 /*
 989  * Zone constructor/destructor callbacks to be executed when a zone is
 990  * created/destroyed.
 991  */
 992 /* ARGSUSED */
 993 void *
 994 flk_zone_init(zoneid_t zoneid)
 995 {
 996         struct flock_globals *fg;
 997         uint_t i;
 998 
 999         fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
1000         fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
1001         for (i = 0; i < HASH_SIZE; i++)
1002                 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
1003         return (fg);
1004 }
1005 
1006 /* ARGSUSED */
1007 void
1008 flk_zone_fini(zoneid_t zoneid, void *data)
1009 {
1010         struct flock_globals *fg = data;
1011 
1012         kmem_free(fg, sizeof (*fg));
1013 }
1014 
1015 /*
1016  * Get a lock_descriptor structure with initialization of edge lists.
1017  */
1018 
1019 static lock_descriptor_t *
1020 flk_get_lock(void)
1021 {
1022         lock_descriptor_t       *l;
1023 
1024         l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
1025 
1026         cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
1027         l->l_edge.edge_in_next = &l->l_edge;
1028         l->l_edge.edge_in_prev = &l->l_edge;
1029         l->l_edge.edge_adj_next = &l->l_edge;
1030         l->l_edge.edge_adj_prev = &l->l_edge;
1031         l->pvertex = -1;
1032         l->l_status = FLK_INITIAL_STATE;
1033         flk_lock_allocs++;
1034         return (l);
1035 }
1036 
1037 /*
1038  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
1039  * when some thread has a reference to it as in reclock().
1040  */
1041 
1042 void
1043 flk_free_lock(lock_descriptor_t *lock)
1044 {
1045         file_t *fp;
1046 
1047         ASSERT(lock->l_blocker >= 0);
1048         ASSERT(IS_DEAD(lock));
1049 
1050         if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock)
1051                 fp->f_filock = NULL;
1052 
1053         if (IS_REFERENCED(lock)) {
1054                 lock->l_state |= DELETED_LOCK;
1055                 return;
1056         }
1057         flk_lock_frees++;
1058         kmem_free(lock, sizeof (lock_descriptor_t));
1059 }
1060 
1061 void
1062 flk_set_state(lock_descriptor_t *lock, int new_state)
1063 {
1064         /*
1065          * Locks in the sleeping list may be woken up in a number of ways,
1066          * and more than once.  If a sleeping lock is signaled awake more
1067          * than once, then it may or may not change state depending on its
1068          * current state.
1069          * Also note that NLM locks that are sleeping could be moved to an
1070          * interrupted state more than once if the unlock request is
1071          * retransmitted by the NLM client - the second time around, this is
1072          * just a nop.
1073          * The ordering of being signaled awake is:
1074          * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
1075          * The checks below implement this ordering.
1076          */
1077         if (IS_INTERRUPTED(lock)) {
1078                 if ((new_state == FLK_CANCELLED_STATE) ||
1079                     (new_state == FLK_GRANTED_STATE) ||
1080                     (new_state == FLK_INTERRUPTED_STATE)) {
1081                         return;
1082                 }
1083         }
1084         if (IS_CANCELLED(lock)) {
1085                 if ((new_state == FLK_GRANTED_STATE) ||
1086                     (new_state == FLK_CANCELLED_STATE)) {
1087                         return;
1088                 }
1089         }
1090         CHECK_LOCK_TRANSITION(lock->l_status, new_state);
1091         if (IS_PXFS(lock)) {
1092                 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
1093         }
1094         lock->l_status = new_state;
1095 }
1096 
1097 /*
1098  * Support for the remote stale lock detection
1099  */
1100 
1101 void
1102 flk_add_sysid_to_host_translator(sysid_to_host_translator_t tr)
1103 {
1104         struct sysid_to_host_translator_entry *te;
1105 
1106         te = kmem_alloc(sizeof (struct sysid_to_host_translator_entry),
1107             KM_SLEEP);
1108 
1109         te->translator = tr;
1110 
1111         rw_enter(&sysid_to_host_translator_lock, RW_WRITER);
1112         list_insert_head(&sysid_to_host_translator_list, te);
1113         rw_exit(&sysid_to_host_translator_lock);
1114 }
1115 
1116 static void
1117 translate_sysid_to_host(zoneid_t zoneid, sysid_t sysid, char *host, size_t hlen,
1118     const char **type)
1119 {
1120         struct sockaddr sa;
1121         struct sysid_to_host_translator_entry *te;
1122 
1123         /* Some defaults in a case the translation will fail */
1124         *type = "?";
1125         (void) strlcpy(host, "?", hlen);
1126 
1127         rw_enter(&sysid_to_host_translator_lock, RW_READER);
1128 
1129         for (te = list_head(&sysid_to_host_translator_list); te != NULL;
1130             te = list_next(&sysid_to_host_translator_list, te)) {
1131 
1132                 if (te->translator(zoneid, sysid, &sa, type) != 0) {
1133                         rw_exit(&sysid_to_host_translator_lock);
1134 
1135                         switch (sa.sa_family) {
1136                         case AF_INET:
1137                                 (void) inet_ntop(AF_INET,
1138                                     &((struct sockaddr_in *)&sa)->sin_addr,
1139                                     host, hlen);
1140                                 break;
1141                         case AF_INET6:
1142                                 (void) inet_ntop(AF_INET6,
1143                                     &((struct sockaddr_in6 *)&sa)->sin6_addr,
1144                                     host, hlen);
1145                                 break;
1146                         default:
1147                                 break;
1148                         }
1149 
1150                         return;
1151                 }
1152         }
1153 
1154         rw_exit(&sysid_to_host_translator_lock);
1155 }
1156 
1157 static char *
1158 get_vnode_path(vnode_t *vp)
1159 {
1160         size_t len;
1161         char *ret;
1162 
1163         mutex_enter(&vp->v_lock);
1164         if (vp->v_path == NULL) {
1165                 mutex_exit(&vp->v_lock);
1166                 return (NULL);
1167         }
1168         len = strlen(vp->v_path) + 1;
1169         mutex_exit(&vp->v_lock);
1170 
1171         ret = kmem_alloc(len, KM_SLEEP);
1172 
1173         mutex_enter(&vp->v_lock);
1174         if (vp->v_path == NULL || strlen(vp->v_path) + 1 != len) {
1175                 mutex_exit(&vp->v_lock);
1176                 kmem_free(ret, len);
1177                 return (NULL);
1178         }
1179         bcopy(vp->v_path, ret, len);
1180         mutex_exit(&vp->v_lock);
1181 
1182         return (ret);
1183 }
1184 
1185 static void
1186 flk_stale_lock_check(lock_descriptor_t *lock)
1187 {
1188         char *path;
1189 
1190         char host[INET6_ADDRSTRLEN];            /* host name */
1191         const char *type;                       /* host type */
1192 
1193         /* temporary variables for the cmn_err() call */
1194         char *p, *t;            /* path, lock type */
1195         pid_t pid;              /* pid */
1196         void *v;                /* vnode */
1197         u_offset_t s, e;        /* start, end */
1198 
1199         ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1200 
1201         /*
1202          * Either not a remote lock, or the stale lock checking is disabled, or
1203          * the lock is already reported.
1204          */
1205         if (IS_LOCAL(lock) || stale_lock_timeout == 0 || lock->l_blocker < 0)
1206                 return;
1207 
1208         /* Seen first time? */
1209         if (lock->l_blocker == 0) {
1210                 lock->l_blocker = gethrtime();
1211                 return;
1212         }
1213 
1214         /* Old enough? */
1215         if ((gethrtime() - lock->l_blocker) / NANOSEC < stale_lock_timeout)
1216                 return;
1217 
1218         translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1219             sizeof (host), &type);
1220         path = get_vnode_path(lock->l_vnode);
1221 
1222         pid = lock->l_flock.l_pid;
1223         v = (void *)lock->l_vnode;
1224         p = path == NULL ? "?" : path;
1225         t = lock->l_type == F_WRLCK ? "WR" : "RD";
1226         s = lock->l_start;
1227         e = lock->l_end;
1228 
1229         /* Report the blocker as stale */
1230         cmn_err(CE_NOTE, "!Stale lock (host: %s (%s), pid: %d, vnode: %p, "
1231             "path: %s, %sLCK: %llu:%llu)", host, type, pid, v, p, t, s, e);
1232 
1233         if (path != NULL)
1234                 strfree(path);
1235 
1236         /* Mark this blocker as reported */
1237         lock->l_blocker = -lock->l_blocker;
1238 }
1239 
1240 static void
1241 flk_stale_lock_shrink(lock_descriptor_t *lock, lock_descriptor_t *new)
1242 {
1243         char *path;
1244 
1245         char host[INET6_ADDRSTRLEN];            /* host name */
1246         const char *type;                       /* host type */
1247 
1248         /* temporary variables for the cmn_err() call */
1249         char *p, *t;            /* path, lock type */
1250         pid_t pid;              /* pid */
1251         void *v;                /* vnode */
1252         u_offset_t s, e;        /* start, end */
1253         u_offset_t ns, ne;      /* new start, new end */
1254 
1255         ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1256 
1257         translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1258             sizeof (host), &type);
1259         path = get_vnode_path(lock->l_vnode);
1260 
1261         pid = lock->l_flock.l_pid;
1262         v = (void *)lock->l_vnode;
1263         p = path == NULL ? "?" : path;
1264         t = lock->l_type == F_WRLCK ? "WR" : "RD";
1265         s = lock->l_start;
1266         e = lock->l_end;
1267         ns = new->l_start;
1268         ne = new->l_end;
1269 
1270         cmn_err(CE_NOTE, "!Stale lock SHRINK (host: %s (%s), pid: %d, "
1271             "vnode: %p, path: %s, %sLCK: %llu:%llu -> %llu:%llu)", host, type,
1272             pid, v, p, t, s, e, ns, ne);
1273 
1274         if (path != NULL)
1275                 strfree(path);
1276 }
1277 
1278 static void
1279 flk_stale_lock_split(lock_descriptor_t *lock, lock_descriptor_t *new1,
1280     lock_descriptor_t *new2)
1281 {
1282         char *path;
1283 
1284         char host[INET6_ADDRSTRLEN];            /* host name */
1285         const char *type;                       /* host type */
1286 
1287         /* temporary variables for the cmn_err() call */
1288         char *p, *t;            /* path, lock type */
1289         pid_t pid;              /* pid */
1290         void *v;                /* vnode */
1291         u_offset_t s, e;        /* start, end */
1292         u_offset_t n1s, n1e;    /* new1 start, new1 end */
1293         u_offset_t n2s, n2e;    /* new2 start, new2 end */
1294 
1295         ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1296 
1297         translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1298             sizeof (host), &type);
1299         path = get_vnode_path(lock->l_vnode);
1300 
1301         pid = lock->l_flock.l_pid;
1302         v = (void *)lock->l_vnode;
1303         p = path == NULL ? "?" : path;
1304         t = lock->l_type == F_WRLCK ? "WR" : "RD";
1305         s = lock->l_start;
1306         e = lock->l_end;
1307         n1s = new1->l_start;
1308         n1e = new1->l_end;
1309         n2s = new2->l_start;
1310         n2e = new2->l_end;
1311 
1312         cmn_err(CE_NOTE, "!Stale lock SPLIT (host: %s (%s), pid: %d, "
1313             "vnode: %p, path: %s, %sLCK: %llu:%llu -> %llu:%llu and %llu:%llu)",
1314             host, type, pid, v, p, t, s, e, n1s, n1e, n2s, n2e);
1315 
1316         if (path != NULL)
1317                 strfree(path);
1318 }
1319 
1320 static void
1321 flk_stale_lock_release(lock_descriptor_t *lock)
1322 {
1323         char *path;
1324 
1325         char host[INET6_ADDRSTRLEN];            /* host name */
1326         const char *type;                       /* host type */
1327 
1328         /* temporary variables for the cmn_err() call */
1329         char *p, *t;            /* path, lock type */
1330         pid_t pid;              /* pid */
1331         void *v;                /* vnode */
1332         u_offset_t s, e;        /* start, end */
1333 
1334         ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1335 
1336         translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1337             sizeof (host), &type);
1338         path = get_vnode_path(lock->l_vnode);
1339 
1340         pid = lock->l_flock.l_pid;
1341         v = (void *)lock->l_vnode;
1342         p = path == NULL ? "?" : path;
1343         t = lock->l_type == F_WRLCK ? "WR" : "RD";
1344         s = lock->l_start;
1345         e = lock->l_end;
1346 
1347         cmn_err(CE_NOTE, "!Stale lock RELEASE (host: %s (%s), pid: %d, "
1348             "vnode: %p, path: %s, %sLCK: %llu:%llu)", host, type, pid, v, p, t,
1349             s, e);
1350 
1351         if (path != NULL)
1352                 strfree(path);
1353 }
1354 
1355 /*
1356  * Routine that checks whether there are any blocking locks in the system.
1357  *
1358  * The policy followed is if a write lock is sleeping we don't allow read
1359  * locks before this write lock even though there may not be any active
1360  * locks corresponding to the read locks' region.
1361  *
1362  * flk_add_edge() function adds an edge between l1 and l2 iff there
1363  * is no path between l1 and l2. This is done to have a "minimum
1364  * storage representation" of the dependency graph.
1365  *
1366  * Another property of the graph is since only the new request throws
1367  * edges to the existing locks in the graph, the graph is always topologically
1368  * ordered.
1369  */
1370 
1371 static int
1372 flk_process_request(lock_descriptor_t *request)
1373 {
1374         graph_t *gp = request->l_graph;
1375         lock_descriptor_t *lock;
1376         int request_blocked_by_active = 0;
1377         int request_blocked_by_granted = 0;
1378         int request_blocked_by_sleeping = 0;
1379         vnode_t *vp = request->l_vnode;
1380         int     error = 0;
1381         int request_will_wait = 0;
1382         int found_covering_lock = 0;
1383         lock_descriptor_t *covered_by = NULL;
1384 
1385         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1386         request_will_wait = IS_WILLING_TO_SLEEP(request);
1387 
1388         /*
1389          * check active locks
1390          */
1391 
1392         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1393 
1394         if (lock) {
1395                 do {
1396                         if (BLOCKS(lock, request)) {
1397                                 if (!request_will_wait) {
1398                                         flk_stale_lock_check(lock);
1399                                         return (EAGAIN);
1400                                 }
1401                                 request_blocked_by_active = 1;
1402                                 break;
1403                         }
1404                         /*
1405                          * Grant lock if it is for the same owner holding active
1406                          * lock that covers the request.
1407                          */
1408 
1409                         if (SAME_OWNER(lock, request) &&
1410                             COVERS(lock, request) &&
1411                             (request->l_type == F_RDLCK))
1412                                 return (flk_execute_request(request));
1413                         lock = lock->l_next;
1414                 } while (lock->l_vnode == vp);
1415         }
1416 
1417         if (!request_blocked_by_active) {
1418                 lock_descriptor_t *lk[1];
1419                 lock_descriptor_t *first_glock = NULL;
1420 
1421                 /*
1422                  * Shall we grant this?! NO!!
1423                  * What about those locks that were just granted and still
1424                  * in sleep queue. Those threads are woken up and so locks
1425                  * are almost active.
1426                  */
1427                 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1428                 if (lock) {
1429                         do {
1430                                 if (BLOCKS(lock, request)) {
1431                                         if (IS_GRANTED(lock)) {
1432                                                 request_blocked_by_granted = 1;
1433                                         } else {
1434                                                 request_blocked_by_sleeping = 1;
1435                                         }
1436                                 }
1437 
1438                                 lock = lock->l_next;
1439                         } while ((lock->l_vnode == vp));
1440                         first_glock = lock->l_prev;
1441                         ASSERT(first_glock->l_vnode == vp);
1442                 }
1443 
1444                 if (request_blocked_by_granted)
1445                         goto block;
1446 
1447                 if (!request_blocked_by_sleeping) {
1448                         /*
1449                          * If the request isn't going to be blocked by a
1450                          * sleeping request, we know that it isn't going to
1451                          * be blocked; we can just execute the request --
1452                          * without performing costly deadlock detection.
1453                          */
1454                         ASSERT(!request_blocked_by_active);
1455                         return (flk_execute_request(request));
1456                 } else if (request->l_type == F_RDLCK) {
1457                         /*
1458                          * If we have a sleeping writer in the requested
1459                          * lock's range, block.
1460                          */
1461                         goto block;
1462                 }
1463 
1464                 lk[0] = request;
1465                 request->l_state |= RECOMPUTE_LOCK;
1466                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1467                 if (lock) {
1468                         do {
1469                                 flk_recompute_dependencies(lock, lk, 1, 0);
1470                                 lock = lock->l_next;
1471                         } while (lock->l_vnode == vp);
1472                 }
1473                 lock = first_glock;
1474                 if (lock) {
1475                         do {
1476                                 if (IS_GRANTED(lock)) {
1477                                         flk_recompute_dependencies(lock, lk, 1,
1478                                             0);
1479                                 }
1480                                 lock = lock->l_prev;
1481                         } while ((lock->l_vnode == vp));
1482                 }
1483                 request->l_state &= ~RECOMPUTE_LOCK;
1484                 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
1485                         return (EDEADLK);
1486                 return (flk_execute_request(request));
1487         }
1488 
1489 block:
1490         if (request_will_wait)
1491                 flk_graph_uncolor(gp);
1492 
1493         /* check sleeping locks */
1494 
1495         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1496 
1497         /*
1498          * If we find a sleeping write lock that is a superset of the
1499          * region wanted by request we can be assured that by adding an
1500          * edge to this write lock we have paths to all locks in the
1501          * graph that blocks the request except in one case and that is why
1502          * another check for SAME_OWNER in the loop below. The exception
1503          * case is when this process that owns the sleeping write lock 'l1'
1504          * has other locks l2, l3, l4 that are in the system and arrived
1505          * before l1. l1 does not have path to these locks as they are from
1506          * same process. We break when we find a second covering sleeping
1507          * lock l5 owned by a process different from that owning l1, because
1508          * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
1509          * it has l1 would have produced a deadlock already.
1510          */
1511 
1512         if (lock) {
1513                 do {
1514                         if (BLOCKS(lock, request)) {
1515                                 if (!request_will_wait)
1516                                         return (EAGAIN);
1517                                 if (COVERS(lock, request) &&
1518                                     lock->l_type == F_WRLCK) {
1519                                         if (found_covering_lock &&
1520                                             !SAME_OWNER(lock, covered_by)) {
1521                                                 found_covering_lock++;
1522                                                 break;
1523                                         }
1524                                         found_covering_lock = 1;
1525                                         covered_by = lock;
1526                                 }
1527                                 if (found_covering_lock &&
1528                                     !SAME_OWNER(lock, covered_by)) {
1529                                         lock = lock->l_next;
1530                                         continue;
1531                                 }
1532                                 if ((error = flk_add_edge(request, lock,
1533                                     !found_covering_lock, 0)))
1534                                         return (error);
1535                         }
1536                         lock = lock->l_next;
1537                 } while (lock->l_vnode == vp);
1538         }
1539 
1540         /*
1541          * found_covering_lock == 2 iff at this point 'request' has paths to
1542          * all locks that blocks 'request'. found_covering_lock == 1 iff at
1543          * this point 'request' has paths to all locks that blocks 'request'
1544          * whose owners are not same as the one that covers 'request'
1545          * (covered_by above) and we can have locks whose owner is same as
1546          * covered_by in the active list.
1547          */
1548 
1549         if (request_blocked_by_active && found_covering_lock != 2) {
1550                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1551                 ASSERT(lock != NULL);
1552                 do {
1553                         if (BLOCKS(lock, request)) {
1554                                 if (found_covering_lock &&
1555                                     !SAME_OWNER(lock, covered_by)) {
1556                                         lock = lock->l_next;
1557                                         continue;
1558                                 }
1559                                 if ((error = flk_add_edge(request, lock,
1560                                     CHECK_CYCLE, 0)))
1561                                         return (error);
1562                         }
1563                         lock = lock->l_next;
1564                 } while (lock->l_vnode == vp);
1565         }
1566 
1567         if (NOT_BLOCKED(request)) {
1568                 /*
1569                  * request not dependent on any other locks
1570                  * so execute this request
1571                  */
1572                 return (flk_execute_request(request));
1573         } else {
1574                 /*
1575                  * check for deadlock
1576                  */
1577                 if (flk_check_deadlock(request))
1578                         return (EDEADLK);
1579                 /*
1580                  * this thread has to sleep
1581                  */
1582                 return (flk_wait_execute_request(request));
1583         }
1584 }
1585 
1586 /*
1587  * The actual execution of the request in the simple case is only to
1588  * insert the 'request' in the list of active locks if it is not an
1589  * UNLOCK.
1590  * We have to consider the existing active locks' relation to
1591  * this 'request' if they are owned by same process. flk_relation() does
1592  * this job and sees to that the dependency graph information is maintained
1593  * properly.
1594  */
1595 
1596 int
1597 flk_execute_request(lock_descriptor_t *request)
1598 {
1599         graph_t *gp = request->l_graph;
1600         vnode_t *vp = request->l_vnode;
1601         lock_descriptor_t       *lock, *lock1;
1602         int done_searching = 0;
1603 
1604         CHECK_SLEEPING_LOCKS(gp);
1605         CHECK_ACTIVE_LOCKS(gp);
1606 
1607         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1608 
1609         flk_set_state(request, FLK_START_STATE);
1610 
1611         ASSERT(NOT_BLOCKED(request));
1612 
1613         /* IO_LOCK requests are only to check status */
1614 
1615         if (IS_IO_LOCK(request))
1616                 return (0);
1617 
1618         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1619 
1620         if (lock != NULL) {
1621                 /*
1622                  * There are some active locks so check for relations
1623                  */
1624                 do {
1625                         lock1 = lock->l_next;
1626                         if (SAME_OWNER(request, lock)) {
1627                                 done_searching = flk_relation(lock, request);
1628                         }
1629                         lock = lock1;
1630                 } while (lock->l_vnode == vp && !done_searching);
1631         }
1632 
1633         /*
1634          * insert in active queue
1635          */
1636 
1637         if (request->l_type != F_UNLCK)
1638                 flk_insert_active_lock(request);
1639 
1640         return (0);
1641 }
1642 
1643 /*
1644  * 'request' is blocked by some one therefore we put it into sleep queue.
1645  */
1646 static int
1647 flk_wait_execute_request(lock_descriptor_t *request)
1648 {
1649         graph_t *gp = request->l_graph;
1650         callb_cpr_t     *cprp;          /* CPR info from callback */
1651         struct flock_globals *fg;
1652         int index;
1653 
1654         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1655         ASSERT(IS_WILLING_TO_SLEEP(request));
1656 
1657         flk_insert_sleeping_lock(request);
1658 
1659         if (IS_LOCKMGR(request)) {
1660                 index = HASH_INDEX(request->l_vnode);
1661                 fg = flk_get_globals();
1662 
1663                 if (nlm_status_size == 0) {     /* not booted as a cluster */
1664                         if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1665                                 flk_cancel_sleeping_lock(request, 1);
1666                                 return (ENOLCK);
1667                         }
1668                 } else {                        /* booted as a cluster */
1669                         /*
1670                          * If the request is an NLM server lock request,
1671                          * and the NLM state of the lock request is not
1672                          * NLM_UP (because the NLM server is shutting
1673                          * down), then cancel the sleeping lock and
1674                          * return error ENOLCK that will encourage the
1675                          * client to retransmit.
1676                          */
1677                         if (!IS_NLM_UP(request)) {
1678                                 flk_cancel_sleeping_lock(request, 1);
1679                                 return (ENOLCK);
1680                         }
1681                 }
1682         }
1683 
1684         /* Clustering: For blocking PXFS locks, return */
1685         if (IS_PXFS(request)) {
1686                 /*
1687                  * PXFS locks sleep on the client side.
1688                  * The callback argument is used to wake up the sleeper
1689                  * when the lock is granted.
1690                  * We return -1 (rather than an errno value) to indicate
1691                  * the client side should sleep
1692                  */
1693                 return (PXFS_LOCK_BLOCKED);
1694         }
1695 
1696         if (request->l_callbacks != NULL) {
1697                 /*
1698                  * To make sure the shutdown code works correctly, either
1699                  * the callback must happen after putting the lock on the
1700                  * sleep list, or we must check the shutdown status after
1701                  * returning from the callback (and before sleeping).  At
1702                  * least for now, we'll use the first option.  If a
1703                  * shutdown or signal or whatever happened while the graph
1704                  * mutex was dropped, that will be detected by
1705                  * wait_for_lock().
1706                  */
1707                 mutex_exit(&gp->gp_mutex);
1708 
1709                 cprp = flk_invoke_callbacks(request->l_callbacks,
1710                     FLK_BEFORE_SLEEP);
1711 
1712                 mutex_enter(&gp->gp_mutex);
1713 
1714                 if (cprp == NULL) {
1715                         wait_for_lock(request);
1716                 } else {
1717                         mutex_enter(cprp->cc_lockp);
1718                         CALLB_CPR_SAFE_BEGIN(cprp);
1719                         mutex_exit(cprp->cc_lockp);
1720                         wait_for_lock(request);
1721                         mutex_enter(cprp->cc_lockp);
1722                         CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1723                         mutex_exit(cprp->cc_lockp);
1724                 }
1725 
1726                 mutex_exit(&gp->gp_mutex);
1727                 (void) flk_invoke_callbacks(request->l_callbacks,
1728                     FLK_AFTER_SLEEP);
1729                 mutex_enter(&gp->gp_mutex);
1730         } else {
1731                 wait_for_lock(request);
1732         }
1733 
1734         if (IS_LOCKMGR(request)) {
1735                 /*
1736                  * If the lock manager is shutting down, return an
1737                  * error that will encourage the client to retransmit.
1738                  */
1739                 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1740                     !IS_GRANTED(request)) {
1741                         flk_cancel_sleeping_lock(request, 1);
1742                         return (ENOLCK);
1743                 }
1744         }
1745 
1746         if (IS_INTERRUPTED(request)) {
1747                 /* we got a signal, or act like we did */
1748                 flk_cancel_sleeping_lock(request, 1);
1749                 return (EINTR);
1750         }
1751 
1752         /* Cancelled if some other thread has closed the file */
1753 
1754         if (IS_CANCELLED(request)) {
1755                 flk_cancel_sleeping_lock(request, 1);
1756                 return (EBADF);
1757         }
1758 
1759         request->l_state &= ~GRANTED_LOCK;
1760         REMOVE_SLEEP_QUEUE(request);
1761         return (flk_execute_request(request));
1762 }
1763 
1764 /*
1765  * This routine adds an edge between from and to because from depends
1766  * to. If asked to check for deadlock it checks whether there are any
1767  * reachable locks from "from_lock" that is owned by the same process
1768  * as "from_lock".
1769  * NOTE: It is the caller's responsibility to make sure that the color
1770  * of the graph is consistent between the calls to flk_add_edge as done
1771  * in flk_process_request. This routine does not color and check for
1772  * deadlock explicitly.
1773  */
1774 
1775 static int
1776 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1777     int check_cycle, int update_graph)
1778 {
1779         edge_t  *edge;
1780         edge_t  *ep;
1781         lock_descriptor_t       *vertex;
1782         lock_descriptor_t *vertex_stack;
1783 
1784         STACK_INIT(vertex_stack);
1785 
1786         /*
1787          * if to vertex already has mark_color just return
1788          * don't add an edge as it is reachable from from vertex
1789          * before itself.
1790          */
1791 
1792         if (COLORED(to_lock))
1793                 return (0);
1794 
1795         edge = flk_get_edge();
1796 
1797         /*
1798          * set the from and to vertex
1799          */
1800 
1801         edge->from_vertex = from_lock;
1802         edge->to_vertex = to_lock;
1803 
1804         /*
1805          * put in adjacency list of from vertex
1806          */
1807 
1808         from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1809         edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1810         edge->edge_adj_prev = &from_lock->l_edge;
1811         from_lock->l_edge.edge_adj_next = edge;
1812 
1813         /*
1814          * put in list of to vertex
1815          */
1816 
1817         to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1818         edge->edge_in_next = to_lock->l_edge.edge_in_next;
1819         to_lock->l_edge.edge_in_next = edge;
1820         edge->edge_in_prev = &to_lock->l_edge;
1821 
1822 
1823         if (update_graph) {
1824                 flk_update_proc_graph(edge, 0);
1825                 return (0);
1826         }
1827         if (!check_cycle) {
1828                 return (0);
1829         }
1830 
1831         STACK_PUSH(vertex_stack, from_lock, l_stack);
1832 
1833         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1834 
1835                 STACK_POP(vertex_stack, l_stack);
1836 
1837                 for (ep = FIRST_ADJ(vertex);
1838                     ep != HEAD(vertex);
1839                     ep = NEXT_ADJ(ep)) {
1840                         if (COLORED(ep->to_vertex))
1841                                 continue;
1842                         COLOR(ep->to_vertex);
1843                         if (SAME_OWNER(ep->to_vertex, from_lock))
1844                                 goto dead_lock;
1845                         STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1846                 }
1847         }
1848         return (0);
1849 
1850 dead_lock:
1851 
1852         /*
1853          * remove all edges
1854          */
1855 
1856         ep = FIRST_ADJ(from_lock);
1857 
1858         while (ep != HEAD(from_lock)) {
1859                 IN_LIST_REMOVE(ep);
1860                 from_lock->l_sedge = NEXT_ADJ(ep);
1861                 ADJ_LIST_REMOVE(ep);
1862                 flk_free_edge(ep);
1863                 ep = from_lock->l_sedge;
1864         }
1865         return (EDEADLK);
1866 }
1867 
1868 /*
1869  * Get an edge structure for representing the dependency between two locks.
1870  */
1871 
1872 static edge_t *
1873 flk_get_edge()
1874 {
1875         edge_t  *ep;
1876 
1877         ASSERT(flk_edge_cache != NULL);
1878 
1879         ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1880         edge_allocs++;
1881         return (ep);
1882 }
1883 
1884 /*
1885  * Free the edge structure.
1886  */
1887 
1888 static void
1889 flk_free_edge(edge_t *ep)
1890 {
1891         edge_frees++;
1892         kmem_cache_free(flk_edge_cache, (void *)ep);
1893 }
1894 
1895 /*
1896  * Check the relationship of 'request' with 'lock' and perform the
1897  * recomputation of dependencies, break 'lock' if required, and return
1898  * 1 if 'request' cannot have any more relationship with the next
1899  * active locks.
1900  *
1901  * The 'lock' and 'request' are compared and in case of overlap we
1902  * delete the 'lock' and form new locks to represent the non-overlapped
1903  * portion of original 'lock'. This function has side effects such as
1904  * 'lock' will be freed, new locks will be added to the active list.
1905  */
1906 
1907 static int
1908 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1909 {
1910         int lock_effect;
1911         lock_descriptor_t *topology[3];
1912         int nvertex = 0;
1913         int i;
1914         edge_t  *ep;
1915         graph_t *gp = lock->l_graph;
1916         boolean_t mergeable;
1917 
1918         ASSERT(request->l_blocker == 0);
1919 
1920         CHECK_SLEEPING_LOCKS(gp);
1921         CHECK_ACTIVE_LOCKS(gp);
1922 
1923         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1924 
1925         topology[0] = topology[1] = topology[2] = NULL;
1926 
1927         if (request->l_type == F_UNLCK)
1928                 lock_effect = FLK_UNLOCK;
1929         else if (request->l_type == F_RDLCK &&
1930             lock->l_type == F_WRLCK)
1931                 lock_effect = FLK_DOWNGRADE;
1932         else if (request->l_type == F_WRLCK &&
1933             lock->l_type == F_RDLCK)
1934                 lock_effect = FLK_UPGRADE;
1935         else
1936                 lock_effect = FLK_STAY_SAME;
1937 
1938         /*
1939          * The 'lock' and 'request' are merged only in a case the effect of
1940          * both locks is same (FLK_STAY_SAME) and their blocker status
1941          * (l_blocker) is same as well.  We do not merge 'lock' and 'request'
1942          * with different l_blocker values because such merge might affect the
1943          * stale lock detection.  It might cause either false positives, or
1944          * miss some stale locks.
1945          */
1946         mergeable = lock_effect == FLK_STAY_SAME &&
1947             lock->l_blocker == request->l_blocker;
1948 
1949         if (lock->l_end < request->l_start) {
1950                 /* If the 'lock' is just next to 'request', try to merge them */
1951                 if (lock->l_end == request->l_start - 1 && mergeable) {
1952                         request->l_start = lock->l_start;
1953                         goto recompute;
1954                 }
1955 
1956                 /* Otherwise, they do not overlap, so return immediately */
1957                 return (0);
1958         }
1959 
1960         if (request->l_end < lock->l_start) {
1961                 /* If the 'request' is just next to 'lock', try to merge them */
1962                 if (request->l_end == lock->l_start - 1 && mergeable) {
1963                         request->l_end = lock->l_end;
1964                         goto recompute;
1965                 }
1966 
1967                 /* Otherwise, they do not overlap, so return immediately */
1968                 return (1);
1969         }
1970 
1971         /*
1972          * Here we are sure the 'lock' and 'request' overlaps, so the 'request'
1973          * will replace the 'lock' (either fully, or at least partially).
1974          */
1975 
1976         /*
1977          * If the 'request' does not fully cover the 'lock' at the start,
1978          * either move the start of the 'request' to cover the 'lock', or split
1979          * the 'lock'.
1980          */
1981         if (lock->l_start < request->l_start) {
1982                 if (mergeable) {
1983                         request->l_start = lock->l_start;
1984                 } else {
1985                         lock_descriptor_t *new_lock = flk_get_lock();
1986 
1987                         COPY(new_lock, lock);
1988                         new_lock->l_end = request->l_start - 1;
1989 
1990                         topology[nvertex++] = new_lock;
1991                 }
1992         }
1993 
1994         /*
1995          * If the 'request' does not fully cover the 'lock' at the end, either
1996          * move the end of the 'request' to cover the 'lock', or split the
1997          * 'lock'.
1998          */
1999         if (request->l_end < lock->l_end) {
2000                 if (mergeable) {
2001                         request->l_end = lock->l_end;
2002                 } else {
2003                         lock_descriptor_t *new_lock = flk_get_lock();
2004 
2005                         COPY(new_lock, lock);
2006                         new_lock->l_start = request->l_end + 1;
2007 
2008                         topology[nvertex++] = new_lock;
2009                 }
2010         }
2011 
2012         /*
2013          * Log the blocker change
2014          */
2015         if (nvertex > 0 && lock->l_blocker < 0) {
2016                 if (nvertex == 1)
2017                         flk_stale_lock_shrink(lock, topology[0]);
2018                 if (nvertex == 2)
2019                         flk_stale_lock_split(lock, topology[0], topology[1]);
2020 
2021                 lock->l_blocker = 0;
2022         }
2023 
2024 recompute:
2025         /*
2026          * For unlock we don't send the 'request' to for recomputing
2027          * dependencies because no lock will add an edge to this.
2028          */
2029         if (lock_effect != FLK_UNLOCK)
2030                 topology[nvertex++] = request;
2031 
2032         for (i = 0; i < nvertex; i++) {
2033                 topology[i]->l_state |= RECOMPUTE_LOCK;
2034                 topology[i]->l_color = NO_COLOR;
2035         }
2036 
2037         ASSERT(FIRST_ADJ(lock) == HEAD(lock));
2038 
2039         /*
2040          * we remove the adjacent edges for all vertices' to this vertex
2041          * 'lock'.
2042          */
2043         ep = FIRST_IN(lock);
2044         while (ep != HEAD(lock)) {
2045                 ADJ_LIST_REMOVE(ep);
2046                 ep = NEXT_IN(ep);
2047         }
2048 
2049         flk_delete_active_lock(lock, 0);
2050 
2051         /* We are ready for recomputing the dependencies now */
2052         flk_recompute_dependencies(lock, topology, nvertex, 1);
2053 
2054         for (i = 0; i < nvertex; i++) {
2055                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
2056                 topology[i]->l_color = NO_COLOR;
2057         }
2058 
2059         if (lock_effect == FLK_UNLOCK) {
2060                 nvertex++;
2061         }
2062         for (i = 0; i < nvertex - 1; i++) {
2063                 flk_insert_active_lock(topology[i]);
2064         }
2065 
2066         if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
2067                 flk_wakeup(lock, 0);
2068         } else {
2069                 ep = FIRST_IN(lock);
2070                 while (ep != HEAD(lock)) {
2071                         lock->l_sedge = NEXT_IN(ep);
2072                         IN_LIST_REMOVE(ep);
2073                         flk_update_proc_graph(ep, 1);
2074                         flk_free_edge(ep);
2075                         ep = lock->l_sedge;
2076                 }
2077         }
2078         flk_free_lock(lock);
2079 
2080         CHECK_SLEEPING_LOCKS(gp);
2081         CHECK_ACTIVE_LOCKS(gp);
2082         return (0);
2083 }
2084 
2085 /*
2086  * Insert a lock into the active queue.
2087  */
2088 
2089 static void
2090 flk_insert_active_lock(lock_descriptor_t *new_lock)
2091 {
2092         graph_t *gp = new_lock->l_graph;
2093         vnode_t *vp = new_lock->l_vnode;
2094         lock_descriptor_t *first_lock, *lock;
2095 
2096         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2097 
2098         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2099         first_lock = lock;
2100 
2101         if (first_lock != NULL) {
2102                 for (; (lock->l_vnode == vp &&
2103                     lock->l_start < new_lock->l_start); lock = lock->l_next)
2104                         ;
2105         } else {
2106                 lock = ACTIVE_HEAD(gp);
2107         }
2108 
2109         lock->l_prev->l_next = new_lock;
2110         new_lock->l_next = lock;
2111         new_lock->l_prev = lock->l_prev;
2112         lock->l_prev = new_lock;
2113 
2114         if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
2115                 vp->v_filocks = (struct filock *)new_lock;
2116         }
2117         flk_set_state(new_lock, FLK_ACTIVE_STATE);
2118         new_lock->l_state |= ACTIVE_LOCK;
2119 
2120         CHECK_ACTIVE_LOCKS(gp);
2121         CHECK_SLEEPING_LOCKS(gp);
2122 }
2123 
2124 /*
2125  * Delete the active lock : Performs two functions depending on the
2126  * value of second parameter. One is to remove from the active lists
2127  * only and other is to both remove and free the lock.
2128  */
2129 
2130 static void
2131 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
2132 {
2133         vnode_t *vp = lock->l_vnode;
2134         graph_t *gp = lock->l_graph;
2135 
2136         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2137         if (free_lock)
2138                 ASSERT(NO_DEPENDENTS(lock));
2139         ASSERT(NOT_BLOCKED(lock));
2140         ASSERT(IS_ACTIVE(lock));
2141 
2142         ASSERT((vp->v_filocks != NULL));
2143 
2144         if (lock->l_blocker < 0) {
2145                 /* Log the blocker release */
2146                 flk_stale_lock_release(lock);
2147                 lock->l_blocker = 0;
2148         }
2149 
2150         if (vp->v_filocks == (struct filock *)lock) {
2151                 vp->v_filocks = (struct filock *)
2152                     ((lock->l_next->l_vnode == vp) ? lock->l_next :
2153                     NULL);
2154         }
2155         lock->l_next->l_prev = lock->l_prev;
2156         lock->l_prev->l_next = lock->l_next;
2157         lock->l_next = lock->l_prev = NULL;
2158         flk_set_state(lock, FLK_DEAD_STATE);
2159         lock->l_state &= ~ACTIVE_LOCK;
2160 
2161         if (free_lock)
2162                 flk_free_lock(lock);
2163         CHECK_ACTIVE_LOCKS(gp);
2164         CHECK_SLEEPING_LOCKS(gp);
2165 }
2166 
2167 /*
2168  * Insert into the sleep queue.
2169  */
2170 
2171 static void
2172 flk_insert_sleeping_lock(lock_descriptor_t *request)
2173 {
2174         graph_t *gp = request->l_graph;
2175         vnode_t *vp = request->l_vnode;
2176         lock_descriptor_t       *lock;
2177 
2178         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2179         ASSERT(IS_INITIAL(request));
2180 
2181         for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
2182             lock->l_vnode < vp); lock = lock->l_next)
2183                 ;
2184 
2185         lock->l_prev->l_next = request;
2186         request->l_prev = lock->l_prev;
2187         lock->l_prev = request;
2188         request->l_next = lock;
2189         flk_set_state(request, FLK_SLEEPING_STATE);
2190         request->l_state |= SLEEPING_LOCK;
2191 }
2192 
2193 /*
2194  * Cancelling a sleeping lock implies removing a vertex from the
2195  * dependency graph and therefore we should recompute the dependencies
2196  * of all vertices that have a path to this vertex, w.r.t. all
2197  * vertices reachable from this vertex.
2198  */
2199 
2200 void
2201 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
2202 {
2203         graph_t *gp = request->l_graph;
2204         vnode_t *vp = request->l_vnode;
2205         lock_descriptor_t **topology = NULL;
2206         edge_t  *ep;
2207         lock_descriptor_t *vertex, *lock;
2208         int nvertex = 0;
2209         int i;
2210         lock_descriptor_t *vertex_stack;
2211 
2212         STACK_INIT(vertex_stack);
2213 
2214         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2215         /*
2216          * count number of vertex pointers that has to be allocated
2217          * All vertices that are reachable from request.
2218          */
2219 
2220         STACK_PUSH(vertex_stack, request, l_stack);
2221 
2222         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2223                 STACK_POP(vertex_stack, l_stack);
2224                 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
2225                     ep = NEXT_ADJ(ep)) {
2226                         if (IS_RECOMPUTE(ep->to_vertex))
2227                                 continue;
2228                         ep->to_vertex->l_state |= RECOMPUTE_LOCK;
2229                         STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
2230                         nvertex++;
2231                 }
2232         }
2233 
2234         /*
2235          * allocate memory for holding the vertex pointers
2236          */
2237 
2238         if (nvertex) {
2239                 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
2240                     KM_SLEEP);
2241         }
2242 
2243         /*
2244          * one more pass to actually store the vertices in the
2245          * allocated array.
2246          * We first check sleeping locks and then active locks
2247          * so that topology array will be in a topological
2248          * order.
2249          */
2250 
2251         nvertex = 0;
2252         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2253 
2254         if (lock) {
2255                 do {
2256                         if (IS_RECOMPUTE(lock)) {
2257                                 lock->l_index = nvertex;
2258                                 topology[nvertex++] = lock;
2259                         }
2260                         lock->l_color = NO_COLOR;
2261                         lock = lock->l_next;
2262                 } while (lock->l_vnode == vp);
2263         }
2264 
2265         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2266 
2267         if (lock) {
2268                 do {
2269                         if (IS_RECOMPUTE(lock)) {
2270                                 lock->l_index = nvertex;
2271                                 topology[nvertex++] = lock;
2272                         }
2273                         lock->l_color = NO_COLOR;
2274                         lock = lock->l_next;
2275                 } while (lock->l_vnode == vp);
2276         }
2277 
2278         /*
2279          * remove in and out edges of request
2280          * They are freed after updating proc_graph below.
2281          */
2282 
2283         for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
2284                 ADJ_LIST_REMOVE(ep);
2285         }
2286 
2287 
2288         if (remove_from_queue)
2289                 REMOVE_SLEEP_QUEUE(request);
2290 
2291         /* we are ready to recompute */
2292 
2293         flk_recompute_dependencies(request, topology, nvertex, 1);
2294 
2295         ep = FIRST_ADJ(request);
2296         while (ep != HEAD(request)) {
2297                 IN_LIST_REMOVE(ep);
2298                 request->l_sedge = NEXT_ADJ(ep);
2299                 ADJ_LIST_REMOVE(ep);
2300                 flk_update_proc_graph(ep, 1);
2301                 flk_free_edge(ep);
2302                 ep = request->l_sedge;
2303         }
2304 
2305 
2306         /*
2307          * unset the RECOMPUTE flag in those vertices
2308          */
2309 
2310         for (i = 0; i < nvertex; i++) {
2311                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
2312         }
2313 
2314         /*
2315          * free the topology
2316          */
2317         if (nvertex)
2318                 kmem_free(topology,
2319                     (nvertex * sizeof (lock_descriptor_t *)));
2320         /*
2321          * Possibility of some locks unblocked now
2322          */
2323 
2324         flk_wakeup(request, 0);
2325 
2326         /*
2327          * we expect to have a correctly recomputed graph  now.
2328          */
2329         flk_set_state(request, FLK_DEAD_STATE);
2330         flk_free_lock(request);
2331         CHECK_SLEEPING_LOCKS(gp);
2332         CHECK_ACTIVE_LOCKS(gp);
2333 
2334 }
2335 
2336 /*
2337  * Uncoloring the graph is simply to increment the mark value of the graph
2338  * And only when wrap round takes place will we color all vertices in
2339  * the graph explicitly.
2340  */
2341 
2342 static void
2343 flk_graph_uncolor(graph_t *gp)
2344 {
2345         lock_descriptor_t *lock;
2346 
2347         if (gp->mark == UINT_MAX) {
2348                 gp->mark = 1;
2349         for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
2350             lock = lock->l_next)
2351                         lock->l_color  = 0;
2352 
2353         for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
2354             lock = lock->l_next)
2355                         lock->l_color  = 0;
2356         } else {
2357                 gp->mark++;
2358         }
2359 }
2360 
2361 /*
2362  * Wake up locks that are blocked on the given lock.
2363  */
2364 
2365 static void
2366 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
2367 {
2368         edge_t  *ep;
2369         graph_t *gp = lock->l_graph;
2370         lock_descriptor_t       *lck;
2371 
2372         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2373         if (NO_DEPENDENTS(lock))
2374                 return;
2375         ep = FIRST_IN(lock);
2376         do {
2377                 /*
2378                  * delete the edge from the adjacency list
2379                  * of from vertex. if no more adjacent edges
2380                  * for this vertex wake this process.
2381                  */
2382                 lck = ep->from_vertex;
2383                 if (adj_list_remove)
2384                         ADJ_LIST_REMOVE(ep);
2385                 flk_update_proc_graph(ep, 1);
2386                 if (NOT_BLOCKED(lck)) {
2387                         GRANT_WAKEUP(lck);
2388                 }
2389                 lock->l_sedge = NEXT_IN(ep);
2390                 IN_LIST_REMOVE(ep);
2391                 flk_free_edge(ep);
2392                 ep = lock->l_sedge;
2393         } while (ep != HEAD(lock));
2394         ASSERT(NO_DEPENDENTS(lock));
2395 }
2396 
2397 /*
2398  * The dependents of request, is checked for its dependency against the
2399  * locks in topology (called topology because the array is and should be in
2400  * topological order for this algorithm, if not in topological order the
2401  * inner loop below might add more edges than necessary. Topological ordering
2402  * of vertices satisfies the property that all edges will be from left to
2403  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
2404  * If lock l1 in the dependent set of request is dependent (blocked by)
2405  * on lock l2 in topology but does not have a path to it, we add an edge
2406  * in the inner loop below.
2407  *
2408  * We don't want to add an edge between l1 and l2 if there exists
2409  * already a path from l1 to l2, so care has to be taken for those vertices
2410  * that  have two paths to 'request'. These vertices are referred to here
2411  * as barrier locks.
2412  *
2413  * The barriers has to be found (those vertex that originally had two paths
2414  * to request) because otherwise we may end up adding edges unnecessarily
2415  * to vertices in topology, and thus barrier vertices can have an edge
2416  * to a vertex in topology as well a path to it.
2417  */
2418 
2419 static void
2420 flk_recompute_dependencies(lock_descriptor_t *request,
2421     lock_descriptor_t **topology, int nvertex, int update_graph)
2422 {
2423         lock_descriptor_t *vertex, *lock;
2424         graph_t *gp = request->l_graph;
2425         int i, count;
2426         int barrier_found = 0;
2427         edge_t  *ep;
2428         lock_descriptor_t *vertex_stack;
2429 
2430         STACK_INIT(vertex_stack);
2431 
2432         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2433         if (nvertex == 0)
2434                 return;
2435         flk_graph_uncolor(request->l_graph);
2436         barrier_found = flk_find_barriers(request);
2437         request->l_state |= RECOMPUTE_DONE;
2438 
2439         STACK_PUSH(vertex_stack, request, l_stack);
2440         request->l_sedge = FIRST_IN(request);
2441 
2442 
2443         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2444                 if (vertex->l_state & RECOMPUTE_DONE) {
2445                         count = 0;
2446                         goto next_in_edge;
2447                 }
2448                 if (IS_BARRIER(vertex)) {
2449                         /* decrement the barrier count */
2450                         if (vertex->l_index) {
2451                                 vertex->l_index--;
2452                                 /* this guy will be pushed again anyway ? */
2453                                 STACK_POP(vertex_stack, l_stack);
2454                                 if (vertex->l_index == 0)  {
2455                                 /*
2456                                  * barrier is over we can recompute
2457                                  * dependencies for this lock in the
2458                                  * next stack pop
2459                                  */
2460                                         vertex->l_state &= ~BARRIER_LOCK;
2461                                 }
2462                                 continue;
2463                         }
2464                 }
2465                 vertex->l_state |= RECOMPUTE_DONE;
2466                 flk_graph_uncolor(gp);
2467                 count = flk_color_reachables(vertex);
2468                 for (i = 0; i < nvertex; i++) {
2469                         lock = topology[i];
2470                         if (COLORED(lock))
2471                                 continue;
2472                         if (BLOCKS(lock, vertex)) {
2473                                 (void) flk_add_edge(vertex, lock,
2474                                     NO_CHECK_CYCLE, update_graph);
2475                                 COLOR(lock);
2476                                 count++;
2477                                 count += flk_color_reachables(lock);
2478                         }
2479 
2480                 }
2481 
2482 next_in_edge:
2483                 if (count == nvertex ||
2484                     vertex->l_sedge == HEAD(vertex)) {
2485                         /* prune the tree below this */
2486                         STACK_POP(vertex_stack, l_stack);
2487                         vertex->l_state &= ~RECOMPUTE_DONE;
2488                         /* update the barrier locks below this! */
2489                         if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
2490                                 flk_graph_uncolor(gp);
2491                                 flk_update_barriers(vertex);
2492                         }
2493                         continue;
2494                 }
2495 
2496                 ep = vertex->l_sedge;
2497                 lock = ep->from_vertex;
2498                 STACK_PUSH(vertex_stack, lock, l_stack);
2499                 lock->l_sedge = FIRST_IN(lock);
2500                 vertex->l_sedge = NEXT_IN(ep);
2501         }
2502 
2503 }
2504 
2505 /*
2506  * Color all reachable vertices from vertex that belongs to topology (here
2507  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
2508  *
2509  * Note: we need to use a different stack_link l_stack1 because this is
2510  * called from flk_recompute_dependencies() that already uses a stack with
2511  * l_stack as stack_link.
2512  */
2513 
2514 static int
2515 flk_color_reachables(lock_descriptor_t *vertex)
2516 {
2517         lock_descriptor_t *ver, *lock;
2518         int count;
2519         edge_t  *ep;
2520         lock_descriptor_t *vertex_stack;
2521 
2522         STACK_INIT(vertex_stack);
2523 
2524         STACK_PUSH(vertex_stack, vertex, l_stack1);
2525         count = 0;
2526         while ((ver = STACK_TOP(vertex_stack)) != NULL) {
2527 
2528                 STACK_POP(vertex_stack, l_stack1);
2529                 for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
2530                     ep = NEXT_ADJ(ep)) {
2531                         lock = ep->to_vertex;
2532                         if (COLORED(lock))
2533                                 continue;
2534                         COLOR(lock);
2535                         if (IS_RECOMPUTE(lock))
2536                                 count++;
2537                         STACK_PUSH(vertex_stack, lock, l_stack1);
2538                 }
2539 
2540         }
2541         return (count);
2542 }
2543 
2544 /*
2545  * Called from flk_recompute_dependencies() this routine decrements
2546  * the barrier count of barrier vertices that are reachable from lock.
2547  */
2548 
2549 static void
2550 flk_update_barriers(lock_descriptor_t *lock)
2551 {
2552         lock_descriptor_t *vertex, *lck;
2553         edge_t  *ep;
2554         lock_descriptor_t *vertex_stack;
2555 
2556         STACK_INIT(vertex_stack);
2557 
2558         STACK_PUSH(vertex_stack, lock, l_stack1);
2559 
2560         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2561                 STACK_POP(vertex_stack, l_stack1);
2562                 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2563                     ep = NEXT_IN(ep)) {
2564                         lck = ep->from_vertex;
2565                         if (COLORED(lck)) {
2566                                 if (IS_BARRIER(lck)) {
2567                                         ASSERT(lck->l_index > 0);
2568                                         lck->l_index--;
2569                                         if (lck->l_index == 0)
2570                                                 lck->l_state &= ~BARRIER_LOCK;
2571                                 }
2572                                 continue;
2573                         }
2574                         COLOR(lck);
2575                         if (IS_BARRIER(lck)) {
2576                                 ASSERT(lck->l_index > 0);
2577                                 lck->l_index--;
2578                                 if (lck->l_index == 0)
2579                                         lck->l_state &= ~BARRIER_LOCK;
2580                         }
2581                         STACK_PUSH(vertex_stack, lck, l_stack1);
2582                 }
2583         }
2584 }
2585 
2586 /*
2587  * Finds all vertices that are reachable from 'lock' more than once and
2588  * mark them as barrier vertices and increment their barrier count.
2589  * The barrier count is one minus the total number of paths from lock
2590  * to that vertex.
2591  */
2592 
2593 static int
2594 flk_find_barriers(lock_descriptor_t *lock)
2595 {
2596         lock_descriptor_t *vertex, *lck;
2597         int found = 0;
2598         edge_t  *ep;
2599         lock_descriptor_t *vertex_stack;
2600 
2601         STACK_INIT(vertex_stack);
2602 
2603         STACK_PUSH(vertex_stack, lock, l_stack1);
2604 
2605         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2606                 STACK_POP(vertex_stack, l_stack1);
2607                 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2608                     ep = NEXT_IN(ep)) {
2609                         lck = ep->from_vertex;
2610                         if (COLORED(lck)) {
2611                                 /* this is a barrier */
2612                                 lck->l_state |= BARRIER_LOCK;
2613                                 /* index will have barrier count */
2614                                 lck->l_index++;
2615                                 if (!found)
2616                                         found = 1;
2617                                 continue;
2618                         }
2619                         COLOR(lck);
2620                         lck->l_index = 0;
2621                         STACK_PUSH(vertex_stack, lck, l_stack1);
2622                 }
2623         }
2624         return (found);
2625 }
2626 
2627 /*
2628  * Finds the first lock that is mainly responsible for blocking this
2629  * request.  If there is no such lock, request->l_flock.l_type is set to
2630  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2631  * of the blocking lock.
2632  *
2633  * Note: It is possible a request is blocked by a sleeping lock because
2634  * of the fairness policy used in flk_process_request() to construct the
2635  * dependencies. (see comments before flk_process_request()).
2636  */
2637 
2638 static void
2639 flk_get_first_blocking_lock(lock_descriptor_t *request)
2640 {
2641         graph_t *gp = request->l_graph;
2642         vnode_t *vp = request->l_vnode;
2643         lock_descriptor_t *lock, *blocker;
2644 
2645         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2646         blocker = NULL;
2647         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2648 
2649         if (lock) {
2650                 do {
2651                         if (BLOCKS(lock, request)) {
2652                                 blocker = lock;
2653                                 break;
2654                         }
2655                         lock = lock->l_next;
2656                 } while (lock->l_vnode == vp);
2657         }
2658 
2659         if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
2660                 /*
2661                  * No active lock is blocking this request, but if a read
2662                  * lock is requested, it may also get blocked by a waiting
2663                  * writer. So search all sleeping locks and see if there is
2664                  * a writer waiting.
2665                  */
2666                 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2667                 if (lock) {
2668                         do {
2669                                 if (BLOCKS(lock, request)) {
2670                                         blocker = lock;
2671                                         break;
2672                                 }
2673                                 lock = lock->l_next;
2674                         } while (lock->l_vnode == vp);
2675                 }
2676         }
2677 
2678         if (blocker) {
2679                 report_blocker(blocker, request);
2680         } else
2681                 request->l_flock.l_type = F_UNLCK;
2682 }
2683 
2684 /*
2685  * Get the graph_t structure associated with a vnode.
2686  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2687  * not yet been initialized, then a new element is allocated and returned.
2688  */
2689 graph_t *
2690 flk_get_lock_graph(vnode_t *vp, int initialize)
2691 {
2692         graph_t *gp;
2693         graph_t *gp_alloc = NULL;
2694         int index = HASH_INDEX(vp);
2695 
2696         if (initialize == FLK_USE_GRAPH) {
2697                 mutex_enter(&flock_lock);
2698                 gp = lock_graph[index];
2699                 mutex_exit(&flock_lock);
2700                 return (gp);
2701         }
2702 
2703         ASSERT(initialize == FLK_INIT_GRAPH);
2704 
2705         if (lock_graph[index] == NULL) {
2706 
2707                 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2708 
2709                 /* Initialize the graph */
2710 
2711                 gp_alloc->active_locks.l_next =
2712                     gp_alloc->active_locks.l_prev =
2713                     (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2714                 gp_alloc->sleeping_locks.l_next =
2715                     gp_alloc->sleeping_locks.l_prev =
2716                     (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2717                 gp_alloc->index = index;
2718                 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2719         }
2720 
2721         mutex_enter(&flock_lock);
2722 
2723         gp = lock_graph[index];
2724 
2725         /* Recheck the value within flock_lock */
2726         if (gp == NULL) {
2727                 struct flock_globals *fg;
2728 
2729                 /* We must have previously allocated the graph_t structure */
2730                 ASSERT(gp_alloc != NULL);
2731                 lock_graph[index] = gp = gp_alloc;
2732                 /*
2733                  * The lockmgr status is only needed if KLM is loaded.
2734                  */
2735                 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2736                         fg = flk_get_globals();
2737                         fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2738                 }
2739         }
2740 
2741         mutex_exit(&flock_lock);
2742 
2743         if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2744                 /* There was a race to allocate the graph_t and we lost */
2745                 mutex_destroy(&gp_alloc->gp_mutex);
2746                 kmem_free(gp_alloc, sizeof (graph_t));
2747         }
2748 
2749         return (gp);
2750 }
2751 
2752 /*
2753  * PSARC case 1997/292
2754  */
2755 int
2756 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2757 {
2758         lock_descriptor_t *lock;
2759         int result = 0;
2760         graph_t *gp;
2761         int                     lock_nlmid;
2762 
2763         /*
2764          * Check to see if node is booted as a cluster. If not, return.
2765          */
2766         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2767                 return (0);
2768         }
2769 
2770         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2771         if (gp == NULL) {
2772                 return (0);
2773         }
2774 
2775         mutex_enter(&gp->gp_mutex);
2776 
2777         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2778 
2779         if (lock) {
2780                 while (lock->l_vnode == vp) {
2781                         /* get NLM id from sysid */
2782                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2783 
2784                         /*
2785                          * If NLM server request _and_ nlmid of lock matches
2786                          * nlmid of argument, then we've found a remote lock.
2787                          */
2788                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2789                                 result = 1;
2790                                 goto done;
2791                         }
2792                         lock = lock->l_next;
2793                 }
2794         }
2795 
2796         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2797 
2798         if (lock) {
2799                 while (lock->l_vnode == vp) {
2800                         /* get NLM id from sysid */
2801                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2802 
2803                         /*
2804                          * If NLM server request _and_ nlmid of lock matches
2805                          * nlmid of argument, then we've found a remote lock.
2806                          */
2807                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2808                                 result = 1;
2809                                 goto done;
2810                         }
2811                         lock = lock->l_next;
2812                 }
2813         }
2814 
2815 done:
2816         mutex_exit(&gp->gp_mutex);
2817         return (result);
2818 }
2819 
2820 /*
2821  * Determine whether there are any locks for the given vnode with a remote
2822  * sysid.  Returns zero if not, non-zero if there are.
2823  *
2824  * Note that the return value from this function is potentially invalid
2825  * once it has been returned.  The caller is responsible for providing its
2826  * own synchronization mechanism to ensure that the return value is useful
2827  * (e.g., see nfs_lockcompletion()).
2828  */
2829 int
2830 flk_has_remote_locks(vnode_t *vp)
2831 {
2832         lock_descriptor_t *lock;
2833         int result = 0;
2834         graph_t *gp;
2835 
2836         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2837         if (gp == NULL) {
2838                 return (0);
2839         }
2840 
2841         mutex_enter(&gp->gp_mutex);
2842 
2843         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2844 
2845         if (lock) {
2846                 while (lock->l_vnode == vp) {
2847                         if (IS_REMOTE(lock)) {
2848                                 result = 1;
2849                                 goto done;
2850                         }
2851                         lock = lock->l_next;
2852                 }
2853         }
2854 
2855         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2856 
2857         if (lock) {
2858                 while (lock->l_vnode == vp) {
2859                         if (IS_REMOTE(lock)) {
2860                                 result = 1;
2861                                 goto done;
2862                         }
2863                         lock = lock->l_next;
2864                 }
2865         }
2866 
2867 done:
2868         mutex_exit(&gp->gp_mutex);
2869         return (result);
2870 }
2871 
2872 /*
2873  * Determine whether there are any locks for the given vnode with a remote
2874  * sysid matching given sysid.
2875  * Used by the new (open source) NFS Lock Manager (NLM)
2876  */
2877 int
2878 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2879 {
2880         lock_descriptor_t *lock;
2881         int result = 0;
2882         graph_t *gp;
2883 
2884         if (sysid == 0)
2885                 return (0);
2886 
2887         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2888         if (gp == NULL) {
2889                 return (0);
2890         }
2891 
2892         mutex_enter(&gp->gp_mutex);
2893 
2894         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2895 
2896         if (lock) {
2897                 while (lock->l_vnode == vp) {
2898                         if (lock->l_flock.l_sysid == sysid) {
2899                                 result = 1;
2900                                 goto done;
2901                         }
2902                         lock = lock->l_next;
2903                 }
2904         }
2905 
2906         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2907 
2908         if (lock) {
2909                 while (lock->l_vnode == vp) {
2910                         if (lock->l_flock.l_sysid == sysid) {
2911                                 result = 1;
2912                                 goto done;
2913                         }
2914                         lock = lock->l_next;
2915                 }
2916         }
2917 
2918 done:
2919         mutex_exit(&gp->gp_mutex);
2920         return (result);
2921 }
2922 
2923 /*
2924  * Determine if there are any locks owned by the given sysid.
2925  * Returns zero if not, non-zero if there are.  Note that this return code
2926  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2927  * avoids all the memory allocations of those routines.
2928  *
2929  * This routine has the same synchronization issues as
2930  * flk_has_remote_locks.
2931  */
2932 
2933 int
2934 flk_sysid_has_locks(int sysid, int lck_type)
2935 {
2936         int             has_locks = 0;
2937         lock_descriptor_t       *lock;
2938         graph_t         *gp;
2939         int             i;
2940 
2941         for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2942                 mutex_enter(&flock_lock);
2943                 gp = lock_graph[i];
2944                 mutex_exit(&flock_lock);
2945                 if (gp == NULL) {
2946                         continue;
2947                 }
2948 
2949                 mutex_enter(&gp->gp_mutex);
2950 
2951                 if (lck_type & FLK_QUERY_ACTIVE) {
2952                         for (lock = ACTIVE_HEAD(gp)->l_next;
2953                             lock != ACTIVE_HEAD(gp) && !has_locks;
2954                             lock = lock->l_next) {
2955                                 if (lock->l_flock.l_sysid == sysid)
2956                                         has_locks = 1;
2957                         }
2958                 }
2959 
2960                 if (lck_type & FLK_QUERY_SLEEPING) {
2961                         for (lock = SLEEPING_HEAD(gp)->l_next;
2962                             lock != SLEEPING_HEAD(gp) && !has_locks;
2963                             lock = lock->l_next) {
2964                                 if (lock->l_flock.l_sysid == sysid)
2965                                         has_locks = 1;
2966                         }
2967                 }
2968                 mutex_exit(&gp->gp_mutex);
2969         }
2970 
2971         return (has_locks);
2972 }
2973 
2974 
2975 /*
2976  * PSARC case 1997/292
2977  *
2978  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2979  *  quantity, the real sysid generated by the NLM server; the upper half
2980  *  identifies the node of the cluster where the NLM server ran.
2981  *  This routine is only called by an NLM server running in a cluster.
2982  * Effects: Remove all locks held on behalf of the client identified
2983  *  by "sysid."
2984  */
2985 void
2986 cl_flk_remove_locks_by_sysid(int sysid)
2987 {
2988         graph_t *gp;
2989         int i;
2990         lock_descriptor_t *lock, *nlock;
2991 
2992         /*
2993          * Check to see if node is booted as a cluster. If not, return.
2994          */
2995         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2996                 return;
2997         }
2998 
2999         ASSERT(sysid != 0);
3000         for (i = 0; i < HASH_SIZE; i++) {
3001                 mutex_enter(&flock_lock);
3002                 gp = lock_graph[i];
3003                 mutex_exit(&flock_lock);
3004 
3005                 if (gp == NULL)
3006                         continue;
3007 
3008                 mutex_enter(&gp->gp_mutex);      /*  get mutex on lock graph */
3009 
3010                 /* signal sleeping requests so that they bail out */
3011                 lock = SLEEPING_HEAD(gp)->l_next;
3012                 while (lock != SLEEPING_HEAD(gp)) {
3013                         nlock = lock->l_next;
3014                         if (lock->l_flock.l_sysid == sysid) {
3015                                 INTERRUPT_WAKEUP(lock);
3016                         }
3017                         lock = nlock;
3018                 }
3019 
3020                 /* delete active locks */
3021                 lock = ACTIVE_HEAD(gp)->l_next;
3022                 while (lock != ACTIVE_HEAD(gp)) {
3023                         nlock = lock->l_next;
3024                         if (lock->l_flock.l_sysid == sysid) {
3025                                 flk_delete_active_lock(lock, 0);
3026                                 flk_wakeup(lock, 1);
3027                                 flk_free_lock(lock);
3028                         }
3029                         lock = nlock;
3030                 }
3031                 mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
3032         }
3033 }
3034 
3035 /*
3036  * Delete all locks in the system that belongs to the sysid of the request.
3037  */
3038 
3039 static void
3040 flk_delete_locks_by_sysid(lock_descriptor_t *request)
3041 {
3042         int     sysid  = request->l_flock.l_sysid;
3043         lock_descriptor_t *lock, *nlock;
3044         graph_t *gp;
3045         int i;
3046 
3047         ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
3048         ASSERT(sysid != 0);
3049 
3050         mutex_exit(&request->l_graph->gp_mutex);
3051 
3052         for (i = 0; i < HASH_SIZE; i++) {
3053                 mutex_enter(&flock_lock);
3054                 gp = lock_graph[i];
3055                 mutex_exit(&flock_lock);
3056 
3057                 if (gp == NULL)
3058                         continue;
3059 
3060                 mutex_enter(&gp->gp_mutex);
3061 
3062                 /* signal sleeping requests so that they bail out */
3063                 lock = SLEEPING_HEAD(gp)->l_next;
3064                 while (lock != SLEEPING_HEAD(gp)) {
3065                         nlock = lock->l_next;
3066                         if (lock->l_flock.l_sysid == sysid) {
3067                                 INTERRUPT_WAKEUP(lock);
3068                         }
3069                         lock = nlock;
3070                 }
3071 
3072                 /* delete active locks */
3073                 lock = ACTIVE_HEAD(gp)->l_next;
3074                 while (lock != ACTIVE_HEAD(gp)) {
3075                         nlock = lock->l_next;
3076                         if (lock->l_flock.l_sysid == sysid) {
3077                                 flk_delete_active_lock(lock, 0);
3078                                 flk_wakeup(lock, 1);
3079                                 flk_free_lock(lock);
3080                         }
3081                         lock = nlock;
3082                 }
3083                 mutex_exit(&gp->gp_mutex);
3084         }
3085 
3086         mutex_enter(&request->l_graph->gp_mutex);
3087 }
3088 
3089 /*
3090  * Clustering: Deletes PXFS locks
3091  * Effects: Delete all locks on files in the given file system and with the
3092  *  given PXFS id.
3093  */
3094 void
3095 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
3096 {
3097         lock_descriptor_t *lock, *nlock;
3098         graph_t *gp;
3099         int i;
3100 
3101         for (i = 0; i < HASH_SIZE; i++) {
3102                 mutex_enter(&flock_lock);
3103                 gp = lock_graph[i];
3104                 mutex_exit(&flock_lock);
3105 
3106                 if (gp == NULL)
3107                         continue;
3108 
3109                 mutex_enter(&gp->gp_mutex);
3110 
3111                 /* signal sleeping requests so that they bail out */
3112                 lock = SLEEPING_HEAD(gp)->l_next;
3113                 while (lock != SLEEPING_HEAD(gp)) {
3114                         nlock = lock->l_next;
3115                         if (lock->l_vnode->v_vfsp == vfsp) {
3116                                 ASSERT(IS_PXFS(lock));
3117                                 if (GETPXFSID(lock->l_flock.l_sysid) ==
3118                                     pxfsid) {
3119                                         flk_set_state(lock,
3120                                             FLK_CANCELLED_STATE);
3121                                         flk_cancel_sleeping_lock(lock, 1);
3122                                 }
3123                         }
3124                         lock = nlock;
3125                 }
3126 
3127                 /* delete active locks */
3128                 lock = ACTIVE_HEAD(gp)->l_next;
3129                 while (lock != ACTIVE_HEAD(gp)) {
3130                         nlock = lock->l_next;
3131                         if (lock->l_vnode->v_vfsp == vfsp) {
3132                                 ASSERT(IS_PXFS(lock));
3133                                 if (GETPXFSID(lock->l_flock.l_sysid) ==
3134                                     pxfsid) {
3135                                         flk_delete_active_lock(lock, 0);
3136                                         flk_wakeup(lock, 1);
3137                                         flk_free_lock(lock);
3138                                 }
3139                         }
3140                         lock = nlock;
3141                 }
3142                 mutex_exit(&gp->gp_mutex);
3143         }
3144 }
3145 
3146 /*
3147  * Search for a sleeping lock manager lock which matches exactly this lock
3148  * request; if one is found, fake a signal to cancel it.
3149  *
3150  * Return 1 if a matching lock was found, 0 otherwise.
3151  */
3152 
3153 static int
3154 flk_canceled(lock_descriptor_t *request)
3155 {
3156         lock_descriptor_t *lock, *nlock;
3157         graph_t *gp = request->l_graph;
3158         vnode_t *vp = request->l_vnode;
3159 
3160         ASSERT(MUTEX_HELD(&gp->gp_mutex));
3161         ASSERT(IS_LOCKMGR(request));
3162         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
3163 
3164         if (lock) {
3165                 while (lock->l_vnode == vp) {
3166                         nlock = lock->l_next;
3167                         if (SAME_OWNER(lock, request) &&
3168                             lock->l_start == request->l_start &&
3169                             lock->l_end == request->l_end) {
3170                                 INTERRUPT_WAKEUP(lock);
3171                                 return (1);
3172                         }
3173                         lock = nlock;
3174                 }
3175         }
3176         return (0);
3177 }
3178 
3179 /*
3180  * Remove all non-OFD locks for the vnode belonging to the given pid and sysid.
3181  * That is, since OFD locks are pid-less we'll never match on the incoming
3182  * pid. OFD locks are removed earlier in the close() path via closef() and
3183  * ofdcleanlock().
3184  */
3185 void
3186 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
3187 {
3188         graph_t *gp;
3189         lock_descriptor_t *lock, *nlock;
3190         lock_descriptor_t *link_stack;
3191 
3192         STACK_INIT(link_stack);
3193 
3194         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
3195 
3196         if (gp == NULL)
3197                 return;
3198         mutex_enter(&gp->gp_mutex);
3199 
3200         CHECK_SLEEPING_LOCKS(gp);
3201         CHECK_ACTIVE_LOCKS(gp);
3202 
3203         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
3204 
3205         if (lock) {
3206                 do {
3207                         nlock = lock->l_next;
3208                         if ((lock->l_flock.l_pid == pid ||
3209                             pid == IGN_PID) &&
3210                             lock->l_flock.l_sysid == sysid) {
3211                                 CANCEL_WAKEUP(lock);
3212                         }
3213                         lock = nlock;
3214                 } while (lock->l_vnode == vp);
3215         }
3216 
3217         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3218 
3219         if (lock) {
3220                 do {
3221                         nlock = lock->l_next;
3222                         if ((lock->l_flock.l_pid == pid ||
3223                             pid == IGN_PID) &&
3224                             lock->l_flock.l_sysid == sysid) {
3225                                 flk_delete_active_lock(lock, 0);
3226                                 STACK_PUSH(link_stack, lock, l_stack);
3227                         }
3228                         lock = nlock;
3229                 } while (lock->l_vnode == vp);
3230         }
3231 
3232         while ((lock = STACK_TOP(link_stack)) != NULL) {
3233                 STACK_POP(link_stack, l_stack);
3234                 flk_wakeup(lock, 1);
3235                 flk_free_lock(lock);
3236         }
3237 
3238         CHECK_SLEEPING_LOCKS(gp);
3239         CHECK_ACTIVE_LOCKS(gp);
3240         CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
3241         mutex_exit(&gp->gp_mutex);
3242 }
3243 
3244 
3245 /*
3246  * Called from 'fs' read and write routines for files that have mandatory
3247  * locking enabled.
3248  */
3249 
3250 int
3251 chklock(struct vnode *vp, int iomode, u_offset_t offset, ssize_t len, int fmode,
3252     caller_context_t *ct)
3253 {
3254         int             i;
3255         struct flock64  bf;
3256         int             error = 0;
3257 
3258         bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
3259         bf.l_whence = 0;
3260         bf.l_start = offset;
3261         bf.l_len = len;
3262         if (ct == NULL) {
3263                 bf.l_pid = curproc->p_pid;
3264                 bf.l_sysid = 0;
3265         } else {
3266                 bf.l_pid = ct->cc_pid;
3267                 bf.l_sysid = ct->cc_sysid;
3268         }
3269         i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
3270         if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
3271             bf.l_type != F_UNLCK)
3272                 error = i ? i : EAGAIN;
3273         return (error);
3274 }
3275 
3276 /*
3277  * convoff - converts the given data (start, whence) to the
3278  * given whence.
3279  */
3280 int
3281 convoff(struct vnode *vp, struct flock64 *lckdat, int whence, offset_t offset)
3282 {
3283         int             error;
3284         struct vattr    vattr;
3285 
3286         if ((lckdat->l_whence == 2) || (whence == 2)) {
3287                 vattr.va_mask = AT_SIZE;
3288                 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3289                         return (error);
3290         }
3291 
3292         switch (lckdat->l_whence) {
3293         case 1:
3294                 lckdat->l_start += offset;
3295                 break;
3296         case 2:
3297                 lckdat->l_start += vattr.va_size;
3298                 /* FALLTHRU */
3299         case 0:
3300                 break;
3301         default:
3302                 return (EINVAL);
3303         }
3304 
3305         if (lckdat->l_start < 0)
3306                 return (EINVAL);
3307 
3308         switch (whence) {
3309         case 1:
3310                 lckdat->l_start -= offset;
3311                 break;
3312         case 2:
3313                 lckdat->l_start -= vattr.va_size;
3314                 /* FALLTHRU */
3315         case 0:
3316                 break;
3317         default:
3318                 return (EINVAL);
3319         }
3320 
3321         lckdat->l_whence = (short)whence;
3322         return (0);
3323 }
3324 
3325 
3326 /*      proc_graph function definitions */
3327 
3328 /*
3329  * Function checks for deadlock due to the new 'lock'. If deadlock found
3330  * edges of this lock are freed and returned.
3331  */
3332 
3333 static int
3334 flk_check_deadlock(lock_descriptor_t *lock)
3335 {
3336         proc_vertex_t   *start_vertex, *pvertex;
3337         proc_vertex_t *dvertex;
3338         proc_edge_t *pep, *ppep;
3339         edge_t  *ep, *nep;
3340         proc_vertex_t *process_stack;
3341 
3342         /*
3343          * OFD style locks are not associated with any process so there is
3344          * no proc graph for these. Thus we cannot, and do not, do deadlock
3345          * detection.
3346          */
3347         if (lock->l_ofd != NULL)
3348                 return (0);
3349 
3350         STACK_INIT(process_stack);
3351 
3352         mutex_enter(&flock_lock);
3353         start_vertex = flk_get_proc_vertex(lock);
3354         ASSERT(start_vertex != NULL);
3355 
3356         /* construct the edges from this process to other processes */
3357 
3358         ep = FIRST_ADJ(lock);
3359         while (ep != HEAD(lock)) {
3360                 proc_vertex_t *adj_proc;
3361 
3362                 adj_proc = flk_get_proc_vertex(ep->to_vertex);
3363                 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
3364                         if (pep->to_proc == adj_proc) {
3365                                 ASSERT(pep->refcount);
3366                                 pep->refcount++;
3367                                 break;
3368                         }
3369                 }
3370                 if (pep == NULL) {
3371                         pep = flk_get_proc_edge();
3372                         pep->to_proc = adj_proc;
3373                         pep->refcount = 1;
3374                         adj_proc->incount++;
3375                         pep->next = start_vertex->edge;
3376                         start_vertex->edge = pep;
3377                 }
3378                 ep = NEXT_ADJ(ep);
3379         }
3380 
3381         ep = FIRST_IN(lock);
3382 
3383         while (ep != HEAD(lock)) {
3384                 proc_vertex_t *in_proc;
3385 
3386                 in_proc = flk_get_proc_vertex(ep->from_vertex);
3387 
3388                 for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
3389                         if (pep->to_proc == start_vertex) {
3390                                 ASSERT(pep->refcount);
3391                                 pep->refcount++;
3392                                 break;
3393                         }
3394                 }
3395                 if (pep == NULL) {
3396                         pep = flk_get_proc_edge();
3397                         pep->to_proc = start_vertex;
3398                         pep->refcount = 1;
3399                         start_vertex->incount++;
3400                         pep->next = in_proc->edge;
3401                         in_proc->edge = pep;
3402                 }
3403                 ep = NEXT_IN(ep);
3404         }
3405 
3406         if (start_vertex->incount == 0) {
3407                 mutex_exit(&flock_lock);
3408                 return (0);
3409         }
3410 
3411         flk_proc_graph_uncolor();
3412 
3413         start_vertex->p_sedge = start_vertex->edge;
3414 
3415         STACK_PUSH(process_stack, start_vertex, p_stack);
3416 
3417         while ((pvertex = STACK_TOP(process_stack)) != NULL) {
3418                 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
3419                         dvertex = pep->to_proc;
3420                         if (!PROC_ARRIVED(dvertex)) {
3421                                 STACK_PUSH(process_stack, dvertex, p_stack);
3422                                 dvertex->p_sedge = dvertex->edge;
3423                                 PROC_ARRIVE(pvertex);
3424                                 pvertex->p_sedge = pep->next;
3425                                 break;
3426                         }
3427                         if (!PROC_DEPARTED(dvertex))
3428                                 goto deadlock;
3429                 }
3430                 if (pep == NULL) {
3431                         PROC_DEPART(pvertex);
3432                         STACK_POP(process_stack, p_stack);
3433                 }
3434         }
3435         mutex_exit(&flock_lock);
3436         return (0);
3437 
3438 deadlock:
3439 
3440         /* we remove all lock edges and proc edges */
3441 
3442         ep = FIRST_ADJ(lock);
3443         while (ep != HEAD(lock)) {
3444                 proc_vertex_t *adj_proc;
3445                 adj_proc = flk_get_proc_vertex(ep->to_vertex);
3446                 nep = NEXT_ADJ(ep);
3447                 IN_LIST_REMOVE(ep);
3448                 ADJ_LIST_REMOVE(ep);
3449                 flk_free_edge(ep);
3450                 ppep = start_vertex->edge;
3451                 for (pep = start_vertex->edge; pep != NULL; ppep = pep,
3452                     pep = ppep->next) {
3453                         if (pep->to_proc == adj_proc) {
3454                                 pep->refcount--;
3455                                 if (pep->refcount == 0) {
3456                                         if (pep == ppep) {
3457                                                 start_vertex->edge = pep->next;
3458                                         } else {
3459                                                 ppep->next = pep->next;
3460                                         }
3461                                         adj_proc->incount--;
3462                                         flk_proc_release(adj_proc);
3463                                         flk_free_proc_edge(pep);
3464                                 }
3465                                 break;
3466                         }
3467                 }
3468                 ep = nep;
3469         }
3470         ep = FIRST_IN(lock);
3471         while (ep != HEAD(lock)) {
3472                 proc_vertex_t *in_proc;
3473                 in_proc = flk_get_proc_vertex(ep->from_vertex);
3474                 nep = NEXT_IN(ep);
3475                 IN_LIST_REMOVE(ep);
3476                 ADJ_LIST_REMOVE(ep);
3477                 flk_free_edge(ep);
3478                 ppep = in_proc->edge;
3479                 for (pep = in_proc->edge; pep != NULL; ppep = pep,
3480                     pep = ppep->next) {
3481                         if (pep->to_proc == start_vertex) {
3482                                 pep->refcount--;
3483                                 if (pep->refcount == 0) {
3484                                         if (pep == ppep) {
3485                                                 in_proc->edge = pep->next;
3486                                         } else {
3487                                                 ppep->next = pep->next;
3488                                         }
3489                                         start_vertex->incount--;
3490                                         flk_proc_release(in_proc);
3491                                         flk_free_proc_edge(pep);
3492                                 }
3493                                 break;
3494                         }
3495                 }
3496                 ep = nep;
3497         }
3498         flk_proc_release(start_vertex);
3499         mutex_exit(&flock_lock);
3500         return (1);
3501 }
3502 
3503 /*
3504  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
3505  * from the list we return that, otherwise we allocate one. If necessary,
3506  * we grow the list of vertices also.
3507  */
3508 
3509 static proc_vertex_t *
3510 flk_get_proc_vertex(lock_descriptor_t *lock)
3511 {
3512         int i;
3513         proc_vertex_t   *pv;
3514         proc_vertex_t   **palloc;
3515 
3516         ASSERT(MUTEX_HELD(&flock_lock));
3517         if (lock->pvertex != -1) {
3518                 ASSERT(lock->pvertex >= 0);
3519                 pv = pgraph.proc[lock->pvertex];
3520                 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3521                         return (pv);
3522                 }
3523         }
3524         for (i = 0; i < pgraph.gcount; i++) {
3525                 pv = pgraph.proc[i];
3526                 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3527                         lock->pvertex = pv->index = i;
3528                         return (pv);
3529                 }
3530         }
3531         pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
3532         pv->pid = lock->l_flock.l_pid;
3533         pv->sysid = lock->l_flock.l_sysid;
3534         flk_proc_vertex_allocs++;
3535         if (pgraph.free != 0) {
3536                 for (i = 0; i < pgraph.gcount; i++) {
3537                         if (pgraph.proc[i] == NULL) {
3538                                 pgraph.proc[i] = pv;
3539                                 lock->pvertex = pv->index = i;
3540                                 pgraph.free--;
3541                                 return (pv);
3542                         }
3543                 }
3544         }
3545         palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
3546             sizeof (proc_vertex_t *), KM_SLEEP);
3547 
3548         if (pgraph.proc) {
3549                 bcopy(pgraph.proc, palloc,
3550                     pgraph.gcount * sizeof (proc_vertex_t *));
3551 
3552                 kmem_free(pgraph.proc,
3553                     pgraph.gcount * sizeof (proc_vertex_t *));
3554         }
3555         pgraph.proc = palloc;
3556         pgraph.free += (PROC_CHUNK - 1);
3557         pv->index = lock->pvertex = pgraph.gcount;
3558         pgraph.gcount += PROC_CHUNK;
3559         pgraph.proc[pv->index] = pv;
3560         return (pv);
3561 }
3562 
3563 /*
3564  * Allocate a proc edge.
3565  */
3566 
3567 static proc_edge_t *
3568 flk_get_proc_edge()
3569 {
3570         proc_edge_t *pep;
3571 
3572         pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3573         flk_proc_edge_allocs++;
3574         return (pep);
3575 }
3576 
3577 /*
3578  * Free the proc edge. Called whenever its reference count goes to zero.
3579  */
3580 
3581 static void
3582 flk_free_proc_edge(proc_edge_t *pep)
3583 {
3584         ASSERT(pep->refcount == 0);
3585         kmem_free(pep, sizeof (proc_edge_t));
3586         flk_proc_edge_frees++;
3587 }
3588 
3589 /*
3590  * Color the graph explicitly done only when the mark value hits max value.
3591  */
3592 
3593 static void
3594 flk_proc_graph_uncolor()
3595 {
3596         int i;
3597 
3598         if (pgraph.mark == UINT_MAX) {
3599                 for (i = 0; i < pgraph.gcount; i++)
3600                         if (pgraph.proc[i] != NULL) {
3601                                 pgraph.proc[i]->atime = 0;
3602                                 pgraph.proc[i]->dtime = 0;
3603                         }
3604                 pgraph.mark = 1;
3605         } else {
3606                 pgraph.mark++;
3607         }
3608 }
3609 
3610 /*
3611  * Release the proc vertex iff both there are no in edges and out edges
3612  */
3613 
3614 static void
3615 flk_proc_release(proc_vertex_t *proc)
3616 {
3617         ASSERT(MUTEX_HELD(&flock_lock));
3618         if (proc->edge == NULL && proc->incount == 0) {
3619                 pgraph.proc[proc->index] = NULL;
3620                 pgraph.free++;
3621                 kmem_free(proc, sizeof (proc_vertex_t));
3622                 flk_proc_vertex_frees++;
3623         }
3624 }
3625 
3626 /*
3627  * Updates process graph to reflect change in a lock_graph.
3628  * Note: We should call this function only after we have a correctly
3629  * recomputed lock graph. Otherwise we might miss a deadlock detection.
3630  * eg: in function flk_relation() we call this function after flk_recompute_
3631  * dependencies() otherwise if a process tries to lock a vnode hashed
3632  * into another graph it might sleep for ever.
3633  */
3634 
3635 static void
3636 flk_update_proc_graph(edge_t *ep, int delete)
3637 {
3638         proc_vertex_t *toproc, *fromproc;
3639         proc_edge_t *pep, *prevpep;
3640 
3641         mutex_enter(&flock_lock);
3642 
3643         /*
3644          * OFD style locks are not associated with any process so there is
3645          * no proc graph for these.
3646          */
3647         if (ep->from_vertex->l_ofd != NULL) {
3648                 mutex_exit(&flock_lock);
3649                 return;
3650         }
3651 
3652         toproc = flk_get_proc_vertex(ep->to_vertex);
3653         fromproc = flk_get_proc_vertex(ep->from_vertex);
3654 
3655         if (!delete)
3656                 goto add;
3657         pep = prevpep = fromproc->edge;
3658 
3659         ASSERT(pep != NULL);
3660         while (pep != NULL) {
3661                 if (pep->to_proc == toproc) {
3662                         ASSERT(pep->refcount > 0);
3663                         pep->refcount--;
3664                         if (pep->refcount == 0) {
3665                                 if (pep == prevpep) {
3666                                         fromproc->edge = pep->next;
3667                                 } else {
3668                                         prevpep->next = pep->next;
3669                                 }
3670                                 toproc->incount--;
3671                                 flk_proc_release(toproc);
3672                                 flk_free_proc_edge(pep);
3673                         }
3674                         break;
3675                 }
3676                 prevpep = pep;
3677                 pep = pep->next;
3678         }
3679         flk_proc_release(fromproc);
3680         mutex_exit(&flock_lock);
3681         return;
3682 add:
3683 
3684         pep = fromproc->edge;
3685 
3686         while (pep != NULL) {
3687                 if (pep->to_proc == toproc) {
3688                         ASSERT(pep->refcount > 0);
3689                         pep->refcount++;
3690                         break;
3691                 }
3692                 pep = pep->next;
3693         }
3694         if (pep == NULL) {
3695                 pep = flk_get_proc_edge();
3696                 pep->to_proc = toproc;
3697                 pep->refcount = 1;
3698                 toproc->incount++;
3699                 pep->next = fromproc->edge;
3700                 fromproc->edge = pep;
3701         }
3702         mutex_exit(&flock_lock);
3703 }
3704 
3705 /*
3706  * Set the control status for lock manager requests.
3707  *
3708  */
3709 
3710 /*
3711  * PSARC case 1997/292
3712  *
3713  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3714  * Effects: Set the state of the NLM server identified by "nlmid"
3715  *   in the NLM registry to state "nlm_state."
3716  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3717  *   NLM server to this LLM.
3718  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3719  *   may be locks requests that have gotten started but not finished.  In
3720  *   particular, there may be blocking requests that are in the callback code
3721  *   before sleeping (so they're not holding the lock for the graph).  If
3722  *   such a thread reacquires the graph's lock (to go to sleep) after
3723  *   NLM state in the NLM registry  is set to a non-up value,
3724  *   it will notice the status and bail out.  If the request gets
3725  *   granted before the thread can check the NLM registry, let it
3726  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3727  *
3728  * Modifies: nlm_reg_obj (global)
3729  * Arguments:
3730  *    nlmid     (IN):    id uniquely identifying an NLM server
3731  *    nlm_state (IN):    NLM server state to change "nlmid" to
3732  */
3733 void
3734 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3735 {
3736         /*
3737          * Check to see if node is booted as a cluster. If not, return.
3738          */
3739         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3740                 return;
3741         }
3742 
3743         /*
3744          * Check for development/debugging.  It is possible to boot a node
3745          * in non-cluster mode, and then run a special script, currently
3746          * available only to developers, to bring up the node as part of a
3747          * cluster.  The problem is that running such a script does not
3748          * result in the routine flk_init() being called and hence global array
3749          * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3750          * but the LLM needs to do an additional check to see if the global
3751          * array has been created or not. If nlm_reg_status is NULL, then
3752          * return, else continue.
3753          */
3754         if (nlm_reg_status == NULL) {
3755                 return;
3756         }
3757 
3758         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3759         mutex_enter(&nlm_reg_lock);
3760 
3761         if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3762                 /*
3763                  * If the NLM server "nlmid" is unknown in the NLM registry,
3764                  * add it to the registry in the nlm shutting down state.
3765                  */
3766                 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3767                     FLK_NLM_SHUTTING_DOWN);
3768         } else {
3769                 /*
3770                  * Change the state of the NLM server identified by "nlmid"
3771                  * in the NLM registry to the argument "nlm_state."
3772                  */
3773                 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3774                     nlm_state);
3775         }
3776 
3777         /*
3778          *  The reason we must register the NLM server that is shutting down
3779          *  with an LLM that doesn't already know about it (never sent a lock
3780          *  request) is to handle correctly a race between shutdown and a new
3781          *  lock request.  Suppose that a shutdown request from the NLM server
3782          *  invokes this routine at the LLM, and a thread is spawned to
3783          *  service the request. Now suppose a new lock request is in
3784          *  progress and has already passed the first line of defense in
3785          *  reclock(), which denies new locks requests from NLM servers
3786          *  that are not in the NLM_UP state.  After the current routine
3787          *  is invoked for both phases of shutdown, the routine will return,
3788          *  having done nothing, and the lock request will proceed and
3789          *  probably be granted.  The problem is that the shutdown was ignored
3790          *  by the lock request because there was no record of that NLM server
3791          *  shutting down.   We will be in the peculiar position of thinking
3792          *  that we've shutdown the NLM server and all locks at all LLMs have
3793          *  been discarded, but in fact there's still one lock held.
3794          *  The solution is to record the existence of NLM server and change
3795          *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3796          *  progress may proceed because the next phase NLM_DOWN will catch
3797          *  this lock and discard it.
3798          */
3799         mutex_exit(&nlm_reg_lock);
3800 
3801         switch (nlm_state) {
3802         case FLK_NLM_UP:
3803                 /*
3804                  * Change the NLM state of all locks still held on behalf of
3805                  * the NLM server identified by "nlmid" to NLM_UP.
3806                  */
3807                 cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3808                 break;
3809 
3810         case FLK_NLM_SHUTTING_DOWN:
3811                 /*
3812                  * Wake up all sleeping locks for the NLM server identified
3813                  * by "nlmid." Note that eventually all woken threads will
3814                  * have their lock requests cancelled and descriptors
3815                  * removed from the sleeping lock list.  Note that the NLM
3816                  * server state associated with each lock descriptor is
3817                  * changed to FLK_NLM_SHUTTING_DOWN.
3818                  */
3819                 cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3820                 break;
3821 
3822         case FLK_NLM_DOWN:
3823                 /*
3824                  * Discard all active, granted locks for this NLM server
3825                  * identified by "nlmid."
3826                  */
3827                 cl_flk_unlock_nlm_granted(nlmid);
3828                 break;
3829 
3830         default:
3831                 panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3832         }
3833 }
3834 
3835 /*
3836  * Set the control status for lock manager requests.
3837  *
3838  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3839  * may be locks requests that have gotten started but not finished.  In
3840  * particular, there may be blocking requests that are in the callback code
3841  * before sleeping (so they're not holding the lock for the graph).  If
3842  * such a thread reacquires the graph's lock (to go to sleep) after
3843  * flk_lockmgr_status is set to a non-up value, it will notice the status
3844  * and bail out.  If the request gets granted before the thread can check
3845  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3846  * we are called with FLK_LOCKMGR_DOWN.
3847  */
3848 
3849 void
3850 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3851 {
3852         int i;
3853         graph_t *gp;
3854         struct flock_globals *fg;
3855 
3856         fg = flk_get_globals();
3857         ASSERT(fg != NULL);
3858 
3859         mutex_enter(&flock_lock);
3860         fg->flk_lockmgr_status = status;
3861         mutex_exit(&flock_lock);
3862 
3863         /*
3864          * If the lock manager is coming back up, all that's needed is to
3865          * propagate this information to the graphs.  If the lock manager
3866          * is going down, additional action is required, and each graph's
3867          * copy of the state is updated atomically with this other action.
3868          */
3869         switch (status) {
3870         case FLK_LOCKMGR_UP:
3871                 for (i = 0; i < HASH_SIZE; i++) {
3872                         mutex_enter(&flock_lock);
3873                         gp = lock_graph[i];
3874                         mutex_exit(&flock_lock);
3875                         if (gp == NULL)
3876                                 continue;
3877                         mutex_enter(&gp->gp_mutex);
3878                         fg->lockmgr_status[i] = status;
3879                         mutex_exit(&gp->gp_mutex);
3880                 }
3881                 break;
3882         case FLK_WAKEUP_SLEEPERS:
3883                 wakeup_sleeping_lockmgr_locks(fg);
3884                 break;
3885         case FLK_LOCKMGR_DOWN:
3886                 unlock_lockmgr_granted(fg);
3887                 break;
3888         default:
3889                 panic("flk_set_lockmgr_status: bad status (%d)", status);
3890                 break;
3891         }
3892 }
3893 
3894 /*
3895  * This routine returns all the locks that are active or sleeping and are
3896  * associated with a particular set of identifiers.  If lock_state != 0, then
3897  * only locks that match the lock_state are returned. If lock_state == 0, then
3898  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3899  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3900  * vnode pointer is ignored.
3901  *
3902  * A list containing the vnode pointer and an flock structure
3903  * describing the lock is returned.  Each element in the list is
3904  * dynamically allocated and must be freed by the caller.  The
3905  * last item in the list is denoted by a NULL value in the ll_next
3906  * field.
3907  *
3908  * The vnode pointers returned are held.  The caller is responsible
3909  * for releasing these.  Note that the returned list is only a snapshot of
3910  * the current lock information, and that it is a snapshot of a moving
3911  * target (only one graph is locked at a time).
3912  */
3913 
3914 locklist_t *
3915 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3916     pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3917 {
3918         lock_descriptor_t       *lock;
3919         lock_descriptor_t       *graph_head;
3920         locklist_t              listhead;
3921         locklist_t              *llheadp;
3922         locklist_t              *llp;
3923         locklist_t              *lltp;
3924         graph_t                 *gp;
3925         int                     i;
3926         int                     first_index; /* graph index */
3927         int                     num_indexes; /* graph index */
3928 
3929         ASSERT((list_type == FLK_ACTIVE_STATE) ||
3930             (list_type == FLK_SLEEPING_STATE));
3931 
3932         /*
3933          * Get a pointer to something to use as a list head while building
3934          * the rest of the list.
3935          */
3936         llheadp = &listhead;
3937         lltp = llheadp;
3938         llheadp->ll_next = (locklist_t *)NULL;
3939 
3940         /* Figure out which graphs we want to look at. */
3941         if (vp == NULL) {
3942                 first_index = 0;
3943                 num_indexes = HASH_SIZE;
3944         } else {
3945                 first_index = HASH_INDEX(vp);
3946                 num_indexes = 1;
3947         }
3948 
3949         for (i = first_index; i < first_index + num_indexes; i++) {
3950                 mutex_enter(&flock_lock);
3951                 gp = lock_graph[i];
3952                 mutex_exit(&flock_lock);
3953                 if (gp == NULL) {
3954                         continue;
3955                 }
3956 
3957                 mutex_enter(&gp->gp_mutex);
3958                 graph_head = (list_type == FLK_ACTIVE_STATE) ?
3959                     ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3960                 for (lock = graph_head->l_next;
3961                     lock != graph_head;
3962                     lock = lock->l_next) {
3963                         if (use_sysid && lock->l_flock.l_sysid != sysid)
3964                                 continue;
3965                         if (pid != NOPID && lock->l_flock.l_pid != pid)
3966                                 continue;
3967                         if (vp != NULL && lock->l_vnode != vp)
3968                                 continue;
3969                         if (lock_state && !(lock_state & lock->l_state))
3970                                 continue;
3971                         if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3972                                 continue;
3973                         /*
3974                          * A matching lock was found.  Allocate
3975                          * space for a new locklist entry and fill
3976                          * it in.
3977                          */
3978                         llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3979                         lltp->ll_next = llp;
3980                         VN_HOLD(lock->l_vnode);
3981                         llp->ll_vp = lock->l_vnode;
3982                         create_flock(lock, &(llp->ll_flock));
3983                         llp->ll_next = (locklist_t *)NULL;
3984                         lltp = llp;
3985                 }
3986                 mutex_exit(&gp->gp_mutex);
3987         }
3988 
3989         llp = llheadp->ll_next;
3990         return (llp);
3991 }
3992 
3993 /*
3994  * These two functions are simply interfaces to get_lock_list.  They return
3995  * a list of sleeping or active locks for the given sysid and pid.  See
3996  * get_lock_list for details.
3997  *
3998  * In either case we don't particularly care to specify the zone of interest;
3999  * the sysid-space is global across zones, so the sysid will map to exactly one
4000  * zone, and we'll return information for that zone.
4001  */
4002 
4003 locklist_t *
4004 flk_get_sleeping_locks(int sysid, pid_t pid)
4005 {
4006         return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
4007             ALL_ZONES));
4008 }
4009 
4010 locklist_t *
4011 flk_get_active_locks(int sysid, pid_t pid)
4012 {
4013         return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
4014             ALL_ZONES));
4015 }
4016 
4017 /*
4018  * Another interface to get_lock_list.  This one returns all the active
4019  * locks for a given vnode.  Again, see get_lock_list for details.
4020  *
4021  * We don't need to specify which zone's locks we're interested in.  The matter
4022  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
4023  * be used by multiple zones, so the list of locks will all be from the right
4024  * zone.
4025  */
4026 
4027 locklist_t *
4028 flk_active_locks_for_vp(const vnode_t *vp)
4029 {
4030         return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
4031             ALL_ZONES));
4032 }
4033 
4034 /*
4035  * Another interface to get_lock_list.  This one returns all the active
4036  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
4037  *
4038  * See the comment for flk_active_locks_for_vp() for why we don't care to
4039  * specify the particular zone of interest.
4040  */
4041 locklist_t *
4042 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
4043 {
4044         return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
4045             NOPID, vp, ALL_ZONES));
4046 }
4047 
4048 /*
4049  * Another interface to get_lock_list.  This one returns all the active
4050  * nbmand locks for a given pid.  Again, see get_lock_list for details.
4051  *
4052  * The zone doesn't need to be specified here; the locks held by a
4053  * particular process will either be local (ie, non-NFS) or from the zone
4054  * the process is executing in.  This is because other parts of the system
4055  * ensure that an NFS vnode can't be used in a zone other than that in
4056  * which it was opened.
4057  */
4058 locklist_t *
4059 flk_active_nbmand_locks(pid_t pid)
4060 {
4061         return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
4062             pid, NULL, ALL_ZONES));
4063 }
4064 
4065 /*
4066  * Free up all entries in the locklist.
4067  */
4068 void
4069 flk_free_locklist(locklist_t *llp)
4070 {
4071         locklist_t *next_llp;
4072 
4073         while (llp) {
4074                 next_llp = llp->ll_next;
4075                 VN_RELE(llp->ll_vp);
4076                 kmem_free(llp, sizeof (*llp));
4077                 llp = next_llp;
4078         }
4079 }
4080 
4081 static void
4082 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
4083 {
4084         /*
4085          * For each graph "lg" in the hash table lock_graph do
4086          * a.  Get the list of sleeping locks
4087          * b.  For each lock descriptor in the list do
4088          *      i.   If the requested lock is an NLM server request AND
4089          *              the nlmid is the same as the routine argument then
4090          *              change the lock descriptor's state field to
4091          *              "nlm_state."
4092          * c.  Get the list of active locks
4093          * d.  For each lock descriptor in the list do
4094          *      i.   If the requested lock is an NLM server request AND
4095          *              the nlmid is the same as the routine argument then
4096          *              change the lock descriptor's state field to
4097          *              "nlm_state."
4098          */
4099 
4100         int                     i;
4101         graph_t                 *gp;                    /* lock graph */
4102         lock_descriptor_t       *lock;                  /* lock */
4103         lock_descriptor_t       *nlock = NULL;          /* next lock */
4104         int                     lock_nlmid;
4105 
4106         for (i = 0; i < HASH_SIZE; i++) {
4107                 mutex_enter(&flock_lock);
4108                 gp = lock_graph[i];
4109                 mutex_exit(&flock_lock);
4110                 if (gp == NULL) {
4111                         continue;
4112                 }
4113 
4114                 /* Get list of sleeping locks in current lock graph. */
4115                 mutex_enter(&gp->gp_mutex);
4116                 for (lock = SLEEPING_HEAD(gp)->l_next;
4117                     lock != SLEEPING_HEAD(gp);
4118                     lock = nlock) {
4119                         nlock = lock->l_next;
4120                         /* get NLM id */
4121                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
4122 
4123                         /*
4124                          * If NLM server request AND nlmid of lock matches
4125                          * nlmid of argument, then set the NLM state of the
4126                          * lock to "nlm_state."
4127                          */
4128                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
4129                                 SET_NLM_STATE(lock, nlm_state);
4130                         }
4131                 }
4132 
4133                 /* Get list of active locks in current lock graph. */
4134                 for (lock = ACTIVE_HEAD(gp)->l_next;
4135                     lock != ACTIVE_HEAD(gp);
4136                     lock = nlock) {
4137                         nlock = lock->l_next;
4138                         /* get NLM id */
4139                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
4140 
4141                         /*
4142                          * If NLM server request AND nlmid of lock matches
4143                          * nlmid of argument, then set the NLM state of the
4144                          * lock to "nlm_state."
4145                          */
4146                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
4147                                 ASSERT(IS_ACTIVE(lock));
4148                                 SET_NLM_STATE(lock, nlm_state);
4149                         }
4150                 }
4151                 mutex_exit(&gp->gp_mutex);
4152         }
4153 }
4154 
4155 /*
4156  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
4157  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
4158  *   identified by "nlmid." Poke those lock requests.
4159  */
4160 static void
4161 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
4162 {
4163         lock_descriptor_t *lock;
4164         lock_descriptor_t *nlock = NULL; /* next lock */
4165         int i;
4166         graph_t *gp;
4167         int     lock_nlmid;
4168 
4169         for (i = 0; i < HASH_SIZE; i++) {
4170                 mutex_enter(&flock_lock);
4171                 gp = lock_graph[i];
4172                 mutex_exit(&flock_lock);
4173                 if (gp == NULL) {
4174                         continue;
4175                 }
4176 
4177                 mutex_enter(&gp->gp_mutex);
4178                 for (lock = SLEEPING_HEAD(gp)->l_next;
4179                     lock != SLEEPING_HEAD(gp);
4180                     lock = nlock) {
4181                         nlock = lock->l_next;
4182                         /*
4183                          * If NLM server request _and_ nlmid of lock matches
4184                          * nlmid of argument, then set the NLM state of the
4185                          * lock to NLM_SHUTTING_DOWN, and wake up sleeping
4186                          * request.
4187                          */
4188                         if (IS_LOCKMGR(lock)) {
4189                                 /* get NLM id */
4190                                 lock_nlmid =
4191                                     GETNLMID(lock->l_flock.l_sysid);
4192                                 if (nlmid == lock_nlmid) {
4193                                         SET_NLM_STATE(lock,
4194                                             FLK_NLM_SHUTTING_DOWN);
4195                                         INTERRUPT_WAKEUP(lock);
4196                                 }
4197                         }
4198                 }
4199                 mutex_exit(&gp->gp_mutex);
4200         }
4201 }
4202 
4203 /*
4204  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
4205  * Effects:  Find all active (granted) lock manager locks _only_ for the
4206  *   NLM server identified by "nlmid" and release them.
4207  */
4208 static void
4209 cl_flk_unlock_nlm_granted(int nlmid)
4210 {
4211         lock_descriptor_t *lock;
4212         lock_descriptor_t *nlock = NULL; /* next lock */
4213         int i;
4214         graph_t *gp;
4215         int     lock_nlmid;
4216 
4217         for (i = 0; i < HASH_SIZE; i++) {
4218                 mutex_enter(&flock_lock);
4219                 gp = lock_graph[i];
4220                 mutex_exit(&flock_lock);
4221                 if (gp == NULL) {
4222                         continue;
4223                 }
4224 
4225                 mutex_enter(&gp->gp_mutex);
4226                 for (lock = ACTIVE_HEAD(gp)->l_next;
4227                     lock != ACTIVE_HEAD(gp);
4228                     lock = nlock) {
4229                         nlock = lock->l_next;
4230                         ASSERT(IS_ACTIVE(lock));
4231 
4232                         /*
4233                          * If it's an  NLM server request _and_ nlmid of
4234                          * the lock matches nlmid of argument, then
4235                          * remove the active lock the list, wakup blocked
4236                          * threads, and free the storage for the lock.
4237                          * Note that there's no need to mark the NLM state
4238                          * of this lock to NLM_DOWN because the lock will
4239                          * be deleted anyway and its storage freed.
4240                          */
4241                         if (IS_LOCKMGR(lock)) {
4242                                 /* get NLM id */
4243                                 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
4244                                 if (nlmid == lock_nlmid) {
4245                                         flk_delete_active_lock(lock, 0);
4246                                         flk_wakeup(lock, 1);
4247                                         flk_free_lock(lock);
4248                                 }
4249                         }
4250                 }
4251                 mutex_exit(&gp->gp_mutex);
4252         }
4253 }
4254 
4255 /*
4256  * Find all sleeping lock manager requests and poke them.
4257  */
4258 static void
4259 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
4260 {
4261         lock_descriptor_t *lock;
4262         lock_descriptor_t *nlock = NULL; /* next lock */
4263         int i;
4264         graph_t *gp;
4265         zoneid_t zoneid = getzoneid();
4266 
4267         for (i = 0; i < HASH_SIZE; i++) {
4268                 mutex_enter(&flock_lock);
4269                 gp = lock_graph[i];
4270                 mutex_exit(&flock_lock);
4271                 if (gp == NULL) {
4272                         continue;
4273                 }
4274 
4275                 mutex_enter(&gp->gp_mutex);
4276                 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
4277                 for (lock = SLEEPING_HEAD(gp)->l_next;
4278                     lock != SLEEPING_HEAD(gp);
4279                     lock = nlock) {
4280                         nlock = lock->l_next;
4281                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4282                                 INTERRUPT_WAKEUP(lock);
4283                         }
4284                 }
4285                 mutex_exit(&gp->gp_mutex);
4286         }
4287 }
4288 
4289 
4290 /*
4291  * Find all active (granted) lock manager locks and release them.
4292  */
4293 static void
4294 unlock_lockmgr_granted(struct flock_globals *fg)
4295 {
4296         lock_descriptor_t *lock;
4297         lock_descriptor_t *nlock = NULL; /* next lock */
4298         int i;
4299         graph_t *gp;
4300         zoneid_t zoneid = getzoneid();
4301 
4302         for (i = 0; i < HASH_SIZE; i++) {
4303                 mutex_enter(&flock_lock);
4304                 gp = lock_graph[i];
4305                 mutex_exit(&flock_lock);
4306                 if (gp == NULL) {
4307                         continue;
4308                 }
4309 
4310                 mutex_enter(&gp->gp_mutex);
4311                 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
4312                 for (lock = ACTIVE_HEAD(gp)->l_next;
4313                     lock != ACTIVE_HEAD(gp);
4314                     lock = nlock) {
4315                         nlock = lock->l_next;
4316                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4317                                 ASSERT(IS_ACTIVE(lock));
4318                                 flk_delete_active_lock(lock, 0);
4319                                 flk_wakeup(lock, 1);
4320                                 flk_free_lock(lock);
4321                         }
4322                 }
4323                 mutex_exit(&gp->gp_mutex);
4324         }
4325 }
4326 
4327 /*
4328  * Wait until a lock is granted, cancelled, or interrupted.
4329  */
4330 
4331 static void
4332 wait_for_lock(lock_descriptor_t *request)
4333 {
4334         graph_t *gp = request->l_graph;
4335         vnode_t *vp = request->l_vnode;
4336 
4337         ASSERT(MUTEX_HELD(&gp->gp_mutex));
4338 
4339         while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
4340             !(IS_INTERRUPTED(request))) {
4341                 lock_descriptor_t *lock;
4342 
4343                 if (stale_lock_timeout == 0) {
4344                         /* The stale lock detection is disabled */
4345                         if (cv_wait_sig(&request->l_cv, &gp->gp_mutex) == 0) {
4346                                 flk_set_state(request, FLK_INTERRUPTED_STATE);
4347                                 request->l_state |= INTERRUPTED_LOCK;
4348                         }
4349 
4350                         continue;
4351                 }
4352 
4353                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4354 
4355                 if (lock != NULL) {
4356                         do {
4357                                 if (BLOCKS(lock, request)) {
4358                                         flk_stale_lock_check(lock);
4359                                         break;
4360                                 }
4361                                 lock = lock->l_next;
4362                         } while (lock->l_vnode == vp);
4363                 }
4364 
4365                 if (cv_timedwait_sig(&request->l_cv, &gp->gp_mutex,
4366                     ddi_get_lbolt() + SEC_TO_TICK(stale_lock_timeout)) == 0) {
4367                         flk_set_state(request, FLK_INTERRUPTED_STATE);
4368                         request->l_state |= INTERRUPTED_LOCK;
4369                 }
4370         }
4371 }
4372 
4373 /*
4374  * Create an flock structure from the existing lock information
4375  *
4376  * This routine is used to create flock structures for the lock manager
4377  * to use in a reclaim request.  Since the lock was originated on this
4378  * host, it must be conforming to UNIX semantics, so no checking is
4379  * done to make sure it falls within the lower half of the 32-bit range.
4380  */
4381 
4382 static void
4383 create_flock(lock_descriptor_t *lp, flock64_t *flp)
4384 {
4385         ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
4386         ASSERT(lp->l_end >= lp->l_start);
4387 
4388         flp->l_type = lp->l_type;
4389         flp->l_whence = 0;
4390         flp->l_start = lp->l_start;
4391         flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
4392             (lp->l_end - lp->l_start + 1);
4393         flp->l_sysid = lp->l_flock.l_sysid;
4394         flp->l_pid = lp->l_flock.l_pid;
4395 }
4396 
4397 /*
4398  * Convert flock_t data describing a lock range into unsigned long starting
4399  * and ending points, which are put into lock_request.  Returns 0 or an
4400  * errno value.
4401  */
4402 
4403 int
4404 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
4405     u_offset_t *start, u_offset_t *end, offset_t offset)
4406 {
4407         struct vattr    vattr;
4408         int     error;
4409 
4410         /*
4411          * Determine the starting point of the request
4412          */
4413         switch (flp->l_whence) {
4414         case 0:         /* SEEK_SET */
4415                 *start = (u_offset_t)flp->l_start;
4416                 break;
4417         case 1:         /* SEEK_CUR */
4418                 *start = (u_offset_t)(flp->l_start + offset);
4419                 break;
4420         case 2:         /* SEEK_END */
4421                 vattr.va_mask = AT_SIZE;
4422                 if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
4423                         return (error);
4424                 *start = (u_offset_t)(flp->l_start + vattr.va_size);
4425                 break;
4426         default:
4427                 return (EINVAL);
4428         }
4429 
4430         /*
4431          * Determine the range covered by the request.
4432          */
4433         if (flp->l_len == 0)
4434                 *end = MAX_U_OFFSET_T;
4435         else if ((offset_t)flp->l_len > 0) {
4436                 *end = (u_offset_t)(*start + (flp->l_len - 1));
4437         } else {
4438                 /*
4439                  * Negative length; why do we even allow this ?
4440                  * Because this allows easy specification of
4441                  * the last n bytes of the file.
4442                  */
4443                 *end = *start;
4444                 *start += (u_offset_t)flp->l_len;
4445                 (*start)++;
4446         }
4447         return (0);
4448 }
4449 
4450 /*
4451  * Check the validity of lock data.  This can used by the NFS
4452  * frlock routines to check data before contacting the server.  The
4453  * server must support semantics that aren't as restrictive as
4454  * the UNIX API, so the NFS client is required to check.
4455  * The maximum is passed in by the caller.
4456  */
4457 
4458 int
4459 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
4460 {
4461         /*
4462          * The end (length) for local locking should never be greater
4463          * than max. However, the representation for
4464          * the entire file is MAX_U_OFFSET_T.
4465          */
4466         if ((start > max) ||
4467             ((end > max) && (end != MAX_U_OFFSET_T))) {
4468                 return (EINVAL);
4469         }
4470         if (start > end) {
4471                 return (EINVAL);
4472         }
4473         return (0);
4474 }
4475 
4476 /*
4477  * Fill in request->l_flock with information about the lock blocking the
4478  * request.  The complexity here is that lock manager requests are allowed
4479  * to see into the upper part of the 32-bit address range, whereas local
4480  * requests are only allowed to see signed values.
4481  *
4482  * What should be done when "blocker" is a lock manager lock that uses the
4483  * upper portion of the 32-bit range, but "request" is local?  Since the
4484  * request has already been determined to have been blocked by the blocker,
4485  * at least some portion of "blocker" must be in the range of the request,
4486  * or the request extends to the end of file.  For the first case, the
4487  * portion in the lower range is returned with the indication that it goes
4488  * "to EOF."  For the second case, the last byte of the lower range is
4489  * returned with the indication that it goes "to EOF."
4490  */
4491 
4492 static void
4493 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
4494 {
4495         flock64_t *flrp;                        /* l_flock portion of request */
4496 
4497         ASSERT(blocker != NULL);
4498 
4499         flrp = &request->l_flock;
4500         flrp->l_whence = 0;
4501         flrp->l_type = blocker->l_type;
4502         flrp->l_pid = blocker->l_flock.l_pid;
4503         flrp->l_sysid = blocker->l_flock.l_sysid;
4504         request->l_ofd = blocker->l_ofd;
4505 
4506         if (IS_LOCKMGR(request)) {
4507                 flrp->l_start = blocker->l_start;
4508                 if (blocker->l_end == MAX_U_OFFSET_T)
4509                         flrp->l_len = 0;
4510                 else
4511                         flrp->l_len = blocker->l_end - blocker->l_start + 1;
4512         } else {
4513                 if (blocker->l_start > MAXEND) {
4514                         flrp->l_start = MAXEND;
4515                         flrp->l_len = 0;
4516                 } else {
4517                         flrp->l_start = blocker->l_start;
4518                         if (blocker->l_end == MAX_U_OFFSET_T)
4519                                 flrp->l_len = 0;
4520                         else
4521                                 flrp->l_len = blocker->l_end -
4522                                     blocker->l_start + 1;
4523                 }
4524         }
4525 }
4526 
4527 /*
4528  * PSARC case 1997/292
4529  */
4530 /*
4531  * This is the public routine exported by flock.h.
4532  */
4533 void
4534 cl_flk_change_nlm_state_to_unknown(int nlmid)
4535 {
4536         /*
4537          * Check to see if node is booted as a cluster. If not, return.
4538          */
4539         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
4540                 return;
4541         }
4542 
4543         /*
4544          * See comment in cl_flk_set_nlm_status().
4545          */
4546         if (nlm_reg_status == NULL) {
4547                 return;
4548         }
4549 
4550         /*
4551          * protect NLM registry state with a mutex.
4552          */
4553         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
4554         mutex_enter(&nlm_reg_lock);
4555         FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
4556         mutex_exit(&nlm_reg_lock);
4557 }
4558 
4559 /*
4560  * Return non-zero if the given I/O request conflicts with an active NBMAND
4561  * lock.
4562  * If svmand is non-zero, it means look at all active locks, not just NBMAND
4563  * locks.
4564  */
4565 
4566 int
4567 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
4568     ssize_t length, int svmand, caller_context_t *ct)
4569 {
4570         int conflict = 0;
4571         graph_t                 *gp;
4572         lock_descriptor_t       *lock;
4573         pid_t pid;
4574         int sysid;
4575 
4576         if (ct == NULL) {
4577                 pid = curproc->p_pid;
4578                 sysid = 0;
4579         } else {
4580                 pid = ct->cc_pid;
4581                 sysid = ct->cc_sysid;
4582         }
4583 
4584         mutex_enter(&flock_lock);
4585         gp = lock_graph[HASH_INDEX(vp)];
4586         mutex_exit(&flock_lock);
4587         if (gp == NULL)
4588                 return (0);
4589 
4590         mutex_enter(&gp->gp_mutex);
4591         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4592 
4593         for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
4594                 if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
4595                     (lock->l_flock.l_sysid != sysid ||
4596                     lock->l_flock.l_pid != pid) &&
4597                     lock_blocks_io(op, offset, length,
4598                     lock->l_type, lock->l_start, lock->l_end)) {
4599                         conflict = 1;
4600                         break;
4601                 }
4602         }
4603         mutex_exit(&gp->gp_mutex);
4604 
4605         return (conflict);
4606 }
4607 
4608 /*
4609  * Return non-zero if the given I/O request conflicts with the given lock.
4610  */
4611 
4612 static int
4613 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4614     int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4615 {
4616         ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4617         ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4618 
4619         if (op == NBL_READ && lock_type == F_RDLCK)
4620                 return (0);
4621 
4622         if (offset <= lock_start && lock_start < offset + length)
4623                 return (1);
4624         if (lock_start <= offset && offset <= lock_end)
4625                 return (1);
4626 
4627         return (0);
4628 }
4629 
4630 #ifdef DEBUG
4631 static void
4632 check_active_locks(graph_t *gp)
4633 {
4634         lock_descriptor_t *lock, *lock1;
4635         edge_t  *ep;
4636 
4637         for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4638             lock = lock->l_next) {
4639                 ASSERT(IS_ACTIVE(lock));
4640                 ASSERT(NOT_BLOCKED(lock));
4641                 ASSERT(!IS_BARRIER(lock));
4642 
4643                 ep = FIRST_IN(lock);
4644 
4645                 while (ep != HEAD(lock)) {
4646                         ASSERT(IS_SLEEPING(ep->from_vertex));
4647                         ASSERT(!NOT_BLOCKED(ep->from_vertex));
4648                         ep = NEXT_IN(ep);
4649                 }
4650 
4651                 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4652                     lock1 = lock1->l_next) {
4653                         if (lock1->l_vnode == lock->l_vnode) {
4654                         if (BLOCKS(lock1, lock)) {
4655                                 cmn_err(CE_PANIC,
4656                                     "active lock %p blocks %p",
4657                                     (void *)lock1, (void *)lock);
4658                         } else if (BLOCKS(lock, lock1)) {
4659                                 cmn_err(CE_PANIC,
4660                                     "active lock %p blocks %p",
4661                                     (void *)lock, (void *)lock1);
4662                         }
4663                         }
4664                 }
4665         }
4666 }
4667 
4668 /*
4669  * Effect: This functions checks to see if the transition from 'old_state' to
4670  *      'new_state' is a valid one.  It returns 0 if the transition is valid
4671  *      and 1 if it is not.
4672  *      For a map of valid transitions, see sys/flock_impl.h
4673  */
4674 static int
4675 check_lock_transition(int old_state, int new_state)
4676 {
4677         switch (old_state) {
4678         case FLK_INITIAL_STATE:
4679                 if ((new_state == FLK_START_STATE) ||
4680                     (new_state == FLK_SLEEPING_STATE) ||
4681                     (new_state == FLK_ACTIVE_STATE) ||
4682                     (new_state == FLK_DEAD_STATE)) {
4683                         return (0);
4684                 } else {
4685                         return (1);
4686                 }
4687         case FLK_START_STATE:
4688                 if ((new_state == FLK_ACTIVE_STATE) ||
4689                     (new_state == FLK_DEAD_STATE)) {
4690                         return (0);
4691                 } else {
4692                         return (1);
4693                 }
4694         case FLK_ACTIVE_STATE:
4695                 if (new_state == FLK_DEAD_STATE) {
4696                         return (0);
4697                 } else {
4698                         return (1);
4699                 }
4700         case FLK_SLEEPING_STATE:
4701                 if ((new_state == FLK_GRANTED_STATE) ||
4702                     (new_state == FLK_INTERRUPTED_STATE) ||
4703                     (new_state == FLK_CANCELLED_STATE)) {
4704                         return (0);
4705                 } else {
4706                         return (1);
4707                 }
4708         case FLK_GRANTED_STATE:
4709                 if ((new_state == FLK_START_STATE) ||
4710                     (new_state == FLK_INTERRUPTED_STATE) ||
4711                     (new_state == FLK_CANCELLED_STATE)) {
4712                         return (0);
4713                 } else {
4714                         return (1);
4715                 }
4716         case FLK_CANCELLED_STATE:
4717                 if ((new_state == FLK_INTERRUPTED_STATE) ||
4718                     (new_state == FLK_DEAD_STATE)) {
4719                         return (0);
4720                 } else {
4721                         return (1);
4722                 }
4723         case FLK_INTERRUPTED_STATE:
4724                 if (new_state == FLK_DEAD_STATE) {
4725                         return (0);
4726                 } else {
4727                         return (1);
4728                 }
4729         case FLK_DEAD_STATE:
4730                 /* May be set more than once */
4731                 if (new_state == FLK_DEAD_STATE) {
4732                         return (0);
4733                 } else {
4734                         return (1);
4735                 }
4736         default:
4737                 return (1);
4738         }
4739 }
4740 
4741 static void
4742 check_sleeping_locks(graph_t *gp)
4743 {
4744         lock_descriptor_t *lock1, *lock2;
4745         edge_t *ep;
4746         for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4747             lock1 = lock1->l_next) {
4748                                 ASSERT(!IS_BARRIER(lock1));
4749         for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4750             lock2 = lock2->l_next) {
4751                 if (lock1->l_vnode == lock2->l_vnode) {
4752                         if (BLOCKS(lock2, lock1)) {
4753                                 ASSERT(!IS_GRANTED(lock1));
4754                                 ASSERT(!NOT_BLOCKED(lock1));
4755                                 path(lock1, lock2);
4756                         }
4757                 }
4758         }
4759 
4760         for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4761             lock2 = lock2->l_next) {
4762                                 ASSERT(!IS_BARRIER(lock1));
4763                 if (lock1->l_vnode == lock2->l_vnode) {
4764                         if (BLOCKS(lock2, lock1)) {
4765                                 ASSERT(!IS_GRANTED(lock1));
4766                                 ASSERT(!NOT_BLOCKED(lock1));
4767                                 path(lock1, lock2);
4768                         }
4769                 }
4770         }
4771         ep = FIRST_ADJ(lock1);
4772         while (ep != HEAD(lock1)) {
4773                 ASSERT(BLOCKS(ep->to_vertex, lock1));
4774                 ep = NEXT_ADJ(ep);
4775         }
4776         }
4777 }
4778 
4779 static int
4780 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4781 {
4782         edge_t  *ep;
4783         lock_descriptor_t       *vertex;
4784         lock_descriptor_t *vertex_stack;
4785 
4786         STACK_INIT(vertex_stack);
4787 
4788         flk_graph_uncolor(lock1->l_graph);
4789         ep = FIRST_ADJ(lock1);
4790         ASSERT(ep != HEAD(lock1));
4791         while (ep != HEAD(lock1)) {
4792                 if (no_path)
4793                         ASSERT(ep->to_vertex != lock2);
4794                 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4795                 COLOR(ep->to_vertex);
4796                 ep = NEXT_ADJ(ep);
4797         }
4798 
4799         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4800                 STACK_POP(vertex_stack, l_dstack);
4801                 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4802                     ep = NEXT_ADJ(ep)) {
4803                         if (COLORED(ep->to_vertex))
4804                                 continue;
4805                         COLOR(ep->to_vertex);
4806                         if (ep->to_vertex == lock2)
4807                                 return (1);
4808 
4809                         STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4810                 }
4811         }
4812         return (0);
4813 }
4814 
4815 static void
4816 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4817 {
4818         lock_descriptor_t *lock;
4819 
4820         /* Ignore OFD style locks since they're not process-wide. */
4821         if (pid == 0)
4822                 return;
4823 
4824         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4825 
4826         if (lock) {
4827                 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4828                         if (lock->l_flock.l_pid == pid &&
4829                             lock->l_flock.l_sysid == sysid)
4830                                 cmn_err(CE_PANIC,
4831                                     "owner pid %d's lock %p in active queue",
4832                                     pid, (void *)lock);
4833                         lock = lock->l_next;
4834                 }
4835         }
4836         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4837 
4838         if (lock) {
4839                 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4840                         if (lock->l_flock.l_pid == pid &&
4841                             lock->l_flock.l_sysid == sysid)
4842                                 cmn_err(CE_PANIC,
4843                                     "owner pid %d's lock %p in sleep queue",
4844                                     pid, (void *)lock);
4845                         lock = lock->l_next;
4846                 }
4847         }
4848 }
4849 
4850 static int
4851 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4852 {
4853         edge_t *ep = FIRST_ADJ(lock1);
4854 
4855         while (ep != HEAD(lock1)) {
4856                 if (ep->to_vertex == lock2)
4857                         return (1);
4858                 else
4859                         ep = NEXT_ADJ(ep);
4860         }
4861         return (0);
4862 }
4863 
4864 static int
4865 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4866 {
4867         return (!level_two_path(lock1, lock2, 1));
4868 }
4869 
4870 static void
4871 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4872 {
4873         if (level_one_path(lock1, lock2)) {
4874                 if (level_two_path(lock1, lock2, 0) != 0) {
4875                         cmn_err(CE_WARN,
4876                             "one edge one path from lock1 %p lock2 %p",
4877                             (void *)lock1, (void *)lock2);
4878                 }
4879         } else if (no_path(lock1, lock2)) {
4880                 cmn_err(CE_PANIC,
4881                     "No path from  lock1 %p to lock2 %p",
4882                     (void *)lock1, (void *)lock2);
4883         }
4884 }
4885 #endif /* DEBUG */