io-lx-public-vs-joyent Wdiff usr/src/uts/common/os/flock.c

Print this page

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/flock.c
          +++ new/usr/src/uts/common/os/flock.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28   28  /*      All Rights Reserved */
  29   29  
  30   30  /*
  31   31   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  32   32   * Copyright 2015 Joyent, Inc.
  33   33   */
  34   34  
  35   35  #include <sys/flock_impl.h>
  36   36  #include <sys/vfs.h>
  37   37  #include <sys/t_lock.h>         /* for <sys/callb.h> */
  38   38  #include <sys/callb.h>
  39   39  #include <sys/clconf.h>
  40   40  #include <sys/cladm.h>
  41   41  #include <sys/nbmlock.h>
  42   42  #include <sys/cred.h>
  43   43  #include <sys/policy.h>
  44   44  
  45   45  /*
  46   46   * The following four variables are for statistics purposes and they are
  47   47   * not protected by locks. They may not be accurate but will at least be
  48   48   * close to the actual value.
  49   49   */
  50   50  
  51   51  int     flk_lock_allocs;
  52   52  int     flk_lock_frees;
  53   53  int     edge_allocs;
  54   54  int     edge_frees;
  55   55  int     flk_proc_vertex_allocs;
  56   56  int     flk_proc_edge_allocs;
  57   57  int     flk_proc_vertex_frees;
  58   58  int     flk_proc_edge_frees;
  59   59  
  60   60  static kmutex_t flock_lock;
  61   61  
  62   62  #ifdef DEBUG
  63   63  int check_debug = 0;
  64   64  #define CHECK_ACTIVE_LOCKS(gp)  if (check_debug) \
  65   65                                          check_active_locks(gp);
  66   66  #define CHECK_SLEEPING_LOCKS(gp)        if (check_debug) \
  67   67                                                  check_sleeping_locks(gp);
  68   68  #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)   \
  69   69                  if (check_debug)        \
  70   70                          check_owner_locks(gp, pid, sysid, vp);
  71   71  #define CHECK_LOCK_TRANSITION(old_state, new_state) \
  72   72          { \
  73   73                  if (check_lock_transition(old_state, new_state)) { \
  74   74                          cmn_err(CE_PANIC, "Illegal lock transition \
  75   75                              from %d to %d", old_state, new_state); \
  76   76                  } \
  77   77          }
  78   78  #else
  79   79  
  80   80  #define CHECK_ACTIVE_LOCKS(gp)
  81   81  #define CHECK_SLEEPING_LOCKS(gp)
  82   82  #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
  83   83  #define CHECK_LOCK_TRANSITION(old_state, new_state)
  84   84  
  85   85  #endif /* DEBUG */
  86   86  
  87   87  struct kmem_cache       *flk_edge_cache;
  88   88  
  89   89  graph_t         *lock_graph[HASH_SIZE];
  90   90  proc_graph_t    pgraph;
  91   91  
  92   92  /*
  93   93   * Clustering.
  94   94   *
  95   95   * NLM REGISTRY TYPE IMPLEMENTATION
  96   96   *
  97   97   * Assumptions:
  98   98   *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
  99   99   *      integers; maximum node id is returned by clconf_maximum_nodeid().
 100  100   *  2.  We use this node id to identify the node an NLM server runs on.
 101  101   */
 102  102  
 103  103  /*
 104  104   * NLM registry object keeps track of NLM servers via their
 105  105   * nlmids (which are the node ids of the node in the cluster they run on)
 106  106   * that have requested locks at this LLM with which this registry is
 107  107   * associated.
 108  108   *
 109  109   * Representation of abstraction:
 110  110   *    rep = record[     states: array[nlm_state],
 111  111   *                      lock: mutex]
 112  112   *
 113  113   *    Representation invariants:
 114  114   *      1. index i of rep.states is between 0 and n - 1 where n is number
 115  115   *         of elements in the array, which happen to be the maximum number
 116  116   *         of nodes in the cluster configuration + 1.
 117  117   *      2. map nlmid to index i of rep.states
 118  118   *              0   -> 0
 119  119   *              1   -> 1
 120  120   *              2   -> 2
 121  121   *              n-1 -> clconf_maximum_nodeid()+1
 122  122   *      3.  This 1-1 mapping is quite convenient and it avoids errors resulting
 123  123   *          from forgetting to subtract 1 from the index.
 124  124   *      4.  The reason we keep the 0th index is the following.  A legitimate
 125  125   *          cluster configuration includes making a UFS file system NFS
 126  126   *          exportable.  The code is structured so that if you're in a cluster
 127  127   *          you do one thing; otherwise, you do something else.  The problem
 128  128   *          is what to do if you think you're in a cluster with PXFS loaded,
 129  129   *          but you're using UFS not PXFS?  The upper two bytes of the sysid
 130  130   *          encode the node id of the node where NLM server runs; these bytes
 131  131   *          are zero for UFS.  Since the nodeid is used to index into the
 132  132   *          registry, we can record the NLM server state information at index
 133  133   *          0 using the same mechanism used for PXFS file locks!
 134  134   */
 135  135  static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
 136  136  static kmutex_t nlm_reg_lock;                   /* lock to protect arrary */
 137  137  static uint_t nlm_status_size;                  /* size of state array */
 138  138  
 139  139  /*
 140  140   * Although we need a global lock dependency graph (and associated data
 141  141   * structures), we also need a per-zone notion of whether the lock manager is
 142  142   * running, and so whether to allow lock manager requests or not.
 143  143   *
 144  144   * Thus, on a per-zone basis we maintain a ``global'' variable
 145  145   * (flk_lockmgr_status), protected by flock_lock, and set when the lock
 146  146   * manager is determined to be changing state (starting or stopping).
 147  147   *
 148  148   * Each graph/zone pair also has a copy of this variable, which is protected by
 149  149   * the graph's mutex.
 150  150   *
 151  151   * The per-graph copies are used to synchronize lock requests with shutdown
 152  152   * requests.  The global copy is used to initialize the per-graph field when a
 153  153   * new graph is created.
 154  154   */
 155  155  struct flock_globals {
 156  156          flk_lockmgr_status_t flk_lockmgr_status;
 157  157          flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
 158  158  };
 159  159  
 160  160  zone_key_t flock_zone_key;
 161  161  
 162  162  static void create_flock(lock_descriptor_t *, flock64_t *);
 163  163  static lock_descriptor_t        *flk_get_lock(void);
 164  164  static void     flk_free_lock(lock_descriptor_t *lock);
 165  165  static void     flk_get_first_blocking_lock(lock_descriptor_t *request);
 166  166  static int flk_process_request(lock_descriptor_t *);
 167  167  static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
 168  168  static edge_t *flk_get_edge(void);
 169  169  static int flk_wait_execute_request(lock_descriptor_t *);
 170  170  static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
 171  171  static void flk_insert_active_lock(lock_descriptor_t *);
 172  172  static void flk_delete_active_lock(lock_descriptor_t *, int);
 173  173  static void flk_insert_sleeping_lock(lock_descriptor_t *);
 174  174  static void flk_graph_uncolor(graph_t *);
 175  175  static void flk_wakeup(lock_descriptor_t *, int);
 176  176  static void flk_free_edge(edge_t *);
 177  177  static void flk_recompute_dependencies(lock_descriptor_t *,
 178  178                          lock_descriptor_t **,  int, int);
 179  179  static int flk_find_barriers(lock_descriptor_t *);
 180  180  static void flk_update_barriers(lock_descriptor_t *);
 181  181  static int flk_color_reachables(lock_descriptor_t *);
 182  182  static int flk_canceled(lock_descriptor_t *);
 183  183  static void flk_delete_locks_by_sysid(lock_descriptor_t *);
 184  184  static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
 185  185  static void wait_for_lock(lock_descriptor_t *);
 186  186  static void unlock_lockmgr_granted(struct flock_globals *);
 187  187  static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
 188  188  
 189  189  /* Clustering hooks */
 190  190  static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
 191  191  static void cl_flk_wakeup_sleeping_nlm_locks(int);
 192  192  static void cl_flk_unlock_nlm_granted(int);
 193  193  
 194  194  #ifdef DEBUG
 195  195  static int check_lock_transition(int, int);
 196  196  static void check_sleeping_locks(graph_t *);
 197  197  static void check_active_locks(graph_t *);
 198  198  static int no_path(lock_descriptor_t *, lock_descriptor_t *);
 199  199  static void path(lock_descriptor_t *, lock_descriptor_t *);
 200  200  static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
 201  201  static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
 202  202  static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
 203  203  #endif
 204  204  
 205  205  /*      proc_graph function definitions */
 206  206  static int flk_check_deadlock(lock_descriptor_t *);
 207  207  static void flk_proc_graph_uncolor(void);
 208  208  static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
 209  209  static proc_edge_t *flk_get_proc_edge(void);
 210  210  static void flk_proc_release(proc_vertex_t *);
 211  211  static void flk_free_proc_edge(proc_edge_t *);
 212  212  static void flk_update_proc_graph(edge_t *, int);
 213  213  
 214  214  /* Non-blocking mandatory locking */
 215  215  static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
 216  216                          u_offset_t);
 217  217  
 218  218  static struct flock_globals *
 219  219  flk_get_globals(void)
 220  220  {
 221  221          /*
 222  222           * The KLM module had better be loaded if we're attempting to handle
 223  223           * lockmgr requests.
 224  224           */
 225  225          ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
 226  226          return (zone_getspecific(flock_zone_key, curproc->p_zone));
 227  227  }
 228  228  
 229  229  static flk_lockmgr_status_t
 230  230  flk_get_lockmgr_status(void)
 231  231  {
 232  232          struct flock_globals *fg;
 233  233  
 234  234          ASSERT(MUTEX_HELD(&flock_lock));
 235  235  
 236  236          if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
 237  237                  /*
 238  238                   * KLM module not loaded; lock manager definitely not running.
 239  239                   */
 240  240                  return (FLK_LOCKMGR_DOWN);
 241  241          }
 242  242          fg = flk_get_globals();
 243  243          return (fg->flk_lockmgr_status);
 244  244  }
 245  245  
 246  246  /*
 247  247   * This implements Open File Description (not descriptor) style record locking.
 248  248   * These locks can also be thought of as pid-less since they are not tied to a
 249  249   * specific process, thus they're preserved across fork.
 250  250   *
 251  251   * Called directly from fcntl.
 252  252   *
 253  253   * See reclock() for the implementation of the traditional POSIX style record
 254  254   * locking scheme (pid-ful). This function is derived from reclock() but
 255  255   * simplified and modified to work for OFD style locking.
 256  256   *
 257  257   * The two primary advantages of OFD style of locking are:
 258  258   * 1) It is per-file description, so closing a file descriptor that refers to a
 259  259   *    different file description for the same file will not drop the lock (i.e.
 260  260   *    two open's of the same file get different descriptions but a dup or fork
 261  261   *    will refer to the same description).
 262  262   * 2) Locks are preserved across fork(2).
 263  263   *
 264  264   * Because these locks are per-description a lock ptr lives at the f_filocks
 265  265   * member of the file_t and the lock_descriptor includes a file_t pointer
 266  266   * to enable unique lock identification and management.
 267  267   *
 268  268   * Since these locks are pid-less we cannot do deadlock detection with the
 269  269   * current process-oriented implementation. This is consistent with OFD locking
 270  270   * behavior on other operating systems such as Linux. Since we don't do
 271  271   * deadlock detection we never interact with the process graph that is
 272  272   * maintained for deadlock detection on the traditional POSIX-style locks.
 273  273   *
 274  274   * Future Work:
 275  275   *
 276  276   * The current implementation does not support record locks. That is,
 277  277   * currently the single lock must cover the entire file. This is validated in
 278  278   * fcntl. To support record locks the f_filock pointer in the file_t needs to
 279  279   * be changed to a list of pointers to the locks. That list needs to be
 280  280   * managed independently of the lock list on the vnode itself and it needs to
 281  281   * be maintained as record locks are created, split, coalesced and deleted.
 282  282   *
 283  283   * The current implementation does not support remote file systems (e.g.
 284  284   * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks
 285  285   * interact with the NLM is not clear since the NLM protocol/implementation
 286  286   * appears to be oriented around locks associated with a process. A further
 287  287   * problem is that a design is needed for what nlm_send_siglost() should do and
 288  288   * where it will send SIGLOST. More recent versions of Linux apparently try to
 289  289   * emulate OFD locks on NFS by converting them to traditional POSIX style locks
 290  290   * that work with the NLM. It is not clear that this provides the correct
 291  291   * semantics in all cases.
 292  292   */
 293  293  int
 294  294  ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, u_offset_t offset)
 295  295  {
 296  296          int cmd = 0;
 297  297          vnode_t *vp;
 298  298          lock_descriptor_t       stack_lock_request;
 299  299          lock_descriptor_t       *lock_request;
 300  300          int error = 0;
 301  301          graph_t *gp;
 302  302          int serialize = 0;
 303  303  
 304  304          if (fcmd != F_OFD_GETLK)
 305  305                  cmd = SETFLCK;
 306  306  
 307  307          if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW)
 308  308                  cmd |= SLPFLCK;
 309  309  
 310  310          /* see block comment */
 311  311          VERIFY(lckdat->l_whence == 0);
 312  312          VERIFY(lckdat->l_start == 0);
 313  313          VERIFY(lckdat->l_len == 0);
 314  314  
 315  315          vp = fp->f_vnode;
 316  316  
 317  317          /*
 318  318           * For reclock fs_frlock() would normally have set these in a few
 319  319           * places but for us it's cleaner to centralize it here. Note that
 320  320           * IGN_PID is -1. We use 0 for our pid-less locks.
 321  321           */
 322  322          lckdat->l_pid = 0;
 323  323          lckdat->l_sysid = 0;
 324  324  
 325  325          /*
 326  326           * Check access permissions
 327  327           */
 328  328          if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) &&
 329  329              ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 330  330              (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 331  331                  return (EBADF);
 332  332  
 333  333          /*
 334  334           * for query and unlock we use the stack_lock_request
 335  335           */
 336  336          if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) {
 337  337                  lock_request = &stack_lock_request;
 338  338                  (void) bzero((caddr_t)lock_request,
 339  339                      sizeof (lock_descriptor_t));
 340  340  
 341  341                  /*
 342  342                   * following is added to make the assertions in
 343  343                   * flk_execute_request() pass
 344  344                   */
 345  345                  lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 346  346                  lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 347  347                  lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 348  348                  lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 349  349                  lock_request->l_status = FLK_INITIAL_STATE;
 350  350          } else {
 351  351                  lock_request = flk_get_lock();
 352  352                  fp->f_filock = (struct filock *)lock_request;
 353  353          }
 354  354          lock_request->l_state = 0;
 355  355          lock_request->l_vnode = vp;
 356  356          lock_request->l_zoneid = getzoneid();
 357  357          lock_request->l_ofd = fp;
 358  358  
 359  359          /*
 360  360           * Convert the request range into the canonical start and end
 361  361           * values then check the validity of the lock range.
 362  362           */
 363  363          error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start,
 364  364              &lock_request->l_end, offset);
 365  365          if (error)
 366  366                  goto done;
 367  367  
 368  368          error = flk_check_lock_data(lock_request->l_start, lock_request->l_end,
 369  369              MAXEND);
 370  370          if (error)
 371  371                  goto done;
 372  372  
 373  373          ASSERT(lock_request->l_end >= lock_request->l_start);
 374  374  
 375  375          lock_request->l_type = lckdat->l_type;
 376  376          if (cmd & SLPFLCK)
 377  377                  lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
 378  378  
 379  379          if (!(cmd & SETFLCK)) {
 380  380                  if (lock_request->l_type == F_RDLCK ||
 381  381                      lock_request->l_type == F_WRLCK)
 382  382                          lock_request->l_state |= QUERY_LOCK;
 383  383          }
 384  384          lock_request->l_flock = (*lckdat);
 385  385  
 386  386          /*
 387  387           * We are ready for processing the request
 388  388           */
 389  389  
 390  390          if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK &&
 391  391              nbl_need_check(vp)) {
 392  392                  nbl_start_crit(vp, RW_WRITER);
 393  393                  serialize = 1;
 394  394          }
 395  395  
 396  396          /* Get the lock graph for a particular vnode */
 397  397          gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
 398  398  
 399  399          mutex_enter(&gp->gp_mutex);
 400  400  
 401  401          lock_request->l_state |= REFERENCED_LOCK;
 402  402          lock_request->l_graph = gp;
 403  403  
 404  404          switch (lock_request->l_type) {
 405  405          case F_RDLCK:
 406  406          case F_WRLCK:
 407  407                  if (IS_QUERY_LOCK(lock_request)) {
 408  408                          flk_get_first_blocking_lock(lock_request);
 409  409                          if (lock_request->l_ofd != NULL)
 410  410                                  lock_request->l_flock.l_pid = -1;
 411  411                          (*lckdat) = lock_request->l_flock;
 412  412                  } else {
 413  413                          /* process the request now */
 414  414                          error = flk_process_request(lock_request);
 415  415                  }
 416  416                  break;
 417  417  
 418  418          case F_UNLCK:
 419  419                  /* unlock request will not block so execute it immediately */
 420  420                  error = flk_execute_request(lock_request);
 421  421                  break;
 422  422  
 423  423          default:
 424  424                  error = EINVAL;
 425  425                  break;
 426  426          }
 427  427  
 428  428          if (lock_request == &stack_lock_request) {
 429  429                  flk_set_state(lock_request, FLK_DEAD_STATE);
 430  430          } else {
 431  431                  lock_request->l_state &= ~REFERENCED_LOCK;
 432  432                  if ((error != 0) || IS_DELETED(lock_request)) {
 433  433                          flk_set_state(lock_request, FLK_DEAD_STATE);
 434  434                          flk_free_lock(lock_request);
 435  435                  }
 436  436          }
 437  437  
 438  438          mutex_exit(&gp->gp_mutex);
 439  439          if (serialize)
 440  440                  nbl_end_crit(vp);
 441  441  
 442  442          return (error);
 443  443  
 444  444  done:
 445  445          flk_set_state(lock_request, FLK_DEAD_STATE);
 446  446          if (lock_request != &stack_lock_request)
 447  447                  flk_free_lock(lock_request);
 448  448          return (error);
 449  449  }
 450  450  
 451  451  /*
 452  452   * Remove any lock on the vnode belonging to the given file_t.
 453  453   * Called from closef on last close, file_t is locked.
 454  454   *
 455  455   * This is modeled on the cleanlocks() function but only removes the single
 456  456   * lock associated with fp.
 457  457   */
 458  458  void
 459  459  ofdcleanlock(file_t *fp)
 460  460  {
 461  461          lock_descriptor_t *fplock, *lock, *nlock;
 462  462          vnode_t *vp;
 463  463          graph_t *gp;
 464  464  
 465  465          ASSERT(MUTEX_HELD(&fp->f_tlock));
 466  466  
 467  467          if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL)
 468  468                  return;
 469  469  
 470  470          fp->f_filock = NULL;
 471  471          vp = fp->f_vnode;
 472  472  
 473  473          gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
 474  474  
 475  475          if (gp == NULL)
 476  476                  return;
 477  477          mutex_enter(&gp->gp_mutex);
 478  478  
 479  479          CHECK_SLEEPING_LOCKS(gp);
 480  480          CHECK_ACTIVE_LOCKS(gp);
 481  481  
 482  482          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 483  483  
 484  484          if (lock) {
 485  485                  do {
 486  486                          nlock = lock->l_next;
 487  487                          if (fplock == lock) {
 488  488                                  CANCEL_WAKEUP(lock);
 489  489                                  break;
 490  490                          }
 491  491                          lock = nlock;
 492  492                  } while (lock->l_vnode == vp);
 493  493          }
 494  494  
 495  495          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 496  496  
 497  497          if (lock) {
 498  498                  do {
 499  499                          nlock = lock->l_next;
 500  500                          if (fplock == lock) {
 501  501                                  flk_delete_active_lock(lock, 0);
 502  502                                  flk_wakeup(lock, 1);
 503  503                                  flk_free_lock(lock);
 504  504                                  break;
 505  505                          }
 506  506                          lock = nlock;
 507  507                  } while (lock->l_vnode == vp);
 508  508          }
 509  509  
 510  510          CHECK_SLEEPING_LOCKS(gp);
 511  511          CHECK_ACTIVE_LOCKS(gp);
 512  512          mutex_exit(&gp->gp_mutex);
 513  513  }
 514  514  
 515  515  /*
 516  516   * Routine called from fs_frlock in fs/fs_subr.c
 517  517   *
 518  518   * This implements traditional POSIX style record locking. The two primary
 519  519   * drawbacks to this style of locking are:
 520  520   * 1) It is per-process, so any close of a file descriptor that refers to the
 521  521   *    file will drop the lock (e.g. lock /etc/passwd, call a library function
 522  522   *    which opens /etc/passwd to read the file, when the library closes it's
 523  523   *    file descriptor the application loses its lock and does not know).
 524  524   * 2) Locks are not preserved across fork(2).
 525  525   *
 526  526   * Because these locks are only associated with a PID, they are per-process.
 527  527   * This is why any close will drop the lock and is also why, once the process
 528  528   * forks, the lock is no longer related to the new process. These locks can
 529  529   * be considered as PID-ful.
 530  530   *
 531  531   * See ofdlock() for the implementation of a similar but improved locking
 532  532   * scheme.
 533  533   */
 534  534  int
 535  535  reclock(vnode_t         *vp,
 536  536          flock64_t       *lckdat,
 537  537          int             cmd,
 538  538          int             flag,
 539  539          u_offset_t      offset,
 540  540          flk_callback_t  *flk_cbp)
 541  541  {
 542  542          lock_descriptor_t       stack_lock_request;
 543  543          lock_descriptor_t       *lock_request;
 544  544          int error = 0;
 545  545          graph_t *gp;
 546  546          int                     nlmid;
 547  547  
 548  548          /*
 549  549           * Check access permissions
 550  550           */
 551  551          if ((cmd & SETFLCK) &&
 552  552              ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 553  553              (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 554  554                          return (EBADF);
 555  555  
 556  556          /*
 557  557           * for query and unlock we use the stack_lock_request
 558  558           */
 559  559  
 560  560          if ((lckdat->l_type == F_UNLCK) ||
 561  561              !((cmd & INOFLCK) || (cmd & SETFLCK))) {
 562  562                  lock_request = &stack_lock_request;
 563  563                  (void) bzero((caddr_t)lock_request,
 564  564                      sizeof (lock_descriptor_t));
 565  565  
 566  566                  /*
 567  567                   * following is added to make the assertions in
 568  568                   * flk_execute_request() to pass through
 569  569                   */
 570  570  
 571  571                  lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 572  572                  lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 573  573                  lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 574  574                  lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 575  575                  lock_request->l_status = FLK_INITIAL_STATE;
 576  576          } else {
 577  577                  lock_request = flk_get_lock();
 578  578          }
 579  579          lock_request->l_state = 0;
 580  580          lock_request->l_vnode = vp;
 581  581          lock_request->l_zoneid = getzoneid();
 582  582  
 583  583          /*
 584  584           * Convert the request range into the canonical start and end
 585  585           * values.  The NLM protocol supports locking over the entire
 586  586           * 32-bit range, so there's no range checking for remote requests,
 587  587           * but we still need to verify that local requests obey the rules.
 588  588           */
 589  589          /* Clustering */
 590  590          if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
 591  591                  ASSERT(lckdat->l_whence == 0);
 592  592                  lock_request->l_start = lckdat->l_start;
 593  593                  lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
 594  594                      lckdat->l_start + (lckdat->l_len - 1);
 595  595          } else {
 596  596                  /* check the validity of the lock range */
 597  597                  error = flk_convert_lock_data(vp, lckdat,
 598  598                      &lock_request->l_start, &lock_request->l_end,
 599  599                      offset);
 600  600                  if (error) {
 601  601                          goto done;
 602  602                  }
 603  603                  error = flk_check_lock_data(lock_request->l_start,
 604  604                      lock_request->l_end, MAXEND);
 605  605                  if (error) {
 606  606                          goto done;
 607  607                  }
 608  608          }
 609  609  
 610  610          ASSERT(lock_request->l_end >= lock_request->l_start);
 611  611  
 612  612          lock_request->l_type = lckdat->l_type;
 613  613          if (cmd & INOFLCK)
 614  614                  lock_request->l_state |= IO_LOCK;
 615  615          if (cmd & SLPFLCK)
 616  616                  lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
 617  617          if (cmd & RCMDLCK)
 618  618                  lock_request->l_state |= LOCKMGR_LOCK;
 619  619          if (cmd & NBMLCK)
 620  620                  lock_request->l_state |= NBMAND_LOCK;
 621  621          /*
 622  622           * Clustering: set flag for PXFS locks
 623  623           * We do not _only_ check for the PCMDLCK flag because PXFS locks could
 624  624           * also be of type 'RCMDLCK'.
 625  625           * We do not _only_ check the GETPXFSID() macro because local PXFS
 626  626           * clients use a pxfsid of zero to permit deadlock detection in the LLM.
 627  627           */
 628  628  
 629  629          if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
 630  630                  lock_request->l_state |= PXFS_LOCK;
 631  631          }
 632  632          if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
 633  633                  if (lock_request->l_type == F_RDLCK ||
 634  634                      lock_request->l_type == F_WRLCK)
 635  635                          lock_request->l_state |= QUERY_LOCK;
 636  636          }
 637  637          lock_request->l_flock = (*lckdat);
 638  638          lock_request->l_callbacks = flk_cbp;
 639  639  
 640  640          /*
 641  641           * We are ready for processing the request
 642  642           */
 643  643          if (IS_LOCKMGR(lock_request)) {
 644  644                  /*
 645  645                   * If the lock request is an NLM server request ....
 646  646                   */
 647  647                  if (nlm_status_size == 0) { /* not booted as cluster */
 648  648                          mutex_enter(&flock_lock);
 649  649                          /*
 650  650                           * Bail out if this is a lock manager request and the
 651  651                           * lock manager is not supposed to be running.
 652  652                           */
 653  653                          if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
 654  654                                  mutex_exit(&flock_lock);
 655  655                                  error = ENOLCK;
 656  656                                  goto done;
 657  657                          }
 658  658                          mutex_exit(&flock_lock);
 659  659                  } else {                        /* booted as a cluster */
 660  660                          nlmid = GETNLMID(lock_request->l_flock.l_sysid);
 661  661                          ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
 662  662  
 663  663                          mutex_enter(&nlm_reg_lock);
 664  664                          /*
 665  665                           * If the NLM registry does not know about this
 666  666                           * NLM server making the request, add its nlmid
 667  667                           * to the registry.
 668  668                           */
 669  669                          if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
 670  670                              nlmid)) {
 671  671                                  FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
 672  672                          } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
 673  673                              nlmid)) {
 674  674                                  /*
 675  675                                   * If the NLM server is already known (has made
 676  676                                   * previous lock requests) and its state is
 677  677                                   * not NLM_UP (means that NLM server is
 678  678                                   * shutting down), then bail out with an
 679  679                                   * error to deny the lock request.
 680  680                                   */
 681  681                                  mutex_exit(&nlm_reg_lock);
 682  682                                  error = ENOLCK;
 683  683                                  goto done;
 684  684                          }
 685  685                          mutex_exit(&nlm_reg_lock);
 686  686                  }
 687  687          }
 688  688  
 689  689          /* Now get the lock graph for a particular vnode */
 690  690          gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
 691  691  
 692  692          /*
 693  693           * We drop rwlock here otherwise this might end up causing a
 694  694           * deadlock if this IOLOCK sleeps. (bugid # 1183392).
 695  695           */
 696  696  
 697  697          if (IS_IO_LOCK(lock_request)) {
 698  698                  VOP_RWUNLOCK(vp,
 699  699                      (lock_request->l_type == F_RDLCK) ?
 700  700                      V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 701  701          }
 702  702          mutex_enter(&gp->gp_mutex);
 703  703  
 704  704          lock_request->l_state |= REFERENCED_LOCK;
 705  705          lock_request->l_graph = gp;
 706  706  
 707  707          switch (lock_request->l_type) {
 708  708          case F_RDLCK:
 709  709          case F_WRLCK:
 710  710                  if (IS_QUERY_LOCK(lock_request)) {
 711  711                          flk_get_first_blocking_lock(lock_request);
 712  712                          if (lock_request->l_ofd != NULL)
 713  713                                  lock_request->l_flock.l_pid = -1;
 714  714                          (*lckdat) = lock_request->l_flock;
 715  715                          break;
 716  716                  }
 717  717  
 718  718                  /* process the request now */
 719  719  
 720  720                  error = flk_process_request(lock_request);
 721  721                  break;
 722  722  
 723  723          case F_UNLCK:
 724  724                  /* unlock request will not block so execute it immediately */
 725  725  
 726  726                  if (IS_LOCKMGR(lock_request) &&
 727  727                      flk_canceled(lock_request)) {
 728  728                          error = 0;
 729  729                  } else {
 730  730                          error = flk_execute_request(lock_request);
 731  731                  }
 732  732                  break;
 733  733  
 734  734          case F_UNLKSYS:
 735  735                  /*
 736  736                   * Recovery mechanism to release lock manager locks when
 737  737                   * NFS client crashes and restart. NFS server will clear
 738  738                   * old locks and grant new locks.
 739  739                   */
 740  740  
 741  741                  if (lock_request->l_flock.l_sysid == 0) {
 742  742                          mutex_exit(&gp->gp_mutex);
 743  743                          return (EINVAL);
 744  744                  }
 745  745                  if (secpolicy_nfs(CRED()) != 0) {
 746  746                          mutex_exit(&gp->gp_mutex);
 747  747                          return (EPERM);
 748  748                  }
 749  749                  flk_delete_locks_by_sysid(lock_request);
 750  750                  lock_request->l_state &= ~REFERENCED_LOCK;
 751  751                  flk_set_state(lock_request, FLK_DEAD_STATE);
 752  752                  flk_free_lock(lock_request);
 753  753                  mutex_exit(&gp->gp_mutex);
 754  754                  return (0);
 755  755  
 756  756          default:
 757  757                  error = EINVAL;
 758  758                  break;
 759  759          }
 760  760  
 761  761          /* Clustering: For blocked PXFS locks, return */
 762  762          if (error == PXFS_LOCK_BLOCKED) {
 763  763                  lock_request->l_state &= ~REFERENCED_LOCK;
 764  764                  mutex_exit(&gp->gp_mutex);
 765  765                  return (error);
 766  766          }
 767  767  
 768  768          /*
 769  769           * Now that we have seen the status of locks in the system for
 770  770           * this vnode we acquire the rwlock if it is an IO_LOCK.
 771  771           */
 772  772  
 773  773          if (IS_IO_LOCK(lock_request)) {
 774  774                  (void) VOP_RWLOCK(vp,
 775  775                      (lock_request->l_type == F_RDLCK) ?
 776  776                      V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 777  777                  if (!error) {
 778  778                          lckdat->l_type = F_UNLCK;
 779  779  
 780  780                          /*
 781  781                           * This wake up is needed otherwise
 782  782                           * if IO_LOCK has slept the dependents on this
 783  783                           * will not be woken up at all. (bugid # 1185482).
 784  784                           */
 785  785  
 786  786                          flk_wakeup(lock_request, 1);
 787  787                          flk_set_state(lock_request, FLK_DEAD_STATE);
 788  788                          flk_free_lock(lock_request);
 789  789                  }
 790  790                  /*
 791  791                   * else if error had occurred either flk_process_request()
 792  792                   * has returned EDEADLK in which case there will be no
 793  793                   * dependents for this lock or EINTR from flk_wait_execute_
 794  794                   * request() in which case flk_cancel_sleeping_lock()
 795  795                   * would have been done. same is true with EBADF.
 796  796                   */
 797  797          }
 798  798  
 799  799          if (lock_request == &stack_lock_request) {
 800  800                  flk_set_state(lock_request, FLK_DEAD_STATE);
 801  801          } else {
 802  802                  lock_request->l_state &= ~REFERENCED_LOCK;
 803  803                  if ((error != 0) || IS_DELETED(lock_request)) {
 804  804                          flk_set_state(lock_request, FLK_DEAD_STATE);
 805  805                          flk_free_lock(lock_request);
 806  806                  }
 807  807          }
 808  808  
 809  809          mutex_exit(&gp->gp_mutex);
 810  810          return (error);
 811  811  
 812  812  done:
 813  813          flk_set_state(lock_request, FLK_DEAD_STATE);
 814  814          if (lock_request != &stack_lock_request)
 815  815                  flk_free_lock(lock_request);
 816  816          return (error);
 817  817  }
 818  818  
 819  819  /*
 820  820   * Invoke the callbacks in the given list.  If before sleeping, invoke in
 821  821   * list order.  If after sleeping, invoke in reverse order.
 822  822   *
 823  823   * CPR (suspend/resume) support: if one of the callbacks returns a
 824  824   * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
 825  825   * while it is sleeping.  There should be at most one callb_cpr_t for the
 826  826   * thread.
 827  827   * XXX This is unnecessarily complicated.  The CPR information should just
 828  828   * get passed in directly through VOP_FRLOCK and reclock, rather than
 829  829   * sneaking it in via a callback.
 830  830   */
 831  831  
 832  832  callb_cpr_t *
 833  833  flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
 834  834  {
 835  835          callb_cpr_t *cpr_callbackp = NULL;
 836  836          callb_cpr_t *one_result;
 837  837          flk_callback_t *cb;
 838  838  
 839  839          if (cblist == NULL)
 840  840                  return (NULL);
 841  841  
 842  842          if (when == FLK_BEFORE_SLEEP) {
 843  843                  cb = cblist;
 844  844                  do {
 845  845                          one_result = (*cb->cb_callback)(when, cb->cb_data);
 846  846                          if (one_result != NULL) {
 847  847                                  ASSERT(cpr_callbackp == NULL);
 848  848                                  cpr_callbackp = one_result;
 849  849                          }
 850  850                          cb = cb->cb_next;
 851  851                  } while (cb != cblist);
 852  852          } else {
 853  853                  cb = cblist->cb_prev;
 854  854                  do {
 855  855                          one_result = (*cb->cb_callback)(when, cb->cb_data);
 856  856                          if (one_result != NULL) {
 857  857                                  cpr_callbackp = one_result;
 858  858                          }
 859  859                          cb = cb->cb_prev;
 860  860                  } while (cb != cblist->cb_prev);
 861  861          }
 862  862  
 863  863          return (cpr_callbackp);
 864  864  }
 865  865  
 866  866  /*
 867  867   * Initialize a flk_callback_t to hold the given callback.
 868  868   */
 869  869  
 870  870  void
 871  871  flk_init_callback(flk_callback_t *flk_cb,
 872  872          callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
 873  873  {
 874  874          flk_cb->cb_next = flk_cb;
 875  875          flk_cb->cb_prev = flk_cb;
 876  876          flk_cb->cb_callback = cb_fcn;
 877  877          flk_cb->cb_data = cbdata;
 878  878  }
 879  879  
 880  880  /*
 881  881   * Initialize an flk_callback_t and then link it into the head of an
 882  882   * existing list (which may be NULL).
 883  883   */
 884  884  
 885  885  void
 886  886  flk_add_callback(flk_callback_t *newcb,
 887  887                  callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
 888  888                  void *cbdata, flk_callback_t *cblist)
 889  889  {
 890  890          flk_init_callback(newcb, cb_fcn, cbdata);
 891  891  
 892  892          if (cblist == NULL)
 893  893                  return;
 894  894  
 895  895          newcb->cb_prev = cblist->cb_prev;
 896  896          newcb->cb_next = cblist;
 897  897          cblist->cb_prev->cb_next = newcb;
 898  898          cblist->cb_prev = newcb;
 899  899  }
 900  900  
 901  901  /*
 902  902   * Initialize the flk_edge_cache data structure and create the
 903  903   * nlm_reg_status array.
 904  904   */
 905  905  
 906  906  void
 907  907  flk_init(void)
 908  908  {
 909  909          uint_t  i;
 910  910  
 911  911          flk_edge_cache = kmem_cache_create("flk_edges",
 912  912              sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
 913  913          if (flk_edge_cache == NULL) {
 914  914                  cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
 915  915          }
 916  916          /*
 917  917           * Create the NLM registry object.
 918  918           */
 919  919  
 920  920          if (cluster_bootflags & CLUSTER_BOOTED) {
 921  921                  /*
 922  922                   * This routine tells you the maximum node id that will be used
 923  923                   * in the cluster.  This number will be the size of the nlm
 924  924                   * registry status array.  We add 1 because we will be using
 925  925                   * all entries indexed from 0 to maxnodeid; e.g., from 0
 926  926                   * to 64, for a total of 65 entries.
 927  927                   */
 928  928                  nlm_status_size = clconf_maximum_nodeid() + 1;
 929  929          } else {
 930  930                  nlm_status_size = 0;
 931  931          }
 932  932  
 933  933          if (nlm_status_size != 0) {     /* booted as a cluster */
 934  934                  nlm_reg_status = (flk_nlm_status_t *)
 935  935                      kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
 936  936                      KM_SLEEP);
 937  937  
 938  938                  /* initialize all NLM states in array to NLM_UNKNOWN */
 939  939                  for (i = 0; i < nlm_status_size; i++) {
 940  940                          nlm_reg_status[i] = FLK_NLM_UNKNOWN;
 941  941                  }
 942  942          }
 943  943  }
 944  944  
 945  945  /*
 946  946   * Zone constructor/destructor callbacks to be executed when a zone is
 947  947   * created/destroyed.
 948  948   */
 949  949  /* ARGSUSED */
 950  950  void *
 951  951  flk_zone_init(zoneid_t zoneid)
 952  952  {
 953  953          struct flock_globals *fg;
 954  954          uint_t i;
 955  955  
 956  956          fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
 957  957          fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
 958  958          for (i = 0; i < HASH_SIZE; i++)
 959  959                  fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
 960  960          return (fg);
 961  961  }
 962  962  
 963  963  /* ARGSUSED */
 964  964  void
 965  965  flk_zone_fini(zoneid_t zoneid, void *data)
 966  966  {
 967  967          struct flock_globals *fg = data;
 968  968  
 969  969          kmem_free(fg, sizeof (*fg));
 970  970  }
 971  971  
 972  972  /*
 973  973   * Get a lock_descriptor structure with initialization of edge lists.
 974  974   */
 975  975  
 976  976  static lock_descriptor_t *
 977  977  flk_get_lock(void)
 978  978  {
 979  979          lock_descriptor_t       *l;
 980  980  
 981  981          l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
 982  982  
 983  983          cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
 984  984          l->l_edge.edge_in_next = &l->l_edge;
 985  985          l->l_edge.edge_in_prev = &l->l_edge;
 986  986          l->l_edge.edge_adj_next = &l->l_edge;
 987  987          l->l_edge.edge_adj_prev = &l->l_edge;
 988  988          l->pvertex = -1;
 989  989          l->l_status = FLK_INITIAL_STATE;
 990  990          flk_lock_allocs++;
 991  991          return (l);
 992  992  }
 993  993  
 994  994  /*
 995  995   * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
 996  996   * when some thread has a reference to it as in reclock().
 997  997   */
 998  998  
 999  999  void
1000 1000  flk_free_lock(lock_descriptor_t *lock)
1001 1001  {
1002 1002          file_t *fp;
1003 1003  
1004 1004          ASSERT(IS_DEAD(lock));
1005 1005  
1006 1006          if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock)
1007 1007                  fp->f_filock = NULL;
1008 1008  
1009 1009          if (IS_REFERENCED(lock)) {
1010 1010                  lock->l_state |= DELETED_LOCK;
1011 1011                  return;
1012 1012          }
1013 1013          flk_lock_frees++;
1014 1014          kmem_free((void *)lock, sizeof (lock_descriptor_t));
1015 1015  }
1016 1016  
1017 1017  void
1018 1018  flk_set_state(lock_descriptor_t *lock, int new_state)
1019 1019  {
1020 1020          /*
1021 1021           * Locks in the sleeping list may be woken up in a number of ways,
1022 1022           * and more than once.  If a sleeping lock is signaled awake more
1023 1023           * than once, then it may or may not change state depending on its
1024 1024           * current state.
1025 1025           * Also note that NLM locks that are sleeping could be moved to an
1026 1026           * interrupted state more than once if the unlock request is
1027 1027           * retransmitted by the NLM client - the second time around, this is
1028 1028           * just a nop.
1029 1029           * The ordering of being signaled awake is:
1030 1030           * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
1031 1031           * The checks below implement this ordering.
1032 1032           */
1033 1033          if (IS_INTERRUPTED(lock)) {
1034 1034                  if ((new_state == FLK_CANCELLED_STATE) ||
1035 1035                      (new_state == FLK_GRANTED_STATE) ||
1036 1036                      (new_state == FLK_INTERRUPTED_STATE)) {
1037 1037                          return;
1038 1038                  }
1039 1039          }
1040 1040          if (IS_CANCELLED(lock)) {
1041 1041                  if ((new_state == FLK_GRANTED_STATE) ||
1042 1042                      (new_state == FLK_CANCELLED_STATE)) {
1043 1043                          return;
1044 1044                  }
1045 1045          }
1046 1046          CHECK_LOCK_TRANSITION(lock->l_status, new_state);
1047 1047          if (IS_PXFS(lock)) {
1048 1048                  cl_flk_state_transition_notify(lock, lock->l_status, new_state);
1049 1049          }
1050 1050          lock->l_status = new_state;
1051 1051  }
1052 1052  
1053 1053  /*
1054 1054   * Routine that checks whether there are any blocking locks in the system.
1055 1055   *
1056 1056   * The policy followed is if a write lock is sleeping we don't allow read
1057 1057   * locks before this write lock even though there may not be any active
1058 1058   * locks corresponding to the read locks' region.
1059 1059   *
1060 1060   * flk_add_edge() function adds an edge between l1 and l2 iff there
1061 1061   * is no path between l1 and l2. This is done to have a "minimum
1062 1062   * storage representation" of the dependency graph.
1063 1063   *
1064 1064   * Another property of the graph is since only the new request throws
1065 1065   * edges to the existing locks in the graph, the graph is always topologically
1066 1066   * ordered.
1067 1067   */
1068 1068  
1069 1069  static int
1070 1070  flk_process_request(lock_descriptor_t *request)
1071 1071  {
1072 1072          graph_t *gp = request->l_graph;
1073 1073          lock_descriptor_t *lock;
1074 1074          int request_blocked_by_active = 0;
1075 1075          int request_blocked_by_granted = 0;
1076 1076          int request_blocked_by_sleeping = 0;
1077 1077          vnode_t *vp = request->l_vnode;
1078 1078          int     error = 0;
1079 1079          int request_will_wait = 0;
1080 1080          int found_covering_lock = 0;
1081 1081          lock_descriptor_t *covered_by = NULL;
1082 1082  
1083 1083          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1084 1084          request_will_wait = IS_WILLING_TO_SLEEP(request);
1085 1085  
1086 1086          /*
1087 1087           * check active locks
1088 1088           */
1089 1089  
1090 1090          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1091 1091  
1092 1092  
1093 1093          if (lock) {
1094 1094                  do {
1095 1095                          if (BLOCKS(lock, request)) {
1096 1096                                  if (!request_will_wait)
1097 1097                                          return (EAGAIN);
1098 1098                                  request_blocked_by_active = 1;
1099 1099                                  break;
1100 1100                          }
1101 1101                          /*
1102 1102                           * Grant lock if it is for the same owner holding active
1103 1103                           * lock that covers the request.
1104 1104                           */
1105 1105  
1106 1106                          if (SAME_OWNER(lock, request) &&
1107 1107                              COVERS(lock, request) &&
1108 1108                              (request->l_type == F_RDLCK))
1109 1109                                  return (flk_execute_request(request));
1110 1110                          lock = lock->l_next;
1111 1111                  } while (lock->l_vnode == vp);
1112 1112          }
1113 1113  
1114 1114          if (!request_blocked_by_active) {
1115 1115                          lock_descriptor_t *lk[1];
1116 1116                          lock_descriptor_t *first_glock = NULL;
1117 1117                  /*
1118 1118                   * Shall we grant this?! NO!!
1119 1119                   * What about those locks that were just granted and still
1120 1120                   * in sleep queue. Those threads are woken up and so locks
1121 1121                   * are almost active.
1122 1122                   */
1123 1123                  SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1124 1124                  if (lock) {
1125 1125                          do {
1126 1126                                  if (BLOCKS(lock, request)) {
1127 1127                                          if (IS_GRANTED(lock)) {
1128 1128                                                  request_blocked_by_granted = 1;
1129 1129                                          } else {
1130 1130                                                  request_blocked_by_sleeping = 1;
1131 1131                                          }
1132 1132                                  }
1133 1133  
1134 1134                                  lock = lock->l_next;
1135 1135                          } while ((lock->l_vnode == vp));
1136 1136                          first_glock = lock->l_prev;
1137 1137                          ASSERT(first_glock->l_vnode == vp);
1138 1138                  }
1139 1139  
1140 1140                  if (request_blocked_by_granted)
1141 1141                          goto block;
1142 1142  
1143 1143                  if (!request_blocked_by_sleeping) {
1144 1144                          /*
1145 1145                           * If the request isn't going to be blocked by a
1146 1146                           * sleeping request, we know that it isn't going to
1147 1147                           * be blocked; we can just execute the request --
1148 1148                           * without performing costly deadlock detection.
1149 1149                           */
1150 1150                          ASSERT(!request_blocked_by_active);
1151 1151                          return (flk_execute_request(request));
1152 1152                  } else if (request->l_type == F_RDLCK) {
1153 1153                          /*
1154 1154                           * If we have a sleeping writer in the requested
1155 1155                           * lock's range, block.
1156 1156                           */
1157 1157                          goto block;
1158 1158                  }
1159 1159  
1160 1160                  lk[0] = request;
1161 1161                  request->l_state |= RECOMPUTE_LOCK;
1162 1162                  SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1163 1163                  if (lock) {
1164 1164                          do {
1165 1165                                  flk_recompute_dependencies(lock, lk, 1, 0);
1166 1166                                  lock = lock->l_next;
1167 1167                          } while (lock->l_vnode == vp);
1168 1168                  }
1169 1169                  lock = first_glock;
1170 1170                  if (lock) {
1171 1171                          do {
1172 1172                                  if (IS_GRANTED(lock)) {
1173 1173                                  flk_recompute_dependencies(lock, lk, 1, 0);
1174 1174                                  }
1175 1175                                  lock = lock->l_prev;
1176 1176                          } while ((lock->l_vnode == vp));
1177 1177                  }
1178 1178                  request->l_state &= ~RECOMPUTE_LOCK;
1179 1179                  if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
1180 1180                          return (EDEADLK);
1181 1181                  return (flk_execute_request(request));
1182 1182          }
1183 1183  
1184 1184  block:
1185 1185          if (request_will_wait)
1186 1186                  flk_graph_uncolor(gp);
1187 1187  
1188 1188          /* check sleeping locks */
1189 1189  
1190 1190          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1191 1191  
1192 1192          /*
1193 1193           * If we find a sleeping write lock that is a superset of the
1194 1194           * region wanted by request we can be assured that by adding an
1195 1195           * edge to this write lock we have paths to all locks in the
1196 1196           * graph that blocks the request except in one case and that is why
1197 1197           * another check for SAME_OWNER in the loop below. The exception
1198 1198           * case is when this process that owns the sleeping write lock 'l1'
1199 1199           * has other locks l2, l3, l4 that are in the system and arrived
1200 1200           * before l1. l1 does not have path to these locks as they are from
1201 1201           * same process. We break when we find a second covering sleeping
1202 1202           * lock l5 owned by a process different from that owning l1, because
1203 1203           * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
1204 1204           * it has l1 would have produced a deadlock already.
1205 1205           */
1206 1206  
1207 1207          if (lock) {
1208 1208                  do {
1209 1209                          if (BLOCKS(lock, request)) {
1210 1210                                  if (!request_will_wait)
1211 1211                                          return (EAGAIN);
1212 1212                                  if (COVERS(lock, request) &&
1213 1213                                      lock->l_type == F_WRLCK) {
1214 1214                                          if (found_covering_lock &&
1215 1215                                              !SAME_OWNER(lock, covered_by)) {
1216 1216                                                  found_covering_lock++;
1217 1217                                                  break;
1218 1218                                          }
1219 1219                                          found_covering_lock = 1;
1220 1220                                          covered_by = lock;
1221 1221                                  }
1222 1222                                  if (found_covering_lock &&
1223 1223                                      !SAME_OWNER(lock, covered_by)) {
1224 1224                                          lock = lock->l_next;
1225 1225                                          continue;
1226 1226                                  }
1227 1227                                  if ((error = flk_add_edge(request, lock,
1228 1228                                      !found_covering_lock, 0)))
1229 1229                                          return (error);
1230 1230                          }
1231 1231                          lock = lock->l_next;
1232 1232                  } while (lock->l_vnode == vp);
1233 1233          }
1234 1234  
1235 1235  /*
1236 1236   * found_covering_lock == 2 iff at this point 'request' has paths
1237 1237   * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
1238 1238   * point 'request' has paths to all locks that blocks 'request' whose owners
1239 1239   * are not same as the one that covers 'request' (covered_by above) and
1240 1240   * we can have locks whose owner is same as covered_by in the active list.
1241 1241   */
1242 1242  
1243 1243          if (request_blocked_by_active && found_covering_lock != 2) {
1244 1244                  SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1245 1245                  ASSERT(lock != NULL);
1246 1246                  do {
1247 1247                          if (BLOCKS(lock, request)) {
1248 1248                                  if (found_covering_lock &&
1249 1249                                      !SAME_OWNER(lock, covered_by)) {
1250 1250                                          lock = lock->l_next;
1251 1251                                          continue;
1252 1252                                  }
1253 1253                                  if ((error = flk_add_edge(request, lock,
1254 1254                                      CHECK_CYCLE, 0)))
1255 1255                                          return (error);
1256 1256                          }
1257 1257                          lock = lock->l_next;
1258 1258                  } while (lock->l_vnode == vp);
1259 1259          }
1260 1260  
1261 1261          if (NOT_BLOCKED(request)) {
1262 1262                  /*
1263 1263                   * request not dependent on any other locks
1264 1264                   * so execute this request
1265 1265                   */
1266 1266                  return (flk_execute_request(request));
1267 1267          } else {
1268 1268                  /*
1269 1269                   * check for deadlock
1270 1270                   */
1271 1271                  if (flk_check_deadlock(request))
1272 1272                          return (EDEADLK);
1273 1273                  /*
1274 1274                   * this thread has to sleep
1275 1275                   */
1276 1276                  return (flk_wait_execute_request(request));
1277 1277          }
1278 1278  }
1279 1279  
1280 1280  /*
1281 1281   * The actual execution of the request in the simple case is only to
1282 1282   * insert the 'request' in the list of active locks if it is not an
1283 1283   * UNLOCK.
1284 1284   * We have to consider the existing active locks' relation to
1285 1285   * this 'request' if they are owned by same process. flk_relation() does
1286 1286   * this job and sees to that the dependency graph information is maintained
1287 1287   * properly.
1288 1288   */
1289 1289  
1290 1290  int
1291 1291  flk_execute_request(lock_descriptor_t *request)
1292 1292  {
1293 1293          graph_t *gp = request->l_graph;
1294 1294          vnode_t *vp = request->l_vnode;
1295 1295          lock_descriptor_t       *lock, *lock1;
1296 1296          int done_searching = 0;
1297 1297  
1298 1298          CHECK_SLEEPING_LOCKS(gp);
1299 1299          CHECK_ACTIVE_LOCKS(gp);
1300 1300  
1301 1301          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1302 1302  
1303 1303          flk_set_state(request, FLK_START_STATE);
1304 1304  
1305 1305          ASSERT(NOT_BLOCKED(request));
1306 1306  
1307 1307          /* IO_LOCK requests are only to check status */
1308 1308  
1309 1309          if (IS_IO_LOCK(request))
1310 1310                  return (0);
1311 1311  
1312 1312          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1313 1313  
1314 1314          if (lock == NULL && request->l_type == F_UNLCK)
1315 1315                  return (0);
1316 1316          if (lock == NULL) {
1317 1317                  flk_insert_active_lock(request);
1318 1318                  return (0);
1319 1319          }
1320 1320  
1321 1321          do {
1322 1322                  lock1 = lock->l_next;
1323 1323                  if (SAME_OWNER(request, lock)) {
1324 1324                          done_searching = flk_relation(lock, request);
1325 1325                  }
1326 1326                  lock = lock1;
1327 1327          } while (lock->l_vnode == vp && !done_searching);
1328 1328  
1329 1329          /*
1330 1330           * insert in active queue
1331 1331           */
1332 1332  
1333 1333          if (request->l_type != F_UNLCK)
1334 1334                  flk_insert_active_lock(request);
1335 1335  
1336 1336          return (0);
1337 1337  }
1338 1338  
1339 1339  /*
1340 1340   * 'request' is blocked by some one therefore we put it into sleep queue.
1341 1341   */
1342 1342  static int
1343 1343  flk_wait_execute_request(lock_descriptor_t *request)
1344 1344  {
1345 1345          graph_t *gp = request->l_graph;
1346 1346          callb_cpr_t     *cprp;          /* CPR info from callback */
1347 1347          struct flock_globals *fg;
1348 1348          int index;
1349 1349  
1350 1350          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1351 1351          ASSERT(IS_WILLING_TO_SLEEP(request));
1352 1352  
1353 1353          flk_insert_sleeping_lock(request);
1354 1354  
1355 1355          if (IS_LOCKMGR(request)) {
1356 1356                  index = HASH_INDEX(request->l_vnode);
1357 1357                  fg = flk_get_globals();
1358 1358  
1359 1359                  if (nlm_status_size == 0) {     /* not booted as a cluster */
1360 1360                          if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1361 1361                                  flk_cancel_sleeping_lock(request, 1);
1362 1362                                  return (ENOLCK);
1363 1363                          }
1364 1364                  } else {                        /* booted as a cluster */
1365 1365                          /*
1366 1366                           * If the request is an NLM server lock request,
1367 1367                           * and the NLM state of the lock request is not
1368 1368                           * NLM_UP (because the NLM server is shutting
1369 1369                           * down), then cancel the sleeping lock and
1370 1370                           * return error ENOLCK that will encourage the
1371 1371                           * client to retransmit.
1372 1372                           */
1373 1373                          if (!IS_NLM_UP(request)) {
1374 1374                                  flk_cancel_sleeping_lock(request, 1);
1375 1375                                  return (ENOLCK);
1376 1376                          }
1377 1377                  }
1378 1378          }
1379 1379  
1380 1380          /* Clustering: For blocking PXFS locks, return */
1381 1381          if (IS_PXFS(request)) {
1382 1382                  /*
1383 1383                   * PXFS locks sleep on the client side.
1384 1384                   * The callback argument is used to wake up the sleeper
1385 1385                   * when the lock is granted.
1386 1386                   * We return -1 (rather than an errno value) to indicate
1387 1387                   * the client side should sleep
1388 1388                   */
1389 1389                  return (PXFS_LOCK_BLOCKED);
1390 1390          }
1391 1391  
1392 1392          if (request->l_callbacks != NULL) {
1393 1393                  /*
1394 1394                   * To make sure the shutdown code works correctly, either
1395 1395                   * the callback must happen after putting the lock on the
1396 1396                   * sleep list, or we must check the shutdown status after
1397 1397                   * returning from the callback (and before sleeping).  At
1398 1398                   * least for now, we'll use the first option.  If a
1399 1399                   * shutdown or signal or whatever happened while the graph
1400 1400                   * mutex was dropped, that will be detected by
1401 1401                   * wait_for_lock().
1402 1402                   */
1403 1403                  mutex_exit(&gp->gp_mutex);
1404 1404  
1405 1405                  cprp = flk_invoke_callbacks(request->l_callbacks,
1406 1406                      FLK_BEFORE_SLEEP);
1407 1407  
1408 1408                  mutex_enter(&gp->gp_mutex);
1409 1409  
1410 1410                  if (cprp == NULL) {
1411 1411                          wait_for_lock(request);
1412 1412                  } else {
1413 1413                          mutex_enter(cprp->cc_lockp);
1414 1414                          CALLB_CPR_SAFE_BEGIN(cprp);
1415 1415                          mutex_exit(cprp->cc_lockp);
1416 1416                          wait_for_lock(request);
1417 1417                          mutex_enter(cprp->cc_lockp);
1418 1418                          CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1419 1419                          mutex_exit(cprp->cc_lockp);
1420 1420                  }
1421 1421  
1422 1422                  mutex_exit(&gp->gp_mutex);
1423 1423                  (void) flk_invoke_callbacks(request->l_callbacks,
1424 1424                      FLK_AFTER_SLEEP);
1425 1425                  mutex_enter(&gp->gp_mutex);
1426 1426          } else {
1427 1427                  wait_for_lock(request);
1428 1428          }
1429 1429  
1430 1430          if (IS_LOCKMGR(request)) {
1431 1431                  /*
1432 1432                   * If the lock manager is shutting down, return an
1433 1433                   * error that will encourage the client to retransmit.
1434 1434                   */
1435 1435                  if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1436 1436                      !IS_GRANTED(request)) {
1437 1437                          flk_cancel_sleeping_lock(request, 1);
1438 1438                          return (ENOLCK);
1439 1439                  }
1440 1440          }
1441 1441  
1442 1442          if (IS_INTERRUPTED(request)) {
1443 1443                  /* we got a signal, or act like we did */
1444 1444                  flk_cancel_sleeping_lock(request, 1);
1445 1445                  return (EINTR);
1446 1446          }
1447 1447  
1448 1448          /* Cancelled if some other thread has closed the file */
1449 1449  
1450 1450          if (IS_CANCELLED(request)) {
1451 1451                  flk_cancel_sleeping_lock(request, 1);
1452 1452                  return (EBADF);
1453 1453          }
1454 1454  
1455 1455          request->l_state &= ~GRANTED_LOCK;
1456 1456          REMOVE_SLEEP_QUEUE(request);
1457 1457          return (flk_execute_request(request));
1458 1458  }
1459 1459  
1460 1460  /*
1461 1461   * This routine adds an edge between from and to because from depends
1462 1462   * to. If asked to check for deadlock it checks whether there are any
1463 1463   * reachable locks from "from_lock" that is owned by the same process
1464 1464   * as "from_lock".
1465 1465   * NOTE: It is the caller's responsibility to make sure that the color
1466 1466   * of the graph is consistent between the calls to flk_add_edge as done
1467 1467   * in flk_process_request. This routine does not color and check for
1468 1468   * deadlock explicitly.
1469 1469   */
1470 1470  
1471 1471  static int
1472 1472  flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1473 1473                          int check_cycle, int update_graph)
1474 1474  {
1475 1475          edge_t  *edge;
1476 1476          edge_t  *ep;
1477 1477          lock_descriptor_t       *vertex;
1478 1478          lock_descriptor_t *vertex_stack;
1479 1479  
1480 1480          STACK_INIT(vertex_stack);
1481 1481  
1482 1482          /*
1483 1483           * if to vertex already has mark_color just return
1484 1484           * don't add an edge as it is reachable from from vertex
1485 1485           * before itself.
1486 1486           */
1487 1487  
1488 1488          if (COLORED(to_lock))
1489 1489                  return (0);
1490 1490  
1491 1491          edge = flk_get_edge();
1492 1492  
1493 1493          /*
1494 1494           * set the from and to vertex
1495 1495           */
1496 1496  
1497 1497          edge->from_vertex = from_lock;
1498 1498          edge->to_vertex = to_lock;
1499 1499  
1500 1500          /*
1501 1501           * put in adjacency list of from vertex
1502 1502           */
1503 1503  
1504 1504          from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1505 1505          edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1506 1506          edge->edge_adj_prev = &from_lock->l_edge;
1507 1507          from_lock->l_edge.edge_adj_next = edge;
1508 1508  
1509 1509          /*
1510 1510           * put in list of to vertex
1511 1511           */
1512 1512  
1513 1513          to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1514 1514          edge->edge_in_next = to_lock->l_edge.edge_in_next;
1515 1515          to_lock->l_edge.edge_in_next = edge;
1516 1516          edge->edge_in_prev = &to_lock->l_edge;
1517 1517  
1518 1518  
1519 1519          if (update_graph) {
1520 1520                  flk_update_proc_graph(edge, 0);
1521 1521                  return (0);
1522 1522          }
1523 1523          if (!check_cycle) {
1524 1524                  return (0);
1525 1525          }
1526 1526  
1527 1527          STACK_PUSH(vertex_stack, from_lock, l_stack);
1528 1528  
1529 1529          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1530 1530  
1531 1531                  STACK_POP(vertex_stack, l_stack);
1532 1532  
1533 1533                  for (ep = FIRST_ADJ(vertex);
1534 1534                      ep != HEAD(vertex);
1535 1535                      ep = NEXT_ADJ(ep)) {
1536 1536                          if (COLORED(ep->to_vertex))
1537 1537                                  continue;
1538 1538                          COLOR(ep->to_vertex);
1539 1539                          if (SAME_OWNER(ep->to_vertex, from_lock))
1540 1540                                  goto dead_lock;
1541 1541                          STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1542 1542                  }
1543 1543          }
1544 1544          return (0);
1545 1545  
1546 1546  dead_lock:
1547 1547  
1548 1548          /*
1549 1549           * remove all edges
1550 1550           */
1551 1551  
1552 1552          ep = FIRST_ADJ(from_lock);
1553 1553  
1554 1554          while (ep != HEAD(from_lock)) {
1555 1555                  IN_LIST_REMOVE(ep);
1556 1556                  from_lock->l_sedge = NEXT_ADJ(ep);
1557 1557                  ADJ_LIST_REMOVE(ep);
1558 1558                  flk_free_edge(ep);
1559 1559                  ep = from_lock->l_sedge;
1560 1560          }
1561 1561          return (EDEADLK);
1562 1562  }
1563 1563  
1564 1564  /*
1565 1565   * Get an edge structure for representing the dependency between two locks.
1566 1566   */
1567 1567  
1568 1568  static edge_t *
1569 1569  flk_get_edge()
1570 1570  {
1571 1571          edge_t  *ep;
1572 1572  
1573 1573          ASSERT(flk_edge_cache != NULL);
1574 1574  
1575 1575          ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1576 1576          edge_allocs++;
1577 1577          return (ep);
1578 1578  }
1579 1579  
1580 1580  /*
1581 1581   * Free the edge structure.
1582 1582   */
1583 1583  
1584 1584  static void
1585 1585  flk_free_edge(edge_t *ep)
1586 1586  {
1587 1587          edge_frees++;
1588 1588          kmem_cache_free(flk_edge_cache, (void *)ep);
1589 1589  }
1590 1590  
1591 1591  /*
1592 1592   * Check the relationship of request with lock and perform the
1593 1593   * recomputation of dependencies, break lock if required, and return
1594 1594   * 1 if request cannot have any more relationship with the next
1595 1595   * active locks.
1596 1596   * The 'lock' and 'request' are compared and in case of overlap we
1597 1597   * delete the 'lock' and form new locks to represent the non-overlapped
1598 1598   * portion of original 'lock'. This function has side effects such as
1599 1599   * 'lock' will be freed, new locks will be added to the active list.
1600 1600   */
1601 1601  
1602 1602  static int
1603 1603  flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1604 1604  {
1605 1605          int lock_effect;
1606 1606          lock_descriptor_t *lock1, *lock2;
1607 1607          lock_descriptor_t *topology[3];
1608 1608          int nvertex = 0;
1609 1609          int i;
1610 1610          edge_t  *ep;
1611 1611          graph_t *gp = (lock->l_graph);
1612 1612  
1613 1613  
1614 1614          CHECK_SLEEPING_LOCKS(gp);
1615 1615          CHECK_ACTIVE_LOCKS(gp);
1616 1616  
1617 1617          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1618 1618  
1619 1619          topology[0] = topology[1] = topology[2] = NULL;
1620 1620  
1621 1621          if (request->l_type == F_UNLCK)
1622 1622                  lock_effect = FLK_UNLOCK;
1623 1623          else if (request->l_type == F_RDLCK &&
1624 1624              lock->l_type == F_WRLCK)
1625 1625                  lock_effect = FLK_DOWNGRADE;
1626 1626          else if (request->l_type == F_WRLCK &&
1627 1627              lock->l_type == F_RDLCK)
1628 1628                  lock_effect = FLK_UPGRADE;
1629 1629          else
1630 1630                  lock_effect = FLK_STAY_SAME;
1631 1631  
1632 1632          if (lock->l_end < request->l_start) {
1633 1633                  if (lock->l_end == request->l_start - 1 &&
1634 1634                      lock_effect == FLK_STAY_SAME) {
1635 1635                          topology[0] = request;
1636 1636                          request->l_start = lock->l_start;
1637 1637                          nvertex = 1;
1638 1638                          goto recompute;
1639 1639                  } else {
1640 1640                          return (0);
1641 1641                  }
1642 1642          }
1643 1643  
1644 1644          if (lock->l_start > request->l_end) {
1645 1645                  if (request->l_end == lock->l_start - 1 &&
1646 1646                      lock_effect == FLK_STAY_SAME) {
1647 1647                          topology[0] = request;
1648 1648                          request->l_end = lock->l_end;
1649 1649                          nvertex = 1;
1650 1650                          goto recompute;
1651 1651                  } else {
1652 1652                          return (1);
1653 1653                  }
1654 1654          }
1655 1655  
1656 1656          if (request->l_end < lock->l_end) {
1657 1657                  if (request->l_start > lock->l_start) {
1658 1658                          if (lock_effect == FLK_STAY_SAME) {
1659 1659                                  request->l_start = lock->l_start;
1660 1660                                  request->l_end = lock->l_end;
1661 1661                                  topology[0] = request;
1662 1662                                  nvertex = 1;
1663 1663                          } else {
1664 1664                                  lock1 = flk_get_lock();
1665 1665                                  lock2 = flk_get_lock();
1666 1666                                  COPY(lock1, lock);
1667 1667                                  COPY(lock2, lock);
1668 1668                                  lock1->l_start = lock->l_start;
1669 1669                                  lock1->l_end = request->l_start - 1;
1670 1670                                  lock2->l_start = request->l_end + 1;
1671 1671                                  lock2->l_end = lock->l_end;
1672 1672                                  topology[0] = lock1;
1673 1673                                  topology[1] = lock2;
1674 1674                                  topology[2] = request;
1675 1675                                  nvertex = 3;
1676 1676                          }
1677 1677                  } else if (request->l_start < lock->l_start) {
1678 1678                          if (lock_effect == FLK_STAY_SAME) {
1679 1679                                  request->l_end = lock->l_end;
1680 1680                                  topology[0] = request;
1681 1681                                  nvertex = 1;
1682 1682                          } else {
1683 1683                                  lock1 = flk_get_lock();
1684 1684                                  COPY(lock1, lock);
1685 1685                                  lock1->l_start = request->l_end + 1;
1686 1686                                  topology[0] = lock1;
1687 1687                                  topology[1] = request;
1688 1688                                  nvertex = 2;
1689 1689                          }
1690 1690                  } else  {
1691 1691                          if (lock_effect == FLK_STAY_SAME) {
1692 1692                                  request->l_start = lock->l_start;
1693 1693                                  request->l_end = lock->l_end;
1694 1694                                  topology[0] = request;
1695 1695                                  nvertex = 1;
1696 1696                          } else {
1697 1697                                  lock1 = flk_get_lock();
1698 1698                                  COPY(lock1, lock);
1699 1699                                  lock1->l_start = request->l_end + 1;
1700 1700                                  topology[0] = lock1;
1701 1701                                  topology[1] = request;
1702 1702                                  nvertex = 2;
1703 1703                          }
1704 1704                  }
1705 1705          } else if (request->l_end > lock->l_end) {
1706 1706                  if (request->l_start > lock->l_start)  {
1707 1707                          if (lock_effect == FLK_STAY_SAME) {
1708 1708                                  request->l_start = lock->l_start;
1709 1709                                  topology[0] = request;
1710 1710                                  nvertex = 1;
1711 1711                          } else {
1712 1712                                  lock1 = flk_get_lock();
1713 1713                                  COPY(lock1, lock);
1714 1714                                  lock1->l_end = request->l_start - 1;
1715 1715                                  topology[0] = lock1;
1716 1716                                  topology[1] = request;
1717 1717                                  nvertex = 2;
1718 1718                          }
1719 1719                  } else if (request->l_start < lock->l_start)  {
1720 1720                          topology[0] = request;
1721 1721                          nvertex = 1;
1722 1722                  } else {
1723 1723                          topology[0] = request;
1724 1724                          nvertex = 1;
1725 1725                  }
1726 1726          } else {
1727 1727                  if (request->l_start > lock->l_start) {
1728 1728                          if (lock_effect == FLK_STAY_SAME) {
1729 1729                                  request->l_start = lock->l_start;
1730 1730                                  topology[0] = request;
1731 1731                                  nvertex = 1;
1732 1732                          } else {
1733 1733                                  lock1 = flk_get_lock();
1734 1734                                  COPY(lock1, lock);
1735 1735                                  lock1->l_end = request->l_start - 1;
1736 1736                                  topology[0] = lock1;
1737 1737                                  topology[1] = request;
1738 1738                                  nvertex = 2;
1739 1739                          }
1740 1740                  } else if (request->l_start < lock->l_start) {
1741 1741                          topology[0] = request;
1742 1742                          nvertex = 1;
1743 1743                  } else {
1744 1744                          if (lock_effect !=  FLK_UNLOCK) {
1745 1745                                  topology[0] = request;
1746 1746                                  nvertex = 1;
1747 1747                          } else {
1748 1748                                  flk_delete_active_lock(lock, 0);
1749 1749                                  flk_wakeup(lock, 1);
1750 1750                                  flk_free_lock(lock);
1751 1751                                  CHECK_SLEEPING_LOCKS(gp);
1752 1752                                  CHECK_ACTIVE_LOCKS(gp);
1753 1753                                  return (1);
1754 1754                          }
1755 1755                  }
1756 1756          }
1757 1757  
1758 1758  recompute:
1759 1759  
1760 1760          /*
1761 1761           * For unlock we don't send the 'request' to for recomputing
1762 1762           * dependencies because no lock will add an edge to this.
1763 1763           */
1764 1764  
1765 1765          if (lock_effect == FLK_UNLOCK) {
1766 1766                  topology[nvertex-1] = NULL;
1767 1767                  nvertex--;
1768 1768          }
1769 1769          for (i = 0; i < nvertex; i++) {
1770 1770                  topology[i]->l_state |= RECOMPUTE_LOCK;
1771 1771                  topology[i]->l_color = NO_COLOR;
1772 1772          }
1773 1773  
1774 1774          ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1775 1775  
1776 1776          /*
1777 1777           * we remove the adjacent edges for all vertices' to this vertex
1778 1778           * 'lock'.
1779 1779           */
1780 1780  
1781 1781          ep = FIRST_IN(lock);
1782 1782          while (ep != HEAD(lock)) {
1783 1783                  ADJ_LIST_REMOVE(ep);
1784 1784                  ep = NEXT_IN(ep);
1785 1785          }
1786 1786  
1787 1787          flk_delete_active_lock(lock, 0);
1788 1788  
1789 1789          /* We are ready for recomputing the dependencies now */
1790 1790  
1791 1791          flk_recompute_dependencies(lock, topology, nvertex, 1);
1792 1792  
1793 1793          for (i = 0; i < nvertex; i++) {
1794 1794                  topology[i]->l_state &= ~RECOMPUTE_LOCK;
1795 1795                  topology[i]->l_color = NO_COLOR;
1796 1796          }
1797 1797  
1798 1798  
1799 1799          if (lock_effect == FLK_UNLOCK) {
1800 1800                  nvertex++;
1801 1801          }
1802 1802          for (i = 0; i < nvertex - 1; i++) {
1803 1803                  flk_insert_active_lock(topology[i]);
1804 1804          }
1805 1805  
1806 1806  
1807 1807          if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1808 1808                  flk_wakeup(lock, 0);
1809 1809          } else {
1810 1810                  ep = FIRST_IN(lock);
1811 1811                  while (ep != HEAD(lock)) {
1812 1812                          lock->l_sedge = NEXT_IN(ep);
1813 1813                          IN_LIST_REMOVE(ep);
1814 1814                          flk_update_proc_graph(ep, 1);
1815 1815                          flk_free_edge(ep);
1816 1816                          ep = lock->l_sedge;
1817 1817                  }
1818 1818          }
1819 1819          flk_free_lock(lock);
1820 1820  
1821 1821          CHECK_SLEEPING_LOCKS(gp);
1822 1822          CHECK_ACTIVE_LOCKS(gp);
1823 1823          return (0);
1824 1824  }
1825 1825  
1826 1826  /*
1827 1827   * Insert a lock into the active queue.
1828 1828   */
1829 1829  
1830 1830  static void
1831 1831  flk_insert_active_lock(lock_descriptor_t *new_lock)
1832 1832  {
1833 1833          graph_t *gp = new_lock->l_graph;
1834 1834          vnode_t *vp = new_lock->l_vnode;
1835 1835          lock_descriptor_t *first_lock, *lock;
1836 1836  
1837 1837          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1838 1838  
1839 1839          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1840 1840          first_lock = lock;
1841 1841  
1842 1842          if (first_lock != NULL) {
1843 1843                  for (; (lock->l_vnode == vp &&
1844 1844                      lock->l_start < new_lock->l_start); lock = lock->l_next)
1845 1845                          ;
1846 1846          } else {
1847 1847                  lock = ACTIVE_HEAD(gp);
1848 1848          }
1849 1849  
1850 1850          lock->l_prev->l_next = new_lock;
1851 1851          new_lock->l_next = lock;
1852 1852          new_lock->l_prev = lock->l_prev;
1853 1853          lock->l_prev = new_lock;
1854 1854  
1855 1855          if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1856 1856                  vp->v_filocks = (struct filock *)new_lock;
1857 1857          }
1858 1858          flk_set_state(new_lock, FLK_ACTIVE_STATE);
1859 1859          new_lock->l_state |= ACTIVE_LOCK;
1860 1860  
1861 1861          CHECK_ACTIVE_LOCKS(gp);
1862 1862          CHECK_SLEEPING_LOCKS(gp);
1863 1863  }
1864 1864  
1865 1865  /*
1866 1866   * Delete the active lock : Performs two functions depending on the
1867 1867   * value of second parameter. One is to remove from the active lists
1868 1868   * only and other is to both remove and free the lock.
1869 1869   */
1870 1870  
1871 1871  static void
1872 1872  flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1873 1873  {
1874 1874          vnode_t *vp = lock->l_vnode;
1875 1875          graph_t *gp = lock->l_graph;
1876 1876  
1877 1877          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1878 1878          if (free_lock)
1879 1879                  ASSERT(NO_DEPENDENTS(lock));
1880 1880          ASSERT(NOT_BLOCKED(lock));
1881 1881          ASSERT(IS_ACTIVE(lock));
1882 1882  
1883 1883          ASSERT((vp->v_filocks != NULL));
1884 1884  
1885 1885          if (vp->v_filocks == (struct filock *)lock) {
1886 1886                  vp->v_filocks = (struct filock *)
1887 1887                      ((lock->l_next->l_vnode == vp) ? lock->l_next :
1888 1888                      NULL);
1889 1889          }
1890 1890          lock->l_next->l_prev = lock->l_prev;
1891 1891          lock->l_prev->l_next = lock->l_next;
1892 1892          lock->l_next = lock->l_prev = NULL;
1893 1893          flk_set_state(lock, FLK_DEAD_STATE);
1894 1894          lock->l_state &= ~ACTIVE_LOCK;
1895 1895  
1896 1896          if (free_lock)
1897 1897                  flk_free_lock(lock);
1898 1898          CHECK_ACTIVE_LOCKS(gp);
1899 1899          CHECK_SLEEPING_LOCKS(gp);
1900 1900  }
1901 1901  
1902 1902  /*
1903 1903   * Insert into the sleep queue.
1904 1904   */
1905 1905  
1906 1906  static void
1907 1907  flk_insert_sleeping_lock(lock_descriptor_t *request)
1908 1908  {
1909 1909          graph_t *gp = request->l_graph;
1910 1910          vnode_t *vp = request->l_vnode;
1911 1911          lock_descriptor_t       *lock;
1912 1912  
1913 1913          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1914 1914          ASSERT(IS_INITIAL(request));
1915 1915  
1916 1916          for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1917 1917              lock->l_vnode < vp); lock = lock->l_next)
1918 1918                  ;
1919 1919  
1920 1920          lock->l_prev->l_next = request;
1921 1921          request->l_prev = lock->l_prev;
1922 1922          lock->l_prev = request;
1923 1923          request->l_next = lock;
1924 1924          flk_set_state(request, FLK_SLEEPING_STATE);
1925 1925          request->l_state |= SLEEPING_LOCK;
1926 1926  }
1927 1927  
1928 1928  /*
1929 1929   * Cancelling a sleeping lock implies removing a vertex from the
1930 1930   * dependency graph and therefore we should recompute the dependencies
1931 1931   * of all vertices that have a path  to this vertex, w.r.t. all
1932 1932   * vertices reachable from this vertex.
1933 1933   */
1934 1934  
1935 1935  void
1936 1936  flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1937 1937  {
1938 1938          graph_t *gp = request->l_graph;
1939 1939          vnode_t *vp = request->l_vnode;
1940 1940          lock_descriptor_t **topology = NULL;
1941 1941          edge_t  *ep;
1942 1942          lock_descriptor_t *vertex, *lock;
1943 1943          int nvertex = 0;
1944 1944          int i;
1945 1945          lock_descriptor_t *vertex_stack;
1946 1946  
1947 1947          STACK_INIT(vertex_stack);
1948 1948  
1949 1949          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1950 1950          /*
1951 1951           * count number of vertex pointers that has to be allocated
1952 1952           * All vertices that are reachable from request.
1953 1953           */
1954 1954  
1955 1955          STACK_PUSH(vertex_stack, request, l_stack);
1956 1956  
1957 1957          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1958 1958                  STACK_POP(vertex_stack, l_stack);
1959 1959                  for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1960 1960                      ep = NEXT_ADJ(ep)) {
1961 1961                          if (IS_RECOMPUTE(ep->to_vertex))
1962 1962                                  continue;
1963 1963                          ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1964 1964                          STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1965 1965                          nvertex++;
1966 1966                  }
1967 1967          }
1968 1968  
1969 1969          /*
1970 1970           * allocate memory for holding the vertex pointers
1971 1971           */
1972 1972  
1973 1973          if (nvertex) {
1974 1974                  topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1975 1975                      KM_SLEEP);
1976 1976          }
1977 1977  
1978 1978          /*
1979 1979           * one more pass to actually store the vertices in the
1980 1980           * allocated array.
1981 1981           * We first check sleeping locks and then active locks
1982 1982           * so that topology array will be in a topological
1983 1983           * order.
1984 1984           */
1985 1985  
1986 1986          nvertex = 0;
1987 1987          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1988 1988  
1989 1989          if (lock) {
1990 1990                  do {
1991 1991                          if (IS_RECOMPUTE(lock)) {
1992 1992                                  lock->l_index = nvertex;
1993 1993                                  topology[nvertex++] = lock;
1994 1994                          }
1995 1995                          lock->l_color = NO_COLOR;
1996 1996                          lock = lock->l_next;
1997 1997                  } while (lock->l_vnode == vp);
1998 1998          }
1999 1999  
2000 2000          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2001 2001  
2002 2002          if (lock) {
2003 2003                  do {
2004 2004                          if (IS_RECOMPUTE(lock)) {
2005 2005                                  lock->l_index = nvertex;
2006 2006                                  topology[nvertex++] = lock;
2007 2007                          }
2008 2008                          lock->l_color = NO_COLOR;
2009 2009                          lock = lock->l_next;
2010 2010                  } while (lock->l_vnode == vp);
2011 2011          }
2012 2012  
2013 2013          /*
2014 2014           * remove in and out edges of request
2015 2015           * They are freed after updating proc_graph below.
2016 2016           */
2017 2017  
2018 2018          for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
2019 2019                  ADJ_LIST_REMOVE(ep);
2020 2020          }
2021 2021  
2022 2022  
2023 2023          if (remove_from_queue)
2024 2024                  REMOVE_SLEEP_QUEUE(request);
2025 2025  
2026 2026          /* we are ready to recompute */
2027 2027  
2028 2028          flk_recompute_dependencies(request, topology, nvertex, 1);
2029 2029  
2030 2030          ep = FIRST_ADJ(request);
2031 2031          while (ep != HEAD(request)) {
2032 2032                  IN_LIST_REMOVE(ep);
2033 2033                  request->l_sedge = NEXT_ADJ(ep);
2034 2034                  ADJ_LIST_REMOVE(ep);
2035 2035                  flk_update_proc_graph(ep, 1);
2036 2036                  flk_free_edge(ep);
2037 2037                  ep = request->l_sedge;
2038 2038          }
2039 2039  
2040 2040  
2041 2041          /*
2042 2042           * unset the RECOMPUTE flag in those vertices
2043 2043           */
2044 2044  
2045 2045          for (i = 0; i < nvertex; i++) {
2046 2046                  topology[i]->l_state &= ~RECOMPUTE_LOCK;
2047 2047          }
2048 2048  
2049 2049          /*
2050 2050           * free the topology
2051 2051           */
2052 2052          if (nvertex)
2053 2053                  kmem_free((void *)topology,
2054 2054                      (nvertex * sizeof (lock_descriptor_t *)));
2055 2055          /*
2056 2056           * Possibility of some locks unblocked now
2057 2057           */
2058 2058  
2059 2059          flk_wakeup(request, 0);
2060 2060  
2061 2061          /*
2062 2062           * we expect to have a correctly recomputed graph  now.
2063 2063           */
2064 2064          flk_set_state(request, FLK_DEAD_STATE);
2065 2065          flk_free_lock(request);
2066 2066          CHECK_SLEEPING_LOCKS(gp);
2067 2067          CHECK_ACTIVE_LOCKS(gp);
2068 2068  
2069 2069  }
2070 2070  
2071 2071  /*
2072 2072   * Uncoloring the graph is simply to increment the mark value of the graph
2073 2073   * And only when wrap round takes place will we color all vertices in
2074 2074   * the graph explicitly.
2075 2075   */
2076 2076  
2077 2077  static void
2078 2078  flk_graph_uncolor(graph_t *gp)
2079 2079  {
2080 2080          lock_descriptor_t *lock;
2081 2081  
2082 2082          if (gp->mark == UINT_MAX) {
2083 2083                  gp->mark = 1;
2084 2084          for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
2085 2085              lock = lock->l_next)
2086 2086                          lock->l_color  = 0;
2087 2087  
2088 2088          for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
2089 2089              lock = lock->l_next)
2090 2090                          lock->l_color  = 0;
2091 2091          } else {
2092 2092                  gp->mark++;
2093 2093          }
2094 2094  }
2095 2095  
2096 2096  /*
2097 2097   * Wake up locks that are blocked on the given lock.
2098 2098   */
2099 2099  
2100 2100  static void
2101 2101  flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
2102 2102  {
2103 2103          edge_t  *ep;
2104 2104          graph_t *gp = lock->l_graph;
2105 2105          lock_descriptor_t       *lck;
2106 2106  
2107 2107          ASSERT(MUTEX_HELD(&gp->gp_mutex));
2108 2108          if (NO_DEPENDENTS(lock))
2109 2109                  return;
2110 2110          ep = FIRST_IN(lock);
2111 2111          do {
2112 2112                  /*
2113 2113                   * delete the edge from the adjacency list
2114 2114                   * of from vertex. if no more adjacent edges
2115 2115                   * for this vertex wake this process.
2116 2116                   */
2117 2117                  lck = ep->from_vertex;
2118 2118                  if (adj_list_remove)
2119 2119                          ADJ_LIST_REMOVE(ep);
2120 2120                  flk_update_proc_graph(ep, 1);
2121 2121                  if (NOT_BLOCKED(lck)) {
2122 2122                          GRANT_WAKEUP(lck);
2123 2123                  }
2124 2124                  lock->l_sedge = NEXT_IN(ep);
2125 2125                  IN_LIST_REMOVE(ep);
2126 2126                  flk_free_edge(ep);
2127 2127                  ep = lock->l_sedge;
2128 2128          } while (ep != HEAD(lock));
2129 2129          ASSERT(NO_DEPENDENTS(lock));
2130 2130  }
2131 2131  
2132 2132  /*
2133 2133   * The dependents of request, is checked for its dependency against the
2134 2134   * locks in topology (called topology because the array is and should be in
2135 2135   * topological order for this algorithm, if not in topological order the
2136 2136   * inner loop below might add more edges than necessary. Topological ordering
2137 2137   * of vertices satisfies the property that all edges will be from left to
2138 2138   * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
2139 2139   * If lock l1 in the dependent set of request is dependent (blocked by)
2140 2140   * on lock l2 in topology but does not have a path to it, we add an edge
2141 2141   * in the inner loop below.
2142 2142   *
2143 2143   * We don't want to add an edge between l1 and l2 if there exists
2144 2144   * already a path from l1 to l2, so care has to be taken for those vertices
2145 2145   * that  have two paths to 'request'. These vertices are referred to here
2146 2146   * as barrier locks.
2147 2147   *
2148 2148   * The barriers has to be found (those vertex that originally had two paths
2149 2149   * to request) because otherwise we may end up adding edges unnecessarily
2150 2150   * to vertices in topology, and thus barrier vertices can have an edge
2151 2151   * to a vertex in topology as well a path to it.
2152 2152   */
2153 2153  
2154 2154  static void
2155 2155  flk_recompute_dependencies(lock_descriptor_t *request,
2156 2156                  lock_descriptor_t **topology,
2157 2157                          int nvertex, int update_graph)
2158 2158  {
2159 2159          lock_descriptor_t *vertex, *lock;
2160 2160          graph_t *gp = request->l_graph;
2161 2161          int i, count;
2162 2162          int barrier_found = 0;
2163 2163          edge_t  *ep;
2164 2164          lock_descriptor_t *vertex_stack;
2165 2165  
2166 2166          STACK_INIT(vertex_stack);
2167 2167  
2168 2168          ASSERT(MUTEX_HELD(&gp->gp_mutex));
2169 2169          if (nvertex == 0)
2170 2170                  return;
2171 2171          flk_graph_uncolor(request->l_graph);
2172 2172          barrier_found = flk_find_barriers(request);
2173 2173          request->l_state |= RECOMPUTE_DONE;
2174 2174  
2175 2175          STACK_PUSH(vertex_stack, request, l_stack);
2176 2176          request->l_sedge = FIRST_IN(request);
2177 2177  
2178 2178  
2179 2179          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2180 2180                  if (vertex->l_state & RECOMPUTE_DONE) {
2181 2181                          count = 0;
2182 2182                          goto next_in_edge;
2183 2183                  }
2184 2184                  if (IS_BARRIER(vertex)) {
2185 2185                          /* decrement the barrier count */
2186 2186                          if (vertex->l_index) {
2187 2187                                  vertex->l_index--;
2188 2188                                  /* this guy will be pushed again anyway ? */
2189 2189                                  STACK_POP(vertex_stack, l_stack);
2190 2190                                  if (vertex->l_index == 0)  {
2191 2191                                  /*
2192 2192                                   * barrier is over we can recompute
2193 2193                                   * dependencies for this lock in the
2194 2194                                   * next stack pop
2195 2195                                   */
2196 2196                                          vertex->l_state &= ~BARRIER_LOCK;
2197 2197                                  }
2198 2198                                  continue;
2199 2199                          }
2200 2200                  }
2201 2201                  vertex->l_state |= RECOMPUTE_DONE;
2202 2202                  flk_graph_uncolor(gp);
2203 2203                  count = flk_color_reachables(vertex);
2204 2204                  for (i = 0; i < nvertex; i++) {
2205 2205                          lock = topology[i];
2206 2206                          if (COLORED(lock))
2207 2207                                  continue;
2208 2208                          if (BLOCKS(lock, vertex)) {
2209 2209                                  (void) flk_add_edge(vertex, lock,
2210 2210                                      NO_CHECK_CYCLE, update_graph);
2211 2211                                  COLOR(lock);
2212 2212                                  count++;
2213 2213                                  count += flk_color_reachables(lock);
2214 2214                          }
2215 2215  
2216 2216                  }
2217 2217  
2218 2218  next_in_edge:
2219 2219                  if (count == nvertex ||
2220 2220                      vertex->l_sedge == HEAD(vertex)) {
2221 2221                          /* prune the tree below this */
2222 2222                          STACK_POP(vertex_stack, l_stack);
2223 2223                          vertex->l_state &= ~RECOMPUTE_DONE;
2224 2224                          /* update the barrier locks below this! */
2225 2225                          if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
2226 2226                                  flk_graph_uncolor(gp);
2227 2227                                  flk_update_barriers(vertex);
2228 2228                          }
2229 2229                          continue;
2230 2230                  }
2231 2231  
2232 2232                  ep = vertex->l_sedge;
2233 2233                  lock = ep->from_vertex;
2234 2234                  STACK_PUSH(vertex_stack, lock, l_stack);
2235 2235                  lock->l_sedge = FIRST_IN(lock);
2236 2236                  vertex->l_sedge = NEXT_IN(ep);
2237 2237          }
2238 2238  
2239 2239  }
2240 2240  
2241 2241  /*
2242 2242   * Color all reachable vertices from vertex that belongs to topology (here
2243 2243   * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
2244 2244   *
2245 2245   * Note: we need to use a different stack_link l_stack1 because this is
2246 2246   * called from flk_recompute_dependencies() that already uses a stack with
2247 2247   * l_stack as stack_link.
2248 2248   */
2249 2249  
2250 2250  static int
2251 2251  flk_color_reachables(lock_descriptor_t *vertex)
2252 2252  {
2253 2253          lock_descriptor_t *ver, *lock;
2254 2254          int count;
2255 2255          edge_t  *ep;
2256 2256          lock_descriptor_t *vertex_stack;
2257 2257  
2258 2258          STACK_INIT(vertex_stack);
2259 2259  
2260 2260          STACK_PUSH(vertex_stack, vertex, l_stack1);
2261 2261          count = 0;
2262 2262          while ((ver = STACK_TOP(vertex_stack)) != NULL) {
2263 2263  
2264 2264                  STACK_POP(vertex_stack, l_stack1);
2265 2265                  for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
2266 2266                      ep = NEXT_ADJ(ep)) {
2267 2267                          lock = ep->to_vertex;
2268 2268                          if (COLORED(lock))
2269 2269                                  continue;
2270 2270                          COLOR(lock);
2271 2271                          if (IS_RECOMPUTE(lock))
2272 2272                                  count++;
2273 2273                          STACK_PUSH(vertex_stack, lock, l_stack1);
2274 2274                  }
2275 2275  
2276 2276          }
2277 2277          return (count);
2278 2278  }
2279 2279  
2280 2280  /*
2281 2281   * Called from flk_recompute_dependencies() this routine decrements
2282 2282   * the barrier count of barrier vertices that are reachable from lock.
2283 2283   */
2284 2284  
2285 2285  static void
2286 2286  flk_update_barriers(lock_descriptor_t *lock)
2287 2287  {
2288 2288          lock_descriptor_t *vertex, *lck;
2289 2289          edge_t  *ep;
2290 2290          lock_descriptor_t *vertex_stack;
2291 2291  
2292 2292          STACK_INIT(vertex_stack);
2293 2293  
2294 2294          STACK_PUSH(vertex_stack, lock, l_stack1);
2295 2295  
2296 2296          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2297 2297                  STACK_POP(vertex_stack, l_stack1);
2298 2298                  for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2299 2299                      ep = NEXT_IN(ep)) {
2300 2300                          lck = ep->from_vertex;
2301 2301                          if (COLORED(lck)) {
2302 2302                                  if (IS_BARRIER(lck)) {
2303 2303                                          ASSERT(lck->l_index > 0);
2304 2304                                          lck->l_index--;
2305 2305                                          if (lck->l_index == 0)
2306 2306                                                  lck->l_state &= ~BARRIER_LOCK;
2307 2307                                  }
2308 2308                                  continue;
2309 2309                          }
2310 2310                          COLOR(lck);
2311 2311                          if (IS_BARRIER(lck)) {
2312 2312                                  ASSERT(lck->l_index > 0);
2313 2313                                  lck->l_index--;
2314 2314                                  if (lck->l_index == 0)
2315 2315                                          lck->l_state &= ~BARRIER_LOCK;
2316 2316                          }
2317 2317                          STACK_PUSH(vertex_stack, lck, l_stack1);
2318 2318                  }
2319 2319          }
2320 2320  }
2321 2321  
2322 2322  /*
2323 2323   * Finds all vertices that are reachable from 'lock' more than once and
2324 2324   * mark them as barrier vertices and increment their barrier count.
2325 2325   * The barrier count is one minus the total number of paths from lock
2326 2326   * to that vertex.
2327 2327   */
2328 2328  
2329 2329  static int
2330 2330  flk_find_barriers(lock_descriptor_t *lock)
2331 2331  {
2332 2332          lock_descriptor_t *vertex, *lck;
2333 2333          int found = 0;
2334 2334          edge_t  *ep;
2335 2335          lock_descriptor_t *vertex_stack;
2336 2336  
2337 2337          STACK_INIT(vertex_stack);
2338 2338  
2339 2339          STACK_PUSH(vertex_stack, lock, l_stack1);
2340 2340  
2341 2341          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2342 2342                  STACK_POP(vertex_stack, l_stack1);
2343 2343                  for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2344 2344                      ep = NEXT_IN(ep)) {
2345 2345                          lck = ep->from_vertex;
2346 2346                          if (COLORED(lck)) {
2347 2347                                  /* this is a barrier */
2348 2348                                  lck->l_state |= BARRIER_LOCK;
2349 2349                                  /* index will have barrier count */
2350 2350                                  lck->l_index++;
2351 2351                                  if (!found)
2352 2352                                          found = 1;
2353 2353                                  continue;
2354 2354                          }
2355 2355                          COLOR(lck);
2356 2356                          lck->l_index = 0;
2357 2357                          STACK_PUSH(vertex_stack, lck, l_stack1);
2358 2358                  }
2359 2359          }
2360 2360          return (found);
2361 2361  }
2362 2362  
2363 2363  /*
2364 2364   * Finds the first lock that is mainly responsible for blocking this
2365 2365   * request.  If there is no such lock, request->l_flock.l_type is set to
2366 2366   * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2367 2367   * of the blocking lock.
2368 2368   *
2369 2369   * Note: It is possible a request is blocked by a sleeping lock because
2370 2370   * of the fairness policy used in flk_process_request() to construct the
2371 2371   * dependencies. (see comments before flk_process_request()).
2372 2372   */
2373 2373  
2374 2374  static void
2375 2375  flk_get_first_blocking_lock(lock_descriptor_t *request)
2376 2376  {
2377 2377          graph_t *gp = request->l_graph;
2378 2378          vnode_t *vp = request->l_vnode;
2379 2379          lock_descriptor_t *lock, *blocker;
2380 2380  
2381 2381          ASSERT(MUTEX_HELD(&gp->gp_mutex));
2382 2382          blocker = NULL;
2383 2383          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2384 2384  
2385 2385          if (lock) {
2386 2386                  do {
2387 2387                          if (BLOCKS(lock, request)) {
2388 2388                                  blocker = lock;
2389 2389                                  break;
2390 2390                          }
2391 2391                          lock = lock->l_next;
2392 2392                  } while (lock->l_vnode == vp);
2393 2393          }
2394 2394  
2395 2395          if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
2396 2396                  /*
2397 2397                   * No active lock is blocking this request, but if a read
2398 2398                   * lock is requested, it may also get blocked by a waiting
2399 2399                   * writer. So search all sleeping locks and see if there is
2400 2400                   * a writer waiting.
2401 2401                   */
2402 2402                  SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2403 2403                  if (lock) {
2404 2404                          do {
2405 2405                                  if (BLOCKS(lock, request)) {
2406 2406                                          blocker = lock;
2407 2407                                          break;
2408 2408                                  }
2409 2409                                  lock = lock->l_next;
2410 2410                          } while (lock->l_vnode == vp);
2411 2411                  }
2412 2412          }
2413 2413  
2414 2414          if (blocker) {
2415 2415                  report_blocker(blocker, request);
2416 2416          } else
2417 2417                  request->l_flock.l_type = F_UNLCK;
2418 2418  }
2419 2419  
2420 2420  /*
2421 2421   * Get the graph_t structure associated with a vnode.
2422 2422   * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2423 2423   * not yet been initialized, then a new element is allocated and returned.
2424 2424   */
2425 2425  graph_t *
2426 2426  flk_get_lock_graph(vnode_t *vp, int initialize)
2427 2427  {
2428 2428          graph_t *gp;
2429 2429          graph_t *gp_alloc = NULL;
2430 2430          int index = HASH_INDEX(vp);
2431 2431  
2432 2432          if (initialize == FLK_USE_GRAPH) {
2433 2433                  mutex_enter(&flock_lock);
2434 2434                  gp = lock_graph[index];
2435 2435                  mutex_exit(&flock_lock);
2436 2436                  return (gp);
2437 2437          }
2438 2438  
2439 2439          ASSERT(initialize == FLK_INIT_GRAPH);
2440 2440  
2441 2441          if (lock_graph[index] == NULL) {
2442 2442  
2443 2443                  gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2444 2444  
2445 2445                  /* Initialize the graph */
2446 2446  
2447 2447                  gp_alloc->active_locks.l_next =
2448 2448                      gp_alloc->active_locks.l_prev =
2449 2449                      (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2450 2450                  gp_alloc->sleeping_locks.l_next =
2451 2451                      gp_alloc->sleeping_locks.l_prev =
2452 2452                      (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2453 2453                  gp_alloc->index = index;
2454 2454                  mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2455 2455          }
2456 2456  
2457 2457          mutex_enter(&flock_lock);
2458 2458  
2459 2459          gp = lock_graph[index];
2460 2460  
2461 2461          /* Recheck the value within flock_lock */
2462 2462          if (gp == NULL) {
2463 2463                  struct flock_globals *fg;
2464 2464  
2465 2465                  /* We must have previously allocated the graph_t structure */
2466 2466                  ASSERT(gp_alloc != NULL);
2467 2467                  lock_graph[index] = gp = gp_alloc;
2468 2468                  /*
2469 2469                   * The lockmgr status is only needed if KLM is loaded.
2470 2470                   */
2471 2471                  if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2472 2472                          fg = flk_get_globals();
2473 2473                          fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2474 2474                  }
2475 2475          }
2476 2476  
2477 2477          mutex_exit(&flock_lock);
2478 2478  
2479 2479          if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2480 2480                  /* There was a race to allocate the graph_t and we lost */
2481 2481                  mutex_destroy(&gp_alloc->gp_mutex);
2482 2482                  kmem_free(gp_alloc, sizeof (graph_t));
2483 2483          }
2484 2484  
2485 2485          return (gp);
2486 2486  }
2487 2487  
2488 2488  /*
2489 2489   * PSARC case 1997/292
2490 2490   */
2491 2491  int
2492 2492  cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2493 2493  {
2494 2494          lock_descriptor_t *lock;
2495 2495          int result = 0;
2496 2496          graph_t *gp;
2497 2497          int                     lock_nlmid;
2498 2498  
2499 2499          /*
2500 2500           * Check to see if node is booted as a cluster. If not, return.
2501 2501           */
2502 2502          if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2503 2503                  return (0);
2504 2504          }
2505 2505  
2506 2506          gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2507 2507          if (gp == NULL) {
2508 2508                  return (0);
2509 2509          }
2510 2510  
2511 2511          mutex_enter(&gp->gp_mutex);
2512 2512  
2513 2513          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2514 2514  
2515 2515          if (lock) {
2516 2516                  while (lock->l_vnode == vp) {
2517 2517                          /* get NLM id from sysid */
2518 2518                          lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2519 2519  
2520 2520                          /*
2521 2521                           * If NLM server request _and_ nlmid of lock matches
2522 2522                           * nlmid of argument, then we've found a remote lock.
2523 2523                           */
2524 2524                          if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2525 2525                                  result = 1;
2526 2526                                  goto done;
2527 2527                          }
2528 2528                          lock = lock->l_next;
2529 2529                  }
2530 2530          }
2531 2531  
2532 2532          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2533 2533  
2534 2534          if (lock) {
2535 2535                  while (lock->l_vnode == vp) {
2536 2536                          /* get NLM id from sysid */
2537 2537                          lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2538 2538  
2539 2539                          /*
2540 2540                           * If NLM server request _and_ nlmid of lock matches
2541 2541                           * nlmid of argument, then we've found a remote lock.
2542 2542                           */
2543 2543                          if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2544 2544                                  result = 1;
2545 2545                                  goto done;
2546 2546                          }
2547 2547                          lock = lock->l_next;
2548 2548                  }
2549 2549          }
2550 2550  
2551 2551  done:
2552 2552          mutex_exit(&gp->gp_mutex);
2553 2553          return (result);
2554 2554  }
2555 2555  
2556 2556  /*
2557 2557   * Determine whether there are any locks for the given vnode with a remote
2558 2558   * sysid.  Returns zero if not, non-zero if there are.
2559 2559   *
2560 2560   * Note that the return value from this function is potentially invalid
2561 2561   * once it has been returned.  The caller is responsible for providing its
2562 2562   * own synchronization mechanism to ensure that the return value is useful
2563 2563   * (e.g., see nfs_lockcompletion()).
2564 2564   */
2565 2565  int
2566 2566  flk_has_remote_locks(vnode_t *vp)
2567 2567  {
2568 2568          lock_descriptor_t *lock;
2569 2569          int result = 0;
2570 2570          graph_t *gp;
2571 2571  
2572 2572          gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2573 2573          if (gp == NULL) {
2574 2574                  return (0);
2575 2575          }
2576 2576  
2577 2577          mutex_enter(&gp->gp_mutex);
2578 2578  
2579 2579          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2580 2580  
2581 2581          if (lock) {
2582 2582                  while (lock->l_vnode == vp) {
2583 2583                          if (IS_REMOTE(lock)) {
2584 2584                                  result = 1;
2585 2585                                  goto done;
2586 2586                          }
2587 2587                          lock = lock->l_next;
2588 2588                  }
2589 2589          }
2590 2590  
2591 2591          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2592 2592  
2593 2593          if (lock) {
2594 2594                  while (lock->l_vnode == vp) {
2595 2595                          if (IS_REMOTE(lock)) {
2596 2596                                  result = 1;
2597 2597                                  goto done;
2598 2598                          }
2599 2599                          lock = lock->l_next;
2600 2600                  }
2601 2601          }
2602 2602  
2603 2603  done:
2604 2604          mutex_exit(&gp->gp_mutex);
2605 2605          return (result);
2606 2606  }
2607 2607  
2608 2608  /*
2609 2609   * Determine whether there are any locks for the given vnode with a remote
2610 2610   * sysid matching given sysid.
2611 2611   * Used by the new (open source) NFS Lock Manager (NLM)
2612 2612   */
2613 2613  int
2614 2614  flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2615 2615  {
2616 2616          lock_descriptor_t *lock;
2617 2617          int result = 0;
2618 2618          graph_t *gp;
2619 2619  
2620 2620          if (sysid == 0)
2621 2621                  return (0);
2622 2622  
2623 2623          gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2624 2624          if (gp == NULL) {
2625 2625                  return (0);
2626 2626          }
2627 2627  
2628 2628          mutex_enter(&gp->gp_mutex);
2629 2629  
2630 2630          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2631 2631  
2632 2632          if (lock) {
2633 2633                  while (lock->l_vnode == vp) {
2634 2634                          if (lock->l_flock.l_sysid == sysid) {
2635 2635                                  result = 1;
2636 2636                                  goto done;
2637 2637                          }
2638 2638                          lock = lock->l_next;
2639 2639                  }
2640 2640          }
2641 2641  
2642 2642          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2643 2643  
2644 2644          if (lock) {
2645 2645                  while (lock->l_vnode == vp) {
2646 2646                          if (lock->l_flock.l_sysid == sysid) {
2647 2647                                  result = 1;
2648 2648                                  goto done;
2649 2649                          }
2650 2650                          lock = lock->l_next;
2651 2651                  }
2652 2652          }
2653 2653  
2654 2654  done:
2655 2655          mutex_exit(&gp->gp_mutex);
2656 2656          return (result);
2657 2657  }
2658 2658  
2659 2659  /*
2660 2660   * Determine if there are any locks owned by the given sysid.
2661 2661   * Returns zero if not, non-zero if there are.  Note that this return code
2662 2662   * could be derived from flk_get_{sleeping,active}_locks, but this routine
2663 2663   * avoids all the memory allocations of those routines.
2664 2664   *
2665 2665   * This routine has the same synchronization issues as
2666 2666   * flk_has_remote_locks.
2667 2667   */
2668 2668  
2669 2669  int
2670 2670  flk_sysid_has_locks(int sysid, int lck_type)
2671 2671  {
2672 2672          int             has_locks = 0;
2673 2673          lock_descriptor_t       *lock;
2674 2674          graph_t         *gp;
2675 2675          int             i;
2676 2676  
2677 2677          for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2678 2678                  mutex_enter(&flock_lock);
2679 2679                  gp = lock_graph[i];
2680 2680                  mutex_exit(&flock_lock);
2681 2681                  if (gp == NULL) {
2682 2682                          continue;
2683 2683                  }
2684 2684  
2685 2685                  mutex_enter(&gp->gp_mutex);
2686 2686  
2687 2687                  if (lck_type & FLK_QUERY_ACTIVE) {
2688 2688                          for (lock = ACTIVE_HEAD(gp)->l_next;
2689 2689                              lock != ACTIVE_HEAD(gp) && !has_locks;
2690 2690                              lock = lock->l_next) {
2691 2691                                  if (lock->l_flock.l_sysid == sysid)
2692 2692                                          has_locks = 1;
2693 2693                          }
2694 2694                  }
2695 2695  
2696 2696                  if (lck_type & FLK_QUERY_SLEEPING) {
2697 2697                          for (lock = SLEEPING_HEAD(gp)->l_next;
2698 2698                              lock != SLEEPING_HEAD(gp) && !has_locks;
2699 2699                              lock = lock->l_next) {
2700 2700                                  if (lock->l_flock.l_sysid == sysid)
2701 2701                                          has_locks = 1;
2702 2702                          }
2703 2703                  }
2704 2704                  mutex_exit(&gp->gp_mutex);
2705 2705          }
2706 2706  
2707 2707          return (has_locks);
2708 2708  }
2709 2709  
2710 2710  
2711 2711  /*
2712 2712   * PSARC case 1997/292
2713 2713   *
2714 2714   * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2715 2715   *  quantity, the real sysid generated by the NLM server; the upper half
2716 2716   *  identifies the node of the cluster where the NLM server ran.
2717 2717   *  This routine is only called by an NLM server running in a cluster.
2718 2718   * Effects: Remove all locks held on behalf of the client identified
2719 2719   *  by "sysid."
2720 2720   */
2721 2721  void
2722 2722  cl_flk_remove_locks_by_sysid(int sysid)
2723 2723  {
2724 2724          graph_t *gp;
2725 2725          int i;
2726 2726          lock_descriptor_t *lock, *nlock;
2727 2727  
2728 2728          /*
2729 2729           * Check to see if node is booted as a cluster. If not, return.
2730 2730           */
2731 2731          if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2732 2732                  return;
2733 2733          }
2734 2734  
2735 2735          ASSERT(sysid != 0);
2736 2736          for (i = 0; i < HASH_SIZE; i++) {
2737 2737                  mutex_enter(&flock_lock);
2738 2738                  gp = lock_graph[i];
2739 2739                  mutex_exit(&flock_lock);
2740 2740  
2741 2741                  if (gp == NULL)
2742 2742                          continue;
2743 2743  
2744 2744                  mutex_enter(&gp->gp_mutex);     /*  get mutex on lock graph */
2745 2745  
2746 2746                  /* signal sleeping requests so that they bail out */
2747 2747                  lock = SLEEPING_HEAD(gp)->l_next;
2748 2748                  while (lock != SLEEPING_HEAD(gp)) {
2749 2749                          nlock = lock->l_next;
2750 2750                          if (lock->l_flock.l_sysid == sysid) {
2751 2751                                  INTERRUPT_WAKEUP(lock);
2752 2752                          }
2753 2753                          lock = nlock;
2754 2754                  }
2755 2755  
2756 2756                  /* delete active locks */
2757 2757                  lock = ACTIVE_HEAD(gp)->l_next;
2758 2758                  while (lock != ACTIVE_HEAD(gp)) {
2759 2759                          nlock = lock->l_next;
2760 2760                          if (lock->l_flock.l_sysid == sysid) {
2761 2761                                  flk_delete_active_lock(lock, 0);
2762 2762                                  flk_wakeup(lock, 1);
2763 2763                                  flk_free_lock(lock);
2764 2764                          }
2765 2765                          lock = nlock;
2766 2766                  }
2767 2767                  mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2768 2768          }
2769 2769  }
2770 2770  
2771 2771  /*
2772 2772   * Delete all locks in the system that belongs to the sysid of the request.
2773 2773   */
2774 2774  
2775 2775  static void
2776 2776  flk_delete_locks_by_sysid(lock_descriptor_t *request)
2777 2777  {
2778 2778          int     sysid  = request->l_flock.l_sysid;
2779 2779          lock_descriptor_t *lock, *nlock;
2780 2780          graph_t *gp;
2781 2781          int i;
2782 2782  
2783 2783          ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2784 2784          ASSERT(sysid != 0);
2785 2785  
2786 2786          mutex_exit(&request->l_graph->gp_mutex);
2787 2787  
2788 2788          for (i = 0; i < HASH_SIZE; i++) {
2789 2789                  mutex_enter(&flock_lock);
2790 2790                  gp = lock_graph[i];
2791 2791                  mutex_exit(&flock_lock);
2792 2792  
2793 2793                  if (gp == NULL)
2794 2794                          continue;
2795 2795  
2796 2796                  mutex_enter(&gp->gp_mutex);
2797 2797  
2798 2798                  /* signal sleeping requests so that they bail out */
2799 2799                  lock = SLEEPING_HEAD(gp)->l_next;
2800 2800                  while (lock != SLEEPING_HEAD(gp)) {
2801 2801                          nlock = lock->l_next;
2802 2802                          if (lock->l_flock.l_sysid == sysid) {
2803 2803                                  INTERRUPT_WAKEUP(lock);
2804 2804                          }
2805 2805                          lock = nlock;
2806 2806                  }
2807 2807  
2808 2808                  /* delete active locks */
2809 2809                  lock = ACTIVE_HEAD(gp)->l_next;
2810 2810                  while (lock != ACTIVE_HEAD(gp)) {
2811 2811                          nlock = lock->l_next;
2812 2812                          if (lock->l_flock.l_sysid == sysid) {
2813 2813                                  flk_delete_active_lock(lock, 0);
2814 2814                                  flk_wakeup(lock, 1);
2815 2815                                  flk_free_lock(lock);
2816 2816                          }
2817 2817                          lock = nlock;
2818 2818                  }
2819 2819                  mutex_exit(&gp->gp_mutex);
2820 2820          }
2821 2821  
2822 2822          mutex_enter(&request->l_graph->gp_mutex);
2823 2823  }
2824 2824  
2825 2825  /*
2826 2826   * Clustering: Deletes PXFS locks
2827 2827   * Effects: Delete all locks on files in the given file system and with the
2828 2828   *  given PXFS id.
2829 2829   */
2830 2830  void
2831 2831  cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2832 2832  {
2833 2833          lock_descriptor_t *lock, *nlock;
2834 2834          graph_t *gp;
2835 2835          int i;
2836 2836  
2837 2837          for (i = 0; i < HASH_SIZE; i++) {
2838 2838                  mutex_enter(&flock_lock);
2839 2839                  gp = lock_graph[i];
2840 2840                  mutex_exit(&flock_lock);
2841 2841  
2842 2842                  if (gp == NULL)
2843 2843                          continue;
2844 2844  
2845 2845                  mutex_enter(&gp->gp_mutex);
2846 2846  
2847 2847                  /* signal sleeping requests so that they bail out */
2848 2848                  lock = SLEEPING_HEAD(gp)->l_next;
2849 2849                  while (lock != SLEEPING_HEAD(gp)) {
2850 2850                          nlock = lock->l_next;
2851 2851                          if (lock->l_vnode->v_vfsp == vfsp) {
2852 2852                                  ASSERT(IS_PXFS(lock));
2853 2853                                  if (GETPXFSID(lock->l_flock.l_sysid) ==
2854 2854                                      pxfsid) {
2855 2855                                          flk_set_state(lock,
2856 2856                                              FLK_CANCELLED_STATE);
2857 2857                                          flk_cancel_sleeping_lock(lock, 1);
2858 2858                                  }
2859 2859                          }
2860 2860                          lock = nlock;
2861 2861                  }
2862 2862  
2863 2863                  /* delete active locks */
2864 2864                  lock = ACTIVE_HEAD(gp)->l_next;
2865 2865                  while (lock != ACTIVE_HEAD(gp)) {
2866 2866                          nlock = lock->l_next;
2867 2867                          if (lock->l_vnode->v_vfsp == vfsp) {
2868 2868                                  ASSERT(IS_PXFS(lock));
2869 2869                                  if (GETPXFSID(lock->l_flock.l_sysid) ==
2870 2870                                      pxfsid) {
2871 2871                                          flk_delete_active_lock(lock, 0);
2872 2872                                          flk_wakeup(lock, 1);
2873 2873                                          flk_free_lock(lock);
2874 2874                                  }
2875 2875                          }
2876 2876                          lock = nlock;
2877 2877                  }
2878 2878                  mutex_exit(&gp->gp_mutex);
2879 2879          }
2880 2880  }
2881 2881  
2882 2882  /*
2883 2883   * Search for a sleeping lock manager lock which matches exactly this lock
2884 2884   * request; if one is found, fake a signal to cancel it.
2885 2885   *
2886 2886   * Return 1 if a matching lock was found, 0 otherwise.
2887 2887   */
2888 2888  
2889 2889  static int
2890 2890  flk_canceled(lock_descriptor_t *request)
2891 2891  {
2892 2892          lock_descriptor_t *lock, *nlock;
2893 2893          graph_t *gp = request->l_graph;
2894 2894          vnode_t *vp = request->l_vnode;
2895 2895  
2896 2896          ASSERT(MUTEX_HELD(&gp->gp_mutex));
2897 2897          ASSERT(IS_LOCKMGR(request));
2898 2898          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2899 2899  
2900 2900          if (lock) {
2901 2901                  while (lock->l_vnode == vp) {
2902 2902                          nlock = lock->l_next;
2903 2903                          if (SAME_OWNER(lock, request) &&
2904 2904                              lock->l_start == request->l_start &&
2905 2905                              lock->l_end == request->l_end) {
2906 2906                                  INTERRUPT_WAKEUP(lock);
2907 2907                                  return (1);
2908 2908                          }
2909 2909                          lock = nlock;
2910 2910                  }
2911 2911          }
2912 2912          return (0);
2913 2913  }
2914 2914  
2915 2915  /*
2916 2916   * Remove all non-OFD locks for the vnode belonging to the given pid and sysid.
2917 2917   * That is, since OFD locks are pid-less we'll never match on the incoming
2918 2918   * pid. OFD locks are removed earlier in the close() path via closef() and
2919 2919   * ofdcleanlock().
2920 2920   */
2921 2921  void
2922 2922  cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2923 2923  {
2924 2924          graph_t *gp;
2925 2925          lock_descriptor_t *lock, *nlock;
2926 2926          lock_descriptor_t *link_stack;
2927 2927  
2928 2928          STACK_INIT(link_stack);
2929 2929  
2930 2930          gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2931 2931  
2932 2932          if (gp == NULL)
2933 2933                  return;
2934 2934          mutex_enter(&gp->gp_mutex);
2935 2935  
2936 2936          CHECK_SLEEPING_LOCKS(gp);
2937 2937          CHECK_ACTIVE_LOCKS(gp);
2938 2938  
2939 2939          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2940 2940  
2941 2941          if (lock) {
2942 2942                  do {
2943 2943                          nlock = lock->l_next;
2944 2944                          if ((lock->l_flock.l_pid == pid ||
2945 2945                              pid == IGN_PID) &&
2946 2946                              lock->l_flock.l_sysid == sysid) {
2947 2947                                  CANCEL_WAKEUP(lock);
2948 2948                          }
2949 2949                          lock = nlock;
2950 2950                  } while (lock->l_vnode == vp);
2951 2951          }
2952 2952  
2953 2953          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2954 2954  
2955 2955          if (lock) {
2956 2956                  do {
2957 2957                          nlock = lock->l_next;
2958 2958                          if ((lock->l_flock.l_pid == pid ||
2959 2959                              pid == IGN_PID) &&
2960 2960                              lock->l_flock.l_sysid == sysid) {
2961 2961                                  flk_delete_active_lock(lock, 0);
2962 2962                                  STACK_PUSH(link_stack, lock, l_stack);
2963 2963                          }
2964 2964                          lock = nlock;
2965 2965                  } while (lock->l_vnode == vp);
2966 2966          }
2967 2967  
2968 2968          while ((lock = STACK_TOP(link_stack)) != NULL) {
2969 2969                  STACK_POP(link_stack, l_stack);
2970 2970                  flk_wakeup(lock, 1);
2971 2971                  flk_free_lock(lock);
2972 2972          }
2973 2973  
2974 2974          CHECK_SLEEPING_LOCKS(gp);
2975 2975          CHECK_ACTIVE_LOCKS(gp);
2976 2976          CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2977 2977          mutex_exit(&gp->gp_mutex);
2978 2978  }
2979 2979  
2980 2980  
2981 2981  /*
2982 2982   * Called from 'fs' read and write routines for files that have mandatory
2983 2983   * locking enabled.
2984 2984   */
2985 2985  
2986 2986  int
2987 2987  chklock(
2988 2988          struct vnode    *vp,
2989 2989          int             iomode,
2990 2990          u_offset_t      offset,
2991 2991          ssize_t         len,
2992 2992          int             fmode,
2993 2993          caller_context_t *ct)
2994 2994  {
2995 2995          register int    i;
2996 2996          struct flock64  bf;
2997 2997          int             error = 0;
2998 2998  
2999 2999          bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
3000 3000          bf.l_whence = 0;
3001 3001          bf.l_start = offset;
3002 3002          bf.l_len = len;
3003 3003          if (ct == NULL) {
3004 3004                  bf.l_pid = curproc->p_pid;
3005 3005                  bf.l_sysid = 0;
3006 3006          } else {
3007 3007                  bf.l_pid = ct->cc_pid;
3008 3008                  bf.l_sysid = ct->cc_sysid;
3009 3009          }
3010 3010          i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
3011 3011          if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
3012 3012              bf.l_type != F_UNLCK)
3013 3013                  error = i ? i : EAGAIN;
3014 3014          return (error);
3015 3015  }
3016 3016  
3017 3017  /*
3018 3018   * convoff - converts the given data (start, whence) to the
3019 3019   * given whence.
3020 3020   */
3021 3021  int
3022 3022  convoff(vp, lckdat, whence, offset)
3023 3023          struct vnode    *vp;
3024 3024          struct flock64  *lckdat;
3025 3025          int             whence;
3026 3026          offset_t        offset;
3027 3027  {
3028 3028          int             error;
3029 3029          struct vattr    vattr;
3030 3030  
3031 3031          if ((lckdat->l_whence == 2) || (whence == 2)) {
3032 3032                  vattr.va_mask = AT_SIZE;
3033 3033                  if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3034 3034                          return (error);
3035 3035          }
3036 3036  
3037 3037          switch (lckdat->l_whence) {
3038 3038          case 1:
3039 3039                  lckdat->l_start += offset;
3040 3040                  break;
3041 3041          case 2:
3042 3042                  lckdat->l_start += vattr.va_size;
3043 3043                  /* FALLTHRU */
3044 3044          case 0:
3045 3045                  break;
3046 3046          default:
3047 3047                  return (EINVAL);
3048 3048          }
3049 3049  
3050 3050          if (lckdat->l_start < 0)
3051 3051                  return (EINVAL);
3052 3052  
3053 3053          switch (whence) {
3054 3054          case 1:
3055 3055                  lckdat->l_start -= offset;
3056 3056                  break;
3057 3057          case 2:
3058 3058                  lckdat->l_start -= vattr.va_size;
3059 3059                  /* FALLTHRU */
3060 3060          case 0:
3061 3061                  break;
3062 3062          default:
3063 3063                  return (EINVAL);
3064 3064          }
3065 3065  
3066 3066          lckdat->l_whence = (short)whence;
3067 3067          return (0);
3068 3068  }
3069 3069  
3070 3070  
3071 3071  /*      proc_graph function definitions */
3072 3072  
3073 3073  /*
3074 3074   * Function checks for deadlock due to the new 'lock'. If deadlock found
3075 3075   * edges of this lock are freed and returned.
3076 3076   */
3077 3077  
3078 3078  static int
3079 3079  flk_check_deadlock(lock_descriptor_t *lock)
3080 3080  {
3081 3081          proc_vertex_t   *start_vertex, *pvertex;
3082 3082          proc_vertex_t *dvertex;
3083 3083          proc_edge_t *pep, *ppep;
3084 3084          edge_t  *ep, *nep;
3085 3085          proc_vertex_t *process_stack;
3086 3086  
3087 3087          /*
3088 3088           * OFD style locks are not associated with any process so there is
3089 3089           * no proc graph for these. Thus we cannot, and do not, do deadlock
3090 3090           * detection.
3091 3091           */
3092 3092          if (lock->l_ofd != NULL)
3093 3093                  return (0);
3094 3094  
3095 3095          STACK_INIT(process_stack);
3096 3096  
3097 3097          mutex_enter(&flock_lock);
3098 3098          start_vertex = flk_get_proc_vertex(lock);
3099 3099          ASSERT(start_vertex != NULL);
3100 3100  
3101 3101          /* construct the edges from this process to other processes */
3102 3102  
3103 3103          ep = FIRST_ADJ(lock);
3104 3104          while (ep != HEAD(lock)) {
3105 3105                  proc_vertex_t *adj_proc;
3106 3106  
3107 3107                  adj_proc = flk_get_proc_vertex(ep->to_vertex);
3108 3108                  for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
3109 3109                          if (pep->to_proc == adj_proc) {
3110 3110                                  ASSERT(pep->refcount);
3111 3111                                  pep->refcount++;
3112 3112                                  break;
3113 3113                          }
3114 3114                  }
3115 3115                  if (pep == NULL) {
3116 3116                          pep = flk_get_proc_edge();
3117 3117                          pep->to_proc = adj_proc;
3118 3118                          pep->refcount = 1;
3119 3119                          adj_proc->incount++;
3120 3120                          pep->next = start_vertex->edge;
3121 3121                          start_vertex->edge = pep;
3122 3122                  }
3123 3123                  ep = NEXT_ADJ(ep);
3124 3124          }
3125 3125  
3126 3126          ep = FIRST_IN(lock);
3127 3127  
3128 3128          while (ep != HEAD(lock)) {
3129 3129                  proc_vertex_t *in_proc;
3130 3130  
3131 3131                  in_proc = flk_get_proc_vertex(ep->from_vertex);
3132 3132  
3133 3133                  for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
3134 3134                          if (pep->to_proc == start_vertex) {
3135 3135                                  ASSERT(pep->refcount);
3136 3136                                  pep->refcount++;
3137 3137                                  break;
3138 3138                          }
3139 3139                  }
3140 3140                  if (pep == NULL) {
3141 3141                          pep = flk_get_proc_edge();
3142 3142                          pep->to_proc = start_vertex;
3143 3143                          pep->refcount = 1;
3144 3144                          start_vertex->incount++;
3145 3145                          pep->next = in_proc->edge;
3146 3146                          in_proc->edge = pep;
3147 3147                  }
3148 3148                  ep = NEXT_IN(ep);
3149 3149          }
3150 3150  
3151 3151          if (start_vertex->incount == 0) {
3152 3152                  mutex_exit(&flock_lock);
3153 3153                  return (0);
3154 3154          }
3155 3155  
3156 3156          flk_proc_graph_uncolor();
3157 3157  
3158 3158          start_vertex->p_sedge = start_vertex->edge;
3159 3159  
3160 3160          STACK_PUSH(process_stack, start_vertex, p_stack);
3161 3161  
3162 3162          while ((pvertex = STACK_TOP(process_stack)) != NULL) {
3163 3163                  for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
3164 3164                          dvertex = pep->to_proc;
3165 3165                          if (!PROC_ARRIVED(dvertex)) {
3166 3166                                  STACK_PUSH(process_stack, dvertex, p_stack);
3167 3167                                  dvertex->p_sedge = dvertex->edge;
3168 3168                                  PROC_ARRIVE(pvertex);
3169 3169                                  pvertex->p_sedge = pep->next;
3170 3170                                  break;
3171 3171                          }
3172 3172                          if (!PROC_DEPARTED(dvertex))
3173 3173                                  goto deadlock;
3174 3174                  }
3175 3175                  if (pep == NULL) {
3176 3176                          PROC_DEPART(pvertex);
3177 3177                          STACK_POP(process_stack, p_stack);
3178 3178                  }
3179 3179          }
3180 3180          mutex_exit(&flock_lock);
3181 3181          return (0);
3182 3182  
3183 3183  deadlock:
3184 3184  
3185 3185          /* we remove all lock edges and proc edges */
3186 3186  
3187 3187          ep = FIRST_ADJ(lock);
3188 3188          while (ep != HEAD(lock)) {
3189 3189                  proc_vertex_t *adj_proc;
3190 3190                  adj_proc = flk_get_proc_vertex(ep->to_vertex);
3191 3191                  nep = NEXT_ADJ(ep);
3192 3192                  IN_LIST_REMOVE(ep);
3193 3193                  ADJ_LIST_REMOVE(ep);
3194 3194                  flk_free_edge(ep);
3195 3195                  ppep = start_vertex->edge;
3196 3196                  for (pep = start_vertex->edge; pep != NULL; ppep = pep,
3197 3197                      pep = ppep->next) {
3198 3198                          if (pep->to_proc == adj_proc) {
3199 3199                                  pep->refcount--;
3200 3200                                  if (pep->refcount == 0) {
3201 3201                                          if (pep == ppep) {
3202 3202                                                  start_vertex->edge = pep->next;
3203 3203                                          } else {
3204 3204                                                  ppep->next = pep->next;
3205 3205                                          }
3206 3206                                          adj_proc->incount--;
3207 3207                                          flk_proc_release(adj_proc);
3208 3208                                          flk_free_proc_edge(pep);
3209 3209                                  }
3210 3210                                  break;
3211 3211                          }
3212 3212                  }
3213 3213                  ep = nep;
3214 3214          }
3215 3215          ep = FIRST_IN(lock);
3216 3216          while (ep != HEAD(lock)) {
3217 3217                  proc_vertex_t *in_proc;
3218 3218                  in_proc = flk_get_proc_vertex(ep->from_vertex);
3219 3219                  nep = NEXT_IN(ep);
3220 3220                  IN_LIST_REMOVE(ep);
3221 3221                  ADJ_LIST_REMOVE(ep);
3222 3222                  flk_free_edge(ep);
3223 3223                  ppep = in_proc->edge;
3224 3224                  for (pep = in_proc->edge; pep != NULL; ppep = pep,
3225 3225                      pep = ppep->next) {
3226 3226                          if (pep->to_proc == start_vertex) {
3227 3227                                  pep->refcount--;
3228 3228                                  if (pep->refcount == 0) {
3229 3229                                          if (pep == ppep) {
3230 3230                                                  in_proc->edge = pep->next;
3231 3231                                          } else {
3232 3232                                                  ppep->next = pep->next;
3233 3233                                          }
3234 3234                                          start_vertex->incount--;
3235 3235                                          flk_proc_release(in_proc);
3236 3236                                          flk_free_proc_edge(pep);
3237 3237                                  }
3238 3238                                  break;
3239 3239                          }
3240 3240                  }
3241 3241                  ep = nep;
3242 3242          }
3243 3243          flk_proc_release(start_vertex);
3244 3244          mutex_exit(&flock_lock);
3245 3245          return (1);
3246 3246  }
3247 3247  
3248 3248  /*
3249 3249   * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
3250 3250   * from the list we return that, otherwise we allocate one. If necessary,
3251 3251   * we grow the list of vertices also.
3252 3252   */
3253 3253  
3254 3254  static proc_vertex_t *
3255 3255  flk_get_proc_vertex(lock_descriptor_t *lock)
3256 3256  {
3257 3257          int i;
3258 3258          proc_vertex_t   *pv;
3259 3259          proc_vertex_t   **palloc;
3260 3260  
3261 3261          ASSERT(MUTEX_HELD(&flock_lock));
3262 3262          if (lock->pvertex != -1) {
3263 3263                  ASSERT(lock->pvertex >= 0);
3264 3264                  pv = pgraph.proc[lock->pvertex];
3265 3265                  if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3266 3266                          return (pv);
3267 3267                  }
3268 3268          }
3269 3269          for (i = 0; i < pgraph.gcount; i++) {
3270 3270                  pv = pgraph.proc[i];
3271 3271                  if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3272 3272                          lock->pvertex = pv->index = i;
3273 3273                          return (pv);
3274 3274                  }
3275 3275          }
3276 3276          pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
3277 3277          pv->pid = lock->l_flock.l_pid;
3278 3278          pv->sysid = lock->l_flock.l_sysid;
3279 3279          flk_proc_vertex_allocs++;
3280 3280          if (pgraph.free != 0) {
3281 3281                  for (i = 0; i < pgraph.gcount; i++) {
3282 3282                          if (pgraph.proc[i] == NULL) {
3283 3283                                  pgraph.proc[i] = pv;
3284 3284                                  lock->pvertex = pv->index = i;
3285 3285                                  pgraph.free--;
3286 3286                                  return (pv);
3287 3287                          }
3288 3288                  }
3289 3289          }
3290 3290          palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
3291 3291              sizeof (proc_vertex_t *), KM_SLEEP);
3292 3292  
3293 3293          if (pgraph.proc) {
3294 3294                  bcopy(pgraph.proc, palloc,
3295 3295                      pgraph.gcount * sizeof (proc_vertex_t *));
3296 3296  
3297 3297                  kmem_free(pgraph.proc,
3298 3298                      pgraph.gcount * sizeof (proc_vertex_t *));
3299 3299          }
3300 3300          pgraph.proc = palloc;
3301 3301          pgraph.free += (PROC_CHUNK - 1);
3302 3302          pv->index = lock->pvertex = pgraph.gcount;
3303 3303          pgraph.gcount += PROC_CHUNK;
3304 3304          pgraph.proc[pv->index] = pv;
3305 3305          return (pv);
3306 3306  }
3307 3307  
3308 3308  /*
3309 3309   * Allocate a proc edge.
3310 3310   */
3311 3311  
3312 3312  static proc_edge_t *
3313 3313  flk_get_proc_edge()
3314 3314  {
3315 3315          proc_edge_t *pep;
3316 3316  
3317 3317          pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3318 3318          flk_proc_edge_allocs++;
3319 3319          return (pep);
3320 3320  }
3321 3321  
3322 3322  /*
3323 3323   * Free the proc edge. Called whenever its reference count goes to zero.
3324 3324   */
3325 3325  
3326 3326  static void
3327 3327  flk_free_proc_edge(proc_edge_t *pep)
3328 3328  {
3329 3329          ASSERT(pep->refcount == 0);
3330 3330          kmem_free((void *)pep, sizeof (proc_edge_t));
3331 3331          flk_proc_edge_frees++;
3332 3332  }
3333 3333  
3334 3334  /*
3335 3335   * Color the graph explicitly done only when the mark value hits max value.
3336 3336   */
3337 3337  
3338 3338  static void
3339 3339  flk_proc_graph_uncolor()
3340 3340  {
3341 3341          int i;
3342 3342  
3343 3343          if (pgraph.mark == UINT_MAX) {
3344 3344                  for (i = 0; i < pgraph.gcount; i++)
3345 3345                          if (pgraph.proc[i] != NULL) {
3346 3346                                  pgraph.proc[i]->atime = 0;
3347 3347                                  pgraph.proc[i]->dtime = 0;
3348 3348                          }
3349 3349                  pgraph.mark = 1;
3350 3350          } else {
3351 3351                  pgraph.mark++;
3352 3352          }
3353 3353  }
3354 3354  
3355 3355  /*
3356 3356   * Release the proc vertex iff both there are no in edges and out edges
3357 3357   */
3358 3358  
3359 3359  static void
3360 3360  flk_proc_release(proc_vertex_t *proc)
3361 3361  {
3362 3362          ASSERT(MUTEX_HELD(&flock_lock));
3363 3363          if (proc->edge == NULL && proc->incount == 0) {
3364 3364                  pgraph.proc[proc->index] = NULL;
3365 3365                  pgraph.free++;
3366 3366                  kmem_free(proc, sizeof (proc_vertex_t));
3367 3367                  flk_proc_vertex_frees++;
3368 3368          }
3369 3369  }
3370 3370  
3371 3371  /*
3372 3372   * Updates process graph to reflect change in a lock_graph.
3373 3373   * Note: We should call this function only after we have a correctly
3374 3374   * recomputed lock graph. Otherwise we might miss a deadlock detection.
3375 3375   * eg: in function flk_relation() we call this function after flk_recompute_
3376 3376   * dependencies() otherwise if a process tries to lock a vnode hashed
3377 3377   * into another graph it might sleep for ever.
3378 3378   */
3379 3379  
3380 3380  static void
3381 3381  flk_update_proc_graph(edge_t *ep, int delete)
3382 3382  {
3383 3383          proc_vertex_t *toproc, *fromproc;
3384 3384          proc_edge_t *pep, *prevpep;
3385 3385  
3386 3386          mutex_enter(&flock_lock);
3387 3387  
3388 3388          /*
3389 3389           * OFD style locks are not associated with any process so there is
3390 3390           * no proc graph for these.
3391 3391           */
3392 3392          if (ep->from_vertex->l_ofd != NULL) {
3393 3393                  mutex_exit(&flock_lock);
3394 3394                  return;
3395 3395          }
3396 3396  
3397 3397          toproc = flk_get_proc_vertex(ep->to_vertex);
3398 3398          fromproc = flk_get_proc_vertex(ep->from_vertex);
3399 3399  
3400 3400          if (!delete)
3401 3401                  goto add;
3402 3402          pep = prevpep = fromproc->edge;
3403 3403  
3404 3404          ASSERT(pep != NULL);
3405 3405          while (pep != NULL) {
3406 3406                  if (pep->to_proc == toproc) {
3407 3407                          ASSERT(pep->refcount > 0);
3408 3408                          pep->refcount--;
3409 3409                          if (pep->refcount == 0) {
3410 3410                                  if (pep == prevpep) {
3411 3411                                          fromproc->edge = pep->next;
3412 3412                                  } else {
3413 3413                                          prevpep->next = pep->next;
3414 3414                                  }
3415 3415                                  toproc->incount--;
3416 3416                                  flk_proc_release(toproc);
3417 3417                                  flk_free_proc_edge(pep);
3418 3418                          }
3419 3419                          break;
3420 3420                  }
3421 3421                  prevpep = pep;
3422 3422                  pep = pep->next;
3423 3423          }
3424 3424          flk_proc_release(fromproc);
3425 3425          mutex_exit(&flock_lock);
3426 3426          return;
3427 3427  add:
3428 3428  
3429 3429          pep = fromproc->edge;
3430 3430  
3431 3431          while (pep != NULL) {
3432 3432                  if (pep->to_proc == toproc) {
3433 3433                          ASSERT(pep->refcount > 0);
3434 3434                          pep->refcount++;
3435 3435                          break;
3436 3436                  }
3437 3437                  pep = pep->next;
3438 3438          }
3439 3439          if (pep == NULL) {
3440 3440                  pep = flk_get_proc_edge();
3441 3441                  pep->to_proc = toproc;
3442 3442                  pep->refcount = 1;
3443 3443                  toproc->incount++;
3444 3444                  pep->next = fromproc->edge;
3445 3445                  fromproc->edge = pep;
3446 3446          }
3447 3447          mutex_exit(&flock_lock);
3448 3448  }
3449 3449  
3450 3450  /*
3451 3451   * Set the control status for lock manager requests.
3452 3452   *
3453 3453   */
3454 3454  
3455 3455  /*
3456 3456   * PSARC case 1997/292
3457 3457   *
3458 3458   * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3459 3459   * Effects: Set the state of the NLM server identified by "nlmid"
3460 3460   *   in the NLM registry to state "nlm_state."
3461 3461   *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3462 3462   *   NLM server to this LLM.
3463 3463   *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3464 3464   *   may be locks requests that have gotten started but not finished.  In
3465 3465   *   particular, there may be blocking requests that are in the callback code
3466 3466   *   before sleeping (so they're not holding the lock for the graph).  If
3467 3467   *   such a thread reacquires the graph's lock (to go to sleep) after
3468 3468   *   NLM state in the NLM registry  is set to a non-up value,
3469 3469   *   it will notice the status and bail out.  If the request gets
3470 3470   *   granted before the thread can check the NLM registry, let it
3471 3471   *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3472 3472   *
3473 3473   * Modifies: nlm_reg_obj (global)
3474 3474   * Arguments:
3475 3475   *    nlmid     (IN):    id uniquely identifying an NLM server
3476 3476   *    nlm_state (IN):    NLM server state to change "nlmid" to
3477 3477   */
3478 3478  void
3479 3479  cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3480 3480  {
3481 3481          /*
3482 3482           * Check to see if node is booted as a cluster. If not, return.
3483 3483           */
3484 3484          if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3485 3485                  return;
3486 3486          }
3487 3487  
3488 3488          /*
3489 3489           * Check for development/debugging.  It is possible to boot a node
3490 3490           * in non-cluster mode, and then run a special script, currently
3491 3491           * available only to developers, to bring up the node as part of a
3492 3492           * cluster.  The problem is that running such a script does not
3493 3493           * result in the routine flk_init() being called and hence global array
3494 3494           * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3495 3495           * but the LLM needs to do an additional check to see if the global
3496 3496           * array has been created or not. If nlm_reg_status is NULL, then
3497 3497           * return, else continue.
3498 3498           */
3499 3499          if (nlm_reg_status == NULL) {
3500 3500                  return;
3501 3501          }
3502 3502  
3503 3503          ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3504 3504          mutex_enter(&nlm_reg_lock);
3505 3505  
3506 3506          if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3507 3507                  /*
3508 3508                   * If the NLM server "nlmid" is unknown in the NLM registry,
3509 3509                   * add it to the registry in the nlm shutting down state.
3510 3510                   */
3511 3511                  FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3512 3512                      FLK_NLM_SHUTTING_DOWN);
3513 3513          } else {
3514 3514                  /*
3515 3515                   * Change the state of the NLM server identified by "nlmid"
3516 3516                   * in the NLM registry to the argument "nlm_state."
3517 3517                   */
3518 3518                  FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3519 3519                      nlm_state);
3520 3520          }
3521 3521  
3522 3522          /*
3523 3523           *  The reason we must register the NLM server that is shutting down
3524 3524           *  with an LLM that doesn't already know about it (never sent a lock
3525 3525           *  request) is to handle correctly a race between shutdown and a new
3526 3526           *  lock request.  Suppose that a shutdown request from the NLM server
3527 3527           *  invokes this routine at the LLM, and a thread is spawned to
3528 3528           *  service the request. Now suppose a new lock request is in
3529 3529           *  progress and has already passed the first line of defense in
3530 3530           *  reclock(), which denies new locks requests from NLM servers
3531 3531           *  that are not in the NLM_UP state.  After the current routine
3532 3532           *  is invoked for both phases of shutdown, the routine will return,
3533 3533           *  having done nothing, and the lock request will proceed and
3534 3534           *  probably be granted.  The problem is that the shutdown was ignored
3535 3535           *  by the lock request because there was no record of that NLM server
3536 3536           *  shutting down.   We will be in the peculiar position of thinking
3537 3537           *  that we've shutdown the NLM server and all locks at all LLMs have
3538 3538           *  been discarded, but in fact there's still one lock held.
3539 3539           *  The solution is to record the existence of NLM server and change
3540 3540           *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3541 3541           *  progress may proceed because the next phase NLM_DOWN will catch
3542 3542           *  this lock and discard it.
3543 3543           */
3544 3544          mutex_exit(&nlm_reg_lock);
3545 3545  
3546 3546          switch (nlm_state) {
3547 3547          case FLK_NLM_UP:
3548 3548                  /*
3549 3549                   * Change the NLM state of all locks still held on behalf of
3550 3550                   * the NLM server identified by "nlmid" to NLM_UP.
3551 3551                   */
3552 3552                  cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3553 3553                  break;
3554 3554  
3555 3555          case FLK_NLM_SHUTTING_DOWN:
3556 3556                  /*
3557 3557                   * Wake up all sleeping locks for the NLM server identified
3558 3558                   * by "nlmid." Note that eventually all woken threads will
3559 3559                   * have their lock requests cancelled and descriptors
3560 3560                   * removed from the sleeping lock list.  Note that the NLM
3561 3561                   * server state associated with each lock descriptor is
3562 3562                   * changed to FLK_NLM_SHUTTING_DOWN.
3563 3563                   */
3564 3564                  cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3565 3565                  break;
3566 3566  
3567 3567          case FLK_NLM_DOWN:
3568 3568                  /*
3569 3569                   * Discard all active, granted locks for this NLM server
3570 3570                   * identified by "nlmid."
3571 3571                   */
3572 3572                  cl_flk_unlock_nlm_granted(nlmid);
3573 3573                  break;
3574 3574  
3575 3575          default:
3576 3576                  panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3577 3577          }
3578 3578  }
3579 3579  
3580 3580  /*
3581 3581   * Set the control status for lock manager requests.
3582 3582   *
3583 3583   * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3584 3584   * may be locks requests that have gotten started but not finished.  In
3585 3585   * particular, there may be blocking requests that are in the callback code
3586 3586   * before sleeping (so they're not holding the lock for the graph).  If
3587 3587   * such a thread reacquires the graph's lock (to go to sleep) after
3588 3588   * flk_lockmgr_status is set to a non-up value, it will notice the status
3589 3589   * and bail out.  If the request gets granted before the thread can check
3590 3590   * flk_lockmgr_status, let it continue normally.  It will get flushed when
3591 3591   * we are called with FLK_LOCKMGR_DOWN.
3592 3592   */
3593 3593  
3594 3594  void
3595 3595  flk_set_lockmgr_status(flk_lockmgr_status_t status)
3596 3596  {
3597 3597          int i;
3598 3598          graph_t *gp;
3599 3599          struct flock_globals *fg;
3600 3600  
3601 3601          fg = flk_get_globals();
3602 3602          ASSERT(fg != NULL);
3603 3603  
3604 3604          mutex_enter(&flock_lock);
3605 3605          fg->flk_lockmgr_status = status;
3606 3606          mutex_exit(&flock_lock);
3607 3607  
3608 3608          /*
3609 3609           * If the lock manager is coming back up, all that's needed is to
3610 3610           * propagate this information to the graphs.  If the lock manager
3611 3611           * is going down, additional action is required, and each graph's
3612 3612           * copy of the state is updated atomically with this other action.
3613 3613           */
3614 3614          switch (status) {
3615 3615          case FLK_LOCKMGR_UP:
3616 3616                  for (i = 0; i < HASH_SIZE; i++) {
3617 3617                          mutex_enter(&flock_lock);
3618 3618                          gp = lock_graph[i];
3619 3619                          mutex_exit(&flock_lock);
3620 3620                          if (gp == NULL)
3621 3621                                  continue;
3622 3622                          mutex_enter(&gp->gp_mutex);
3623 3623                          fg->lockmgr_status[i] = status;
3624 3624                          mutex_exit(&gp->gp_mutex);
3625 3625                  }
3626 3626                  break;
3627 3627          case FLK_WAKEUP_SLEEPERS:
3628 3628                  wakeup_sleeping_lockmgr_locks(fg);
3629 3629                  break;
3630 3630          case FLK_LOCKMGR_DOWN:
3631 3631                  unlock_lockmgr_granted(fg);
3632 3632                  break;
3633 3633          default:
3634 3634                  panic("flk_set_lockmgr_status: bad status (%d)", status);
3635 3635                  break;
3636 3636          }
3637 3637  }
3638 3638  
3639 3639  /*
3640 3640   * This routine returns all the locks that are active or sleeping and are
3641 3641   * associated with a particular set of identifiers.  If lock_state != 0, then
3642 3642   * only locks that match the lock_state are returned. If lock_state == 0, then
3643 3643   * all locks are returned. If pid == NOPID, the pid is ignored.  If
3644 3644   * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3645 3645   * vnode pointer is ignored.
3646 3646   *
3647 3647   * A list containing the vnode pointer and an flock structure
3648 3648   * describing the lock is returned.  Each element in the list is
3649 3649   * dynamically allocated and must be freed by the caller.  The
3650 3650   * last item in the list is denoted by a NULL value in the ll_next
3651 3651   * field.
3652 3652   *
3653 3653   * The vnode pointers returned are held.  The caller is responsible
3654 3654   * for releasing these.  Note that the returned list is only a snapshot of
3655 3655   * the current lock information, and that it is a snapshot of a moving
3656 3656   * target (only one graph is locked at a time).
3657 3657   */
3658 3658  
3659 3659  locklist_t *
3660 3660  get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3661 3661                  pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3662 3662  {
3663 3663          lock_descriptor_t       *lock;
3664 3664          lock_descriptor_t       *graph_head;
3665 3665          locklist_t              listhead;
3666 3666          locklist_t              *llheadp;
3667 3667          locklist_t              *llp;
3668 3668          locklist_t              *lltp;
3669 3669          graph_t                 *gp;
3670 3670          int                     i;
3671 3671          int                     first_index; /* graph index */
3672 3672          int                     num_indexes; /* graph index */
3673 3673  
3674 3674          ASSERT((list_type == FLK_ACTIVE_STATE) ||
3675 3675              (list_type == FLK_SLEEPING_STATE));
3676 3676  
3677 3677          /*
3678 3678           * Get a pointer to something to use as a list head while building
3679 3679           * the rest of the list.
3680 3680           */
3681 3681          llheadp = &listhead;
3682 3682          lltp = llheadp;
3683 3683          llheadp->ll_next = (locklist_t *)NULL;
3684 3684  
3685 3685          /* Figure out which graphs we want to look at. */
3686 3686          if (vp == NULL) {
3687 3687                  first_index = 0;
3688 3688                  num_indexes = HASH_SIZE;
3689 3689          } else {
3690 3690                  first_index = HASH_INDEX(vp);
3691 3691                  num_indexes = 1;
3692 3692          }
3693 3693  
3694 3694          for (i = first_index; i < first_index + num_indexes; i++) {
3695 3695                  mutex_enter(&flock_lock);
3696 3696                  gp = lock_graph[i];
3697 3697                  mutex_exit(&flock_lock);
3698 3698                  if (gp == NULL) {
3699 3699                          continue;
3700 3700                  }
3701 3701  
3702 3702                  mutex_enter(&gp->gp_mutex);
3703 3703                  graph_head = (list_type == FLK_ACTIVE_STATE) ?
3704 3704                      ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3705 3705                  for (lock = graph_head->l_next;
3706 3706                      lock != graph_head;
3707 3707                      lock = lock->l_next) {
3708 3708                          if (use_sysid && lock->l_flock.l_sysid != sysid)
3709 3709                                  continue;
3710 3710                          if (pid != NOPID && lock->l_flock.l_pid != pid)
3711 3711                                  continue;
3712 3712                          if (vp != NULL && lock->l_vnode != vp)
3713 3713                                  continue;
3714 3714                          if (lock_state && !(lock_state & lock->l_state))
3715 3715                                  continue;
3716 3716                          if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3717 3717                                  continue;
3718 3718                          /*
3719 3719                           * A matching lock was found.  Allocate
3720 3720                           * space for a new locklist entry and fill
3721 3721                           * it in.
3722 3722                           */
3723 3723                          llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3724 3724                          lltp->ll_next = llp;
3725 3725                          VN_HOLD(lock->l_vnode);
3726 3726                          llp->ll_vp = lock->l_vnode;
3727 3727                          create_flock(lock, &(llp->ll_flock));
3728 3728                          llp->ll_next = (locklist_t *)NULL;
3729 3729                          lltp = llp;
3730 3730                  }
3731 3731                  mutex_exit(&gp->gp_mutex);
3732 3732          }
3733 3733  
3734 3734          llp = llheadp->ll_next;
3735 3735          return (llp);
3736 3736  }
3737 3737  
3738 3738  /*
3739 3739   * These two functions are simply interfaces to get_lock_list.  They return
3740 3740   * a list of sleeping or active locks for the given sysid and pid.  See
3741 3741   * get_lock_list for details.
3742 3742   *
3743 3743   * In either case we don't particularly care to specify the zone of interest;
3744 3744   * the sysid-space is global across zones, so the sysid will map to exactly one
3745 3745   * zone, and we'll return information for that zone.
3746 3746   */
3747 3747  
3748 3748  locklist_t *
3749 3749  flk_get_sleeping_locks(int sysid, pid_t pid)
3750 3750  {
3751 3751          return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3752 3752              ALL_ZONES));
3753 3753  }
3754 3754  
3755 3755  locklist_t *
3756 3756  flk_get_active_locks(int sysid, pid_t pid)
3757 3757  {
3758 3758          return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3759 3759              ALL_ZONES));
3760 3760  }
3761 3761  
3762 3762  /*
3763 3763   * Another interface to get_lock_list.  This one returns all the active
3764 3764   * locks for a given vnode.  Again, see get_lock_list for details.
3765 3765   *
3766 3766   * We don't need to specify which zone's locks we're interested in.  The matter
3767 3767   * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3768 3768   * be used by multiple zones, so the list of locks will all be from the right
3769 3769   * zone.
3770 3770   */
3771 3771  
3772 3772  locklist_t *
3773 3773  flk_active_locks_for_vp(const vnode_t *vp)
3774 3774  {
3775 3775          return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3776 3776              ALL_ZONES));
3777 3777  }
3778 3778  
3779 3779  /*
3780 3780   * Another interface to get_lock_list.  This one returns all the active
3781 3781   * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3782 3782   *
3783 3783   * See the comment for flk_active_locks_for_vp() for why we don't care to
3784 3784   * specify the particular zone of interest.
3785 3785   */
3786 3786  locklist_t *
3787 3787  flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3788 3788  {
3789 3789          return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3790 3790              NOPID, vp, ALL_ZONES));
3791 3791  }
3792 3792  
3793 3793  /*
3794 3794   * Another interface to get_lock_list.  This one returns all the active
3795 3795   * nbmand locks for a given pid.  Again, see get_lock_list for details.
3796 3796   *
3797 3797   * The zone doesn't need to be specified here; the locks held by a
3798 3798   * particular process will either be local (ie, non-NFS) or from the zone
3799 3799   * the process is executing in.  This is because other parts of the system
3800 3800   * ensure that an NFS vnode can't be used in a zone other than that in
3801 3801   * which it was opened.
3802 3802   */
3803 3803  locklist_t *
3804 3804  flk_active_nbmand_locks(pid_t pid)
3805 3805  {
3806 3806          return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3807 3807              pid, NULL, ALL_ZONES));
3808 3808  }
3809 3809  
3810 3810  /*
3811 3811   * Free up all entries in the locklist.
3812 3812   */
3813 3813  void
3814 3814  flk_free_locklist(locklist_t *llp)
3815 3815  {
3816 3816          locklist_t *next_llp;
3817 3817  
3818 3818          while (llp) {
3819 3819                  next_llp = llp->ll_next;
3820 3820                  VN_RELE(llp->ll_vp);
3821 3821                  kmem_free(llp, sizeof (*llp));
3822 3822                  llp = next_llp;
3823 3823          }
3824 3824  }
3825 3825  
3826 3826  static void
3827 3827  cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3828 3828  {
3829 3829          /*
3830 3830           * For each graph "lg" in the hash table lock_graph do
3831 3831           * a.  Get the list of sleeping locks
3832 3832           * b.  For each lock descriptor in the list do
3833 3833           *      i.   If the requested lock is an NLM server request AND
3834 3834           *              the nlmid is the same as the routine argument then
3835 3835           *              change the lock descriptor's state field to
3836 3836           *              "nlm_state."
3837 3837           * c.  Get the list of active locks
3838 3838           * d.  For each lock descriptor in the list do
3839 3839           *      i.   If the requested lock is an NLM server request AND
3840 3840           *              the nlmid is the same as the routine argument then
3841 3841           *              change the lock descriptor's state field to
3842 3842           *              "nlm_state."
3843 3843           */
3844 3844  
3845 3845          int                     i;
3846 3846          graph_t                 *gp;                    /* lock graph */
3847 3847          lock_descriptor_t       *lock;                  /* lock */
3848 3848          lock_descriptor_t       *nlock = NULL;          /* next lock */
3849 3849          int                     lock_nlmid;
3850 3850  
3851 3851          for (i = 0; i < HASH_SIZE; i++) {
3852 3852                  mutex_enter(&flock_lock);
3853 3853                  gp = lock_graph[i];
3854 3854                  mutex_exit(&flock_lock);
3855 3855                  if (gp == NULL) {
3856 3856                          continue;
3857 3857                  }
3858 3858  
3859 3859                  /* Get list of sleeping locks in current lock graph. */
3860 3860                  mutex_enter(&gp->gp_mutex);
3861 3861                  for (lock = SLEEPING_HEAD(gp)->l_next;
3862 3862                      lock != SLEEPING_HEAD(gp);
3863 3863                      lock = nlock) {
3864 3864                          nlock = lock->l_next;
3865 3865                          /* get NLM id */
3866 3866                          lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3867 3867  
3868 3868                          /*
3869 3869                           * If NLM server request AND nlmid of lock matches
3870 3870                           * nlmid of argument, then set the NLM state of the
3871 3871                           * lock to "nlm_state."
3872 3872                           */
3873 3873                          if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3874 3874                                  SET_NLM_STATE(lock, nlm_state);
3875 3875                          }
3876 3876                  }
3877 3877  
3878 3878                  /* Get list of active locks in current lock graph. */
3879 3879                  for (lock = ACTIVE_HEAD(gp)->l_next;
3880 3880                      lock != ACTIVE_HEAD(gp);
3881 3881                      lock = nlock) {
3882 3882                          nlock = lock->l_next;
3883 3883                          /* get NLM id */
3884 3884                          lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3885 3885  
3886 3886                          /*
3887 3887                           * If NLM server request AND nlmid of lock matches
3888 3888                           * nlmid of argument, then set the NLM state of the
3889 3889                           * lock to "nlm_state."
3890 3890                           */
3891 3891                          if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3892 3892                                  ASSERT(IS_ACTIVE(lock));
3893 3893                                  SET_NLM_STATE(lock, nlm_state);
3894 3894                          }
3895 3895                  }
3896 3896                  mutex_exit(&gp->gp_mutex);
3897 3897          }
3898 3898  }
3899 3899  
3900 3900  /*
3901 3901   * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3902 3902   * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3903 3903   *   identified by "nlmid." Poke those lock requests.
3904 3904   */
3905 3905  static void
3906 3906  cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3907 3907  {
3908 3908          lock_descriptor_t *lock;
3909 3909          lock_descriptor_t *nlock = NULL; /* next lock */
3910 3910          int i;
3911 3911          graph_t *gp;
3912 3912          int     lock_nlmid;
3913 3913  
3914 3914          for (i = 0; i < HASH_SIZE; i++) {
3915 3915                  mutex_enter(&flock_lock);
3916 3916                  gp = lock_graph[i];
3917 3917                  mutex_exit(&flock_lock);
3918 3918                  if (gp == NULL) {
3919 3919                          continue;
3920 3920                  }
3921 3921  
3922 3922                  mutex_enter(&gp->gp_mutex);
3923 3923                  for (lock = SLEEPING_HEAD(gp)->l_next;
3924 3924                      lock != SLEEPING_HEAD(gp);
3925 3925                      lock = nlock) {
3926 3926                          nlock = lock->l_next;
3927 3927                          /*
3928 3928                           * If NLM server request _and_ nlmid of lock matches
3929 3929                           * nlmid of argument, then set the NLM state of the
3930 3930                           * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3931 3931                           * request.
3932 3932                           */
3933 3933                          if (IS_LOCKMGR(lock)) {
3934 3934                                  /* get NLM id */
3935 3935                                  lock_nlmid =
3936 3936                                      GETNLMID(lock->l_flock.l_sysid);
3937 3937                                  if (nlmid == lock_nlmid) {
3938 3938                                          SET_NLM_STATE(lock,
3939 3939                                              FLK_NLM_SHUTTING_DOWN);
3940 3940                                          INTERRUPT_WAKEUP(lock);
3941 3941                                  }
3942 3942                          }
3943 3943                  }
3944 3944                  mutex_exit(&gp->gp_mutex);
3945 3945          }
3946 3946  }
3947 3947  
3948 3948  /*
3949 3949   * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3950 3950   * Effects:  Find all active (granted) lock manager locks _only_ for the
3951 3951   *   NLM server identified by "nlmid" and release them.
3952 3952   */
3953 3953  static void
3954 3954  cl_flk_unlock_nlm_granted(int nlmid)
3955 3955  {
3956 3956          lock_descriptor_t *lock;
3957 3957          lock_descriptor_t *nlock = NULL; /* next lock */
3958 3958          int i;
3959 3959          graph_t *gp;
3960 3960          int     lock_nlmid;
3961 3961  
3962 3962          for (i = 0; i < HASH_SIZE; i++) {
3963 3963                  mutex_enter(&flock_lock);
3964 3964                  gp = lock_graph[i];
3965 3965                  mutex_exit(&flock_lock);
3966 3966                  if (gp == NULL) {
3967 3967                          continue;
3968 3968                  }
3969 3969  
3970 3970                  mutex_enter(&gp->gp_mutex);
3971 3971                  for (lock = ACTIVE_HEAD(gp)->l_next;
3972 3972                      lock != ACTIVE_HEAD(gp);
3973 3973                      lock = nlock) {
3974 3974                          nlock = lock->l_next;
3975 3975                          ASSERT(IS_ACTIVE(lock));
3976 3976  
3977 3977                          /*
3978 3978                           * If it's an  NLM server request _and_ nlmid of
3979 3979                           * the lock matches nlmid of argument, then
3980 3980                           * remove the active lock the list, wakup blocked
3981 3981                           * threads, and free the storage for the lock.
3982 3982                           * Note that there's no need to mark the NLM state
3983 3983                           * of this lock to NLM_DOWN because the lock will
3984 3984                           * be deleted anyway and its storage freed.
3985 3985                           */
3986 3986                          if (IS_LOCKMGR(lock)) {
3987 3987                                  /* get NLM id */
3988 3988                                  lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3989 3989                                  if (nlmid == lock_nlmid) {
3990 3990                                          flk_delete_active_lock(lock, 0);
3991 3991                                          flk_wakeup(lock, 1);
3992 3992                                          flk_free_lock(lock);
3993 3993                                  }
3994 3994                          }
3995 3995                  }
3996 3996                  mutex_exit(&gp->gp_mutex);
3997 3997          }
3998 3998  }
3999 3999  
4000 4000  /*
4001 4001   * Find all sleeping lock manager requests and poke them.
4002 4002   */
4003 4003  static void
4004 4004  wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
4005 4005  {
4006 4006          lock_descriptor_t *lock;
4007 4007          lock_descriptor_t *nlock = NULL; /* next lock */
4008 4008          int i;
4009 4009          graph_t *gp;
4010 4010          zoneid_t zoneid = getzoneid();
4011 4011  
4012 4012          for (i = 0; i < HASH_SIZE; i++) {
4013 4013                  mutex_enter(&flock_lock);
4014 4014                  gp = lock_graph[i];
4015 4015                  mutex_exit(&flock_lock);
4016 4016                  if (gp == NULL) {
4017 4017                          continue;
4018 4018                  }
4019 4019  
4020 4020                  mutex_enter(&gp->gp_mutex);
4021 4021                  fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
4022 4022                  for (lock = SLEEPING_HEAD(gp)->l_next;
4023 4023                      lock != SLEEPING_HEAD(gp);
4024 4024                      lock = nlock) {
4025 4025                          nlock = lock->l_next;
4026 4026                          if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4027 4027                                  INTERRUPT_WAKEUP(lock);
4028 4028                          }
4029 4029                  }
4030 4030                  mutex_exit(&gp->gp_mutex);
4031 4031          }
4032 4032  }
4033 4033  
4034 4034  
4035 4035  /*
4036 4036   * Find all active (granted) lock manager locks and release them.
4037 4037   */
4038 4038  static void
4039 4039  unlock_lockmgr_granted(struct flock_globals *fg)
4040 4040  {
4041 4041          lock_descriptor_t *lock;
4042 4042          lock_descriptor_t *nlock = NULL; /* next lock */
4043 4043          int i;
4044 4044          graph_t *gp;
4045 4045          zoneid_t zoneid = getzoneid();
4046 4046  
4047 4047          for (i = 0; i < HASH_SIZE; i++) {
4048 4048                  mutex_enter(&flock_lock);
4049 4049                  gp = lock_graph[i];
4050 4050                  mutex_exit(&flock_lock);
4051 4051                  if (gp == NULL) {
4052 4052                          continue;
4053 4053                  }
4054 4054  
4055 4055                  mutex_enter(&gp->gp_mutex);
4056 4056                  fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
4057 4057                  for (lock = ACTIVE_HEAD(gp)->l_next;
4058 4058                      lock != ACTIVE_HEAD(gp);
4059 4059                      lock = nlock) {
4060 4060                          nlock = lock->l_next;
4061 4061                          if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4062 4062                                  ASSERT(IS_ACTIVE(lock));
4063 4063                                  flk_delete_active_lock(lock, 0);
4064 4064                                  flk_wakeup(lock, 1);
4065 4065                                  flk_free_lock(lock);
4066 4066                          }
4067 4067                  }
4068 4068                  mutex_exit(&gp->gp_mutex);
4069 4069          }
4070 4070  }
4071 4071  
4072 4072  
4073 4073  /*
4074 4074   * Wait until a lock is granted, cancelled, or interrupted.
4075 4075   */
4076 4076  
4077 4077  static void
4078 4078  wait_for_lock(lock_descriptor_t *request)
4079 4079  {
4080 4080          graph_t *gp = request->l_graph;
4081 4081  
4082 4082          ASSERT(MUTEX_HELD(&gp->gp_mutex));
4083 4083  
4084 4084          while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
4085 4085              !(IS_INTERRUPTED(request))) {
4086 4086                  if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
4087 4087                          flk_set_state(request, FLK_INTERRUPTED_STATE);
4088 4088                          request->l_state |= INTERRUPTED_LOCK;
4089 4089                  }
4090 4090          }
4091 4091  }
4092 4092  
4093 4093  /*
4094 4094   * Create an flock structure from the existing lock information
4095 4095   *
4096 4096   * This routine is used to create flock structures for the lock manager
4097 4097   * to use in a reclaim request.  Since the lock was originated on this
4098 4098   * host, it must be conforming to UNIX semantics, so no checking is
4099 4099   * done to make sure it falls within the lower half of the 32-bit range.
4100 4100   */
4101 4101  
4102 4102  static void
4103 4103  create_flock(lock_descriptor_t *lp, flock64_t *flp)
4104 4104  {
4105 4105          ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
4106 4106          ASSERT(lp->l_end >= lp->l_start);
4107 4107  
4108 4108          flp->l_type = lp->l_type;
4109 4109          flp->l_whence = 0;
4110 4110          flp->l_start = lp->l_start;
4111 4111          flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
4112 4112              (lp->l_end - lp->l_start + 1);
4113 4113          flp->l_sysid = lp->l_flock.l_sysid;
4114 4114          flp->l_pid = lp->l_flock.l_pid;
4115 4115  }
4116 4116  
4117 4117  /*
4118 4118   * Convert flock_t data describing a lock range into unsigned long starting
4119 4119   * and ending points, which are put into lock_request.  Returns 0 or an
4120 4120   * errno value.
4121 4121   * Large Files: max is passed by the caller and we return EOVERFLOW
4122 4122   * as defined by LFS API.
4123 4123   */
4124 4124  
4125 4125  int
4126 4126  flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
4127 4127      u_offset_t *start, u_offset_t *end, offset_t offset)
4128 4128  {
4129 4129          struct vattr    vattr;
4130 4130          int     error;
4131 4131  
4132 4132          /*
4133 4133           * Determine the starting point of the request
4134 4134           */
4135 4135          switch (flp->l_whence) {
4136 4136          case 0:         /* SEEK_SET */
4137 4137                  *start = (u_offset_t)flp->l_start;
4138 4138                  break;
4139 4139          case 1:         /* SEEK_CUR */
4140 4140                  *start = (u_offset_t)(flp->l_start + offset);
4141 4141                  break;
4142 4142          case 2:         /* SEEK_END */
4143 4143                  vattr.va_mask = AT_SIZE;
4144 4144                  if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
4145 4145                          return (error);
4146 4146                  *start = (u_offset_t)(flp->l_start + vattr.va_size);
4147 4147                  break;
4148 4148          default:
4149 4149                  return (EINVAL);
4150 4150          }
4151 4151  
4152 4152          /*
4153 4153           * Determine the range covered by the request.
4154 4154           */
4155 4155          if (flp->l_len == 0)
4156 4156                  *end = MAX_U_OFFSET_T;
4157 4157          else if ((offset_t)flp->l_len > 0) {
4158 4158                  *end = (u_offset_t)(*start + (flp->l_len - 1));
4159 4159          } else {
4160 4160                  /*
4161 4161                   * Negative length; why do we even allow this ?
4162 4162                   * Because this allows easy specification of
4163 4163                   * the last n bytes of the file.
4164 4164                   */
4165 4165                  *end = *start;
4166 4166                  *start += (u_offset_t)flp->l_len;
4167 4167                  (*start)++;
4168 4168          }
4169 4169          return (0);
4170 4170  }
4171 4171  
4172 4172  /*
4173 4173   * Check the validity of lock data.  This can used by the NFS
4174 4174   * frlock routines to check data before contacting the server.  The
4175 4175   * server must support semantics that aren't as restrictive as
4176 4176   * the UNIX API, so the NFS client is required to check.
4177 4177   * The maximum is now passed in by the caller.
4178 4178   */
4179 4179  
4180 4180  int
4181 4181  flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
4182 4182  {
4183 4183          /*
4184 4184           * The end (length) for local locking should never be greater
4185 4185           * than MAXEND. However, the representation for
4186 4186           * the entire file is MAX_U_OFFSET_T.
4187 4187           */
4188 4188          if ((start > max) ||
4189 4189              ((end > max) && (end != MAX_U_OFFSET_T))) {
4190 4190                  return (EINVAL);
4191 4191          }
4192 4192          if (start > end) {
4193 4193                  return (EINVAL);
4194 4194          }
4195 4195          return (0);
4196 4196  }
4197 4197  
4198 4198  /*
4199 4199   * Fill in request->l_flock with information about the lock blocking the
4200 4200   * request.  The complexity here is that lock manager requests are allowed
4201 4201   * to see into the upper part of the 32-bit address range, whereas local
4202 4202   * requests are only allowed to see signed values.
4203 4203   *
4204 4204   * What should be done when "blocker" is a lock manager lock that uses the
4205 4205   * upper portion of the 32-bit range, but "request" is local?  Since the
4206 4206   * request has already been determined to have been blocked by the blocker,
4207 4207   * at least some portion of "blocker" must be in the range of the request,
4208 4208   * or the request extends to the end of file.  For the first case, the
4209 4209   * portion in the lower range is returned with the indication that it goes
4210 4210   * "to EOF."  For the second case, the last byte of the lower range is
4211 4211   * returned with the indication that it goes "to EOF."
4212 4212   */
4213 4213  
4214 4214  static void
4215 4215  report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
4216 4216  {
4217 4217          flock64_t *flrp;                        /* l_flock portion of request */
4218 4218  
4219 4219          ASSERT(blocker != NULL);
4220 4220  
4221 4221          flrp = &request->l_flock;
4222 4222          flrp->l_whence = 0;
4223 4223          flrp->l_type = blocker->l_type;
4224 4224          flrp->l_pid = blocker->l_flock.l_pid;
4225 4225          flrp->l_sysid = blocker->l_flock.l_sysid;
4226 4226          request->l_ofd = blocker->l_ofd;
4227 4227  
4228 4228          if (IS_LOCKMGR(request)) {
4229 4229                  flrp->l_start = blocker->l_start;
4230 4230                  if (blocker->l_end == MAX_U_OFFSET_T)
4231 4231                          flrp->l_len = 0;
4232 4232                  else
4233 4233                          flrp->l_len = blocker->l_end - blocker->l_start + 1;
4234 4234          } else {
4235 4235                  if (blocker->l_start > MAXEND) {
4236 4236                          flrp->l_start = MAXEND;
4237 4237                          flrp->l_len = 0;
4238 4238                  } else {
4239 4239                          flrp->l_start = blocker->l_start;
4240 4240                          if (blocker->l_end == MAX_U_OFFSET_T)
4241 4241                                  flrp->l_len = 0;
4242 4242                          else
4243 4243                                  flrp->l_len = blocker->l_end -
4244 4244                                      blocker->l_start + 1;
4245 4245                  }
4246 4246          }
4247 4247  }
4248 4248  
4249 4249  /*
4250 4250   * PSARC case 1997/292
4251 4251   */
4252 4252  /*
4253 4253   * This is the public routine exported by flock.h.
4254 4254   */
4255 4255  void
4256 4256  cl_flk_change_nlm_state_to_unknown(int nlmid)
4257 4257  {
4258 4258          /*
4259 4259           * Check to see if node is booted as a cluster. If not, return.
4260 4260           */
4261 4261          if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
4262 4262                  return;
4263 4263          }
4264 4264  
4265 4265          /*
4266 4266           * See comment in cl_flk_set_nlm_status().
4267 4267           */
4268 4268          if (nlm_reg_status == NULL) {
4269 4269                  return;
4270 4270          }
4271 4271  
4272 4272          /*
4273 4273           * protect NLM registry state with a mutex.
4274 4274           */
4275 4275          ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
4276 4276          mutex_enter(&nlm_reg_lock);
4277 4277          FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
4278 4278          mutex_exit(&nlm_reg_lock);
4279 4279  }
4280 4280  
4281 4281  /*
4282 4282   * Return non-zero if the given I/O request conflicts with an active NBMAND
4283 4283   * lock.
4284 4284   * If svmand is non-zero, it means look at all active locks, not just NBMAND
4285 4285   * locks.
4286 4286   */
4287 4287  
4288 4288  int
4289 4289  nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
4290 4290                  ssize_t length, int svmand, caller_context_t *ct)
4291 4291  {
4292 4292          int conflict = 0;
4293 4293          graph_t                 *gp;
4294 4294          lock_descriptor_t       *lock;
4295 4295          pid_t pid;
4296 4296          int sysid;
4297 4297  
4298 4298          if (ct == NULL) {
4299 4299                  pid = curproc->p_pid;
4300 4300                  sysid = 0;
4301 4301          } else {
4302 4302                  pid = ct->cc_pid;
4303 4303                  sysid = ct->cc_sysid;
4304 4304          }
4305 4305  
4306 4306          mutex_enter(&flock_lock);
4307 4307          gp = lock_graph[HASH_INDEX(vp)];
4308 4308          mutex_exit(&flock_lock);
4309 4309          if (gp == NULL)
4310 4310                  return (0);
4311 4311  
4312 4312          mutex_enter(&gp->gp_mutex);
4313 4313          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4314 4314  
4315 4315          for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
4316 4316                  if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
4317 4317                      (lock->l_flock.l_sysid != sysid ||
4318 4318                      lock->l_flock.l_pid != pid) &&
4319 4319                      lock_blocks_io(op, offset, length,
4320 4320                      lock->l_type, lock->l_start, lock->l_end)) {
4321 4321                          conflict = 1;
4322 4322                          break;
4323 4323                  }
4324 4324          }
4325 4325          mutex_exit(&gp->gp_mutex);
4326 4326  
4327 4327          return (conflict);
4328 4328  }
4329 4329  
4330 4330  /*
4331 4331   * Return non-zero if the given I/O request conflicts with the given lock.
4332 4332   */
4333 4333  
4334 4334  static int
4335 4335  lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4336 4336              int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4337 4337  {
4338 4338          ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4339 4339          ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4340 4340  
4341 4341          if (op == NBL_READ && lock_type == F_RDLCK)
4342 4342                  return (0);
4343 4343  
4344 4344          if (offset <= lock_start && lock_start < offset + length)
4345 4345                  return (1);
4346 4346          if (lock_start <= offset && offset <= lock_end)
4347 4347                  return (1);
4348 4348  
4349 4349          return (0);
4350 4350  }
4351 4351  
4352 4352  #ifdef DEBUG
4353 4353  static void
4354 4354  check_active_locks(graph_t *gp)
4355 4355  {
4356 4356          lock_descriptor_t *lock, *lock1;
4357 4357          edge_t  *ep;
4358 4358  
4359 4359          for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4360 4360              lock = lock->l_next) {
4361 4361                  ASSERT(IS_ACTIVE(lock));
4362 4362                  ASSERT(NOT_BLOCKED(lock));
4363 4363                  ASSERT(!IS_BARRIER(lock));
4364 4364  
4365 4365                  ep = FIRST_IN(lock);
4366 4366  
4367 4367                  while (ep != HEAD(lock)) {
4368 4368                          ASSERT(IS_SLEEPING(ep->from_vertex));
4369 4369                          ASSERT(!NOT_BLOCKED(ep->from_vertex));
4370 4370                          ep = NEXT_IN(ep);
4371 4371                  }
4372 4372  
4373 4373                  for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4374 4374                      lock1 = lock1->l_next) {
4375 4375                          if (lock1->l_vnode == lock->l_vnode) {
4376 4376                          if (BLOCKS(lock1, lock)) {
4377 4377                                  cmn_err(CE_PANIC,
4378 4378                                      "active lock %p blocks %p",
4379 4379                                      (void *)lock1, (void *)lock);
4380 4380                          } else if (BLOCKS(lock, lock1)) {
4381 4381                                  cmn_err(CE_PANIC,
4382 4382                                      "active lock %p blocks %p",
4383 4383                                      (void *)lock, (void *)lock1);
4384 4384                          }
4385 4385                          }
4386 4386                  }
4387 4387          }
4388 4388  }
4389 4389  
4390 4390  /*
4391 4391   * Effect: This functions checks to see if the transition from 'old_state' to
4392 4392   *      'new_state' is a valid one.  It returns 0 if the transition is valid
4393 4393   *      and 1 if it is not.
4394 4394   *      For a map of valid transitions, see sys/flock_impl.h
4395 4395   */
4396 4396  static int
4397 4397  check_lock_transition(int old_state, int new_state)
4398 4398  {
4399 4399          switch (old_state) {
4400 4400          case FLK_INITIAL_STATE:
4401 4401                  if ((new_state == FLK_START_STATE) ||
4402 4402                      (new_state == FLK_SLEEPING_STATE) ||
4403 4403                      (new_state == FLK_ACTIVE_STATE) ||
4404 4404                      (new_state == FLK_DEAD_STATE)) {
4405 4405                          return (0);
4406 4406                  } else {
4407 4407                          return (1);
4408 4408                  }
4409 4409          case FLK_START_STATE:
4410 4410                  if ((new_state == FLK_ACTIVE_STATE) ||
4411 4411                      (new_state == FLK_DEAD_STATE)) {
4412 4412                          return (0);
4413 4413                  } else {
4414 4414                          return (1);
4415 4415                  }
4416 4416          case FLK_ACTIVE_STATE:
4417 4417                  if (new_state == FLK_DEAD_STATE) {
4418 4418                          return (0);
4419 4419                  } else {
4420 4420                          return (1);
4421 4421                  }
4422 4422          case FLK_SLEEPING_STATE:
4423 4423                  if ((new_state == FLK_GRANTED_STATE) ||
4424 4424                      (new_state == FLK_INTERRUPTED_STATE) ||
4425 4425                      (new_state == FLK_CANCELLED_STATE)) {
4426 4426                          return (0);
4427 4427                  } else {
4428 4428                          return (1);
4429 4429                  }
4430 4430          case FLK_GRANTED_STATE:
4431 4431                  if ((new_state == FLK_START_STATE) ||
4432 4432                      (new_state == FLK_INTERRUPTED_STATE) ||
4433 4433                      (new_state == FLK_CANCELLED_STATE)) {
4434 4434                          return (0);
4435 4435                  } else {
4436 4436                          return (1);
4437 4437                  }
4438 4438          case FLK_CANCELLED_STATE:
4439 4439                  if ((new_state == FLK_INTERRUPTED_STATE) ||
4440 4440                      (new_state == FLK_DEAD_STATE)) {
4441 4441                          return (0);
4442 4442                  } else {
4443 4443                          return (1);
4444 4444                  }
4445 4445          case FLK_INTERRUPTED_STATE:
4446 4446                  if (new_state == FLK_DEAD_STATE) {
4447 4447                          return (0);
4448 4448                  } else {
4449 4449                          return (1);
4450 4450                  }
4451 4451          case FLK_DEAD_STATE:
4452 4452                  /* May be set more than once */
4453 4453                  if (new_state == FLK_DEAD_STATE) {
4454 4454                          return (0);
4455 4455                  } else {
4456 4456                          return (1);
4457 4457                  }
4458 4458          default:
4459 4459                  return (1);
4460 4460          }
4461 4461  }
4462 4462  
4463 4463  static void
4464 4464  check_sleeping_locks(graph_t *gp)
4465 4465  {
4466 4466          lock_descriptor_t *lock1, *lock2;
4467 4467          edge_t *ep;
4468 4468          for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4469 4469              lock1 = lock1->l_next) {
4470 4470                                  ASSERT(!IS_BARRIER(lock1));
4471 4471          for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4472 4472              lock2 = lock2->l_next) {
4473 4473                  if (lock1->l_vnode == lock2->l_vnode) {
4474 4474                          if (BLOCKS(lock2, lock1)) {
4475 4475                                  ASSERT(!IS_GRANTED(lock1));
4476 4476                                  ASSERT(!NOT_BLOCKED(lock1));
4477 4477                                  path(lock1, lock2);
4478 4478                          }
4479 4479                  }
4480 4480          }
4481 4481  
4482 4482          for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4483 4483              lock2 = lock2->l_next) {
4484 4484                                  ASSERT(!IS_BARRIER(lock1));
4485 4485                  if (lock1->l_vnode == lock2->l_vnode) {
4486 4486                          if (BLOCKS(lock2, lock1)) {
4487 4487                                  ASSERT(!IS_GRANTED(lock1));
4488 4488                                  ASSERT(!NOT_BLOCKED(lock1));
4489 4489                                  path(lock1, lock2);
4490 4490                          }
4491 4491                  }
4492 4492          }
4493 4493          ep = FIRST_ADJ(lock1);
4494 4494          while (ep != HEAD(lock1)) {
4495 4495                  ASSERT(BLOCKS(ep->to_vertex, lock1));
4496 4496                  ep = NEXT_ADJ(ep);
4497 4497          }
4498 4498          }
4499 4499  }
4500 4500  
4501 4501  static int
4502 4502  level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4503 4503  {
4504 4504          edge_t  *ep;
4505 4505          lock_descriptor_t       *vertex;
4506 4506          lock_descriptor_t *vertex_stack;
4507 4507  
4508 4508          STACK_INIT(vertex_stack);
4509 4509  
4510 4510          flk_graph_uncolor(lock1->l_graph);
4511 4511          ep = FIRST_ADJ(lock1);
4512 4512          ASSERT(ep != HEAD(lock1));
4513 4513          while (ep != HEAD(lock1)) {
4514 4514                  if (no_path)
4515 4515                          ASSERT(ep->to_vertex != lock2);
4516 4516                  STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4517 4517                  COLOR(ep->to_vertex);
4518 4518                  ep = NEXT_ADJ(ep);
4519 4519          }
4520 4520  
4521 4521          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4522 4522                  STACK_POP(vertex_stack, l_dstack);
4523 4523                  for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4524 4524                      ep = NEXT_ADJ(ep)) {
4525 4525                          if (COLORED(ep->to_vertex))
4526 4526                                  continue;
4527 4527                          COLOR(ep->to_vertex);
4528 4528                          if (ep->to_vertex == lock2)
4529 4529                                  return (1);
4530 4530  
4531 4531                          STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4532 4532                  }
4533 4533          }
4534 4534          return (0);
4535 4535  }
4536 4536  
4537 4537  static void
4538 4538  check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4539 4539  {
4540 4540          lock_descriptor_t *lock;
4541 4541  
4542 4542          /* Ignore OFD style locks since they're not process-wide. */
4543 4543          if (pid == 0)
4544 4544                  return;
4545 4545  
4546 4546          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4547 4547  
4548 4548          if (lock) {
4549 4549                  while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4550 4550                          if (lock->l_flock.l_pid == pid &&
4551 4551                              lock->l_flock.l_sysid == sysid)
4552 4552                                  cmn_err(CE_PANIC,
4553 4553                                      "owner pid %d's lock %p in active queue",
4554 4554                                      pid, (void *)lock);
4555 4555                          lock = lock->l_next;
4556 4556                  }
4557 4557          }
4558 4558          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4559 4559  
4560 4560          if (lock) {
4561 4561                  while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4562 4562                          if (lock->l_flock.l_pid == pid &&
4563 4563                              lock->l_flock.l_sysid == sysid)
4564 4564                                  cmn_err(CE_PANIC,
4565 4565                                      "owner pid %d's lock %p in sleep queue",
4566 4566                                      pid, (void *)lock);
4567 4567                          lock = lock->l_next;
4568 4568                  }
4569 4569          }
4570 4570  }
4571 4571  
4572 4572  static int
4573 4573  level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4574 4574  {
4575 4575          edge_t *ep = FIRST_ADJ(lock1);
4576 4576  
4577 4577          while (ep != HEAD(lock1)) {
4578 4578                  if (ep->to_vertex == lock2)
4579 4579                          return (1);
4580 4580                  else
4581 4581                          ep = NEXT_ADJ(ep);
4582 4582          }
4583 4583          return (0);
4584 4584  }
4585 4585  
4586 4586  static int
4587 4587  no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4588 4588  {
4589 4589          return (!level_two_path(lock1, lock2, 1));
4590 4590  }
4591 4591  
4592 4592  static void
4593 4593  path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4594 4594  {
4595 4595          if (level_one_path(lock1, lock2)) {
4596 4596                  if (level_two_path(lock1, lock2, 0) != 0) {
4597 4597                          cmn_err(CE_WARN,
4598 4598                              "one edge one path from lock1 %p lock2 %p",
4599 4599                              (void *)lock1, (void *)lock2);
4600 4600                  }
4601 4601          } else if (no_path(lock1, lock2)) {
4602 4602                  cmn_err(CE_PANIC,
4603 4603                      "No path from  lock1 %p to lock2 %p",
4604 4604                      (void *)lock1, (void *)lock2);
4605 4605          }
4606 4606  }
4607 4607  #endif /* DEBUG */

↓ open down ↓

4607 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX