big-one Udiff usr/src/uts/common/os/flock.c

Print this page

NEX-3758 Support for remote stale lock detection
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>

@@ -26,11 +26,11 @@
 
 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
 /*      All Rights Reserved */
 
 /*
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/flock_impl.h>
 #include <sys/vfs.h>

@@ -39,10 +39,14 @@
 #include <sys/clconf.h>
 #include <sys/cladm.h>
 #include <sys/nbmlock.h>
 #include <sys/cred.h>
 #include <sys/policy.h>
+#include <sys/list.h>
+#include <sys/sysmacros.h>
+#include <sys/socket.h>
+#include <inet/ip.h>
 
 /*
  * The following four variables are for statistics purposes and they are
  * not protected by locks. They may not be accurate but will at least be
  * close to the actual value.

@@ -157,14 +161,36 @@
         flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
 };
 
 zone_key_t flock_zone_key;
 
+/*
+ * Support for the remote stale lock detection
+ *
+ * The sysid_to_host_translator_lock readers/writer lock protects
+ * sysid_to_host_translator_list.
+ *
+ * The sysid_to_host_translator_list is a list of sysid to host name translator
+ * functions.  The new translators are added using the public
+ * flk_add_sysid_to_host_translator() call.
+ *
+ * The stale_lock_timeout is in seconds and it determines the interval for the
+ * remote stale lock checking.  When set to 0, the remote stale lock checking
+ * is disabled.
+ */
+struct sysid_to_host_translator_entry {
+        sysid_to_host_translator_t translator;
+        list_node_t node;
+};
+static krwlock_t sysid_to_host_translator_lock;
+static list_t sysid_to_host_translator_list;
+volatile int stale_lock_timeout = 3600;         /* one hour, in seconds */
+
 static void create_flock(lock_descriptor_t *, flock64_t *);
 static lock_descriptor_t        *flk_get_lock(void);
 static void     flk_free_lock(lock_descriptor_t *lock);
-static void     flk_get_first_blocking_lock(lock_descriptor_t *request);
+static void     flk_get_first_blocking_lock(lock_descriptor_t *);
 static int flk_process_request(lock_descriptor_t *);
 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
 static edge_t *flk_get_edge(void);
 static int flk_wait_execute_request(lock_descriptor_t *);
 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);

@@ -554,12 +580,11 @@
          */
 
         if ((lckdat->l_type == F_UNLCK) ||
             !((cmd & INOFLCK) || (cmd & SETFLCK))) {
                 lock_request = &stack_lock_request;
-                (void) bzero((caddr_t)lock_request,
-                    sizeof (lock_descriptor_t));
+                bzero(lock_request, sizeof (lock_descriptor_t));
 
                 /*
                  * following is added to make the assertions in
                  * flk_execute_request() to pass through
                  */

@@ -948,10 +973,18 @@
                 /* initialize all NLM states in array to NLM_UNKNOWN */
                 for (i = 0; i < nlm_status_size; i++) {
                         nlm_reg_status[i] = FLK_NLM_UNKNOWN;
                 }
         }
+
+        mutex_init(&flock_lock, NULL, MUTEX_DEFAULT, NULL);
+        mutex_init(&nlm_reg_lock, NULL, MUTEX_DEFAULT, NULL);
+
+        rw_init(&sysid_to_host_translator_lock, NULL, RW_DEFAULT, NULL);
+        list_create(&sysid_to_host_translator_list,
+            sizeof (struct sysid_to_host_translator_entry),
+            offsetof(struct sysid_to_host_translator_entry, node));
 }
 
 /*
  * Zone constructor/destructor callbacks to be executed when a zone is
  * created/destroyed.

@@ -1009,10 +1042,11 @@
 void
 flk_free_lock(lock_descriptor_t *lock)
 {
         file_t *fp;
 
+        ASSERT(lock->l_blocker >= 0);
         ASSERT(IS_DEAD(lock));
 
         if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock)
                 fp->f_filock = NULL;

@@ -1019,11 +1053,11 @@
         if (IS_REFERENCED(lock)) {
                 lock->l_state |= DELETED_LOCK;
                 return;
         }
         flk_lock_frees++;
-        kmem_free((void *)lock, sizeof (lock_descriptor_t));
+        kmem_free(lock, sizeof (lock_descriptor_t));
 }
 
 void
 flk_set_state(lock_descriptor_t *lock, int new_state)
 {

@@ -1059,10 +1093,268 @@
         }
         lock->l_status = new_state;
 }
 
 /*
+ * Support for the remote stale lock detection
+ */
+
+void
+flk_add_sysid_to_host_translator(sysid_to_host_translator_t tr)
+{
+        struct sysid_to_host_translator_entry *te;
+
+        te = kmem_alloc(sizeof (struct sysid_to_host_translator_entry),
+            KM_SLEEP);
+
+        te->translator = tr;
+
+        rw_enter(&sysid_to_host_translator_lock, RW_WRITER);
+        list_insert_head(&sysid_to_host_translator_list, te);
+        rw_exit(&sysid_to_host_translator_lock);
+}
+
+static void
+translate_sysid_to_host(zoneid_t zoneid, sysid_t sysid, char *host, size_t hlen,
+    const char **type)
+{
+        struct sockaddr sa;
+        struct sysid_to_host_translator_entry *te;
+
+        /* Some defaults in a case the translation will fail */
+        *type = "?";
+        (void) strlcpy(host, "?", hlen);
+
+        rw_enter(&sysid_to_host_translator_lock, RW_READER);
+
+        for (te = list_head(&sysid_to_host_translator_list); te != NULL;
+            te = list_next(&sysid_to_host_translator_list, te)) {
+
+                if (te->translator(zoneid, sysid, &sa, type) != 0) {
+                        rw_exit(&sysid_to_host_translator_lock);
+
+                        switch (sa.sa_family) {
+                        case AF_INET:
+                                (void) inet_ntop(AF_INET,
+                                    &((struct sockaddr_in *)&sa)->sin_addr,
+                                    host, hlen);
+                                break;
+                        case AF_INET6:
+                                (void) inet_ntop(AF_INET6,
+                                    &((struct sockaddr_in6 *)&sa)->sin6_addr,
+                                    host, hlen);
+                                break;
+                        default:
+                                break;
+                        }
+
+                        return;
+                }
+        }
+
+        rw_exit(&sysid_to_host_translator_lock);
+}
+
+static char *
+get_vnode_path(vnode_t *vp)
+{
+        size_t len;
+        char *ret;
+
+        mutex_enter(&vp->v_lock);
+        if (vp->v_path == NULL) {
+                mutex_exit(&vp->v_lock);
+                return (NULL);
+        }
+        len = strlen(vp->v_path) + 1;
+        mutex_exit(&vp->v_lock);
+
+        ret = kmem_alloc(len, KM_SLEEP);
+
+        mutex_enter(&vp->v_lock);
+        if (vp->v_path == NULL || strlen(vp->v_path) + 1 != len) {
+                mutex_exit(&vp->v_lock);
+                kmem_free(ret, len);
+                return (NULL);
+        }
+        bcopy(vp->v_path, ret, len);
+        mutex_exit(&vp->v_lock);
+
+        return (ret);
+}
+
+static void
+flk_stale_lock_check(lock_descriptor_t *lock)
+{
+        char *path;
+
+        char host[INET6_ADDRSTRLEN];            /* host name */
+        const char *type;                       /* host type */
+
+        /* temporary variables for the cmn_err() call */
+        char *p, *t;            /* path, lock type */
+        pid_t pid;              /* pid */
+        void *v;                /* vnode */
+        u_offset_t s, e;        /* start, end */
+
+        ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
+
+        /*
+         * Either not a remote lock, or the stale lock checking is disabled, or
+         * the lock is already reported.
+         */
+        if (IS_LOCAL(lock) || stale_lock_timeout == 0 || lock->l_blocker < 0)
+                return;
+
+        /* Seen first time? */
+        if (lock->l_blocker == 0) {
+                lock->l_blocker = gethrtime();
+                return;
+        }
+
+        /* Old enough? */
+        if ((gethrtime() - lock->l_blocker) / NANOSEC < stale_lock_timeout)
+                return;
+
+        translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
+            sizeof (host), &type);
+        path = get_vnode_path(lock->l_vnode);
+
+        pid = lock->l_flock.l_pid;
+        v = (void *)lock->l_vnode;
+        p = path == NULL ? "?" : path;
+        t = lock->l_type == F_WRLCK ? "WR" : "RD";
+        s = lock->l_start;
+        e = lock->l_end;
+
+        /* Report the blocker as stale */
+        cmn_err(CE_NOTE, "!Stale lock (host: %s (%s), pid: %d, vnode: %p, "
+            "path: %s, %sLCK: %llu:%llu)", host, type, pid, v, p, t, s, e);
+
+        if (path != NULL)
+                strfree(path);
+
+        /* Mark this blocker as reported */
+        lock->l_blocker = -lock->l_blocker;
+}
+
+static void
+flk_stale_lock_shrink(lock_descriptor_t *lock, lock_descriptor_t *new)
+{
+        char *path;
+
+        char host[INET6_ADDRSTRLEN];            /* host name */
+        const char *type;                       /* host type */
+
+        /* temporary variables for the cmn_err() call */
+        char *p, *t;            /* path, lock type */
+        pid_t pid;              /* pid */
+        void *v;                /* vnode */
+        u_offset_t s, e;        /* start, end */
+        u_offset_t ns, ne;      /* new start, new end */
+
+        ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
+
+        translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
+            sizeof (host), &type);
+        path = get_vnode_path(lock->l_vnode);
+
+        pid = lock->l_flock.l_pid;
+        v = (void *)lock->l_vnode;
+        p = path == NULL ? "?" : path;
+        t = lock->l_type == F_WRLCK ? "WR" : "RD";
+        s = lock->l_start;
+        e = lock->l_end;
+        ns = new->l_start;
+        ne = new->l_end;
+
+        cmn_err(CE_NOTE, "!Stale lock SHRINK (host: %s (%s), pid: %d, "
+            "vnode: %p, path: %s, %sLCK: %llu:%llu -> %llu:%llu)", host, type,
+            pid, v, p, t, s, e, ns, ne);
+
+        if (path != NULL)
+                strfree(path);
+}
+
+static void
+flk_stale_lock_split(lock_descriptor_t *lock, lock_descriptor_t *new1,
+    lock_descriptor_t *new2)
+{
+        char *path;
+
+        char host[INET6_ADDRSTRLEN];            /* host name */
+        const char *type;                       /* host type */
+
+        /* temporary variables for the cmn_err() call */
+        char *p, *t;            /* path, lock type */
+        pid_t pid;              /* pid */
+        void *v;                /* vnode */
+        u_offset_t s, e;        /* start, end */
+        u_offset_t n1s, n1e;    /* new1 start, new1 end */
+        u_offset_t n2s, n2e;    /* new2 start, new2 end */
+
+        ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
+
+        translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
+            sizeof (host), &type);
+        path = get_vnode_path(lock->l_vnode);
+
+        pid = lock->l_flock.l_pid;
+        v = (void *)lock->l_vnode;
+        p = path == NULL ? "?" : path;
+        t = lock->l_type == F_WRLCK ? "WR" : "RD";
+        s = lock->l_start;
+        e = lock->l_end;
+        n1s = new1->l_start;
+        n1e = new1->l_end;
+        n2s = new2->l_start;
+        n2e = new2->l_end;
+
+        cmn_err(CE_NOTE, "!Stale lock SPLIT (host: %s (%s), pid: %d, "
+            "vnode: %p, path: %s, %sLCK: %llu:%llu -> %llu:%llu and %llu:%llu)",
+            host, type, pid, v, p, t, s, e, n1s, n1e, n2s, n2e);
+
+        if (path != NULL)
+                strfree(path);
+}
+
+static void
+flk_stale_lock_release(lock_descriptor_t *lock)
+{
+        char *path;
+
+        char host[INET6_ADDRSTRLEN];            /* host name */
+        const char *type;                       /* host type */
+
+        /* temporary variables for the cmn_err() call */
+        char *p, *t;            /* path, lock type */
+        pid_t pid;              /* pid */
+        void *v;                /* vnode */
+        u_offset_t s, e;        /* start, end */
+
+        ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
+
+        translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
+            sizeof (host), &type);
+        path = get_vnode_path(lock->l_vnode);
+
+        pid = lock->l_flock.l_pid;
+        v = (void *)lock->l_vnode;
+        p = path == NULL ? "?" : path;
+        t = lock->l_type == F_WRLCK ? "WR" : "RD";
+        s = lock->l_start;
+        e = lock->l_end;
+
+        cmn_err(CE_NOTE, "!Stale lock RELEASE (host: %s (%s), pid: %d, "
+            "vnode: %p, path: %s, %sLCK: %llu:%llu)", host, type, pid, v, p, t,
+            s, e);
+
+        if (path != NULL)
+                strfree(path);
+}
+
+/*
  * Routine that checks whether there are any blocking locks in the system.
  *
  * The policy followed is if a write lock is sleeping we don't allow read
  * locks before this write lock even though there may not be any active
  * locks corresponding to the read locks' region.

@@ -1097,16 +1389,17 @@
          * check active locks
          */
 
         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 
-
         if (lock) {
                 do {
                         if (BLOCKS(lock, request)) {
-                                if (!request_will_wait)
+                                if (!request_will_wait) {
+                                        flk_stale_lock_check(lock);
                                         return (EAGAIN);
+                                }
                                 request_blocked_by_active = 1;
                                 break;
                         }
                         /*
                          * Grant lock if it is for the same owner holding active

@@ -1122,10 +1415,11 @@
         }
 
         if (!request_blocked_by_active) {
                         lock_descriptor_t *lk[1];
                         lock_descriptor_t *first_glock = NULL;
+
                 /*
                  * Shall we grant this?! NO!!
                  * What about those locks that were just granted and still
                  * in sleep queue. Those threads are woken up and so locks
                  * are almost active.

@@ -1178,11 +1472,12 @@
                 }
                 lock = first_glock;
                 if (lock) {
                         do {
                                 if (IS_GRANTED(lock)) {
-                                flk_recompute_dependencies(lock, lk, 1, 0);
+                                        flk_recompute_dependencies(lock, lk, 1,
+                                            0);
                                 }
                                 lock = lock->l_prev;
                         } while ((lock->l_vnode == vp));
                 }
                 request->l_state &= ~RECOMPUTE_LOCK;

@@ -1240,16 +1535,17 @@
                         }
                         lock = lock->l_next;
                 } while (lock->l_vnode == vp);
         }
 
-/*
- * found_covering_lock == 2 iff at this point 'request' has paths
- * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
- * point 'request' has paths to all locks that blocks 'request' whose owners
- * are not same as the one that covers 'request' (covered_by above) and
- * we can have locks whose owner is same as covered_by in the active list.
+        /*
+         * found_covering_lock == 2 iff at this point 'request' has paths to
+         * all locks that blocks 'request'. found_covering_lock == 1 iff at
+         * this point 'request' has paths to all locks that blocks 'request'
+         * whose owners are not same as the one that covers 'request'
+         * (covered_by above) and we can have locks whose owner is same as
+         * covered_by in the active list.
  */
 
         if (request_blocked_by_active && found_covering_lock != 2) {
                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
                 ASSERT(lock != NULL);

@@ -1319,24 +1615,22 @@
         if (IS_IO_LOCK(request))
                 return (0);
 
         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 
-        if (lock == NULL && request->l_type == F_UNLCK)
-                return (0);
-        if (lock == NULL) {
-                flk_insert_active_lock(request);
-                return (0);
-        }
-
+        if (lock != NULL) {
+                /*
+                 * There are some active locks so check for relations
+                 */
         do {
                 lock1 = lock->l_next;
                 if (SAME_OWNER(request, lock)) {
                         done_searching = flk_relation(lock, request);
                 }
                 lock = lock1;
         } while (lock->l_vnode == vp && !done_searching);
+        }
 
         /*
          * insert in active queue
          */

@@ -1597,14 +1891,15 @@
         edge_frees++;
         kmem_cache_free(flk_edge_cache, (void *)ep);
 }
 
 /*
- * Check the relationship of request with lock and perform the
- * recomputation of dependencies, break lock if required, and return
- * 1 if request cannot have any more relationship with the next
+ * Check the relationship of 'request' with 'lock' and perform the
+ * recomputation of dependencies, break 'lock' if required, and return
+ * 1 if 'request' cannot have any more relationship with the next
  * active locks.
+ *
  * The 'lock' and 'request' are compared and in case of overlap we
  * delete the 'lock' and form new locks to represent the non-overlapped
  * portion of original 'lock'. This function has side effects such as
  * 'lock' will be freed, new locks will be added to the active list.
  */

@@ -1611,17 +1906,18 @@
 
 static int
 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
 {
         int lock_effect;
-        lock_descriptor_t *lock1, *lock2;
         lock_descriptor_t *topology[3];
         int nvertex = 0;
         int i;
         edge_t  *ep;
-        graph_t *gp = (lock->l_graph);
+        graph_t *gp = lock->l_graph;
+        boolean_t mergeable;
 
+        ASSERT(request->l_blocker == 0);
 
         CHECK_SLEEPING_LOCKS(gp);
         CHECK_ACTIVE_LOCKS(gp);
 
         ASSERT(MUTEX_HELD(&gp->gp_mutex));

@@ -1637,147 +1933,104 @@
             lock->l_type == F_RDLCK)
                 lock_effect = FLK_UPGRADE;
         else
                 lock_effect = FLK_STAY_SAME;
 
+        /*
+         * The 'lock' and 'request' are merged only in a case the effect of
+         * both locks is same (FLK_STAY_SAME) and their blocker status
+         * (l_blocker) is same as well.  We do not merge 'lock' and 'request'
+         * with different l_blocker values because such merge might affect the
+         * stale lock detection.  It might cause either false positives, or
+         * miss some stale locks.
+         */
+        mergeable = lock_effect == FLK_STAY_SAME &&
+            lock->l_blocker == request->l_blocker;
+
         if (lock->l_end < request->l_start) {
-                if (lock->l_end == request->l_start - 1 &&
-                    lock_effect == FLK_STAY_SAME) {
-                        topology[0] = request;
+                /* If the 'lock' is just next to 'request', try to merge them */
+                if (lock->l_end == request->l_start - 1 && mergeable) {
                         request->l_start = lock->l_start;
-                        nvertex = 1;
                         goto recompute;
-                } else {
+                }
+
+                /* Otherwise, they do not overlap, so return immediately */
                         return (0);
                 }
-        }
 
-        if (lock->l_start > request->l_end) {
-                if (request->l_end == lock->l_start - 1 &&
-                    lock_effect == FLK_STAY_SAME) {
-                        topology[0] = request;
+        if (request->l_end < lock->l_start) {
+                /* If the 'request' is just next to 'lock', try to merge them */
+                if (request->l_end == lock->l_start - 1 && mergeable) {
                         request->l_end = lock->l_end;
-                        nvertex = 1;
                         goto recompute;
-                } else {
+                }
+
+                /* Otherwise, they do not overlap, so return immediately */
                         return (1);
                 }
-        }
 
-        if (request->l_end < lock->l_end) {
-                if (request->l_start > lock->l_start) {
-                        if (lock_effect == FLK_STAY_SAME) {
+        /*
+         * Here we are sure the 'lock' and 'request' overlaps, so the 'request'
+         * will replace the 'lock' (either fully, or at least partially).
+         */
+
+        /*
+         * If the 'request' does not fully cover the 'lock' at the start,
+         * either move the start of the 'request' to cover the 'lock', or split
+         * the 'lock'.
+         */
+        if (lock->l_start < request->l_start) {
+                if (mergeable) {
                                 request->l_start = lock->l_start;
-                                request->l_end = lock->l_end;
-                                topology[0] = request;
-                                nvertex = 1;
                         } else {
-                                lock1 = flk_get_lock();
-                                lock2 = flk_get_lock();
-                                COPY(lock1, lock);
-                                COPY(lock2, lock);
-                                lock1->l_start = lock->l_start;
-                                lock1->l_end = request->l_start - 1;
-                                lock2->l_start = request->l_end + 1;
-                                lock2->l_end = lock->l_end;
-                                topology[0] = lock1;
-                                topology[1] = lock2;
-                                topology[2] = request;
-                                nvertex = 3;
+                        lock_descriptor_t *new_lock = flk_get_lock();
+
+                        COPY(new_lock, lock);
+                        new_lock->l_end = request->l_start - 1;
+
+                        topology[nvertex++] = new_lock;
                         }
-                } else if (request->l_start < lock->l_start) {
-                        if (lock_effect == FLK_STAY_SAME) {
-                                request->l_end = lock->l_end;
-                                topology[0] = request;
-                                nvertex = 1;
-                        } else {
-                                lock1 = flk_get_lock();
-                                COPY(lock1, lock);
-                                lock1->l_start = request->l_end + 1;
-                                topology[0] = lock1;
-                                topology[1] = request;
-                                nvertex = 2;
                         }
-                } else  {
-                        if (lock_effect == FLK_STAY_SAME) {
-                                request->l_start = lock->l_start;
+
+        /*
+         * If the 'request' does not fully cover the 'lock' at the end, either
+         * move the end of the 'request' to cover the 'lock', or split the
+         * 'lock'.
+         */
+        if (request->l_end < lock->l_end) {
+                if (mergeable) {
                                 request->l_end = lock->l_end;
-                                topology[0] = request;
-                                nvertex = 1;
                         } else {
-                                lock1 = flk_get_lock();
-                                COPY(lock1, lock);
-                                lock1->l_start = request->l_end + 1;
-                                topology[0] = lock1;
-                                topology[1] = request;
-                                nvertex = 2;
+                        lock_descriptor_t *new_lock = flk_get_lock();
+
+                        COPY(new_lock, lock);
+                        new_lock->l_start = request->l_end + 1;
+
+                        topology[nvertex++] = new_lock;
                         }
                 }
-        } else if (request->l_end > lock->l_end) {
-                if (request->l_start > lock->l_start)  {
-                        if (lock_effect == FLK_STAY_SAME) {
-                                request->l_start = lock->l_start;
-                                topology[0] = request;
-                                nvertex = 1;
-                        } else {
-                                lock1 = flk_get_lock();
-                                COPY(lock1, lock);
-                                lock1->l_end = request->l_start - 1;
-                                topology[0] = lock1;
-                                topology[1] = request;
-                                nvertex = 2;
+
+        /*
+         * Log the blocker change
+         */
+        if (nvertex > 0 && lock->l_blocker < 0) {
+                if (nvertex == 1)
+                        flk_stale_lock_shrink(lock, topology[0]);
+                if (nvertex == 2)
+                        flk_stale_lock_split(lock, topology[0], topology[1]);
+
+                lock->l_blocker = 0;
                         }
-                } else if (request->l_start < lock->l_start)  {
-                        topology[0] = request;
-                        nvertex = 1;
-                } else {
-                        topology[0] = request;
-                        nvertex = 1;
-                }
-        } else {
-                if (request->l_start > lock->l_start) {
-                        if (lock_effect == FLK_STAY_SAME) {
-                                request->l_start = lock->l_start;
-                                topology[0] = request;
-                                nvertex = 1;
-                        } else {
-                                lock1 = flk_get_lock();
-                                COPY(lock1, lock);
-                                lock1->l_end = request->l_start - 1;
-                                topology[0] = lock1;
-                                topology[1] = request;
-                                nvertex = 2;
-                        }
-                } else if (request->l_start < lock->l_start) {
-                        topology[0] = request;
-                        nvertex = 1;
-                } else {
-                        if (lock_effect !=  FLK_UNLOCK) {
-                                topology[0] = request;
-                                nvertex = 1;
-                        } else {
-                                flk_delete_active_lock(lock, 0);
-                                flk_wakeup(lock, 1);
-                                flk_free_lock(lock);
-                                CHECK_SLEEPING_LOCKS(gp);
-                                CHECK_ACTIVE_LOCKS(gp);
-                                return (1);
-                        }
-                }
-        }
 
 recompute:
-
         /*
          * For unlock we don't send the 'request' to for recomputing
          * dependencies because no lock will add an edge to this.
          */
+        if (lock_effect != FLK_UNLOCK)
+                topology[nvertex++] = request;
 
-        if (lock_effect == FLK_UNLOCK) {
-                topology[nvertex-1] = NULL;
-                nvertex--;
-        }
         for (i = 0; i < nvertex; i++) {
                 topology[i]->l_state |= RECOMPUTE_LOCK;
                 topology[i]->l_color = NO_COLOR;
         }

@@ -1785,37 +2038,33 @@
 
         /*
          * we remove the adjacent edges for all vertices' to this vertex
          * 'lock'.
          */
-
         ep = FIRST_IN(lock);
         while (ep != HEAD(lock)) {
                 ADJ_LIST_REMOVE(ep);
                 ep = NEXT_IN(ep);
         }
 
         flk_delete_active_lock(lock, 0);
 
         /* We are ready for recomputing the dependencies now */
-
         flk_recompute_dependencies(lock, topology, nvertex, 1);
 
         for (i = 0; i < nvertex; i++) {
                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
                 topology[i]->l_color = NO_COLOR;
         }
 
-
         if (lock_effect == FLK_UNLOCK) {
                 nvertex++;
         }
         for (i = 0; i < nvertex - 1; i++) {
                 flk_insert_active_lock(topology[i]);
         }
 
-
         if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
                 flk_wakeup(lock, 0);
         } else {
                 ep = FIRST_IN(lock);
                 while (ep != HEAD(lock)) {

@@ -1890,10 +2139,16 @@
         ASSERT(NOT_BLOCKED(lock));
         ASSERT(IS_ACTIVE(lock));
 
         ASSERT((vp->v_filocks != NULL));
 
+        if (lock->l_blocker < 0) {
+                /* Log the blocker release */
+                flk_stale_lock_release(lock);
+                lock->l_blocker = 0;
+        }
+
         if (vp->v_filocks == (struct filock *)lock) {
                 vp->v_filocks = (struct filock *)
                     ((lock->l_next->l_vnode == vp) ? lock->l_next :
                     NULL);
         }

@@ -2058,11 +2313,11 @@
 
         /*
          * free the topology
          */
         if (nvertex)
-                kmem_free((void *)topology,
+                kmem_free(topology,
                     (nvertex * sizeof (lock_descriptor_t *)));
         /*
          * Possibility of some locks unblocked now
          */

@@ -2994,11 +3249,11 @@
 
 int
 chklock(struct vnode *vp, int iomode, u_offset_t offset, ssize_t len, int fmode,
     caller_context_t *ct)
 {
-        register int    i;
+        int             i;
         struct flock64  bf;
         int             error = 0;
 
         bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
         bf.l_whence = 0;

@@ -3325,11 +3580,11 @@
 
 static void
 flk_free_proc_edge(proc_edge_t *pep)
 {
         ASSERT(pep->refcount == 0);
-        kmem_free((void *)pep, sizeof (proc_edge_t));
+        kmem_free(pep, sizeof (proc_edge_t));
         flk_proc_edge_frees++;
 }
 
 /*
  * Color the graph explicitly done only when the mark value hits max value.

@@ -4067,29 +4322,54 @@
                 }
                 mutex_exit(&gp->gp_mutex);
         }
 }
 
-
 /*
  * Wait until a lock is granted, cancelled, or interrupted.
  */
 
 static void
 wait_for_lock(lock_descriptor_t *request)
 {
         graph_t *gp = request->l_graph;
+        vnode_t *vp = request->l_vnode;
 
         ASSERT(MUTEX_HELD(&gp->gp_mutex));
 
         while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
             !(IS_INTERRUPTED(request))) {
-                if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
+                lock_descriptor_t *lock;
+
+                if (stale_lock_timeout == 0) {
+                        /* The stale lock detection is disabled */
+                        if (cv_wait_sig(&request->l_cv, &gp->gp_mutex) == 0) {
                         flk_set_state(request, FLK_INTERRUPTED_STATE);
                         request->l_state |= INTERRUPTED_LOCK;
                 }
+
+                        continue;
         }
+
+                SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
+
+                if (lock != NULL) {
+                        do {
+                                if (BLOCKS(lock, request)) {
+                                        flk_stale_lock_check(lock);
+                                        break;
+                                }
+                                lock = lock->l_next;
+                        } while (lock->l_vnode == vp);
+                }
+
+                if (cv_timedwait_sig(&request->l_cv, &gp->gp_mutex,
+                    ddi_get_lbolt() + SEC_TO_TICK(stale_lock_timeout)) == 0) {
+                        flk_set_state(request, FLK_INTERRUPTED_STATE);
+                        request->l_state |= INTERRUPTED_LOCK;
+                }
+        }
 }
 
 /*
  * Create an flock structure from the existing lock information
  *

@@ -4137,11 +4417,11 @@
         case 1:         /* SEEK_CUR */
                 *start = (u_offset_t)(flp->l_start + offset);
                 break;
         case 2:         /* SEEK_END */
                 vattr.va_mask = AT_SIZE;
-                if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
+                if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
                         return (error);
                 *start = (u_offset_t)(flp->l_start + vattr.va_size);
                 break;
         default:
                 return (EINVAL);

@@ -4170,19 +4450,19 @@
 /*
  * Check the validity of lock data.  This can used by the NFS
  * frlock routines to check data before contacting the server.  The
  * server must support semantics that aren't as restrictive as
  * the UNIX API, so the NFS client is required to check.
- * The maximum is now passed in by the caller.
+ * The maximum is passed in by the caller.
  */
 
 int
 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
 {
         /*
          * The end (length) for local locking should never be greater
-         * than MAXEND. However, the representation for
+         * than max. However, the representation for
          * the entire file is MAX_U_OFFSET_T.
          */
         if ((start > max) ||
             ((end > max) && (end != MAX_U_OFFSET_T))) {
                 return (EINVAL);