Print this page
NEX-3758 Support for remote stale lock detection
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>


  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28 /*      All Rights Reserved */
  29 
  30 /*
  31  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  32  * Copyright 2015 Joyent, Inc.
  33  */
  34 
  35 #include <sys/flock_impl.h>
  36 #include <sys/vfs.h>
  37 #include <sys/t_lock.h>           /* for <sys/callb.h> */
  38 #include <sys/callb.h>
  39 #include <sys/clconf.h>
  40 #include <sys/cladm.h>
  41 #include <sys/nbmlock.h>
  42 #include <sys/cred.h>
  43 #include <sys/policy.h>




  44 
  45 /*
  46  * The following four variables are for statistics purposes and they are
  47  * not protected by locks. They may not be accurate but will at least be
  48  * close to the actual value.
  49  */
  50 
  51 int     flk_lock_allocs;
  52 int     flk_lock_frees;
  53 int     edge_allocs;
  54 int     edge_frees;
  55 int     flk_proc_vertex_allocs;
  56 int     flk_proc_edge_allocs;
  57 int     flk_proc_vertex_frees;
  58 int     flk_proc_edge_frees;
  59 
  60 static kmutex_t flock_lock;
  61 
  62 #ifdef DEBUG
  63 int check_debug = 0;


 142  * running, and so whether to allow lock manager requests or not.
 143  *
 144  * Thus, on a per-zone basis we maintain a ``global'' variable
 145  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
 146  * manager is determined to be changing state (starting or stopping).
 147  *
 148  * Each graph/zone pair also has a copy of this variable, which is protected by
 149  * the graph's mutex.
 150  *
 151  * The per-graph copies are used to synchronize lock requests with shutdown
 152  * requests.  The global copy is used to initialize the per-graph field when a
 153  * new graph is created.
 154  */
 155 struct flock_globals {
 156         flk_lockmgr_status_t flk_lockmgr_status;
 157         flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
 158 };
 159 
 160 zone_key_t flock_zone_key;
 161 






















 162 static void create_flock(lock_descriptor_t *, flock64_t *);
 163 static lock_descriptor_t        *flk_get_lock(void);
 164 static void     flk_free_lock(lock_descriptor_t *lock);
 165 static void     flk_get_first_blocking_lock(lock_descriptor_t *request);
 166 static int flk_process_request(lock_descriptor_t *);
 167 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
 168 static edge_t *flk_get_edge(void);
 169 static int flk_wait_execute_request(lock_descriptor_t *);
 170 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
 171 static void flk_insert_active_lock(lock_descriptor_t *);
 172 static void flk_delete_active_lock(lock_descriptor_t *, int);
 173 static void flk_insert_sleeping_lock(lock_descriptor_t *);
 174 static void flk_graph_uncolor(graph_t *);
 175 static void flk_wakeup(lock_descriptor_t *, int);
 176 static void flk_free_edge(edge_t *);
 177 static void flk_recompute_dependencies(lock_descriptor_t *,
 178                         lock_descriptor_t **,  int, int);
 179 static int flk_find_barriers(lock_descriptor_t *);
 180 static void flk_update_barriers(lock_descriptor_t *);
 181 static int flk_color_reachables(lock_descriptor_t *);
 182 static int flk_canceled(lock_descriptor_t *);
 183 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
 184 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
 185 static void wait_for_lock(lock_descriptor_t *);


 539         lock_descriptor_t       *lock_request;
 540         int error = 0;
 541         graph_t *gp;
 542         int                     nlmid;
 543 
 544         /*
 545          * Check access permissions
 546          */
 547         if ((cmd & SETFLCK) &&
 548             ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 549             (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 550                         return (EBADF);
 551 
 552         /*
 553          * for query and unlock we use the stack_lock_request
 554          */
 555 
 556         if ((lckdat->l_type == F_UNLCK) ||
 557             !((cmd & INOFLCK) || (cmd & SETFLCK))) {
 558                 lock_request = &stack_lock_request;
 559                 (void) bzero((caddr_t)lock_request,
 560                     sizeof (lock_descriptor_t));
 561 
 562                 /*
 563                  * following is added to make the assertions in
 564                  * flk_execute_request() to pass through
 565                  */
 566 
 567                 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 568                 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 569                 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 570                 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 571                 lock_request->l_status = FLK_INITIAL_STATE;
 572         } else {
 573                 lock_request = flk_get_lock();
 574         }
 575         lock_request->l_state = 0;
 576         lock_request->l_vnode = vp;
 577         lock_request->l_zoneid = getzoneid();
 578 
 579         /*
 580          * Convert the request range into the canonical start and end


 933                  * in the cluster.  This number will be the size of the nlm
 934                  * registry status array.  We add 1 because we will be using
 935                  * all entries indexed from 0 to maxnodeid; e.g., from 0
 936                  * to 64, for a total of 65 entries.
 937                  */
 938                 nlm_status_size = clconf_maximum_nodeid() + 1;
 939         } else {
 940                 nlm_status_size = 0;
 941         }
 942 
 943         if (nlm_status_size != 0) {     /* booted as a cluster */
 944                 nlm_reg_status = (flk_nlm_status_t *)
 945                     kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
 946                     KM_SLEEP);
 947 
 948                 /* initialize all NLM states in array to NLM_UNKNOWN */
 949                 for (i = 0; i < nlm_status_size; i++) {
 950                         nlm_reg_status[i] = FLK_NLM_UNKNOWN;
 951                 }
 952         }








 953 }
 954 
 955 /*
 956  * Zone constructor/destructor callbacks to be executed when a zone is
 957  * created/destroyed.
 958  */
 959 /* ARGSUSED */
 960 void *
 961 flk_zone_init(zoneid_t zoneid)
 962 {
 963         struct flock_globals *fg;
 964         uint_t i;
 965 
 966         fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
 967         fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
 968         for (i = 0; i < HASH_SIZE; i++)
 969                 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
 970         return (fg);
 971 }
 972 


 994         l->l_edge.edge_in_next = &l->l_edge;
 995         l->l_edge.edge_in_prev = &l->l_edge;
 996         l->l_edge.edge_adj_next = &l->l_edge;
 997         l->l_edge.edge_adj_prev = &l->l_edge;
 998         l->pvertex = -1;
 999         l->l_status = FLK_INITIAL_STATE;
1000         flk_lock_allocs++;
1001         return (l);
1002 }
1003 
1004 /*
1005  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
1006  * when some thread has a reference to it as in reclock().
1007  */
1008 
1009 void
1010 flk_free_lock(lock_descriptor_t *lock)
1011 {
1012         file_t *fp;
1013 

1014         ASSERT(IS_DEAD(lock));
1015 
1016         if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock)
1017                 fp->f_filock = NULL;
1018 
1019         if (IS_REFERENCED(lock)) {
1020                 lock->l_state |= DELETED_LOCK;
1021                 return;
1022         }
1023         flk_lock_frees++;
1024         kmem_free((void *)lock, sizeof (lock_descriptor_t));
1025 }
1026 
1027 void
1028 flk_set_state(lock_descriptor_t *lock, int new_state)
1029 {
1030         /*
1031          * Locks in the sleeping list may be woken up in a number of ways,
1032          * and more than once.  If a sleeping lock is signaled awake more
1033          * than once, then it may or may not change state depending on its
1034          * current state.
1035          * Also note that NLM locks that are sleeping could be moved to an
1036          * interrupted state more than once if the unlock request is
1037          * retransmitted by the NLM client - the second time around, this is
1038          * just a nop.
1039          * The ordering of being signaled awake is:
1040          * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
1041          * The checks below implement this ordering.
1042          */
1043         if (IS_INTERRUPTED(lock)) {
1044                 if ((new_state == FLK_CANCELLED_STATE) ||
1045                     (new_state == FLK_GRANTED_STATE) ||
1046                     (new_state == FLK_INTERRUPTED_STATE)) {
1047                         return;
1048                 }
1049         }
1050         if (IS_CANCELLED(lock)) {
1051                 if ((new_state == FLK_GRANTED_STATE) ||
1052                     (new_state == FLK_CANCELLED_STATE)) {
1053                         return;
1054                 }
1055         }
1056         CHECK_LOCK_TRANSITION(lock->l_status, new_state);
1057         if (IS_PXFS(lock)) {
1058                 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
1059         }
1060         lock->l_status = new_state;
1061 }
1062 
1063 /*


































































































































































































































































1064  * Routine that checks whether there are any blocking locks in the system.
1065  *
1066  * The policy followed is if a write lock is sleeping we don't allow read
1067  * locks before this write lock even though there may not be any active
1068  * locks corresponding to the read locks' region.
1069  *
1070  * flk_add_edge() function adds an edge between l1 and l2 iff there
1071  * is no path between l1 and l2. This is done to have a "minimum
1072  * storage representation" of the dependency graph.
1073  *
1074  * Another property of the graph is since only the new request throws
1075  * edges to the existing locks in the graph, the graph is always topologically
1076  * ordered.
1077  */
1078 
1079 static int
1080 flk_process_request(lock_descriptor_t *request)
1081 {
1082         graph_t *gp = request->l_graph;
1083         lock_descriptor_t *lock;
1084         int request_blocked_by_active = 0;
1085         int request_blocked_by_granted = 0;
1086         int request_blocked_by_sleeping = 0;
1087         vnode_t *vp = request->l_vnode;
1088         int     error = 0;
1089         int request_will_wait = 0;
1090         int found_covering_lock = 0;
1091         lock_descriptor_t *covered_by = NULL;
1092 
1093         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1094         request_will_wait = IS_WILLING_TO_SLEEP(request);
1095 
1096         /*
1097          * check active locks
1098          */
1099 
1100         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1101 
1102 
1103         if (lock) {
1104                 do {
1105                         if (BLOCKS(lock, request)) {
1106                                 if (!request_will_wait)

1107                                         return (EAGAIN);

1108                                 request_blocked_by_active = 1;
1109                                 break;
1110                         }
1111                         /*
1112                          * Grant lock if it is for the same owner holding active
1113                          * lock that covers the request.
1114                          */
1115 
1116                         if (SAME_OWNER(lock, request) &&
1117                             COVERS(lock, request) &&
1118                             (request->l_type == F_RDLCK))
1119                                 return (flk_execute_request(request));
1120                         lock = lock->l_next;
1121                 } while (lock->l_vnode == vp);
1122         }
1123 
1124         if (!request_blocked_by_active) {
1125                         lock_descriptor_t *lk[1];
1126                         lock_descriptor_t *first_glock = NULL;

1127                 /*
1128                  * Shall we grant this?! NO!!
1129                  * What about those locks that were just granted and still
1130                  * in sleep queue. Those threads are woken up and so locks
1131                  * are almost active.
1132                  */
1133                 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1134                 if (lock) {
1135                         do {
1136                                 if (BLOCKS(lock, request)) {
1137                                         if (IS_GRANTED(lock)) {
1138                                                 request_blocked_by_granted = 1;
1139                                         } else {
1140                                                 request_blocked_by_sleeping = 1;
1141                                         }
1142                                 }
1143 
1144                                 lock = lock->l_next;
1145                         } while ((lock->l_vnode == vp));
1146                         first_glock = lock->l_prev;


1163                         /*
1164                          * If we have a sleeping writer in the requested
1165                          * lock's range, block.
1166                          */
1167                         goto block;
1168                 }
1169 
1170                 lk[0] = request;
1171                 request->l_state |= RECOMPUTE_LOCK;
1172                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1173                 if (lock) {
1174                         do {
1175                                 flk_recompute_dependencies(lock, lk, 1, 0);
1176                                 lock = lock->l_next;
1177                         } while (lock->l_vnode == vp);
1178                 }
1179                 lock = first_glock;
1180                 if (lock) {
1181                         do {
1182                                 if (IS_GRANTED(lock)) {
1183                                 flk_recompute_dependencies(lock, lk, 1, 0);

1184                                 }
1185                                 lock = lock->l_prev;
1186                         } while ((lock->l_vnode == vp));
1187                 }
1188                 request->l_state &= ~RECOMPUTE_LOCK;
1189                 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
1190                         return (EDEADLK);
1191                 return (flk_execute_request(request));
1192         }
1193 
1194 block:
1195         if (request_will_wait)
1196                 flk_graph_uncolor(gp);
1197 
1198         /* check sleeping locks */
1199 
1200         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1201 
1202         /*
1203          * If we find a sleeping write lock that is a superset of the


1225                                             !SAME_OWNER(lock, covered_by)) {
1226                                                 found_covering_lock++;
1227                                                 break;
1228                                         }
1229                                         found_covering_lock = 1;
1230                                         covered_by = lock;
1231                                 }
1232                                 if (found_covering_lock &&
1233                                     !SAME_OWNER(lock, covered_by)) {
1234                                         lock = lock->l_next;
1235                                         continue;
1236                                 }
1237                                 if ((error = flk_add_edge(request, lock,
1238                                     !found_covering_lock, 0)))
1239                                         return (error);
1240                         }
1241                         lock = lock->l_next;
1242                 } while (lock->l_vnode == vp);
1243         }
1244 
1245 /*
1246  * found_covering_lock == 2 iff at this point 'request' has paths
1247  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
1248  * point 'request' has paths to all locks that blocks 'request' whose owners
1249  * are not same as the one that covers 'request' (covered_by above) and
1250  * we can have locks whose owner is same as covered_by in the active list.

1251  */
1252 
1253         if (request_blocked_by_active && found_covering_lock != 2) {
1254                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1255                 ASSERT(lock != NULL);
1256                 do {
1257                         if (BLOCKS(lock, request)) {
1258                                 if (found_covering_lock &&
1259                                     !SAME_OWNER(lock, covered_by)) {
1260                                         lock = lock->l_next;
1261                                         continue;
1262                                 }
1263                                 if ((error = flk_add_edge(request, lock,
1264                                     CHECK_CYCLE, 0)))
1265                                         return (error);
1266                         }
1267                         lock = lock->l_next;
1268                 } while (lock->l_vnode == vp);
1269         }
1270 


1304         vnode_t *vp = request->l_vnode;
1305         lock_descriptor_t       *lock, *lock1;
1306         int done_searching = 0;
1307 
1308         CHECK_SLEEPING_LOCKS(gp);
1309         CHECK_ACTIVE_LOCKS(gp);
1310 
1311         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1312 
1313         flk_set_state(request, FLK_START_STATE);
1314 
1315         ASSERT(NOT_BLOCKED(request));
1316 
1317         /* IO_LOCK requests are only to check status */
1318 
1319         if (IS_IO_LOCK(request))
1320                 return (0);
1321 
1322         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1323 
1324         if (lock == NULL && request->l_type == F_UNLCK)
1325                 return (0);
1326         if (lock == NULL) {
1327                 flk_insert_active_lock(request);
1328                 return (0);
1329         }
1330 
1331         do {
1332                 lock1 = lock->l_next;
1333                 if (SAME_OWNER(request, lock)) {
1334                         done_searching = flk_relation(lock, request);
1335                 }
1336                 lock = lock1;
1337         } while (lock->l_vnode == vp && !done_searching);

1338 
1339         /*
1340          * insert in active queue
1341          */
1342 
1343         if (request->l_type != F_UNLCK)
1344                 flk_insert_active_lock(request);
1345 
1346         return (0);
1347 }
1348 
1349 /*
1350  * 'request' is blocked by some one therefore we put it into sleep queue.
1351  */
1352 static int
1353 flk_wait_execute_request(lock_descriptor_t *request)
1354 {
1355         graph_t *gp = request->l_graph;
1356         callb_cpr_t     *cprp;          /* CPR info from callback */
1357         struct flock_globals *fg;


1582 
1583         ASSERT(flk_edge_cache != NULL);
1584 
1585         ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1586         edge_allocs++;
1587         return (ep);
1588 }
1589 
1590 /*
1591  * Free the edge structure.
1592  */
1593 
1594 static void
1595 flk_free_edge(edge_t *ep)
1596 {
1597         edge_frees++;
1598         kmem_cache_free(flk_edge_cache, (void *)ep);
1599 }
1600 
1601 /*
1602  * Check the relationship of request with lock and perform the
1603  * recomputation of dependencies, break lock if required, and return
1604  * 1 if request cannot have any more relationship with the next
1605  * active locks.

1606  * The 'lock' and 'request' are compared and in case of overlap we
1607  * delete the 'lock' and form new locks to represent the non-overlapped
1608  * portion of original 'lock'. This function has side effects such as
1609  * 'lock' will be freed, new locks will be added to the active list.
1610  */
1611 
1612 static int
1613 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1614 {
1615         int lock_effect;
1616         lock_descriptor_t *lock1, *lock2;
1617         lock_descriptor_t *topology[3];
1618         int nvertex = 0;
1619         int i;
1620         edge_t  *ep;
1621         graph_t *gp = (lock->l_graph);

1622 

1623 
1624         CHECK_SLEEPING_LOCKS(gp);
1625         CHECK_ACTIVE_LOCKS(gp);
1626 
1627         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1628 
1629         topology[0] = topology[1] = topology[2] = NULL;
1630 
1631         if (request->l_type == F_UNLCK)
1632                 lock_effect = FLK_UNLOCK;
1633         else if (request->l_type == F_RDLCK &&
1634             lock->l_type == F_WRLCK)
1635                 lock_effect = FLK_DOWNGRADE;
1636         else if (request->l_type == F_WRLCK &&
1637             lock->l_type == F_RDLCK)
1638                 lock_effect = FLK_UPGRADE;
1639         else
1640                 lock_effect = FLK_STAY_SAME;
1641 











1642         if (lock->l_end < request->l_start) {
1643                 if (lock->l_end == request->l_start - 1 &&
1644                     lock_effect == FLK_STAY_SAME) {
1645                         topology[0] = request;
1646                         request->l_start = lock->l_start;
1647                         nvertex = 1;
1648                         goto recompute;
1649                 } else {


1650                         return (0);
1651                 }
1652         }
1653 
1654         if (lock->l_start > request->l_end) {
1655                 if (request->l_end == lock->l_start - 1 &&
1656                     lock_effect == FLK_STAY_SAME) {
1657                         topology[0] = request;
1658                         request->l_end = lock->l_end;
1659                         nvertex = 1;
1660                         goto recompute;
1661                 } else {


1662                         return (1);
1663                 }
1664         }
1665 
1666         if (request->l_end < lock->l_end) {
1667                 if (request->l_start > lock->l_start) {
1668                         if (lock_effect == FLK_STAY_SAME) {









1669                                 request->l_start = lock->l_start;
1670                                 request->l_end = lock->l_end;
1671                                 topology[0] = request;
1672                                 nvertex = 1;
1673                         } else {
1674                                 lock1 = flk_get_lock();
1675                                 lock2 = flk_get_lock();
1676                                 COPY(lock1, lock);
1677                                 COPY(lock2, lock);
1678                                 lock1->l_start = lock->l_start;
1679                                 lock1->l_end = request->l_start - 1;
1680                                 lock2->l_start = request->l_end + 1;
1681                                 lock2->l_end = lock->l_end;
1682                                 topology[0] = lock1;
1683                                 topology[1] = lock2;
1684                                 topology[2] = request;
1685                                 nvertex = 3;
1686                         }
1687                 } else if (request->l_start < lock->l_start) {
1688                         if (lock_effect == FLK_STAY_SAME) {
1689                                 request->l_end = lock->l_end;
1690                                 topology[0] = request;
1691                                 nvertex = 1;
1692                         } else {
1693                                 lock1 = flk_get_lock();
1694                                 COPY(lock1, lock);
1695                                 lock1->l_start = request->l_end + 1;
1696                                 topology[0] = lock1;
1697                                 topology[1] = request;
1698                                 nvertex = 2;
1699                         }
1700                 } else  {
1701                         if (lock_effect == FLK_STAY_SAME) {
1702                                 request->l_start = lock->l_start;





1703                                 request->l_end = lock->l_end;
1704                                 topology[0] = request;
1705                                 nvertex = 1;
1706                         } else {
1707                                 lock1 = flk_get_lock();
1708                                 COPY(lock1, lock);
1709                                 lock1->l_start = request->l_end + 1;
1710                                 topology[0] = lock1;
1711                                 topology[1] = request;
1712                                 nvertex = 2;
1713                         }
1714                 }
1715         } else if (request->l_end > lock->l_end) {
1716                 if (request->l_start > lock->l_start)  {
1717                         if (lock_effect == FLK_STAY_SAME) {
1718                                 request->l_start = lock->l_start;
1719                                 topology[0] = request;
1720                                 nvertex = 1;
1721                         } else {
1722                                 lock1 = flk_get_lock();
1723                                 COPY(lock1, lock);
1724                                 lock1->l_end = request->l_start - 1;
1725                                 topology[0] = lock1;
1726                                 topology[1] = request;
1727                                 nvertex = 2;
1728                         }
1729                 } else if (request->l_start < lock->l_start)  {
1730                         topology[0] = request;
1731                         nvertex = 1;
1732                 } else {
1733                         topology[0] = request;
1734                         nvertex = 1;
1735                 }
1736         } else {
1737                 if (request->l_start > lock->l_start) {
1738                         if (lock_effect == FLK_STAY_SAME) {
1739                                 request->l_start = lock->l_start;
1740                                 topology[0] = request;
1741                                 nvertex = 1;
1742                         } else {
1743                                 lock1 = flk_get_lock();
1744                                 COPY(lock1, lock);
1745                                 lock1->l_end = request->l_start - 1;
1746                                 topology[0] = lock1;
1747                                 topology[1] = request;
1748                                 nvertex = 2;
1749                         }
1750                 } else if (request->l_start < lock->l_start) {
1751                         topology[0] = request;
1752                         nvertex = 1;
1753                 } else {
1754                         if (lock_effect !=  FLK_UNLOCK) {
1755                                 topology[0] = request;
1756                                 nvertex = 1;
1757                         } else {
1758                                 flk_delete_active_lock(lock, 0);
1759                                 flk_wakeup(lock, 1);
1760                                 flk_free_lock(lock);
1761                                 CHECK_SLEEPING_LOCKS(gp);
1762                                 CHECK_ACTIVE_LOCKS(gp);
1763                                 return (1);
1764                         }
1765                 }
1766         }
1767 
1768 recompute:
1769 
1770         /*
1771          * For unlock we don't send the 'request' to for recomputing
1772          * dependencies because no lock will add an edge to this.
1773          */


1774 
1775         if (lock_effect == FLK_UNLOCK) {
1776                 topology[nvertex-1] = NULL;
1777                 nvertex--;
1778         }
1779         for (i = 0; i < nvertex; i++) {
1780                 topology[i]->l_state |= RECOMPUTE_LOCK;
1781                 topology[i]->l_color = NO_COLOR;
1782         }
1783 
1784         ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1785 
1786         /*
1787          * we remove the adjacent edges for all vertices' to this vertex
1788          * 'lock'.
1789          */
1790 
1791         ep = FIRST_IN(lock);
1792         while (ep != HEAD(lock)) {
1793                 ADJ_LIST_REMOVE(ep);
1794                 ep = NEXT_IN(ep);
1795         }
1796 
1797         flk_delete_active_lock(lock, 0);
1798 
1799         /* We are ready for recomputing the dependencies now */
1800 
1801         flk_recompute_dependencies(lock, topology, nvertex, 1);
1802 
1803         for (i = 0; i < nvertex; i++) {
1804                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1805                 topology[i]->l_color = NO_COLOR;
1806         }
1807 
1808 
1809         if (lock_effect == FLK_UNLOCK) {
1810                 nvertex++;
1811         }
1812         for (i = 0; i < nvertex - 1; i++) {
1813                 flk_insert_active_lock(topology[i]);
1814         }
1815 
1816 
1817         if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1818                 flk_wakeup(lock, 0);
1819         } else {
1820                 ep = FIRST_IN(lock);
1821                 while (ep != HEAD(lock)) {
1822                         lock->l_sedge = NEXT_IN(ep);
1823                         IN_LIST_REMOVE(ep);
1824                         flk_update_proc_graph(ep, 1);
1825                         flk_free_edge(ep);
1826                         ep = lock->l_sedge;
1827                 }
1828         }
1829         flk_free_lock(lock);
1830 
1831         CHECK_SLEEPING_LOCKS(gp);
1832         CHECK_ACTIVE_LOCKS(gp);
1833         return (0);
1834 }
1835 
1836 /*


1875 /*
1876  * Delete the active lock : Performs two functions depending on the
1877  * value of second parameter. One is to remove from the active lists
1878  * only and other is to both remove and free the lock.
1879  */
1880 
1881 static void
1882 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1883 {
1884         vnode_t *vp = lock->l_vnode;
1885         graph_t *gp = lock->l_graph;
1886 
1887         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1888         if (free_lock)
1889                 ASSERT(NO_DEPENDENTS(lock));
1890         ASSERT(NOT_BLOCKED(lock));
1891         ASSERT(IS_ACTIVE(lock));
1892 
1893         ASSERT((vp->v_filocks != NULL));
1894 






1895         if (vp->v_filocks == (struct filock *)lock) {
1896                 vp->v_filocks = (struct filock *)
1897                     ((lock->l_next->l_vnode == vp) ? lock->l_next :
1898                     NULL);
1899         }
1900         lock->l_next->l_prev = lock->l_prev;
1901         lock->l_prev->l_next = lock->l_next;
1902         lock->l_next = lock->l_prev = NULL;
1903         flk_set_state(lock, FLK_DEAD_STATE);
1904         lock->l_state &= ~ACTIVE_LOCK;
1905 
1906         if (free_lock)
1907                 flk_free_lock(lock);
1908         CHECK_ACTIVE_LOCKS(gp);
1909         CHECK_SLEEPING_LOCKS(gp);
1910 }
1911 
1912 /*
1913  * Insert into the sleep queue.
1914  */


2043                 request->l_sedge = NEXT_ADJ(ep);
2044                 ADJ_LIST_REMOVE(ep);
2045                 flk_update_proc_graph(ep, 1);
2046                 flk_free_edge(ep);
2047                 ep = request->l_sedge;
2048         }
2049 
2050 
2051         /*
2052          * unset the RECOMPUTE flag in those vertices
2053          */
2054 
2055         for (i = 0; i < nvertex; i++) {
2056                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
2057         }
2058 
2059         /*
2060          * free the topology
2061          */
2062         if (nvertex)
2063                 kmem_free((void *)topology,
2064                     (nvertex * sizeof (lock_descriptor_t *)));
2065         /*
2066          * Possibility of some locks unblocked now
2067          */
2068 
2069         flk_wakeup(request, 0);
2070 
2071         /*
2072          * we expect to have a correctly recomputed graph  now.
2073          */
2074         flk_set_state(request, FLK_DEAD_STATE);
2075         flk_free_lock(request);
2076         CHECK_SLEEPING_LOCKS(gp);
2077         CHECK_ACTIVE_LOCKS(gp);
2078 
2079 }
2080 
2081 /*
2082  * Uncoloring the graph is simply to increment the mark value of the graph
2083  * And only when wrap round takes place will we color all vertices in


2979                 flk_wakeup(lock, 1);
2980                 flk_free_lock(lock);
2981         }
2982 
2983         CHECK_SLEEPING_LOCKS(gp);
2984         CHECK_ACTIVE_LOCKS(gp);
2985         CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2986         mutex_exit(&gp->gp_mutex);
2987 }
2988 
2989 
2990 /*
2991  * Called from 'fs' read and write routines for files that have mandatory
2992  * locking enabled.
2993  */
2994 
2995 int
2996 chklock(struct vnode *vp, int iomode, u_offset_t offset, ssize_t len, int fmode,
2997     caller_context_t *ct)
2998 {
2999         register int    i;
3000         struct flock64  bf;
3001         int             error = 0;
3002 
3003         bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
3004         bf.l_whence = 0;
3005         bf.l_start = offset;
3006         bf.l_len = len;
3007         if (ct == NULL) {
3008                 bf.l_pid = curproc->p_pid;
3009                 bf.l_sysid = 0;
3010         } else {
3011                 bf.l_pid = ct->cc_pid;
3012                 bf.l_sysid = ct->cc_sysid;
3013         }
3014         i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
3015         if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
3016             bf.l_type != F_UNLCK)
3017                 error = i ? i : EAGAIN;
3018         return (error);
3019 }


3310  */
3311 
3312 static proc_edge_t *
3313 flk_get_proc_edge()
3314 {
3315         proc_edge_t *pep;
3316 
3317         pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3318         flk_proc_edge_allocs++;
3319         return (pep);
3320 }
3321 
3322 /*
3323  * Free the proc edge. Called whenever its reference count goes to zero.
3324  */
3325 
3326 static void
3327 flk_free_proc_edge(proc_edge_t *pep)
3328 {
3329         ASSERT(pep->refcount == 0);
3330         kmem_free((void *)pep, sizeof (proc_edge_t));
3331         flk_proc_edge_frees++;
3332 }
3333 
3334 /*
3335  * Color the graph explicitly done only when the mark value hits max value.
3336  */
3337 
3338 static void
3339 flk_proc_graph_uncolor()
3340 {
3341         int i;
3342 
3343         if (pgraph.mark == UINT_MAX) {
3344                 for (i = 0; i < pgraph.gcount; i++)
3345                         if (pgraph.proc[i] != NULL) {
3346                                 pgraph.proc[i]->atime = 0;
3347                                 pgraph.proc[i]->dtime = 0;
3348                         }
3349                 pgraph.mark = 1;
3350         } else {


4052                         continue;
4053                 }
4054 
4055                 mutex_enter(&gp->gp_mutex);
4056                 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
4057                 for (lock = ACTIVE_HEAD(gp)->l_next;
4058                     lock != ACTIVE_HEAD(gp);
4059                     lock = nlock) {
4060                         nlock = lock->l_next;
4061                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4062                                 ASSERT(IS_ACTIVE(lock));
4063                                 flk_delete_active_lock(lock, 0);
4064                                 flk_wakeup(lock, 1);
4065                                 flk_free_lock(lock);
4066                         }
4067                 }
4068                 mutex_exit(&gp->gp_mutex);
4069         }
4070 }
4071 
4072 
4073 /*
4074  * Wait until a lock is granted, cancelled, or interrupted.
4075  */
4076 
4077 static void
4078 wait_for_lock(lock_descriptor_t *request)
4079 {
4080         graph_t *gp = request->l_graph;

4081 
4082         ASSERT(MUTEX_HELD(&gp->gp_mutex));
4083 
4084         while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
4085             !(IS_INTERRUPTED(request))) {
4086                 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {




4087                         flk_set_state(request, FLK_INTERRUPTED_STATE);
4088                         request->l_state |= INTERRUPTED_LOCK;
4089                 }


4090         }



















4091 }
4092 
4093 /*
4094  * Create an flock structure from the existing lock information
4095  *
4096  * This routine is used to create flock structures for the lock manager
4097  * to use in a reclaim request.  Since the lock was originated on this
4098  * host, it must be conforming to UNIX semantics, so no checking is
4099  * done to make sure it falls within the lower half of the 32-bit range.
4100  */
4101 
4102 static void
4103 create_flock(lock_descriptor_t *lp, flock64_t *flp)
4104 {
4105         ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
4106         ASSERT(lp->l_end >= lp->l_start);
4107 
4108         flp->l_type = lp->l_type;
4109         flp->l_whence = 0;
4110         flp->l_start = lp->l_start;


4122 
4123 int
4124 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
4125     u_offset_t *start, u_offset_t *end, offset_t offset)
4126 {
4127         struct vattr    vattr;
4128         int     error;
4129 
4130         /*
4131          * Determine the starting point of the request
4132          */
4133         switch (flp->l_whence) {
4134         case 0:         /* SEEK_SET */
4135                 *start = (u_offset_t)flp->l_start;
4136                 break;
4137         case 1:         /* SEEK_CUR */
4138                 *start = (u_offset_t)(flp->l_start + offset);
4139                 break;
4140         case 2:         /* SEEK_END */
4141                 vattr.va_mask = AT_SIZE;
4142                 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
4143                         return (error);
4144                 *start = (u_offset_t)(flp->l_start + vattr.va_size);
4145                 break;
4146         default:
4147                 return (EINVAL);
4148         }
4149 
4150         /*
4151          * Determine the range covered by the request.
4152          */
4153         if (flp->l_len == 0)
4154                 *end = MAX_U_OFFSET_T;
4155         else if ((offset_t)flp->l_len > 0) {
4156                 *end = (u_offset_t)(*start + (flp->l_len - 1));
4157         } else {
4158                 /*
4159                  * Negative length; why do we even allow this ?
4160                  * Because this allows easy specification of
4161                  * the last n bytes of the file.
4162                  */
4163                 *end = *start;
4164                 *start += (u_offset_t)flp->l_len;
4165                 (*start)++;
4166         }
4167         return (0);
4168 }
4169 
4170 /*
4171  * Check the validity of lock data.  This can used by the NFS
4172  * frlock routines to check data before contacting the server.  The
4173  * server must support semantics that aren't as restrictive as
4174  * the UNIX API, so the NFS client is required to check.
4175  * The maximum is now passed in by the caller.
4176  */
4177 
4178 int
4179 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
4180 {
4181         /*
4182          * The end (length) for local locking should never be greater
4183          * than MAXEND. However, the representation for
4184          * the entire file is MAX_U_OFFSET_T.
4185          */
4186         if ((start > max) ||
4187             ((end > max) && (end != MAX_U_OFFSET_T))) {
4188                 return (EINVAL);
4189         }
4190         if (start > end) {
4191                 return (EINVAL);
4192         }
4193         return (0);
4194 }
4195 
4196 /*
4197  * Fill in request->l_flock with information about the lock blocking the
4198  * request.  The complexity here is that lock manager requests are allowed
4199  * to see into the upper part of the 32-bit address range, whereas local
4200  * requests are only allowed to see signed values.
4201  *
4202  * What should be done when "blocker" is a lock manager lock that uses the
4203  * upper portion of the 32-bit range, but "request" is local?  Since the




  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28 /*      All Rights Reserved */
  29 
  30 /*
  31  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  32  * Copyright 2015 Joyent, Inc.
  33  */
  34 
  35 #include <sys/flock_impl.h>
  36 #include <sys/vfs.h>
  37 #include <sys/t_lock.h>           /* for <sys/callb.h> */
  38 #include <sys/callb.h>
  39 #include <sys/clconf.h>
  40 #include <sys/cladm.h>
  41 #include <sys/nbmlock.h>
  42 #include <sys/cred.h>
  43 #include <sys/policy.h>
  44 #include <sys/list.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/socket.h>
  47 #include <inet/ip.h>
  48 
  49 /*
  50  * The following four variables are for statistics purposes and they are
  51  * not protected by locks. They may not be accurate but will at least be
  52  * close to the actual value.
  53  */
  54 
  55 int     flk_lock_allocs;
  56 int     flk_lock_frees;
  57 int     edge_allocs;
  58 int     edge_frees;
  59 int     flk_proc_vertex_allocs;
  60 int     flk_proc_edge_allocs;
  61 int     flk_proc_vertex_frees;
  62 int     flk_proc_edge_frees;
  63 
  64 static kmutex_t flock_lock;
  65 
  66 #ifdef DEBUG
  67 int check_debug = 0;


 146  * running, and so whether to allow lock manager requests or not.
 147  *
 148  * Thus, on a per-zone basis we maintain a ``global'' variable
 149  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
 150  * manager is determined to be changing state (starting or stopping).
 151  *
 152  * Each graph/zone pair also has a copy of this variable, which is protected by
 153  * the graph's mutex.
 154  *
 155  * The per-graph copies are used to synchronize lock requests with shutdown
 156  * requests.  The global copy is used to initialize the per-graph field when a
 157  * new graph is created.
 158  */
 159 struct flock_globals {
 160         flk_lockmgr_status_t flk_lockmgr_status;
 161         flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
 162 };
 163 
 164 zone_key_t flock_zone_key;
 165 
 166 /*
 167  * Support for the remote stale lock detection
 168  *
 169  * The sysid_to_host_translator_lock readers/writer lock protects
 170  * sysid_to_host_translator_list.
 171  *
 172  * The sysid_to_host_translator_list is a list of sysid to host name translator
 173  * functions.  The new translators are added using the public
 174  * flk_add_sysid_to_host_translator() call.
 175  *
 176  * The stale_lock_timeout is in seconds and it determines the interval for the
 177  * remote stale lock checking.  When set to 0, the remote stale lock checking
 178  * is disabled.
 179  */
 180 struct sysid_to_host_translator_entry {
 181         sysid_to_host_translator_t translator;
 182         list_node_t node;
 183 };
 184 static krwlock_t sysid_to_host_translator_lock;
 185 static list_t sysid_to_host_translator_list;
 186 volatile int stale_lock_timeout = 3600;         /* one hour, in seconds */
 187 
 188 static void create_flock(lock_descriptor_t *, flock64_t *);
 189 static lock_descriptor_t        *flk_get_lock(void);
 190 static void     flk_free_lock(lock_descriptor_t *lock);
 191 static void     flk_get_first_blocking_lock(lock_descriptor_t *);
 192 static int flk_process_request(lock_descriptor_t *);
 193 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
 194 static edge_t *flk_get_edge(void);
 195 static int flk_wait_execute_request(lock_descriptor_t *);
 196 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
 197 static void flk_insert_active_lock(lock_descriptor_t *);
 198 static void flk_delete_active_lock(lock_descriptor_t *, int);
 199 static void flk_insert_sleeping_lock(lock_descriptor_t *);
 200 static void flk_graph_uncolor(graph_t *);
 201 static void flk_wakeup(lock_descriptor_t *, int);
 202 static void flk_free_edge(edge_t *);
 203 static void flk_recompute_dependencies(lock_descriptor_t *,
 204                         lock_descriptor_t **,  int, int);
 205 static int flk_find_barriers(lock_descriptor_t *);
 206 static void flk_update_barriers(lock_descriptor_t *);
 207 static int flk_color_reachables(lock_descriptor_t *);
 208 static int flk_canceled(lock_descriptor_t *);
 209 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
 210 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
 211 static void wait_for_lock(lock_descriptor_t *);


 565         lock_descriptor_t       *lock_request;
 566         int error = 0;
 567         graph_t *gp;
 568         int                     nlmid;
 569 
 570         /*
 571          * Check access permissions
 572          */
 573         if ((cmd & SETFLCK) &&
 574             ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 575             (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 576                         return (EBADF);
 577 
 578         /*
 579          * for query and unlock we use the stack_lock_request
 580          */
 581 
 582         if ((lckdat->l_type == F_UNLCK) ||
 583             !((cmd & INOFLCK) || (cmd & SETFLCK))) {
 584                 lock_request = &stack_lock_request;
 585                 bzero(lock_request, sizeof (lock_descriptor_t));

 586 
 587                 /*
 588                  * following is added to make the assertions in
 589                  * flk_execute_request() to pass through
 590                  */
 591 
 592                 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 593                 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 594                 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 595                 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 596                 lock_request->l_status = FLK_INITIAL_STATE;
 597         } else {
 598                 lock_request = flk_get_lock();
 599         }
 600         lock_request->l_state = 0;
 601         lock_request->l_vnode = vp;
 602         lock_request->l_zoneid = getzoneid();
 603 
 604         /*
 605          * Convert the request range into the canonical start and end


 958                  * in the cluster.  This number will be the size of the nlm
 959                  * registry status array.  We add 1 because we will be using
 960                  * all entries indexed from 0 to maxnodeid; e.g., from 0
 961                  * to 64, for a total of 65 entries.
 962                  */
 963                 nlm_status_size = clconf_maximum_nodeid() + 1;
 964         } else {
 965                 nlm_status_size = 0;
 966         }
 967 
 968         if (nlm_status_size != 0) {     /* booted as a cluster */
 969                 nlm_reg_status = (flk_nlm_status_t *)
 970                     kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
 971                     KM_SLEEP);
 972 
 973                 /* initialize all NLM states in array to NLM_UNKNOWN */
 974                 for (i = 0; i < nlm_status_size; i++) {
 975                         nlm_reg_status[i] = FLK_NLM_UNKNOWN;
 976                 }
 977         }
 978 
 979         mutex_init(&flock_lock, NULL, MUTEX_DEFAULT, NULL);
 980         mutex_init(&nlm_reg_lock, NULL, MUTEX_DEFAULT, NULL);
 981 
 982         rw_init(&sysid_to_host_translator_lock, NULL, RW_DEFAULT, NULL);
 983         list_create(&sysid_to_host_translator_list,
 984             sizeof (struct sysid_to_host_translator_entry),
 985             offsetof(struct sysid_to_host_translator_entry, node));
 986 }
 987 
 988 /*
 989  * Zone constructor/destructor callbacks to be executed when a zone is
 990  * created/destroyed.
 991  */
 992 /* ARGSUSED */
 993 void *
 994 flk_zone_init(zoneid_t zoneid)
 995 {
 996         struct flock_globals *fg;
 997         uint_t i;
 998 
 999         fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
1000         fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
1001         for (i = 0; i < HASH_SIZE; i++)
1002                 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
1003         return (fg);
1004 }
1005 


1027         l->l_edge.edge_in_next = &l->l_edge;
1028         l->l_edge.edge_in_prev = &l->l_edge;
1029         l->l_edge.edge_adj_next = &l->l_edge;
1030         l->l_edge.edge_adj_prev = &l->l_edge;
1031         l->pvertex = -1;
1032         l->l_status = FLK_INITIAL_STATE;
1033         flk_lock_allocs++;
1034         return (l);
1035 }
1036 
1037 /*
1038  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
1039  * when some thread has a reference to it as in reclock().
1040  */
1041 
1042 void
1043 flk_free_lock(lock_descriptor_t *lock)
1044 {
1045         file_t *fp;
1046 
1047         ASSERT(lock->l_blocker >= 0);
1048         ASSERT(IS_DEAD(lock));
1049 
1050         if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock)
1051                 fp->f_filock = NULL;
1052 
1053         if (IS_REFERENCED(lock)) {
1054                 lock->l_state |= DELETED_LOCK;
1055                 return;
1056         }
1057         flk_lock_frees++;
1058         kmem_free(lock, sizeof (lock_descriptor_t));
1059 }
1060 
1061 void
1062 flk_set_state(lock_descriptor_t *lock, int new_state)
1063 {
1064         /*
1065          * Locks in the sleeping list may be woken up in a number of ways,
1066          * and more than once.  If a sleeping lock is signaled awake more
1067          * than once, then it may or may not change state depending on its
1068          * current state.
1069          * Also note that NLM locks that are sleeping could be moved to an
1070          * interrupted state more than once if the unlock request is
1071          * retransmitted by the NLM client - the second time around, this is
1072          * just a nop.
1073          * The ordering of being signaled awake is:
1074          * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
1075          * The checks below implement this ordering.
1076          */
1077         if (IS_INTERRUPTED(lock)) {
1078                 if ((new_state == FLK_CANCELLED_STATE) ||
1079                     (new_state == FLK_GRANTED_STATE) ||
1080                     (new_state == FLK_INTERRUPTED_STATE)) {
1081                         return;
1082                 }
1083         }
1084         if (IS_CANCELLED(lock)) {
1085                 if ((new_state == FLK_GRANTED_STATE) ||
1086                     (new_state == FLK_CANCELLED_STATE)) {
1087                         return;
1088                 }
1089         }
1090         CHECK_LOCK_TRANSITION(lock->l_status, new_state);
1091         if (IS_PXFS(lock)) {
1092                 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
1093         }
1094         lock->l_status = new_state;
1095 }
1096 
1097 /*
1098  * Support for the remote stale lock detection
1099  */
1100 
1101 void
1102 flk_add_sysid_to_host_translator(sysid_to_host_translator_t tr)
1103 {
1104         struct sysid_to_host_translator_entry *te;
1105 
1106         te = kmem_alloc(sizeof (struct sysid_to_host_translator_entry),
1107             KM_SLEEP);
1108 
1109         te->translator = tr;
1110 
1111         rw_enter(&sysid_to_host_translator_lock, RW_WRITER);
1112         list_insert_head(&sysid_to_host_translator_list, te);
1113         rw_exit(&sysid_to_host_translator_lock);
1114 }
1115 
1116 static void
1117 translate_sysid_to_host(zoneid_t zoneid, sysid_t sysid, char *host, size_t hlen,
1118     const char **type)
1119 {
1120         struct sockaddr sa;
1121         struct sysid_to_host_translator_entry *te;
1122 
1123         /* Some defaults in a case the translation will fail */
1124         *type = "?";
1125         (void) strlcpy(host, "?", hlen);
1126 
1127         rw_enter(&sysid_to_host_translator_lock, RW_READER);
1128 
1129         for (te = list_head(&sysid_to_host_translator_list); te != NULL;
1130             te = list_next(&sysid_to_host_translator_list, te)) {
1131 
1132                 if (te->translator(zoneid, sysid, &sa, type) != 0) {
1133                         rw_exit(&sysid_to_host_translator_lock);
1134 
1135                         switch (sa.sa_family) {
1136                         case AF_INET:
1137                                 (void) inet_ntop(AF_INET,
1138                                     &((struct sockaddr_in *)&sa)->sin_addr,
1139                                     host, hlen);
1140                                 break;
1141                         case AF_INET6:
1142                                 (void) inet_ntop(AF_INET6,
1143                                     &((struct sockaddr_in6 *)&sa)->sin6_addr,
1144                                     host, hlen);
1145                                 break;
1146                         default:
1147                                 break;
1148                         }
1149 
1150                         return;
1151                 }
1152         }
1153 
1154         rw_exit(&sysid_to_host_translator_lock);
1155 }
1156 
1157 static char *
1158 get_vnode_path(vnode_t *vp)
1159 {
1160         size_t len;
1161         char *ret;
1162 
1163         mutex_enter(&vp->v_lock);
1164         if (vp->v_path == NULL) {
1165                 mutex_exit(&vp->v_lock);
1166                 return (NULL);
1167         }
1168         len = strlen(vp->v_path) + 1;
1169         mutex_exit(&vp->v_lock);
1170 
1171         ret = kmem_alloc(len, KM_SLEEP);
1172 
1173         mutex_enter(&vp->v_lock);
1174         if (vp->v_path == NULL || strlen(vp->v_path) + 1 != len) {
1175                 mutex_exit(&vp->v_lock);
1176                 kmem_free(ret, len);
1177                 return (NULL);
1178         }
1179         bcopy(vp->v_path, ret, len);
1180         mutex_exit(&vp->v_lock);
1181 
1182         return (ret);
1183 }
1184 
1185 static void
1186 flk_stale_lock_check(lock_descriptor_t *lock)
1187 {
1188         char *path;
1189 
1190         char host[INET6_ADDRSTRLEN];            /* host name */
1191         const char *type;                       /* host type */
1192 
1193         /* temporary variables for the cmn_err() call */
1194         char *p, *t;            /* path, lock type */
1195         pid_t pid;              /* pid */
1196         void *v;                /* vnode */
1197         u_offset_t s, e;        /* start, end */
1198 
1199         ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1200 
1201         /*
1202          * Either not a remote lock, or the stale lock checking is disabled, or
1203          * the lock is already reported.
1204          */
1205         if (IS_LOCAL(lock) || stale_lock_timeout == 0 || lock->l_blocker < 0)
1206                 return;
1207 
1208         /* Seen first time? */
1209         if (lock->l_blocker == 0) {
1210                 lock->l_blocker = gethrtime();
1211                 return;
1212         }
1213 
1214         /* Old enough? */
1215         if ((gethrtime() - lock->l_blocker) / NANOSEC < stale_lock_timeout)
1216                 return;
1217 
1218         translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1219             sizeof (host), &type);
1220         path = get_vnode_path(lock->l_vnode);
1221 
1222         pid = lock->l_flock.l_pid;
1223         v = (void *)lock->l_vnode;
1224         p = path == NULL ? "?" : path;
1225         t = lock->l_type == F_WRLCK ? "WR" : "RD";
1226         s = lock->l_start;
1227         e = lock->l_end;
1228 
1229         /* Report the blocker as stale */
1230         cmn_err(CE_NOTE, "!Stale lock (host: %s (%s), pid: %d, vnode: %p, "
1231             "path: %s, %sLCK: %llu:%llu)", host, type, pid, v, p, t, s, e);
1232 
1233         if (path != NULL)
1234                 strfree(path);
1235 
1236         /* Mark this blocker as reported */
1237         lock->l_blocker = -lock->l_blocker;
1238 }
1239 
1240 static void
1241 flk_stale_lock_shrink(lock_descriptor_t *lock, lock_descriptor_t *new)
1242 {
1243         char *path;
1244 
1245         char host[INET6_ADDRSTRLEN];            /* host name */
1246         const char *type;                       /* host type */
1247 
1248         /* temporary variables for the cmn_err() call */
1249         char *p, *t;            /* path, lock type */
1250         pid_t pid;              /* pid */
1251         void *v;                /* vnode */
1252         u_offset_t s, e;        /* start, end */
1253         u_offset_t ns, ne;      /* new start, new end */
1254 
1255         ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1256 
1257         translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1258             sizeof (host), &type);
1259         path = get_vnode_path(lock->l_vnode);
1260 
1261         pid = lock->l_flock.l_pid;
1262         v = (void *)lock->l_vnode;
1263         p = path == NULL ? "?" : path;
1264         t = lock->l_type == F_WRLCK ? "WR" : "RD";
1265         s = lock->l_start;
1266         e = lock->l_end;
1267         ns = new->l_start;
1268         ne = new->l_end;
1269 
1270         cmn_err(CE_NOTE, "!Stale lock SHRINK (host: %s (%s), pid: %d, "
1271             "vnode: %p, path: %s, %sLCK: %llu:%llu -> %llu:%llu)", host, type,
1272             pid, v, p, t, s, e, ns, ne);
1273 
1274         if (path != NULL)
1275                 strfree(path);
1276 }
1277 
1278 static void
1279 flk_stale_lock_split(lock_descriptor_t *lock, lock_descriptor_t *new1,
1280     lock_descriptor_t *new2)
1281 {
1282         char *path;
1283 
1284         char host[INET6_ADDRSTRLEN];            /* host name */
1285         const char *type;                       /* host type */
1286 
1287         /* temporary variables for the cmn_err() call */
1288         char *p, *t;            /* path, lock type */
1289         pid_t pid;              /* pid */
1290         void *v;                /* vnode */
1291         u_offset_t s, e;        /* start, end */
1292         u_offset_t n1s, n1e;    /* new1 start, new1 end */
1293         u_offset_t n2s, n2e;    /* new2 start, new2 end */
1294 
1295         ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1296 
1297         translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1298             sizeof (host), &type);
1299         path = get_vnode_path(lock->l_vnode);
1300 
1301         pid = lock->l_flock.l_pid;
1302         v = (void *)lock->l_vnode;
1303         p = path == NULL ? "?" : path;
1304         t = lock->l_type == F_WRLCK ? "WR" : "RD";
1305         s = lock->l_start;
1306         e = lock->l_end;
1307         n1s = new1->l_start;
1308         n1e = new1->l_end;
1309         n2s = new2->l_start;
1310         n2e = new2->l_end;
1311 
1312         cmn_err(CE_NOTE, "!Stale lock SPLIT (host: %s (%s), pid: %d, "
1313             "vnode: %p, path: %s, %sLCK: %llu:%llu -> %llu:%llu and %llu:%llu)",
1314             host, type, pid, v, p, t, s, e, n1s, n1e, n2s, n2e);
1315 
1316         if (path != NULL)
1317                 strfree(path);
1318 }
1319 
1320 static void
1321 flk_stale_lock_release(lock_descriptor_t *lock)
1322 {
1323         char *path;
1324 
1325         char host[INET6_ADDRSTRLEN];            /* host name */
1326         const char *type;                       /* host type */
1327 
1328         /* temporary variables for the cmn_err() call */
1329         char *p, *t;            /* path, lock type */
1330         pid_t pid;              /* pid */
1331         void *v;                /* vnode */
1332         u_offset_t s, e;        /* start, end */
1333 
1334         ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1335 
1336         translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1337             sizeof (host), &type);
1338         path = get_vnode_path(lock->l_vnode);
1339 
1340         pid = lock->l_flock.l_pid;
1341         v = (void *)lock->l_vnode;
1342         p = path == NULL ? "?" : path;
1343         t = lock->l_type == F_WRLCK ? "WR" : "RD";
1344         s = lock->l_start;
1345         e = lock->l_end;
1346 
1347         cmn_err(CE_NOTE, "!Stale lock RELEASE (host: %s (%s), pid: %d, "
1348             "vnode: %p, path: %s, %sLCK: %llu:%llu)", host, type, pid, v, p, t,
1349             s, e);
1350 
1351         if (path != NULL)
1352                 strfree(path);
1353 }
1354 
1355 /*
1356  * Routine that checks whether there are any blocking locks in the system.
1357  *
1358  * The policy followed is if a write lock is sleeping we don't allow read
1359  * locks before this write lock even though there may not be any active
1360  * locks corresponding to the read locks' region.
1361  *
1362  * flk_add_edge() function adds an edge between l1 and l2 iff there
1363  * is no path between l1 and l2. This is done to have a "minimum
1364  * storage representation" of the dependency graph.
1365  *
1366  * Another property of the graph is since only the new request throws
1367  * edges to the existing locks in the graph, the graph is always topologically
1368  * ordered.
1369  */
1370 
1371 static int
1372 flk_process_request(lock_descriptor_t *request)
1373 {
1374         graph_t *gp = request->l_graph;
1375         lock_descriptor_t *lock;
1376         int request_blocked_by_active = 0;
1377         int request_blocked_by_granted = 0;
1378         int request_blocked_by_sleeping = 0;
1379         vnode_t *vp = request->l_vnode;
1380         int     error = 0;
1381         int request_will_wait = 0;
1382         int found_covering_lock = 0;
1383         lock_descriptor_t *covered_by = NULL;
1384 
1385         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1386         request_will_wait = IS_WILLING_TO_SLEEP(request);
1387 
1388         /*
1389          * check active locks
1390          */
1391 
1392         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1393 

1394         if (lock) {
1395                 do {
1396                         if (BLOCKS(lock, request)) {
1397                                 if (!request_will_wait) {
1398                                         flk_stale_lock_check(lock);
1399                                         return (EAGAIN);
1400                                 }
1401                                 request_blocked_by_active = 1;
1402                                 break;
1403                         }
1404                         /*
1405                          * Grant lock if it is for the same owner holding active
1406                          * lock that covers the request.
1407                          */
1408 
1409                         if (SAME_OWNER(lock, request) &&
1410                             COVERS(lock, request) &&
1411                             (request->l_type == F_RDLCK))
1412                                 return (flk_execute_request(request));
1413                         lock = lock->l_next;
1414                 } while (lock->l_vnode == vp);
1415         }
1416 
1417         if (!request_blocked_by_active) {
1418                 lock_descriptor_t *lk[1];
1419                 lock_descriptor_t *first_glock = NULL;
1420 
1421                 /*
1422                  * Shall we grant this?! NO!!
1423                  * What about those locks that were just granted and still
1424                  * in sleep queue. Those threads are woken up and so locks
1425                  * are almost active.
1426                  */
1427                 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1428                 if (lock) {
1429                         do {
1430                                 if (BLOCKS(lock, request)) {
1431                                         if (IS_GRANTED(lock)) {
1432                                                 request_blocked_by_granted = 1;
1433                                         } else {
1434                                                 request_blocked_by_sleeping = 1;
1435                                         }
1436                                 }
1437 
1438                                 lock = lock->l_next;
1439                         } while ((lock->l_vnode == vp));
1440                         first_glock = lock->l_prev;


1457                         /*
1458                          * If we have a sleeping writer in the requested
1459                          * lock's range, block.
1460                          */
1461                         goto block;
1462                 }
1463 
1464                 lk[0] = request;
1465                 request->l_state |= RECOMPUTE_LOCK;
1466                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1467                 if (lock) {
1468                         do {
1469                                 flk_recompute_dependencies(lock, lk, 1, 0);
1470                                 lock = lock->l_next;
1471                         } while (lock->l_vnode == vp);
1472                 }
1473                 lock = first_glock;
1474                 if (lock) {
1475                         do {
1476                                 if (IS_GRANTED(lock)) {
1477                                         flk_recompute_dependencies(lock, lk, 1,
1478                                             0);
1479                                 }
1480                                 lock = lock->l_prev;
1481                         } while ((lock->l_vnode == vp));
1482                 }
1483                 request->l_state &= ~RECOMPUTE_LOCK;
1484                 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
1485                         return (EDEADLK);
1486                 return (flk_execute_request(request));
1487         }
1488 
1489 block:
1490         if (request_will_wait)
1491                 flk_graph_uncolor(gp);
1492 
1493         /* check sleeping locks */
1494 
1495         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1496 
1497         /*
1498          * If we find a sleeping write lock that is a superset of the


1520                                             !SAME_OWNER(lock, covered_by)) {
1521                                                 found_covering_lock++;
1522                                                 break;
1523                                         }
1524                                         found_covering_lock = 1;
1525                                         covered_by = lock;
1526                                 }
1527                                 if (found_covering_lock &&
1528                                     !SAME_OWNER(lock, covered_by)) {
1529                                         lock = lock->l_next;
1530                                         continue;
1531                                 }
1532                                 if ((error = flk_add_edge(request, lock,
1533                                     !found_covering_lock, 0)))
1534                                         return (error);
1535                         }
1536                         lock = lock->l_next;
1537                 } while (lock->l_vnode == vp);
1538         }
1539 
1540         /*
1541          * found_covering_lock == 2 iff at this point 'request' has paths to
1542          * all locks that blocks 'request'. found_covering_lock == 1 iff at
1543          * this point 'request' has paths to all locks that blocks 'request'
1544          * whose owners are not same as the one that covers 'request'
1545          * (covered_by above) and we can have locks whose owner is same as
1546          * covered_by in the active list.
1547          */
1548 
1549         if (request_blocked_by_active && found_covering_lock != 2) {
1550                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1551                 ASSERT(lock != NULL);
1552                 do {
1553                         if (BLOCKS(lock, request)) {
1554                                 if (found_covering_lock &&
1555                                     !SAME_OWNER(lock, covered_by)) {
1556                                         lock = lock->l_next;
1557                                         continue;
1558                                 }
1559                                 if ((error = flk_add_edge(request, lock,
1560                                     CHECK_CYCLE, 0)))
1561                                         return (error);
1562                         }
1563                         lock = lock->l_next;
1564                 } while (lock->l_vnode == vp);
1565         }
1566 


1600         vnode_t *vp = request->l_vnode;
1601         lock_descriptor_t       *lock, *lock1;
1602         int done_searching = 0;
1603 
1604         CHECK_SLEEPING_LOCKS(gp);
1605         CHECK_ACTIVE_LOCKS(gp);
1606 
1607         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1608 
1609         flk_set_state(request, FLK_START_STATE);
1610 
1611         ASSERT(NOT_BLOCKED(request));
1612 
1613         /* IO_LOCK requests are only to check status */
1614 
1615         if (IS_IO_LOCK(request))
1616                 return (0);
1617 
1618         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1619 
1620         if (lock != NULL) {
1621                 /*
1622                  * There are some active locks so check for relations
1623                  */



1624                 do {
1625                         lock1 = lock->l_next;
1626                         if (SAME_OWNER(request, lock)) {
1627                                 done_searching = flk_relation(lock, request);
1628                         }
1629                         lock = lock1;
1630                 } while (lock->l_vnode == vp && !done_searching);
1631         }
1632 
1633         /*
1634          * insert in active queue
1635          */
1636 
1637         if (request->l_type != F_UNLCK)
1638                 flk_insert_active_lock(request);
1639 
1640         return (0);
1641 }
1642 
1643 /*
1644  * 'request' is blocked by some one therefore we put it into sleep queue.
1645  */
1646 static int
1647 flk_wait_execute_request(lock_descriptor_t *request)
1648 {
1649         graph_t *gp = request->l_graph;
1650         callb_cpr_t     *cprp;          /* CPR info from callback */
1651         struct flock_globals *fg;


1876 
1877         ASSERT(flk_edge_cache != NULL);
1878 
1879         ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1880         edge_allocs++;
1881         return (ep);
1882 }
1883 
1884 /*
1885  * Free the edge structure.
1886  */
1887 
1888 static void
1889 flk_free_edge(edge_t *ep)
1890 {
1891         edge_frees++;
1892         kmem_cache_free(flk_edge_cache, (void *)ep);
1893 }
1894 
1895 /*
1896  * Check the relationship of 'request' with 'lock' and perform the
1897  * recomputation of dependencies, break 'lock' if required, and return
1898  * 1 if 'request' cannot have any more relationship with the next
1899  * active locks.
1900  *
1901  * The 'lock' and 'request' are compared and in case of overlap we
1902  * delete the 'lock' and form new locks to represent the non-overlapped
1903  * portion of original 'lock'. This function has side effects such as
1904  * 'lock' will be freed, new locks will be added to the active list.
1905  */
1906 
1907 static int
1908 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1909 {
1910         int lock_effect;

1911         lock_descriptor_t *topology[3];
1912         int nvertex = 0;
1913         int i;
1914         edge_t  *ep;
1915         graph_t *gp = lock->l_graph;
1916         boolean_t mergeable;
1917 
1918         ASSERT(request->l_blocker == 0);
1919 
1920         CHECK_SLEEPING_LOCKS(gp);
1921         CHECK_ACTIVE_LOCKS(gp);
1922 
1923         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1924 
1925         topology[0] = topology[1] = topology[2] = NULL;
1926 
1927         if (request->l_type == F_UNLCK)
1928                 lock_effect = FLK_UNLOCK;
1929         else if (request->l_type == F_RDLCK &&
1930             lock->l_type == F_WRLCK)
1931                 lock_effect = FLK_DOWNGRADE;
1932         else if (request->l_type == F_WRLCK &&
1933             lock->l_type == F_RDLCK)
1934                 lock_effect = FLK_UPGRADE;
1935         else
1936                 lock_effect = FLK_STAY_SAME;
1937 
1938         /*
1939          * The 'lock' and 'request' are merged only in a case the effect of
1940          * both locks is same (FLK_STAY_SAME) and their blocker status
1941          * (l_blocker) is same as well.  We do not merge 'lock' and 'request'
1942          * with different l_blocker values because such merge might affect the
1943          * stale lock detection.  It might cause either false positives, or
1944          * miss some stale locks.
1945          */
1946         mergeable = lock_effect == FLK_STAY_SAME &&
1947             lock->l_blocker == request->l_blocker;
1948 
1949         if (lock->l_end < request->l_start) {
1950                 /* If the 'lock' is just next to 'request', try to merge them */
1951                 if (lock->l_end == request->l_start - 1 && mergeable) {

1952                         request->l_start = lock->l_start;

1953                         goto recompute;
1954                 }
1955 
1956                 /* Otherwise, they do not overlap, so return immediately */
1957                 return (0);
1958         }

1959 
1960         if (request->l_end < lock->l_start) {
1961                 /* If the 'request' is just next to 'lock', try to merge them */
1962                 if (request->l_end == lock->l_start - 1 && mergeable) {

1963                         request->l_end = lock->l_end;

1964                         goto recompute;
1965                 }
1966 
1967                 /* Otherwise, they do not overlap, so return immediately */
1968                 return (1);
1969         }

1970 
1971         /*
1972          * Here we are sure the 'lock' and 'request' overlaps, so the 'request'
1973          * will replace the 'lock' (either fully, or at least partially).
1974          */
1975 
1976         /*
1977          * If the 'request' does not fully cover the 'lock' at the start,
1978          * either move the start of the 'request' to cover the 'lock', or split
1979          * the 'lock'.
1980          */
1981         if (lock->l_start < request->l_start) {
1982                 if (mergeable) {
1983                         request->l_start = lock->l_start;



1984                 } else {
1985                         lock_descriptor_t *new_lock = flk_get_lock();
1986 
1987                         COPY(new_lock, lock);
1988                         new_lock->l_end = request->l_start - 1;
1989 
1990                         topology[nvertex++] = new_lock;






1991                 }












1992         }
1993 
1994         /*
1995          * If the 'request' does not fully cover the 'lock' at the end, either
1996          * move the end of the 'request' to cover the 'lock', or split the
1997          * 'lock'.
1998          */
1999         if (request->l_end < lock->l_end) {
2000                 if (mergeable) {
2001                         request->l_end = lock->l_end;


2002                 } else {
2003                         lock_descriptor_t *new_lock = flk_get_lock();
2004 
2005                         COPY(new_lock, lock);
2006                         new_lock->l_start = request->l_end + 1;
2007 
2008                         topology[nvertex++] = new_lock;
2009                 }
2010         }
2011 
2012         /*
2013          * Log the blocker change
2014          */
2015         if (nvertex > 0 && lock->l_blocker < 0) {
2016                 if (nvertex == 1)
2017                         flk_stale_lock_shrink(lock, topology[0]);
2018                 if (nvertex == 2)
2019                         flk_stale_lock_split(lock, topology[0], topology[1]);
2020 
2021                 lock->l_blocker = 0;


2022         }






































2023 
2024 recompute:

2025         /*
2026          * For unlock we don't send the 'request' to for recomputing
2027          * dependencies because no lock will add an edge to this.
2028          */
2029         if (lock_effect != FLK_UNLOCK)
2030                 topology[nvertex++] = request;
2031 




2032         for (i = 0; i < nvertex; i++) {
2033                 topology[i]->l_state |= RECOMPUTE_LOCK;
2034                 topology[i]->l_color = NO_COLOR;
2035         }
2036 
2037         ASSERT(FIRST_ADJ(lock) == HEAD(lock));
2038 
2039         /*
2040          * we remove the adjacent edges for all vertices' to this vertex
2041          * 'lock'.
2042          */

2043         ep = FIRST_IN(lock);
2044         while (ep != HEAD(lock)) {
2045                 ADJ_LIST_REMOVE(ep);
2046                 ep = NEXT_IN(ep);
2047         }
2048 
2049         flk_delete_active_lock(lock, 0);
2050 
2051         /* We are ready for recomputing the dependencies now */

2052         flk_recompute_dependencies(lock, topology, nvertex, 1);
2053 
2054         for (i = 0; i < nvertex; i++) {
2055                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
2056                 topology[i]->l_color = NO_COLOR;
2057         }
2058 

2059         if (lock_effect == FLK_UNLOCK) {
2060                 nvertex++;
2061         }
2062         for (i = 0; i < nvertex - 1; i++) {
2063                 flk_insert_active_lock(topology[i]);
2064         }
2065 

2066         if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
2067                 flk_wakeup(lock, 0);
2068         } else {
2069                 ep = FIRST_IN(lock);
2070                 while (ep != HEAD(lock)) {
2071                         lock->l_sedge = NEXT_IN(ep);
2072                         IN_LIST_REMOVE(ep);
2073                         flk_update_proc_graph(ep, 1);
2074                         flk_free_edge(ep);
2075                         ep = lock->l_sedge;
2076                 }
2077         }
2078         flk_free_lock(lock);
2079 
2080         CHECK_SLEEPING_LOCKS(gp);
2081         CHECK_ACTIVE_LOCKS(gp);
2082         return (0);
2083 }
2084 
2085 /*


2124 /*
2125  * Delete the active lock : Performs two functions depending on the
2126  * value of second parameter. One is to remove from the active lists
2127  * only and other is to both remove and free the lock.
2128  */
2129 
2130 static void
2131 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
2132 {
2133         vnode_t *vp = lock->l_vnode;
2134         graph_t *gp = lock->l_graph;
2135 
2136         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2137         if (free_lock)
2138                 ASSERT(NO_DEPENDENTS(lock));
2139         ASSERT(NOT_BLOCKED(lock));
2140         ASSERT(IS_ACTIVE(lock));
2141 
2142         ASSERT((vp->v_filocks != NULL));
2143 
2144         if (lock->l_blocker < 0) {
2145                 /* Log the blocker release */
2146                 flk_stale_lock_release(lock);
2147                 lock->l_blocker = 0;
2148         }
2149 
2150         if (vp->v_filocks == (struct filock *)lock) {
2151                 vp->v_filocks = (struct filock *)
2152                     ((lock->l_next->l_vnode == vp) ? lock->l_next :
2153                     NULL);
2154         }
2155         lock->l_next->l_prev = lock->l_prev;
2156         lock->l_prev->l_next = lock->l_next;
2157         lock->l_next = lock->l_prev = NULL;
2158         flk_set_state(lock, FLK_DEAD_STATE);
2159         lock->l_state &= ~ACTIVE_LOCK;
2160 
2161         if (free_lock)
2162                 flk_free_lock(lock);
2163         CHECK_ACTIVE_LOCKS(gp);
2164         CHECK_SLEEPING_LOCKS(gp);
2165 }
2166 
2167 /*
2168  * Insert into the sleep queue.
2169  */


2298                 request->l_sedge = NEXT_ADJ(ep);
2299                 ADJ_LIST_REMOVE(ep);
2300                 flk_update_proc_graph(ep, 1);
2301                 flk_free_edge(ep);
2302                 ep = request->l_sedge;
2303         }
2304 
2305 
2306         /*
2307          * unset the RECOMPUTE flag in those vertices
2308          */
2309 
2310         for (i = 0; i < nvertex; i++) {
2311                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
2312         }
2313 
2314         /*
2315          * free the topology
2316          */
2317         if (nvertex)
2318                 kmem_free(topology,
2319                     (nvertex * sizeof (lock_descriptor_t *)));
2320         /*
2321          * Possibility of some locks unblocked now
2322          */
2323 
2324         flk_wakeup(request, 0);
2325 
2326         /*
2327          * we expect to have a correctly recomputed graph  now.
2328          */
2329         flk_set_state(request, FLK_DEAD_STATE);
2330         flk_free_lock(request);
2331         CHECK_SLEEPING_LOCKS(gp);
2332         CHECK_ACTIVE_LOCKS(gp);
2333 
2334 }
2335 
2336 /*
2337  * Uncoloring the graph is simply to increment the mark value of the graph
2338  * And only when wrap round takes place will we color all vertices in


3234                 flk_wakeup(lock, 1);
3235                 flk_free_lock(lock);
3236         }
3237 
3238         CHECK_SLEEPING_LOCKS(gp);
3239         CHECK_ACTIVE_LOCKS(gp);
3240         CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
3241         mutex_exit(&gp->gp_mutex);
3242 }
3243 
3244 
3245 /*
3246  * Called from 'fs' read and write routines for files that have mandatory
3247  * locking enabled.
3248  */
3249 
3250 int
3251 chklock(struct vnode *vp, int iomode, u_offset_t offset, ssize_t len, int fmode,
3252     caller_context_t *ct)
3253 {
3254         int             i;
3255         struct flock64  bf;
3256         int             error = 0;
3257 
3258         bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
3259         bf.l_whence = 0;
3260         bf.l_start = offset;
3261         bf.l_len = len;
3262         if (ct == NULL) {
3263                 bf.l_pid = curproc->p_pid;
3264                 bf.l_sysid = 0;
3265         } else {
3266                 bf.l_pid = ct->cc_pid;
3267                 bf.l_sysid = ct->cc_sysid;
3268         }
3269         i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
3270         if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
3271             bf.l_type != F_UNLCK)
3272                 error = i ? i : EAGAIN;
3273         return (error);
3274 }


3565  */
3566 
3567 static proc_edge_t *
3568 flk_get_proc_edge()
3569 {
3570         proc_edge_t *pep;
3571 
3572         pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3573         flk_proc_edge_allocs++;
3574         return (pep);
3575 }
3576 
3577 /*
3578  * Free the proc edge. Called whenever its reference count goes to zero.
3579  */
3580 
3581 static void
3582 flk_free_proc_edge(proc_edge_t *pep)
3583 {
3584         ASSERT(pep->refcount == 0);
3585         kmem_free(pep, sizeof (proc_edge_t));
3586         flk_proc_edge_frees++;
3587 }
3588 
3589 /*
3590  * Color the graph explicitly done only when the mark value hits max value.
3591  */
3592 
3593 static void
3594 flk_proc_graph_uncolor()
3595 {
3596         int i;
3597 
3598         if (pgraph.mark == UINT_MAX) {
3599                 for (i = 0; i < pgraph.gcount; i++)
3600                         if (pgraph.proc[i] != NULL) {
3601                                 pgraph.proc[i]->atime = 0;
3602                                 pgraph.proc[i]->dtime = 0;
3603                         }
3604                 pgraph.mark = 1;
3605         } else {


4307                         continue;
4308                 }
4309 
4310                 mutex_enter(&gp->gp_mutex);
4311                 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
4312                 for (lock = ACTIVE_HEAD(gp)->l_next;
4313                     lock != ACTIVE_HEAD(gp);
4314                     lock = nlock) {
4315                         nlock = lock->l_next;
4316                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4317                                 ASSERT(IS_ACTIVE(lock));
4318                                 flk_delete_active_lock(lock, 0);
4319                                 flk_wakeup(lock, 1);
4320                                 flk_free_lock(lock);
4321                         }
4322                 }
4323                 mutex_exit(&gp->gp_mutex);
4324         }
4325 }
4326 

4327 /*
4328  * Wait until a lock is granted, cancelled, or interrupted.
4329  */
4330 
4331 static void
4332 wait_for_lock(lock_descriptor_t *request)
4333 {
4334         graph_t *gp = request->l_graph;
4335         vnode_t *vp = request->l_vnode;
4336 
4337         ASSERT(MUTEX_HELD(&gp->gp_mutex));
4338 
4339         while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
4340             !(IS_INTERRUPTED(request))) {
4341                 lock_descriptor_t *lock;
4342 
4343                 if (stale_lock_timeout == 0) {
4344                         /* The stale lock detection is disabled */
4345                         if (cv_wait_sig(&request->l_cv, &gp->gp_mutex) == 0) {
4346                                 flk_set_state(request, FLK_INTERRUPTED_STATE);
4347                                 request->l_state |= INTERRUPTED_LOCK;
4348                         }
4349 
4350                         continue;
4351                 }
4352 
4353                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4354 
4355                 if (lock != NULL) {
4356                         do {
4357                                 if (BLOCKS(lock, request)) {
4358                                         flk_stale_lock_check(lock);
4359                                         break;
4360                                 }
4361                                 lock = lock->l_next;
4362                         } while (lock->l_vnode == vp);
4363                 }
4364 
4365                 if (cv_timedwait_sig(&request->l_cv, &gp->gp_mutex,
4366                     ddi_get_lbolt() + SEC_TO_TICK(stale_lock_timeout)) == 0) {
4367                         flk_set_state(request, FLK_INTERRUPTED_STATE);
4368                         request->l_state |= INTERRUPTED_LOCK;
4369                 }
4370         }
4371 }
4372 
4373 /*
4374  * Create an flock structure from the existing lock information
4375  *
4376  * This routine is used to create flock structures for the lock manager
4377  * to use in a reclaim request.  Since the lock was originated on this
4378  * host, it must be conforming to UNIX semantics, so no checking is
4379  * done to make sure it falls within the lower half of the 32-bit range.
4380  */
4381 
4382 static void
4383 create_flock(lock_descriptor_t *lp, flock64_t *flp)
4384 {
4385         ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
4386         ASSERT(lp->l_end >= lp->l_start);
4387 
4388         flp->l_type = lp->l_type;
4389         flp->l_whence = 0;
4390         flp->l_start = lp->l_start;


4402 
4403 int
4404 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
4405     u_offset_t *start, u_offset_t *end, offset_t offset)
4406 {
4407         struct vattr    vattr;
4408         int     error;
4409 
4410         /*
4411          * Determine the starting point of the request
4412          */
4413         switch (flp->l_whence) {
4414         case 0:         /* SEEK_SET */
4415                 *start = (u_offset_t)flp->l_start;
4416                 break;
4417         case 1:         /* SEEK_CUR */
4418                 *start = (u_offset_t)(flp->l_start + offset);
4419                 break;
4420         case 2:         /* SEEK_END */
4421                 vattr.va_mask = AT_SIZE;
4422                 if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
4423                         return (error);
4424                 *start = (u_offset_t)(flp->l_start + vattr.va_size);
4425                 break;
4426         default:
4427                 return (EINVAL);
4428         }
4429 
4430         /*
4431          * Determine the range covered by the request.
4432          */
4433         if (flp->l_len == 0)
4434                 *end = MAX_U_OFFSET_T;
4435         else if ((offset_t)flp->l_len > 0) {
4436                 *end = (u_offset_t)(*start + (flp->l_len - 1));
4437         } else {
4438                 /*
4439                  * Negative length; why do we even allow this ?
4440                  * Because this allows easy specification of
4441                  * the last n bytes of the file.
4442                  */
4443                 *end = *start;
4444                 *start += (u_offset_t)flp->l_len;
4445                 (*start)++;
4446         }
4447         return (0);
4448 }
4449 
4450 /*
4451  * Check the validity of lock data.  This can used by the NFS
4452  * frlock routines to check data before contacting the server.  The
4453  * server must support semantics that aren't as restrictive as
4454  * the UNIX API, so the NFS client is required to check.
4455  * The maximum is passed in by the caller.
4456  */
4457 
4458 int
4459 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
4460 {
4461         /*
4462          * The end (length) for local locking should never be greater
4463          * than max. However, the representation for
4464          * the entire file is MAX_U_OFFSET_T.
4465          */
4466         if ((start > max) ||
4467             ((end > max) && (end != MAX_U_OFFSET_T))) {
4468                 return (EINVAL);
4469         }
4470         if (start > end) {
4471                 return (EINVAL);
4472         }
4473         return (0);
4474 }
4475 
4476 /*
4477  * Fill in request->l_flock with information about the lock blocking the
4478  * request.  The complexity here is that lock manager requests are allowed
4479  * to see into the upper part of the 32-bit address range, whereas local
4480  * requests are only allowed to see signed values.
4481  *
4482  * What should be done when "blocker" is a lock manager lock that uses the
4483  * upper portion of the 32-bit range, but "request" is local?  Since the