11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
32 * Copyright 2015 Joyent, Inc.
33 */
34
35 #include <sys/flock_impl.h>
36 #include <sys/vfs.h>
37 #include <sys/t_lock.h> /* for <sys/callb.h> */
38 #include <sys/callb.h>
39 #include <sys/clconf.h>
40 #include <sys/cladm.h>
41 #include <sys/nbmlock.h>
42 #include <sys/cred.h>
43 #include <sys/policy.h>
44
45 /*
46 * The following four variables are for statistics purposes and they are
47 * not protected by locks. They may not be accurate but will at least be
48 * close to the actual value.
49 */
50
51 int flk_lock_allocs;
52 int flk_lock_frees;
53 int edge_allocs;
54 int edge_frees;
55 int flk_proc_vertex_allocs;
56 int flk_proc_edge_allocs;
57 int flk_proc_vertex_frees;
58 int flk_proc_edge_frees;
59
60 static kmutex_t flock_lock;
61
62 #ifdef DEBUG
63 int check_debug = 0;
142 * running, and so whether to allow lock manager requests or not.
143 *
144 * Thus, on a per-zone basis we maintain a ``global'' variable
145 * (flk_lockmgr_status), protected by flock_lock, and set when the lock
146 * manager is determined to be changing state (starting or stopping).
147 *
148 * Each graph/zone pair also has a copy of this variable, which is protected by
149 * the graph's mutex.
150 *
151 * The per-graph copies are used to synchronize lock requests with shutdown
152 * requests. The global copy is used to initialize the per-graph field when a
153 * new graph is created.
154 */
155 struct flock_globals {
156 flk_lockmgr_status_t flk_lockmgr_status;
157 flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
158 };
159
160 zone_key_t flock_zone_key;
161
162 static void create_flock(lock_descriptor_t *, flock64_t *);
163 static lock_descriptor_t *flk_get_lock(void);
164 static void flk_free_lock(lock_descriptor_t *lock);
165 static void flk_get_first_blocking_lock(lock_descriptor_t *request);
166 static int flk_process_request(lock_descriptor_t *);
167 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
168 static edge_t *flk_get_edge(void);
169 static int flk_wait_execute_request(lock_descriptor_t *);
170 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
171 static void flk_insert_active_lock(lock_descriptor_t *);
172 static void flk_delete_active_lock(lock_descriptor_t *, int);
173 static void flk_insert_sleeping_lock(lock_descriptor_t *);
174 static void flk_graph_uncolor(graph_t *);
175 static void flk_wakeup(lock_descriptor_t *, int);
176 static void flk_free_edge(edge_t *);
177 static void flk_recompute_dependencies(lock_descriptor_t *,
178 lock_descriptor_t **, int, int);
179 static int flk_find_barriers(lock_descriptor_t *);
180 static void flk_update_barriers(lock_descriptor_t *);
181 static int flk_color_reachables(lock_descriptor_t *);
182 static int flk_canceled(lock_descriptor_t *);
183 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
184 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
185 static void wait_for_lock(lock_descriptor_t *);
539 lock_descriptor_t *lock_request;
540 int error = 0;
541 graph_t *gp;
542 int nlmid;
543
544 /*
545 * Check access permissions
546 */
547 if ((cmd & SETFLCK) &&
548 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
549 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
550 return (EBADF);
551
552 /*
553 * for query and unlock we use the stack_lock_request
554 */
555
556 if ((lckdat->l_type == F_UNLCK) ||
557 !((cmd & INOFLCK) || (cmd & SETFLCK))) {
558 lock_request = &stack_lock_request;
559 (void) bzero((caddr_t)lock_request,
560 sizeof (lock_descriptor_t));
561
562 /*
563 * following is added to make the assertions in
564 * flk_execute_request() to pass through
565 */
566
567 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
568 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
569 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
570 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
571 lock_request->l_status = FLK_INITIAL_STATE;
572 } else {
573 lock_request = flk_get_lock();
574 }
575 lock_request->l_state = 0;
576 lock_request->l_vnode = vp;
577 lock_request->l_zoneid = getzoneid();
578
579 /*
580 * Convert the request range into the canonical start and end
933 * in the cluster. This number will be the size of the nlm
934 * registry status array. We add 1 because we will be using
935 * all entries indexed from 0 to maxnodeid; e.g., from 0
936 * to 64, for a total of 65 entries.
937 */
938 nlm_status_size = clconf_maximum_nodeid() + 1;
939 } else {
940 nlm_status_size = 0;
941 }
942
943 if (nlm_status_size != 0) { /* booted as a cluster */
944 nlm_reg_status = (flk_nlm_status_t *)
945 kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
946 KM_SLEEP);
947
948 /* initialize all NLM states in array to NLM_UNKNOWN */
949 for (i = 0; i < nlm_status_size; i++) {
950 nlm_reg_status[i] = FLK_NLM_UNKNOWN;
951 }
952 }
953 }
954
955 /*
956 * Zone constructor/destructor callbacks to be executed when a zone is
957 * created/destroyed.
958 */
959 /* ARGSUSED */
960 void *
961 flk_zone_init(zoneid_t zoneid)
962 {
963 struct flock_globals *fg;
964 uint_t i;
965
966 fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
967 fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
968 for (i = 0; i < HASH_SIZE; i++)
969 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
970 return (fg);
971 }
972
994 l->l_edge.edge_in_next = &l->l_edge;
995 l->l_edge.edge_in_prev = &l->l_edge;
996 l->l_edge.edge_adj_next = &l->l_edge;
997 l->l_edge.edge_adj_prev = &l->l_edge;
998 l->pvertex = -1;
999 l->l_status = FLK_INITIAL_STATE;
1000 flk_lock_allocs++;
1001 return (l);
1002 }
1003
1004 /*
1005 * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
1006 * when some thread has a reference to it as in reclock().
1007 */
1008
1009 void
1010 flk_free_lock(lock_descriptor_t *lock)
1011 {
1012 file_t *fp;
1013
1014 ASSERT(IS_DEAD(lock));
1015
1016 if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock)
1017 fp->f_filock = NULL;
1018
1019 if (IS_REFERENCED(lock)) {
1020 lock->l_state |= DELETED_LOCK;
1021 return;
1022 }
1023 flk_lock_frees++;
1024 kmem_free((void *)lock, sizeof (lock_descriptor_t));
1025 }
1026
1027 void
1028 flk_set_state(lock_descriptor_t *lock, int new_state)
1029 {
1030 /*
1031 * Locks in the sleeping list may be woken up in a number of ways,
1032 * and more than once. If a sleeping lock is signaled awake more
1033 * than once, then it may or may not change state depending on its
1034 * current state.
1035 * Also note that NLM locks that are sleeping could be moved to an
1036 * interrupted state more than once if the unlock request is
1037 * retransmitted by the NLM client - the second time around, this is
1038 * just a nop.
1039 * The ordering of being signaled awake is:
1040 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
1041 * The checks below implement this ordering.
1042 */
1043 if (IS_INTERRUPTED(lock)) {
1044 if ((new_state == FLK_CANCELLED_STATE) ||
1045 (new_state == FLK_GRANTED_STATE) ||
1046 (new_state == FLK_INTERRUPTED_STATE)) {
1047 return;
1048 }
1049 }
1050 if (IS_CANCELLED(lock)) {
1051 if ((new_state == FLK_GRANTED_STATE) ||
1052 (new_state == FLK_CANCELLED_STATE)) {
1053 return;
1054 }
1055 }
1056 CHECK_LOCK_TRANSITION(lock->l_status, new_state);
1057 if (IS_PXFS(lock)) {
1058 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
1059 }
1060 lock->l_status = new_state;
1061 }
1062
1063 /*
1064 * Routine that checks whether there are any blocking locks in the system.
1065 *
1066 * The policy followed is if a write lock is sleeping we don't allow read
1067 * locks before this write lock even though there may not be any active
1068 * locks corresponding to the read locks' region.
1069 *
1070 * flk_add_edge() function adds an edge between l1 and l2 iff there
1071 * is no path between l1 and l2. This is done to have a "minimum
1072 * storage representation" of the dependency graph.
1073 *
1074 * Another property of the graph is since only the new request throws
1075 * edges to the existing locks in the graph, the graph is always topologically
1076 * ordered.
1077 */
1078
1079 static int
1080 flk_process_request(lock_descriptor_t *request)
1081 {
1082 graph_t *gp = request->l_graph;
1083 lock_descriptor_t *lock;
1084 int request_blocked_by_active = 0;
1085 int request_blocked_by_granted = 0;
1086 int request_blocked_by_sleeping = 0;
1087 vnode_t *vp = request->l_vnode;
1088 int error = 0;
1089 int request_will_wait = 0;
1090 int found_covering_lock = 0;
1091 lock_descriptor_t *covered_by = NULL;
1092
1093 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1094 request_will_wait = IS_WILLING_TO_SLEEP(request);
1095
1096 /*
1097 * check active locks
1098 */
1099
1100 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1101
1102
1103 if (lock) {
1104 do {
1105 if (BLOCKS(lock, request)) {
1106 if (!request_will_wait)
1107 return (EAGAIN);
1108 request_blocked_by_active = 1;
1109 break;
1110 }
1111 /*
1112 * Grant lock if it is for the same owner holding active
1113 * lock that covers the request.
1114 */
1115
1116 if (SAME_OWNER(lock, request) &&
1117 COVERS(lock, request) &&
1118 (request->l_type == F_RDLCK))
1119 return (flk_execute_request(request));
1120 lock = lock->l_next;
1121 } while (lock->l_vnode == vp);
1122 }
1123
1124 if (!request_blocked_by_active) {
1125 lock_descriptor_t *lk[1];
1126 lock_descriptor_t *first_glock = NULL;
1127 /*
1128 * Shall we grant this?! NO!!
1129 * What about those locks that were just granted and still
1130 * in sleep queue. Those threads are woken up and so locks
1131 * are almost active.
1132 */
1133 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1134 if (lock) {
1135 do {
1136 if (BLOCKS(lock, request)) {
1137 if (IS_GRANTED(lock)) {
1138 request_blocked_by_granted = 1;
1139 } else {
1140 request_blocked_by_sleeping = 1;
1141 }
1142 }
1143
1144 lock = lock->l_next;
1145 } while ((lock->l_vnode == vp));
1146 first_glock = lock->l_prev;
1163 /*
1164 * If we have a sleeping writer in the requested
1165 * lock's range, block.
1166 */
1167 goto block;
1168 }
1169
1170 lk[0] = request;
1171 request->l_state |= RECOMPUTE_LOCK;
1172 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1173 if (lock) {
1174 do {
1175 flk_recompute_dependencies(lock, lk, 1, 0);
1176 lock = lock->l_next;
1177 } while (lock->l_vnode == vp);
1178 }
1179 lock = first_glock;
1180 if (lock) {
1181 do {
1182 if (IS_GRANTED(lock)) {
1183 flk_recompute_dependencies(lock, lk, 1, 0);
1184 }
1185 lock = lock->l_prev;
1186 } while ((lock->l_vnode == vp));
1187 }
1188 request->l_state &= ~RECOMPUTE_LOCK;
1189 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
1190 return (EDEADLK);
1191 return (flk_execute_request(request));
1192 }
1193
1194 block:
1195 if (request_will_wait)
1196 flk_graph_uncolor(gp);
1197
1198 /* check sleeping locks */
1199
1200 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1201
1202 /*
1203 * If we find a sleeping write lock that is a superset of the
1225 !SAME_OWNER(lock, covered_by)) {
1226 found_covering_lock++;
1227 break;
1228 }
1229 found_covering_lock = 1;
1230 covered_by = lock;
1231 }
1232 if (found_covering_lock &&
1233 !SAME_OWNER(lock, covered_by)) {
1234 lock = lock->l_next;
1235 continue;
1236 }
1237 if ((error = flk_add_edge(request, lock,
1238 !found_covering_lock, 0)))
1239 return (error);
1240 }
1241 lock = lock->l_next;
1242 } while (lock->l_vnode == vp);
1243 }
1244
1245 /*
1246 * found_covering_lock == 2 iff at this point 'request' has paths
1247 * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
1248 * point 'request' has paths to all locks that blocks 'request' whose owners
1249 * are not same as the one that covers 'request' (covered_by above) and
1250 * we can have locks whose owner is same as covered_by in the active list.
1251 */
1252
1253 if (request_blocked_by_active && found_covering_lock != 2) {
1254 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1255 ASSERT(lock != NULL);
1256 do {
1257 if (BLOCKS(lock, request)) {
1258 if (found_covering_lock &&
1259 !SAME_OWNER(lock, covered_by)) {
1260 lock = lock->l_next;
1261 continue;
1262 }
1263 if ((error = flk_add_edge(request, lock,
1264 CHECK_CYCLE, 0)))
1265 return (error);
1266 }
1267 lock = lock->l_next;
1268 } while (lock->l_vnode == vp);
1269 }
1270
1304 vnode_t *vp = request->l_vnode;
1305 lock_descriptor_t *lock, *lock1;
1306 int done_searching = 0;
1307
1308 CHECK_SLEEPING_LOCKS(gp);
1309 CHECK_ACTIVE_LOCKS(gp);
1310
1311 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1312
1313 flk_set_state(request, FLK_START_STATE);
1314
1315 ASSERT(NOT_BLOCKED(request));
1316
1317 /* IO_LOCK requests are only to check status */
1318
1319 if (IS_IO_LOCK(request))
1320 return (0);
1321
1322 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1323
1324 if (lock == NULL && request->l_type == F_UNLCK)
1325 return (0);
1326 if (lock == NULL) {
1327 flk_insert_active_lock(request);
1328 return (0);
1329 }
1330
1331 do {
1332 lock1 = lock->l_next;
1333 if (SAME_OWNER(request, lock)) {
1334 done_searching = flk_relation(lock, request);
1335 }
1336 lock = lock1;
1337 } while (lock->l_vnode == vp && !done_searching);
1338
1339 /*
1340 * insert in active queue
1341 */
1342
1343 if (request->l_type != F_UNLCK)
1344 flk_insert_active_lock(request);
1345
1346 return (0);
1347 }
1348
1349 /*
1350 * 'request' is blocked by some one therefore we put it into sleep queue.
1351 */
1352 static int
1353 flk_wait_execute_request(lock_descriptor_t *request)
1354 {
1355 graph_t *gp = request->l_graph;
1356 callb_cpr_t *cprp; /* CPR info from callback */
1357 struct flock_globals *fg;
1582
1583 ASSERT(flk_edge_cache != NULL);
1584
1585 ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1586 edge_allocs++;
1587 return (ep);
1588 }
1589
1590 /*
1591 * Free the edge structure.
1592 */
1593
1594 static void
1595 flk_free_edge(edge_t *ep)
1596 {
1597 edge_frees++;
1598 kmem_cache_free(flk_edge_cache, (void *)ep);
1599 }
1600
1601 /*
1602 * Check the relationship of request with lock and perform the
1603 * recomputation of dependencies, break lock if required, and return
1604 * 1 if request cannot have any more relationship with the next
1605 * active locks.
1606 * The 'lock' and 'request' are compared and in case of overlap we
1607 * delete the 'lock' and form new locks to represent the non-overlapped
1608 * portion of original 'lock'. This function has side effects such as
1609 * 'lock' will be freed, new locks will be added to the active list.
1610 */
1611
1612 static int
1613 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1614 {
1615 int lock_effect;
1616 lock_descriptor_t *lock1, *lock2;
1617 lock_descriptor_t *topology[3];
1618 int nvertex = 0;
1619 int i;
1620 edge_t *ep;
1621 graph_t *gp = (lock->l_graph);
1622
1623
1624 CHECK_SLEEPING_LOCKS(gp);
1625 CHECK_ACTIVE_LOCKS(gp);
1626
1627 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1628
1629 topology[0] = topology[1] = topology[2] = NULL;
1630
1631 if (request->l_type == F_UNLCK)
1632 lock_effect = FLK_UNLOCK;
1633 else if (request->l_type == F_RDLCK &&
1634 lock->l_type == F_WRLCK)
1635 lock_effect = FLK_DOWNGRADE;
1636 else if (request->l_type == F_WRLCK &&
1637 lock->l_type == F_RDLCK)
1638 lock_effect = FLK_UPGRADE;
1639 else
1640 lock_effect = FLK_STAY_SAME;
1641
1642 if (lock->l_end < request->l_start) {
1643 if (lock->l_end == request->l_start - 1 &&
1644 lock_effect == FLK_STAY_SAME) {
1645 topology[0] = request;
1646 request->l_start = lock->l_start;
1647 nvertex = 1;
1648 goto recompute;
1649 } else {
1650 return (0);
1651 }
1652 }
1653
1654 if (lock->l_start > request->l_end) {
1655 if (request->l_end == lock->l_start - 1 &&
1656 lock_effect == FLK_STAY_SAME) {
1657 topology[0] = request;
1658 request->l_end = lock->l_end;
1659 nvertex = 1;
1660 goto recompute;
1661 } else {
1662 return (1);
1663 }
1664 }
1665
1666 if (request->l_end < lock->l_end) {
1667 if (request->l_start > lock->l_start) {
1668 if (lock_effect == FLK_STAY_SAME) {
1669 request->l_start = lock->l_start;
1670 request->l_end = lock->l_end;
1671 topology[0] = request;
1672 nvertex = 1;
1673 } else {
1674 lock1 = flk_get_lock();
1675 lock2 = flk_get_lock();
1676 COPY(lock1, lock);
1677 COPY(lock2, lock);
1678 lock1->l_start = lock->l_start;
1679 lock1->l_end = request->l_start - 1;
1680 lock2->l_start = request->l_end + 1;
1681 lock2->l_end = lock->l_end;
1682 topology[0] = lock1;
1683 topology[1] = lock2;
1684 topology[2] = request;
1685 nvertex = 3;
1686 }
1687 } else if (request->l_start < lock->l_start) {
1688 if (lock_effect == FLK_STAY_SAME) {
1689 request->l_end = lock->l_end;
1690 topology[0] = request;
1691 nvertex = 1;
1692 } else {
1693 lock1 = flk_get_lock();
1694 COPY(lock1, lock);
1695 lock1->l_start = request->l_end + 1;
1696 topology[0] = lock1;
1697 topology[1] = request;
1698 nvertex = 2;
1699 }
1700 } else {
1701 if (lock_effect == FLK_STAY_SAME) {
1702 request->l_start = lock->l_start;
1703 request->l_end = lock->l_end;
1704 topology[0] = request;
1705 nvertex = 1;
1706 } else {
1707 lock1 = flk_get_lock();
1708 COPY(lock1, lock);
1709 lock1->l_start = request->l_end + 1;
1710 topology[0] = lock1;
1711 topology[1] = request;
1712 nvertex = 2;
1713 }
1714 }
1715 } else if (request->l_end > lock->l_end) {
1716 if (request->l_start > lock->l_start) {
1717 if (lock_effect == FLK_STAY_SAME) {
1718 request->l_start = lock->l_start;
1719 topology[0] = request;
1720 nvertex = 1;
1721 } else {
1722 lock1 = flk_get_lock();
1723 COPY(lock1, lock);
1724 lock1->l_end = request->l_start - 1;
1725 topology[0] = lock1;
1726 topology[1] = request;
1727 nvertex = 2;
1728 }
1729 } else if (request->l_start < lock->l_start) {
1730 topology[0] = request;
1731 nvertex = 1;
1732 } else {
1733 topology[0] = request;
1734 nvertex = 1;
1735 }
1736 } else {
1737 if (request->l_start > lock->l_start) {
1738 if (lock_effect == FLK_STAY_SAME) {
1739 request->l_start = lock->l_start;
1740 topology[0] = request;
1741 nvertex = 1;
1742 } else {
1743 lock1 = flk_get_lock();
1744 COPY(lock1, lock);
1745 lock1->l_end = request->l_start - 1;
1746 topology[0] = lock1;
1747 topology[1] = request;
1748 nvertex = 2;
1749 }
1750 } else if (request->l_start < lock->l_start) {
1751 topology[0] = request;
1752 nvertex = 1;
1753 } else {
1754 if (lock_effect != FLK_UNLOCK) {
1755 topology[0] = request;
1756 nvertex = 1;
1757 } else {
1758 flk_delete_active_lock(lock, 0);
1759 flk_wakeup(lock, 1);
1760 flk_free_lock(lock);
1761 CHECK_SLEEPING_LOCKS(gp);
1762 CHECK_ACTIVE_LOCKS(gp);
1763 return (1);
1764 }
1765 }
1766 }
1767
1768 recompute:
1769
1770 /*
1771 * For unlock we don't send the 'request' to for recomputing
1772 * dependencies because no lock will add an edge to this.
1773 */
1774
1775 if (lock_effect == FLK_UNLOCK) {
1776 topology[nvertex-1] = NULL;
1777 nvertex--;
1778 }
1779 for (i = 0; i < nvertex; i++) {
1780 topology[i]->l_state |= RECOMPUTE_LOCK;
1781 topology[i]->l_color = NO_COLOR;
1782 }
1783
1784 ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1785
1786 /*
1787 * we remove the adjacent edges for all vertices' to this vertex
1788 * 'lock'.
1789 */
1790
1791 ep = FIRST_IN(lock);
1792 while (ep != HEAD(lock)) {
1793 ADJ_LIST_REMOVE(ep);
1794 ep = NEXT_IN(ep);
1795 }
1796
1797 flk_delete_active_lock(lock, 0);
1798
1799 /* We are ready for recomputing the dependencies now */
1800
1801 flk_recompute_dependencies(lock, topology, nvertex, 1);
1802
1803 for (i = 0; i < nvertex; i++) {
1804 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1805 topology[i]->l_color = NO_COLOR;
1806 }
1807
1808
1809 if (lock_effect == FLK_UNLOCK) {
1810 nvertex++;
1811 }
1812 for (i = 0; i < nvertex - 1; i++) {
1813 flk_insert_active_lock(topology[i]);
1814 }
1815
1816
1817 if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1818 flk_wakeup(lock, 0);
1819 } else {
1820 ep = FIRST_IN(lock);
1821 while (ep != HEAD(lock)) {
1822 lock->l_sedge = NEXT_IN(ep);
1823 IN_LIST_REMOVE(ep);
1824 flk_update_proc_graph(ep, 1);
1825 flk_free_edge(ep);
1826 ep = lock->l_sedge;
1827 }
1828 }
1829 flk_free_lock(lock);
1830
1831 CHECK_SLEEPING_LOCKS(gp);
1832 CHECK_ACTIVE_LOCKS(gp);
1833 return (0);
1834 }
1835
1836 /*
1875 /*
1876 * Delete the active lock : Performs two functions depending on the
1877 * value of second parameter. One is to remove from the active lists
1878 * only and other is to both remove and free the lock.
1879 */
1880
1881 static void
1882 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1883 {
1884 vnode_t *vp = lock->l_vnode;
1885 graph_t *gp = lock->l_graph;
1886
1887 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1888 if (free_lock)
1889 ASSERT(NO_DEPENDENTS(lock));
1890 ASSERT(NOT_BLOCKED(lock));
1891 ASSERT(IS_ACTIVE(lock));
1892
1893 ASSERT((vp->v_filocks != NULL));
1894
1895 if (vp->v_filocks == (struct filock *)lock) {
1896 vp->v_filocks = (struct filock *)
1897 ((lock->l_next->l_vnode == vp) ? lock->l_next :
1898 NULL);
1899 }
1900 lock->l_next->l_prev = lock->l_prev;
1901 lock->l_prev->l_next = lock->l_next;
1902 lock->l_next = lock->l_prev = NULL;
1903 flk_set_state(lock, FLK_DEAD_STATE);
1904 lock->l_state &= ~ACTIVE_LOCK;
1905
1906 if (free_lock)
1907 flk_free_lock(lock);
1908 CHECK_ACTIVE_LOCKS(gp);
1909 CHECK_SLEEPING_LOCKS(gp);
1910 }
1911
1912 /*
1913 * Insert into the sleep queue.
1914 */
2043 request->l_sedge = NEXT_ADJ(ep);
2044 ADJ_LIST_REMOVE(ep);
2045 flk_update_proc_graph(ep, 1);
2046 flk_free_edge(ep);
2047 ep = request->l_sedge;
2048 }
2049
2050
2051 /*
2052 * unset the RECOMPUTE flag in those vertices
2053 */
2054
2055 for (i = 0; i < nvertex; i++) {
2056 topology[i]->l_state &= ~RECOMPUTE_LOCK;
2057 }
2058
2059 /*
2060 * free the topology
2061 */
2062 if (nvertex)
2063 kmem_free((void *)topology,
2064 (nvertex * sizeof (lock_descriptor_t *)));
2065 /*
2066 * Possibility of some locks unblocked now
2067 */
2068
2069 flk_wakeup(request, 0);
2070
2071 /*
2072 * we expect to have a correctly recomputed graph now.
2073 */
2074 flk_set_state(request, FLK_DEAD_STATE);
2075 flk_free_lock(request);
2076 CHECK_SLEEPING_LOCKS(gp);
2077 CHECK_ACTIVE_LOCKS(gp);
2078
2079 }
2080
2081 /*
2082 * Uncoloring the graph is simply to increment the mark value of the graph
2083 * And only when wrap round takes place will we color all vertices in
2979 flk_wakeup(lock, 1);
2980 flk_free_lock(lock);
2981 }
2982
2983 CHECK_SLEEPING_LOCKS(gp);
2984 CHECK_ACTIVE_LOCKS(gp);
2985 CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2986 mutex_exit(&gp->gp_mutex);
2987 }
2988
2989
2990 /*
2991 * Called from 'fs' read and write routines for files that have mandatory
2992 * locking enabled.
2993 */
2994
2995 int
2996 chklock(struct vnode *vp, int iomode, u_offset_t offset, ssize_t len, int fmode,
2997 caller_context_t *ct)
2998 {
2999 register int i;
3000 struct flock64 bf;
3001 int error = 0;
3002
3003 bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
3004 bf.l_whence = 0;
3005 bf.l_start = offset;
3006 bf.l_len = len;
3007 if (ct == NULL) {
3008 bf.l_pid = curproc->p_pid;
3009 bf.l_sysid = 0;
3010 } else {
3011 bf.l_pid = ct->cc_pid;
3012 bf.l_sysid = ct->cc_sysid;
3013 }
3014 i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
3015 if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
3016 bf.l_type != F_UNLCK)
3017 error = i ? i : EAGAIN;
3018 return (error);
3019 }
3310 */
3311
3312 static proc_edge_t *
3313 flk_get_proc_edge()
3314 {
3315 proc_edge_t *pep;
3316
3317 pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3318 flk_proc_edge_allocs++;
3319 return (pep);
3320 }
3321
3322 /*
3323 * Free the proc edge. Called whenever its reference count goes to zero.
3324 */
3325
3326 static void
3327 flk_free_proc_edge(proc_edge_t *pep)
3328 {
3329 ASSERT(pep->refcount == 0);
3330 kmem_free((void *)pep, sizeof (proc_edge_t));
3331 flk_proc_edge_frees++;
3332 }
3333
3334 /*
3335 * Color the graph explicitly done only when the mark value hits max value.
3336 */
3337
3338 static void
3339 flk_proc_graph_uncolor()
3340 {
3341 int i;
3342
3343 if (pgraph.mark == UINT_MAX) {
3344 for (i = 0; i < pgraph.gcount; i++)
3345 if (pgraph.proc[i] != NULL) {
3346 pgraph.proc[i]->atime = 0;
3347 pgraph.proc[i]->dtime = 0;
3348 }
3349 pgraph.mark = 1;
3350 } else {
4052 continue;
4053 }
4054
4055 mutex_enter(&gp->gp_mutex);
4056 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
4057 for (lock = ACTIVE_HEAD(gp)->l_next;
4058 lock != ACTIVE_HEAD(gp);
4059 lock = nlock) {
4060 nlock = lock->l_next;
4061 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4062 ASSERT(IS_ACTIVE(lock));
4063 flk_delete_active_lock(lock, 0);
4064 flk_wakeup(lock, 1);
4065 flk_free_lock(lock);
4066 }
4067 }
4068 mutex_exit(&gp->gp_mutex);
4069 }
4070 }
4071
4072
4073 /*
4074 * Wait until a lock is granted, cancelled, or interrupted.
4075 */
4076
4077 static void
4078 wait_for_lock(lock_descriptor_t *request)
4079 {
4080 graph_t *gp = request->l_graph;
4081
4082 ASSERT(MUTEX_HELD(&gp->gp_mutex));
4083
4084 while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
4085 !(IS_INTERRUPTED(request))) {
4086 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
4087 flk_set_state(request, FLK_INTERRUPTED_STATE);
4088 request->l_state |= INTERRUPTED_LOCK;
4089 }
4090 }
4091 }
4092
4093 /*
4094 * Create an flock structure from the existing lock information
4095 *
4096 * This routine is used to create flock structures for the lock manager
4097 * to use in a reclaim request. Since the lock was originated on this
4098 * host, it must be conforming to UNIX semantics, so no checking is
4099 * done to make sure it falls within the lower half of the 32-bit range.
4100 */
4101
4102 static void
4103 create_flock(lock_descriptor_t *lp, flock64_t *flp)
4104 {
4105 ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
4106 ASSERT(lp->l_end >= lp->l_start);
4107
4108 flp->l_type = lp->l_type;
4109 flp->l_whence = 0;
4110 flp->l_start = lp->l_start;
4122
4123 int
4124 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
4125 u_offset_t *start, u_offset_t *end, offset_t offset)
4126 {
4127 struct vattr vattr;
4128 int error;
4129
4130 /*
4131 * Determine the starting point of the request
4132 */
4133 switch (flp->l_whence) {
4134 case 0: /* SEEK_SET */
4135 *start = (u_offset_t)flp->l_start;
4136 break;
4137 case 1: /* SEEK_CUR */
4138 *start = (u_offset_t)(flp->l_start + offset);
4139 break;
4140 case 2: /* SEEK_END */
4141 vattr.va_mask = AT_SIZE;
4142 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
4143 return (error);
4144 *start = (u_offset_t)(flp->l_start + vattr.va_size);
4145 break;
4146 default:
4147 return (EINVAL);
4148 }
4149
4150 /*
4151 * Determine the range covered by the request.
4152 */
4153 if (flp->l_len == 0)
4154 *end = MAX_U_OFFSET_T;
4155 else if ((offset_t)flp->l_len > 0) {
4156 *end = (u_offset_t)(*start + (flp->l_len - 1));
4157 } else {
4158 /*
4159 * Negative length; why do we even allow this ?
4160 * Because this allows easy specification of
4161 * the last n bytes of the file.
4162 */
4163 *end = *start;
4164 *start += (u_offset_t)flp->l_len;
4165 (*start)++;
4166 }
4167 return (0);
4168 }
4169
4170 /*
4171 * Check the validity of lock data. This can used by the NFS
4172 * frlock routines to check data before contacting the server. The
4173 * server must support semantics that aren't as restrictive as
4174 * the UNIX API, so the NFS client is required to check.
4175 * The maximum is now passed in by the caller.
4176 */
4177
4178 int
4179 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
4180 {
4181 /*
4182 * The end (length) for local locking should never be greater
4183 * than MAXEND. However, the representation for
4184 * the entire file is MAX_U_OFFSET_T.
4185 */
4186 if ((start > max) ||
4187 ((end > max) && (end != MAX_U_OFFSET_T))) {
4188 return (EINVAL);
4189 }
4190 if (start > end) {
4191 return (EINVAL);
4192 }
4193 return (0);
4194 }
4195
4196 /*
4197 * Fill in request->l_flock with information about the lock blocking the
4198 * request. The complexity here is that lock manager requests are allowed
4199 * to see into the upper part of the 32-bit address range, whereas local
4200 * requests are only allowed to see signed values.
4201 *
4202 * What should be done when "blocker" is a lock manager lock that uses the
4203 * upper portion of the 32-bit range, but "request" is local? Since the
|
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
32 * Copyright 2015 Joyent, Inc.
33 */
34
35 #include <sys/flock_impl.h>
36 #include <sys/vfs.h>
37 #include <sys/t_lock.h> /* for <sys/callb.h> */
38 #include <sys/callb.h>
39 #include <sys/clconf.h>
40 #include <sys/cladm.h>
41 #include <sys/nbmlock.h>
42 #include <sys/cred.h>
43 #include <sys/policy.h>
44 #include <sys/list.h>
45 #include <sys/sysmacros.h>
46 #include <sys/socket.h>
47 #include <inet/ip.h>
48
49 /*
50 * The following four variables are for statistics purposes and they are
51 * not protected by locks. They may not be accurate but will at least be
52 * close to the actual value.
53 */
54
55 int flk_lock_allocs;
56 int flk_lock_frees;
57 int edge_allocs;
58 int edge_frees;
59 int flk_proc_vertex_allocs;
60 int flk_proc_edge_allocs;
61 int flk_proc_vertex_frees;
62 int flk_proc_edge_frees;
63
64 static kmutex_t flock_lock;
65
66 #ifdef DEBUG
67 int check_debug = 0;
146 * running, and so whether to allow lock manager requests or not.
147 *
148 * Thus, on a per-zone basis we maintain a ``global'' variable
149 * (flk_lockmgr_status), protected by flock_lock, and set when the lock
150 * manager is determined to be changing state (starting or stopping).
151 *
152 * Each graph/zone pair also has a copy of this variable, which is protected by
153 * the graph's mutex.
154 *
155 * The per-graph copies are used to synchronize lock requests with shutdown
156 * requests. The global copy is used to initialize the per-graph field when a
157 * new graph is created.
158 */
159 struct flock_globals {
160 flk_lockmgr_status_t flk_lockmgr_status;
161 flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
162 };
163
164 zone_key_t flock_zone_key;
165
166 /*
167 * Support for the remote stale lock detection
168 *
169 * The sysid_to_host_translator_lock readers/writer lock protects
170 * sysid_to_host_translator_list.
171 *
172 * The sysid_to_host_translator_list is a list of sysid to host name translator
173 * functions. The new translators are added using the public
174 * flk_add_sysid_to_host_translator() call.
175 *
176 * The stale_lock_timeout is in seconds and it determines the interval for the
177 * remote stale lock checking. When set to 0, the remote stale lock checking
178 * is disabled.
179 */
180 struct sysid_to_host_translator_entry {
181 sysid_to_host_translator_t translator;
182 list_node_t node;
183 };
184 static krwlock_t sysid_to_host_translator_lock;
185 static list_t sysid_to_host_translator_list;
186 volatile int stale_lock_timeout = 3600; /* one hour, in seconds */
187
188 static void create_flock(lock_descriptor_t *, flock64_t *);
189 static lock_descriptor_t *flk_get_lock(void);
190 static void flk_free_lock(lock_descriptor_t *lock);
191 static void flk_get_first_blocking_lock(lock_descriptor_t *);
192 static int flk_process_request(lock_descriptor_t *);
193 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
194 static edge_t *flk_get_edge(void);
195 static int flk_wait_execute_request(lock_descriptor_t *);
196 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
197 static void flk_insert_active_lock(lock_descriptor_t *);
198 static void flk_delete_active_lock(lock_descriptor_t *, int);
199 static void flk_insert_sleeping_lock(lock_descriptor_t *);
200 static void flk_graph_uncolor(graph_t *);
201 static void flk_wakeup(lock_descriptor_t *, int);
202 static void flk_free_edge(edge_t *);
203 static void flk_recompute_dependencies(lock_descriptor_t *,
204 lock_descriptor_t **, int, int);
205 static int flk_find_barriers(lock_descriptor_t *);
206 static void flk_update_barriers(lock_descriptor_t *);
207 static int flk_color_reachables(lock_descriptor_t *);
208 static int flk_canceled(lock_descriptor_t *);
209 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
210 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
211 static void wait_for_lock(lock_descriptor_t *);
565 lock_descriptor_t *lock_request;
566 int error = 0;
567 graph_t *gp;
568 int nlmid;
569
570 /*
571 * Check access permissions
572 */
573 if ((cmd & SETFLCK) &&
574 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
575 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
576 return (EBADF);
577
578 /*
579 * for query and unlock we use the stack_lock_request
580 */
581
582 if ((lckdat->l_type == F_UNLCK) ||
583 !((cmd & INOFLCK) || (cmd & SETFLCK))) {
584 lock_request = &stack_lock_request;
585 bzero(lock_request, sizeof (lock_descriptor_t));
586
587 /*
588 * following is added to make the assertions in
589 * flk_execute_request() to pass through
590 */
591
592 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
593 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
594 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
595 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
596 lock_request->l_status = FLK_INITIAL_STATE;
597 } else {
598 lock_request = flk_get_lock();
599 }
600 lock_request->l_state = 0;
601 lock_request->l_vnode = vp;
602 lock_request->l_zoneid = getzoneid();
603
604 /*
605 * Convert the request range into the canonical start and end
958 * in the cluster. This number will be the size of the nlm
959 * registry status array. We add 1 because we will be using
960 * all entries indexed from 0 to maxnodeid; e.g., from 0
961 * to 64, for a total of 65 entries.
962 */
963 nlm_status_size = clconf_maximum_nodeid() + 1;
964 } else {
965 nlm_status_size = 0;
966 }
967
968 if (nlm_status_size != 0) { /* booted as a cluster */
969 nlm_reg_status = (flk_nlm_status_t *)
970 kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
971 KM_SLEEP);
972
973 /* initialize all NLM states in array to NLM_UNKNOWN */
974 for (i = 0; i < nlm_status_size; i++) {
975 nlm_reg_status[i] = FLK_NLM_UNKNOWN;
976 }
977 }
978
979 mutex_init(&flock_lock, NULL, MUTEX_DEFAULT, NULL);
980 mutex_init(&nlm_reg_lock, NULL, MUTEX_DEFAULT, NULL);
981
982 rw_init(&sysid_to_host_translator_lock, NULL, RW_DEFAULT, NULL);
983 list_create(&sysid_to_host_translator_list,
984 sizeof (struct sysid_to_host_translator_entry),
985 offsetof(struct sysid_to_host_translator_entry, node));
986 }
987
988 /*
989 * Zone constructor/destructor callbacks to be executed when a zone is
990 * created/destroyed.
991 */
992 /* ARGSUSED */
993 void *
994 flk_zone_init(zoneid_t zoneid)
995 {
996 struct flock_globals *fg;
997 uint_t i;
998
999 fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
1000 fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
1001 for (i = 0; i < HASH_SIZE; i++)
1002 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
1003 return (fg);
1004 }
1005
1027 l->l_edge.edge_in_next = &l->l_edge;
1028 l->l_edge.edge_in_prev = &l->l_edge;
1029 l->l_edge.edge_adj_next = &l->l_edge;
1030 l->l_edge.edge_adj_prev = &l->l_edge;
1031 l->pvertex = -1;
1032 l->l_status = FLK_INITIAL_STATE;
1033 flk_lock_allocs++;
1034 return (l);
1035 }
1036
1037 /*
1038 * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
1039 * when some thread has a reference to it as in reclock().
1040 */
1041
1042 void
1043 flk_free_lock(lock_descriptor_t *lock)
1044 {
1045 file_t *fp;
1046
1047 ASSERT(lock->l_blocker >= 0);
1048 ASSERT(IS_DEAD(lock));
1049
1050 if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock)
1051 fp->f_filock = NULL;
1052
1053 if (IS_REFERENCED(lock)) {
1054 lock->l_state |= DELETED_LOCK;
1055 return;
1056 }
1057 flk_lock_frees++;
1058 kmem_free(lock, sizeof (lock_descriptor_t));
1059 }
1060
1061 void
1062 flk_set_state(lock_descriptor_t *lock, int new_state)
1063 {
1064 /*
1065 * Locks in the sleeping list may be woken up in a number of ways,
1066 * and more than once. If a sleeping lock is signaled awake more
1067 * than once, then it may or may not change state depending on its
1068 * current state.
1069 * Also note that NLM locks that are sleeping could be moved to an
1070 * interrupted state more than once if the unlock request is
1071 * retransmitted by the NLM client - the second time around, this is
1072 * just a nop.
1073 * The ordering of being signaled awake is:
1074 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
1075 * The checks below implement this ordering.
1076 */
1077 if (IS_INTERRUPTED(lock)) {
1078 if ((new_state == FLK_CANCELLED_STATE) ||
1079 (new_state == FLK_GRANTED_STATE) ||
1080 (new_state == FLK_INTERRUPTED_STATE)) {
1081 return;
1082 }
1083 }
1084 if (IS_CANCELLED(lock)) {
1085 if ((new_state == FLK_GRANTED_STATE) ||
1086 (new_state == FLK_CANCELLED_STATE)) {
1087 return;
1088 }
1089 }
1090 CHECK_LOCK_TRANSITION(lock->l_status, new_state);
1091 if (IS_PXFS(lock)) {
1092 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
1093 }
1094 lock->l_status = new_state;
1095 }
1096
1097 /*
1098 * Support for the remote stale lock detection
1099 */
1100
1101 void
1102 flk_add_sysid_to_host_translator(sysid_to_host_translator_t tr)
1103 {
1104 struct sysid_to_host_translator_entry *te;
1105
1106 te = kmem_alloc(sizeof (struct sysid_to_host_translator_entry),
1107 KM_SLEEP);
1108
1109 te->translator = tr;
1110
1111 rw_enter(&sysid_to_host_translator_lock, RW_WRITER);
1112 list_insert_head(&sysid_to_host_translator_list, te);
1113 rw_exit(&sysid_to_host_translator_lock);
1114 }
1115
1116 static void
1117 translate_sysid_to_host(zoneid_t zoneid, sysid_t sysid, char *host, size_t hlen,
1118 const char **type)
1119 {
1120 struct sockaddr sa;
1121 struct sysid_to_host_translator_entry *te;
1122
1123 /* Some defaults in a case the translation will fail */
1124 *type = "?";
1125 (void) strlcpy(host, "?", hlen);
1126
1127 rw_enter(&sysid_to_host_translator_lock, RW_READER);
1128
1129 for (te = list_head(&sysid_to_host_translator_list); te != NULL;
1130 te = list_next(&sysid_to_host_translator_list, te)) {
1131
1132 if (te->translator(zoneid, sysid, &sa, type) != 0) {
1133 rw_exit(&sysid_to_host_translator_lock);
1134
1135 switch (sa.sa_family) {
1136 case AF_INET:
1137 (void) inet_ntop(AF_INET,
1138 &((struct sockaddr_in *)&sa)->sin_addr,
1139 host, hlen);
1140 break;
1141 case AF_INET6:
1142 (void) inet_ntop(AF_INET6,
1143 &((struct sockaddr_in6 *)&sa)->sin6_addr,
1144 host, hlen);
1145 break;
1146 default:
1147 break;
1148 }
1149
1150 return;
1151 }
1152 }
1153
1154 rw_exit(&sysid_to_host_translator_lock);
1155 }
1156
1157 static char *
1158 get_vnode_path(vnode_t *vp)
1159 {
1160 size_t len;
1161 char *ret;
1162
1163 mutex_enter(&vp->v_lock);
1164 if (vp->v_path == NULL) {
1165 mutex_exit(&vp->v_lock);
1166 return (NULL);
1167 }
1168 len = strlen(vp->v_path) + 1;
1169 mutex_exit(&vp->v_lock);
1170
1171 ret = kmem_alloc(len, KM_SLEEP);
1172
1173 mutex_enter(&vp->v_lock);
1174 if (vp->v_path == NULL || strlen(vp->v_path) + 1 != len) {
1175 mutex_exit(&vp->v_lock);
1176 kmem_free(ret, len);
1177 return (NULL);
1178 }
1179 bcopy(vp->v_path, ret, len);
1180 mutex_exit(&vp->v_lock);
1181
1182 return (ret);
1183 }
1184
1185 static void
1186 flk_stale_lock_check(lock_descriptor_t *lock)
1187 {
1188 char *path;
1189
1190 char host[INET6_ADDRSTRLEN]; /* host name */
1191 const char *type; /* host type */
1192
1193 /* temporary variables for the cmn_err() call */
1194 char *p, *t; /* path, lock type */
1195 pid_t pid; /* pid */
1196 void *v; /* vnode */
1197 u_offset_t s, e; /* start, end */
1198
1199 ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1200
1201 /*
1202 * Either not a remote lock, or the stale lock checking is disabled, or
1203 * the lock is already reported.
1204 */
1205 if (IS_LOCAL(lock) || stale_lock_timeout == 0 || lock->l_blocker < 0)
1206 return;
1207
1208 /* Seen first time? */
1209 if (lock->l_blocker == 0) {
1210 lock->l_blocker = gethrtime();
1211 return;
1212 }
1213
1214 /* Old enough? */
1215 if ((gethrtime() - lock->l_blocker) / NANOSEC < stale_lock_timeout)
1216 return;
1217
1218 translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1219 sizeof (host), &type);
1220 path = get_vnode_path(lock->l_vnode);
1221
1222 pid = lock->l_flock.l_pid;
1223 v = (void *)lock->l_vnode;
1224 p = path == NULL ? "?" : path;
1225 t = lock->l_type == F_WRLCK ? "WR" : "RD";
1226 s = lock->l_start;
1227 e = lock->l_end;
1228
1229 /* Report the blocker as stale */
1230 cmn_err(CE_NOTE, "!Stale lock (host: %s (%s), pid: %d, vnode: %p, "
1231 "path: %s, %sLCK: %llu:%llu)", host, type, pid, v, p, t, s, e);
1232
1233 if (path != NULL)
1234 strfree(path);
1235
1236 /* Mark this blocker as reported */
1237 lock->l_blocker = -lock->l_blocker;
1238 }
1239
1240 static void
1241 flk_stale_lock_shrink(lock_descriptor_t *lock, lock_descriptor_t *new)
1242 {
1243 char *path;
1244
1245 char host[INET6_ADDRSTRLEN]; /* host name */
1246 const char *type; /* host type */
1247
1248 /* temporary variables for the cmn_err() call */
1249 char *p, *t; /* path, lock type */
1250 pid_t pid; /* pid */
1251 void *v; /* vnode */
1252 u_offset_t s, e; /* start, end */
1253 u_offset_t ns, ne; /* new start, new end */
1254
1255 ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1256
1257 translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1258 sizeof (host), &type);
1259 path = get_vnode_path(lock->l_vnode);
1260
1261 pid = lock->l_flock.l_pid;
1262 v = (void *)lock->l_vnode;
1263 p = path == NULL ? "?" : path;
1264 t = lock->l_type == F_WRLCK ? "WR" : "RD";
1265 s = lock->l_start;
1266 e = lock->l_end;
1267 ns = new->l_start;
1268 ne = new->l_end;
1269
1270 cmn_err(CE_NOTE, "!Stale lock SHRINK (host: %s (%s), pid: %d, "
1271 "vnode: %p, path: %s, %sLCK: %llu:%llu -> %llu:%llu)", host, type,
1272 pid, v, p, t, s, e, ns, ne);
1273
1274 if (path != NULL)
1275 strfree(path);
1276 }
1277
1278 static void
1279 flk_stale_lock_split(lock_descriptor_t *lock, lock_descriptor_t *new1,
1280 lock_descriptor_t *new2)
1281 {
1282 char *path;
1283
1284 char host[INET6_ADDRSTRLEN]; /* host name */
1285 const char *type; /* host type */
1286
1287 /* temporary variables for the cmn_err() call */
1288 char *p, *t; /* path, lock type */
1289 pid_t pid; /* pid */
1290 void *v; /* vnode */
1291 u_offset_t s, e; /* start, end */
1292 u_offset_t n1s, n1e; /* new1 start, new1 end */
1293 u_offset_t n2s, n2e; /* new2 start, new2 end */
1294
1295 ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1296
1297 translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1298 sizeof (host), &type);
1299 path = get_vnode_path(lock->l_vnode);
1300
1301 pid = lock->l_flock.l_pid;
1302 v = (void *)lock->l_vnode;
1303 p = path == NULL ? "?" : path;
1304 t = lock->l_type == F_WRLCK ? "WR" : "RD";
1305 s = lock->l_start;
1306 e = lock->l_end;
1307 n1s = new1->l_start;
1308 n1e = new1->l_end;
1309 n2s = new2->l_start;
1310 n2e = new2->l_end;
1311
1312 cmn_err(CE_NOTE, "!Stale lock SPLIT (host: %s (%s), pid: %d, "
1313 "vnode: %p, path: %s, %sLCK: %llu:%llu -> %llu:%llu and %llu:%llu)",
1314 host, type, pid, v, p, t, s, e, n1s, n1e, n2s, n2e);
1315
1316 if (path != NULL)
1317 strfree(path);
1318 }
1319
1320 static void
1321 flk_stale_lock_release(lock_descriptor_t *lock)
1322 {
1323 char *path;
1324
1325 char host[INET6_ADDRSTRLEN]; /* host name */
1326 const char *type; /* host type */
1327
1328 /* temporary variables for the cmn_err() call */
1329 char *p, *t; /* path, lock type */
1330 pid_t pid; /* pid */
1331 void *v; /* vnode */
1332 u_offset_t s, e; /* start, end */
1333
1334 ASSERT(MUTEX_HELD(&lock->l_graph->gp_mutex));
1335
1336 translate_sysid_to_host(lock->l_zoneid, lock->l_flock.l_sysid, host,
1337 sizeof (host), &type);
1338 path = get_vnode_path(lock->l_vnode);
1339
1340 pid = lock->l_flock.l_pid;
1341 v = (void *)lock->l_vnode;
1342 p = path == NULL ? "?" : path;
1343 t = lock->l_type == F_WRLCK ? "WR" : "RD";
1344 s = lock->l_start;
1345 e = lock->l_end;
1346
1347 cmn_err(CE_NOTE, "!Stale lock RELEASE (host: %s (%s), pid: %d, "
1348 "vnode: %p, path: %s, %sLCK: %llu:%llu)", host, type, pid, v, p, t,
1349 s, e);
1350
1351 if (path != NULL)
1352 strfree(path);
1353 }
1354
1355 /*
1356 * Routine that checks whether there are any blocking locks in the system.
1357 *
1358 * The policy followed is if a write lock is sleeping we don't allow read
1359 * locks before this write lock even though there may not be any active
1360 * locks corresponding to the read locks' region.
1361 *
1362 * flk_add_edge() function adds an edge between l1 and l2 iff there
1363 * is no path between l1 and l2. This is done to have a "minimum
1364 * storage representation" of the dependency graph.
1365 *
1366 * Another property of the graph is since only the new request throws
1367 * edges to the existing locks in the graph, the graph is always topologically
1368 * ordered.
1369 */
1370
1371 static int
1372 flk_process_request(lock_descriptor_t *request)
1373 {
1374 graph_t *gp = request->l_graph;
1375 lock_descriptor_t *lock;
1376 int request_blocked_by_active = 0;
1377 int request_blocked_by_granted = 0;
1378 int request_blocked_by_sleeping = 0;
1379 vnode_t *vp = request->l_vnode;
1380 int error = 0;
1381 int request_will_wait = 0;
1382 int found_covering_lock = 0;
1383 lock_descriptor_t *covered_by = NULL;
1384
1385 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1386 request_will_wait = IS_WILLING_TO_SLEEP(request);
1387
1388 /*
1389 * check active locks
1390 */
1391
1392 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1393
1394 if (lock) {
1395 do {
1396 if (BLOCKS(lock, request)) {
1397 if (!request_will_wait) {
1398 flk_stale_lock_check(lock);
1399 return (EAGAIN);
1400 }
1401 request_blocked_by_active = 1;
1402 break;
1403 }
1404 /*
1405 * Grant lock if it is for the same owner holding active
1406 * lock that covers the request.
1407 */
1408
1409 if (SAME_OWNER(lock, request) &&
1410 COVERS(lock, request) &&
1411 (request->l_type == F_RDLCK))
1412 return (flk_execute_request(request));
1413 lock = lock->l_next;
1414 } while (lock->l_vnode == vp);
1415 }
1416
1417 if (!request_blocked_by_active) {
1418 lock_descriptor_t *lk[1];
1419 lock_descriptor_t *first_glock = NULL;
1420
1421 /*
1422 * Shall we grant this?! NO!!
1423 * What about those locks that were just granted and still
1424 * in sleep queue. Those threads are woken up and so locks
1425 * are almost active.
1426 */
1427 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1428 if (lock) {
1429 do {
1430 if (BLOCKS(lock, request)) {
1431 if (IS_GRANTED(lock)) {
1432 request_blocked_by_granted = 1;
1433 } else {
1434 request_blocked_by_sleeping = 1;
1435 }
1436 }
1437
1438 lock = lock->l_next;
1439 } while ((lock->l_vnode == vp));
1440 first_glock = lock->l_prev;
1457 /*
1458 * If we have a sleeping writer in the requested
1459 * lock's range, block.
1460 */
1461 goto block;
1462 }
1463
1464 lk[0] = request;
1465 request->l_state |= RECOMPUTE_LOCK;
1466 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1467 if (lock) {
1468 do {
1469 flk_recompute_dependencies(lock, lk, 1, 0);
1470 lock = lock->l_next;
1471 } while (lock->l_vnode == vp);
1472 }
1473 lock = first_glock;
1474 if (lock) {
1475 do {
1476 if (IS_GRANTED(lock)) {
1477 flk_recompute_dependencies(lock, lk, 1,
1478 0);
1479 }
1480 lock = lock->l_prev;
1481 } while ((lock->l_vnode == vp));
1482 }
1483 request->l_state &= ~RECOMPUTE_LOCK;
1484 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
1485 return (EDEADLK);
1486 return (flk_execute_request(request));
1487 }
1488
1489 block:
1490 if (request_will_wait)
1491 flk_graph_uncolor(gp);
1492
1493 /* check sleeping locks */
1494
1495 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1496
1497 /*
1498 * If we find a sleeping write lock that is a superset of the
1520 !SAME_OWNER(lock, covered_by)) {
1521 found_covering_lock++;
1522 break;
1523 }
1524 found_covering_lock = 1;
1525 covered_by = lock;
1526 }
1527 if (found_covering_lock &&
1528 !SAME_OWNER(lock, covered_by)) {
1529 lock = lock->l_next;
1530 continue;
1531 }
1532 if ((error = flk_add_edge(request, lock,
1533 !found_covering_lock, 0)))
1534 return (error);
1535 }
1536 lock = lock->l_next;
1537 } while (lock->l_vnode == vp);
1538 }
1539
1540 /*
1541 * found_covering_lock == 2 iff at this point 'request' has paths to
1542 * all locks that blocks 'request'. found_covering_lock == 1 iff at
1543 * this point 'request' has paths to all locks that blocks 'request'
1544 * whose owners are not same as the one that covers 'request'
1545 * (covered_by above) and we can have locks whose owner is same as
1546 * covered_by in the active list.
1547 */
1548
1549 if (request_blocked_by_active && found_covering_lock != 2) {
1550 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1551 ASSERT(lock != NULL);
1552 do {
1553 if (BLOCKS(lock, request)) {
1554 if (found_covering_lock &&
1555 !SAME_OWNER(lock, covered_by)) {
1556 lock = lock->l_next;
1557 continue;
1558 }
1559 if ((error = flk_add_edge(request, lock,
1560 CHECK_CYCLE, 0)))
1561 return (error);
1562 }
1563 lock = lock->l_next;
1564 } while (lock->l_vnode == vp);
1565 }
1566
1600 vnode_t *vp = request->l_vnode;
1601 lock_descriptor_t *lock, *lock1;
1602 int done_searching = 0;
1603
1604 CHECK_SLEEPING_LOCKS(gp);
1605 CHECK_ACTIVE_LOCKS(gp);
1606
1607 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1608
1609 flk_set_state(request, FLK_START_STATE);
1610
1611 ASSERT(NOT_BLOCKED(request));
1612
1613 /* IO_LOCK requests are only to check status */
1614
1615 if (IS_IO_LOCK(request))
1616 return (0);
1617
1618 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1619
1620 if (lock != NULL) {
1621 /*
1622 * There are some active locks so check for relations
1623 */
1624 do {
1625 lock1 = lock->l_next;
1626 if (SAME_OWNER(request, lock)) {
1627 done_searching = flk_relation(lock, request);
1628 }
1629 lock = lock1;
1630 } while (lock->l_vnode == vp && !done_searching);
1631 }
1632
1633 /*
1634 * insert in active queue
1635 */
1636
1637 if (request->l_type != F_UNLCK)
1638 flk_insert_active_lock(request);
1639
1640 return (0);
1641 }
1642
1643 /*
1644 * 'request' is blocked by some one therefore we put it into sleep queue.
1645 */
1646 static int
1647 flk_wait_execute_request(lock_descriptor_t *request)
1648 {
1649 graph_t *gp = request->l_graph;
1650 callb_cpr_t *cprp; /* CPR info from callback */
1651 struct flock_globals *fg;
1876
1877 ASSERT(flk_edge_cache != NULL);
1878
1879 ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1880 edge_allocs++;
1881 return (ep);
1882 }
1883
1884 /*
1885 * Free the edge structure.
1886 */
1887
1888 static void
1889 flk_free_edge(edge_t *ep)
1890 {
1891 edge_frees++;
1892 kmem_cache_free(flk_edge_cache, (void *)ep);
1893 }
1894
1895 /*
1896 * Check the relationship of 'request' with 'lock' and perform the
1897 * recomputation of dependencies, break 'lock' if required, and return
1898 * 1 if 'request' cannot have any more relationship with the next
1899 * active locks.
1900 *
1901 * The 'lock' and 'request' are compared and in case of overlap we
1902 * delete the 'lock' and form new locks to represent the non-overlapped
1903 * portion of original 'lock'. This function has side effects such as
1904 * 'lock' will be freed, new locks will be added to the active list.
1905 */
1906
1907 static int
1908 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1909 {
1910 int lock_effect;
1911 lock_descriptor_t *topology[3];
1912 int nvertex = 0;
1913 int i;
1914 edge_t *ep;
1915 graph_t *gp = lock->l_graph;
1916 boolean_t mergeable;
1917
1918 ASSERT(request->l_blocker == 0);
1919
1920 CHECK_SLEEPING_LOCKS(gp);
1921 CHECK_ACTIVE_LOCKS(gp);
1922
1923 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1924
1925 topology[0] = topology[1] = topology[2] = NULL;
1926
1927 if (request->l_type == F_UNLCK)
1928 lock_effect = FLK_UNLOCK;
1929 else if (request->l_type == F_RDLCK &&
1930 lock->l_type == F_WRLCK)
1931 lock_effect = FLK_DOWNGRADE;
1932 else if (request->l_type == F_WRLCK &&
1933 lock->l_type == F_RDLCK)
1934 lock_effect = FLK_UPGRADE;
1935 else
1936 lock_effect = FLK_STAY_SAME;
1937
1938 /*
1939 * The 'lock' and 'request' are merged only in a case the effect of
1940 * both locks is same (FLK_STAY_SAME) and their blocker status
1941 * (l_blocker) is same as well. We do not merge 'lock' and 'request'
1942 * with different l_blocker values because such merge might affect the
1943 * stale lock detection. It might cause either false positives, or
1944 * miss some stale locks.
1945 */
1946 mergeable = lock_effect == FLK_STAY_SAME &&
1947 lock->l_blocker == request->l_blocker;
1948
1949 if (lock->l_end < request->l_start) {
1950 /* If the 'lock' is just next to 'request', try to merge them */
1951 if (lock->l_end == request->l_start - 1 && mergeable) {
1952 request->l_start = lock->l_start;
1953 goto recompute;
1954 }
1955
1956 /* Otherwise, they do not overlap, so return immediately */
1957 return (0);
1958 }
1959
1960 if (request->l_end < lock->l_start) {
1961 /* If the 'request' is just next to 'lock', try to merge them */
1962 if (request->l_end == lock->l_start - 1 && mergeable) {
1963 request->l_end = lock->l_end;
1964 goto recompute;
1965 }
1966
1967 /* Otherwise, they do not overlap, so return immediately */
1968 return (1);
1969 }
1970
1971 /*
1972 * Here we are sure the 'lock' and 'request' overlaps, so the 'request'
1973 * will replace the 'lock' (either fully, or at least partially).
1974 */
1975
1976 /*
1977 * If the 'request' does not fully cover the 'lock' at the start,
1978 * either move the start of the 'request' to cover the 'lock', or split
1979 * the 'lock'.
1980 */
1981 if (lock->l_start < request->l_start) {
1982 if (mergeable) {
1983 request->l_start = lock->l_start;
1984 } else {
1985 lock_descriptor_t *new_lock = flk_get_lock();
1986
1987 COPY(new_lock, lock);
1988 new_lock->l_end = request->l_start - 1;
1989
1990 topology[nvertex++] = new_lock;
1991 }
1992 }
1993
1994 /*
1995 * If the 'request' does not fully cover the 'lock' at the end, either
1996 * move the end of the 'request' to cover the 'lock', or split the
1997 * 'lock'.
1998 */
1999 if (request->l_end < lock->l_end) {
2000 if (mergeable) {
2001 request->l_end = lock->l_end;
2002 } else {
2003 lock_descriptor_t *new_lock = flk_get_lock();
2004
2005 COPY(new_lock, lock);
2006 new_lock->l_start = request->l_end + 1;
2007
2008 topology[nvertex++] = new_lock;
2009 }
2010 }
2011
2012 /*
2013 * Log the blocker change
2014 */
2015 if (nvertex > 0 && lock->l_blocker < 0) {
2016 if (nvertex == 1)
2017 flk_stale_lock_shrink(lock, topology[0]);
2018 if (nvertex == 2)
2019 flk_stale_lock_split(lock, topology[0], topology[1]);
2020
2021 lock->l_blocker = 0;
2022 }
2023
2024 recompute:
2025 /*
2026 * For unlock we don't send the 'request' to for recomputing
2027 * dependencies because no lock will add an edge to this.
2028 */
2029 if (lock_effect != FLK_UNLOCK)
2030 topology[nvertex++] = request;
2031
2032 for (i = 0; i < nvertex; i++) {
2033 topology[i]->l_state |= RECOMPUTE_LOCK;
2034 topology[i]->l_color = NO_COLOR;
2035 }
2036
2037 ASSERT(FIRST_ADJ(lock) == HEAD(lock));
2038
2039 /*
2040 * we remove the adjacent edges for all vertices' to this vertex
2041 * 'lock'.
2042 */
2043 ep = FIRST_IN(lock);
2044 while (ep != HEAD(lock)) {
2045 ADJ_LIST_REMOVE(ep);
2046 ep = NEXT_IN(ep);
2047 }
2048
2049 flk_delete_active_lock(lock, 0);
2050
2051 /* We are ready for recomputing the dependencies now */
2052 flk_recompute_dependencies(lock, topology, nvertex, 1);
2053
2054 for (i = 0; i < nvertex; i++) {
2055 topology[i]->l_state &= ~RECOMPUTE_LOCK;
2056 topology[i]->l_color = NO_COLOR;
2057 }
2058
2059 if (lock_effect == FLK_UNLOCK) {
2060 nvertex++;
2061 }
2062 for (i = 0; i < nvertex - 1; i++) {
2063 flk_insert_active_lock(topology[i]);
2064 }
2065
2066 if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
2067 flk_wakeup(lock, 0);
2068 } else {
2069 ep = FIRST_IN(lock);
2070 while (ep != HEAD(lock)) {
2071 lock->l_sedge = NEXT_IN(ep);
2072 IN_LIST_REMOVE(ep);
2073 flk_update_proc_graph(ep, 1);
2074 flk_free_edge(ep);
2075 ep = lock->l_sedge;
2076 }
2077 }
2078 flk_free_lock(lock);
2079
2080 CHECK_SLEEPING_LOCKS(gp);
2081 CHECK_ACTIVE_LOCKS(gp);
2082 return (0);
2083 }
2084
2085 /*
2124 /*
2125 * Delete the active lock : Performs two functions depending on the
2126 * value of second parameter. One is to remove from the active lists
2127 * only and other is to both remove and free the lock.
2128 */
2129
2130 static void
2131 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
2132 {
2133 vnode_t *vp = lock->l_vnode;
2134 graph_t *gp = lock->l_graph;
2135
2136 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2137 if (free_lock)
2138 ASSERT(NO_DEPENDENTS(lock));
2139 ASSERT(NOT_BLOCKED(lock));
2140 ASSERT(IS_ACTIVE(lock));
2141
2142 ASSERT((vp->v_filocks != NULL));
2143
2144 if (lock->l_blocker < 0) {
2145 /* Log the blocker release */
2146 flk_stale_lock_release(lock);
2147 lock->l_blocker = 0;
2148 }
2149
2150 if (vp->v_filocks == (struct filock *)lock) {
2151 vp->v_filocks = (struct filock *)
2152 ((lock->l_next->l_vnode == vp) ? lock->l_next :
2153 NULL);
2154 }
2155 lock->l_next->l_prev = lock->l_prev;
2156 lock->l_prev->l_next = lock->l_next;
2157 lock->l_next = lock->l_prev = NULL;
2158 flk_set_state(lock, FLK_DEAD_STATE);
2159 lock->l_state &= ~ACTIVE_LOCK;
2160
2161 if (free_lock)
2162 flk_free_lock(lock);
2163 CHECK_ACTIVE_LOCKS(gp);
2164 CHECK_SLEEPING_LOCKS(gp);
2165 }
2166
2167 /*
2168 * Insert into the sleep queue.
2169 */
2298 request->l_sedge = NEXT_ADJ(ep);
2299 ADJ_LIST_REMOVE(ep);
2300 flk_update_proc_graph(ep, 1);
2301 flk_free_edge(ep);
2302 ep = request->l_sedge;
2303 }
2304
2305
2306 /*
2307 * unset the RECOMPUTE flag in those vertices
2308 */
2309
2310 for (i = 0; i < nvertex; i++) {
2311 topology[i]->l_state &= ~RECOMPUTE_LOCK;
2312 }
2313
2314 /*
2315 * free the topology
2316 */
2317 if (nvertex)
2318 kmem_free(topology,
2319 (nvertex * sizeof (lock_descriptor_t *)));
2320 /*
2321 * Possibility of some locks unblocked now
2322 */
2323
2324 flk_wakeup(request, 0);
2325
2326 /*
2327 * we expect to have a correctly recomputed graph now.
2328 */
2329 flk_set_state(request, FLK_DEAD_STATE);
2330 flk_free_lock(request);
2331 CHECK_SLEEPING_LOCKS(gp);
2332 CHECK_ACTIVE_LOCKS(gp);
2333
2334 }
2335
2336 /*
2337 * Uncoloring the graph is simply to increment the mark value of the graph
2338 * And only when wrap round takes place will we color all vertices in
3234 flk_wakeup(lock, 1);
3235 flk_free_lock(lock);
3236 }
3237
3238 CHECK_SLEEPING_LOCKS(gp);
3239 CHECK_ACTIVE_LOCKS(gp);
3240 CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
3241 mutex_exit(&gp->gp_mutex);
3242 }
3243
3244
3245 /*
3246 * Called from 'fs' read and write routines for files that have mandatory
3247 * locking enabled.
3248 */
3249
3250 int
3251 chklock(struct vnode *vp, int iomode, u_offset_t offset, ssize_t len, int fmode,
3252 caller_context_t *ct)
3253 {
3254 int i;
3255 struct flock64 bf;
3256 int error = 0;
3257
3258 bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
3259 bf.l_whence = 0;
3260 bf.l_start = offset;
3261 bf.l_len = len;
3262 if (ct == NULL) {
3263 bf.l_pid = curproc->p_pid;
3264 bf.l_sysid = 0;
3265 } else {
3266 bf.l_pid = ct->cc_pid;
3267 bf.l_sysid = ct->cc_sysid;
3268 }
3269 i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
3270 if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
3271 bf.l_type != F_UNLCK)
3272 error = i ? i : EAGAIN;
3273 return (error);
3274 }
3565 */
3566
3567 static proc_edge_t *
3568 flk_get_proc_edge()
3569 {
3570 proc_edge_t *pep;
3571
3572 pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3573 flk_proc_edge_allocs++;
3574 return (pep);
3575 }
3576
3577 /*
3578 * Free the proc edge. Called whenever its reference count goes to zero.
3579 */
3580
3581 static void
3582 flk_free_proc_edge(proc_edge_t *pep)
3583 {
3584 ASSERT(pep->refcount == 0);
3585 kmem_free(pep, sizeof (proc_edge_t));
3586 flk_proc_edge_frees++;
3587 }
3588
3589 /*
3590 * Color the graph explicitly done only when the mark value hits max value.
3591 */
3592
3593 static void
3594 flk_proc_graph_uncolor()
3595 {
3596 int i;
3597
3598 if (pgraph.mark == UINT_MAX) {
3599 for (i = 0; i < pgraph.gcount; i++)
3600 if (pgraph.proc[i] != NULL) {
3601 pgraph.proc[i]->atime = 0;
3602 pgraph.proc[i]->dtime = 0;
3603 }
3604 pgraph.mark = 1;
3605 } else {
4307 continue;
4308 }
4309
4310 mutex_enter(&gp->gp_mutex);
4311 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
4312 for (lock = ACTIVE_HEAD(gp)->l_next;
4313 lock != ACTIVE_HEAD(gp);
4314 lock = nlock) {
4315 nlock = lock->l_next;
4316 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4317 ASSERT(IS_ACTIVE(lock));
4318 flk_delete_active_lock(lock, 0);
4319 flk_wakeup(lock, 1);
4320 flk_free_lock(lock);
4321 }
4322 }
4323 mutex_exit(&gp->gp_mutex);
4324 }
4325 }
4326
4327 /*
4328 * Wait until a lock is granted, cancelled, or interrupted.
4329 */
4330
4331 static void
4332 wait_for_lock(lock_descriptor_t *request)
4333 {
4334 graph_t *gp = request->l_graph;
4335 vnode_t *vp = request->l_vnode;
4336
4337 ASSERT(MUTEX_HELD(&gp->gp_mutex));
4338
4339 while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
4340 !(IS_INTERRUPTED(request))) {
4341 lock_descriptor_t *lock;
4342
4343 if (stale_lock_timeout == 0) {
4344 /* The stale lock detection is disabled */
4345 if (cv_wait_sig(&request->l_cv, &gp->gp_mutex) == 0) {
4346 flk_set_state(request, FLK_INTERRUPTED_STATE);
4347 request->l_state |= INTERRUPTED_LOCK;
4348 }
4349
4350 continue;
4351 }
4352
4353 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4354
4355 if (lock != NULL) {
4356 do {
4357 if (BLOCKS(lock, request)) {
4358 flk_stale_lock_check(lock);
4359 break;
4360 }
4361 lock = lock->l_next;
4362 } while (lock->l_vnode == vp);
4363 }
4364
4365 if (cv_timedwait_sig(&request->l_cv, &gp->gp_mutex,
4366 ddi_get_lbolt() + SEC_TO_TICK(stale_lock_timeout)) == 0) {
4367 flk_set_state(request, FLK_INTERRUPTED_STATE);
4368 request->l_state |= INTERRUPTED_LOCK;
4369 }
4370 }
4371 }
4372
4373 /*
4374 * Create an flock structure from the existing lock information
4375 *
4376 * This routine is used to create flock structures for the lock manager
4377 * to use in a reclaim request. Since the lock was originated on this
4378 * host, it must be conforming to UNIX semantics, so no checking is
4379 * done to make sure it falls within the lower half of the 32-bit range.
4380 */
4381
4382 static void
4383 create_flock(lock_descriptor_t *lp, flock64_t *flp)
4384 {
4385 ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
4386 ASSERT(lp->l_end >= lp->l_start);
4387
4388 flp->l_type = lp->l_type;
4389 flp->l_whence = 0;
4390 flp->l_start = lp->l_start;
4402
4403 int
4404 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
4405 u_offset_t *start, u_offset_t *end, offset_t offset)
4406 {
4407 struct vattr vattr;
4408 int error;
4409
4410 /*
4411 * Determine the starting point of the request
4412 */
4413 switch (flp->l_whence) {
4414 case 0: /* SEEK_SET */
4415 *start = (u_offset_t)flp->l_start;
4416 break;
4417 case 1: /* SEEK_CUR */
4418 *start = (u_offset_t)(flp->l_start + offset);
4419 break;
4420 case 2: /* SEEK_END */
4421 vattr.va_mask = AT_SIZE;
4422 if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
4423 return (error);
4424 *start = (u_offset_t)(flp->l_start + vattr.va_size);
4425 break;
4426 default:
4427 return (EINVAL);
4428 }
4429
4430 /*
4431 * Determine the range covered by the request.
4432 */
4433 if (flp->l_len == 0)
4434 *end = MAX_U_OFFSET_T;
4435 else if ((offset_t)flp->l_len > 0) {
4436 *end = (u_offset_t)(*start + (flp->l_len - 1));
4437 } else {
4438 /*
4439 * Negative length; why do we even allow this ?
4440 * Because this allows easy specification of
4441 * the last n bytes of the file.
4442 */
4443 *end = *start;
4444 *start += (u_offset_t)flp->l_len;
4445 (*start)++;
4446 }
4447 return (0);
4448 }
4449
4450 /*
4451 * Check the validity of lock data. This can used by the NFS
4452 * frlock routines to check data before contacting the server. The
4453 * server must support semantics that aren't as restrictive as
4454 * the UNIX API, so the NFS client is required to check.
4455 * The maximum is passed in by the caller.
4456 */
4457
4458 int
4459 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
4460 {
4461 /*
4462 * The end (length) for local locking should never be greater
4463 * than max. However, the representation for
4464 * the entire file is MAX_U_OFFSET_T.
4465 */
4466 if ((start > max) ||
4467 ((end > max) && (end != MAX_U_OFFSET_T))) {
4468 return (EINVAL);
4469 }
4470 if (start > end) {
4471 return (EINVAL);
4472 }
4473 return (0);
4474 }
4475
4476 /*
4477 * Fill in request->l_flock with information about the lock blocking the
4478 * request. The complexity here is that lock manager requests are allowed
4479 * to see into the upper part of the 32-bit address range, whereas local
4480 * requests are only allowed to see signed values.
4481 *
4482 * What should be done when "blocker" is a lock manager lock that uses the
4483 * upper portion of the 32-bit range, but "request" is local? Since the
|