Print this page
14619 Race between udp_activate() and conn_get_socket_info()
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/inet/ip/ipclassifier.c
+++ new/usr/src/uts/common/inet/ip/ipclassifier.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
24 - * Copyright 2020 Joyent, Inc.
24 + * Copyright 2022 Joyent, Inc.
25 25 */
26 26
27 27 /*
28 28 * IP PACKET CLASSIFIER
29 29 *
30 30 * The IP packet classifier provides mapping between IP packets and persistent
31 31 * connection state for connection-oriented protocols. It also provides
32 32 * interface for managing connection states.
33 33 *
34 34 * The connection state is kept in conn_t data structure and contains, among
35 35 * other things:
36 36 *
37 37 * o local/remote address and ports
38 38 * o Transport protocol
39 39 * o squeue for the connection (for TCP only)
40 40 * o reference counter
41 41 * o Connection state
42 42 * o hash table linkage
43 43 * o interface/ire information
44 44 * o credentials
45 45 * o ipsec policy
46 46 * o send and receive functions.
47 47 * o mutex lock.
48 48 *
49 49 * Connections use a reference counting scheme. They are freed when the
50 50 * reference counter drops to zero. A reference is incremented when connection
51 51 * is placed in a list or table, when incoming packet for the connection arrives
52 52 * and when connection is processed via squeue (squeue processing may be
53 53 * asynchronous and the reference protects the connection from being destroyed
54 54 * before its processing is finished).
55 55 *
56 56 * conn_recv is used to pass up packets to the ULP.
57 57 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
58 58 * a listener, and changes to tcp_input_listener as the listener has picked a
59 59 * good squeue. For other cases it is set to tcp_input_data.
60 60 *
61 61 * conn_recvicmp is used to pass up ICMP errors to the ULP.
62 62 *
63 63 * Classifier uses several hash tables:
64 64 *
65 65 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state
66 66 * ipcl_bind_fanout: contains all connections in BOUND state
67 67 * ipcl_proto_fanout: IPv4 protocol fanout
68 68 * ipcl_proto_fanout_v6: IPv6 protocol fanout
69 69 * ipcl_udp_fanout: contains all UDP connections
70 70 * ipcl_iptun_fanout: contains all IP tunnel connections
71 71 * ipcl_globalhash_fanout: contains all connections
72 72 *
73 73 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
74 74 * which need to view all existing connections.
75 75 *
76 76 * All tables are protected by per-bucket locks. When both per-bucket lock and
77 77 * connection lock need to be held, the per-bucket lock should be acquired
78 78 * first, followed by the connection lock.
79 79 *
80 80 * All functions doing search in one of these tables increment a reference
81 81 * counter on the connection found (if any). This reference should be dropped
82 82 * when the caller has finished processing the connection.
83 83 *
84 84 *
85 85 * INTERFACES:
86 86 * ===========
87 87 *
88 88 * Connection Lookup:
89 89 * ------------------
90 90 *
91 91 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
92 92 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
93 93 *
94 94 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
95 95 * it can't find any associated connection. If the connection is found, its
96 96 * reference counter is incremented.
97 97 *
98 98 * mp: mblock, containing packet header. The full header should fit
99 99 * into a single mblock. It should also contain at least full IP
100 100 * and TCP or UDP header.
101 101 *
102 102 * protocol: Either IPPROTO_TCP or IPPROTO_UDP.
103 103 *
104 104 * hdr_len: The size of IP header. It is used to find TCP or UDP header in
105 105 * the packet.
106 106 *
107 107 * ira->ira_zoneid: The zone in which the returned connection must be; the
108 108 * zoneid corresponding to the ire_zoneid on the IRE located for
109 109 * the packet's destination address.
110 110 *
111 111 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
112 112 * IRAF_TX_SHARED_ADDR flags
113 113 *
114 114 * For TCP connections, the lookup order is as follows:
115 115 * 5-tuple {src, dst, protocol, local port, remote port}
116 116 * lookup in ipcl_conn_fanout table.
117 117 * 3-tuple {dst, remote port, protocol} lookup in
118 118 * ipcl_bind_fanout table.
119 119 *
120 120 * For UDP connections, a 5-tuple {src, dst, protocol, local port,
121 121 * remote port} lookup is done on ipcl_udp_fanout. Note that,
122 122 * these interfaces do not handle cases where a packets belongs
123 123 * to multiple UDP clients, which is handled in IP itself.
124 124 *
125 125 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
126 126 * determine which actual zone gets the segment. This is used only in a
127 127 * labeled environment. The matching rules are:
128 128 *
129 129 * - If it's not a multilevel port, then the label on the packet selects
130 130 * the zone. Unlabeled packets are delivered to the global zone.
131 131 *
132 132 * - If it's a multilevel port, then only the zone registered to receive
133 133 * packets on that port matches.
134 134 *
135 135 * Also, in a labeled environment, packet labels need to be checked. For fully
136 136 * bound TCP connections, we can assume that the packet label was checked
137 137 * during connection establishment, and doesn't need to be checked on each
138 138 * packet. For others, though, we need to check for strict equality or, for
139 139 * multilevel ports, membership in the range or set. This part currently does
140 140 * a tnrh lookup on each packet, but could be optimized to use cached results
141 141 * if that were necessary. (SCTP doesn't come through here, but if it did,
142 142 * we would apply the same rules as TCP.)
143 143 *
144 144 * An implication of the above is that fully-bound TCP sockets must always use
145 145 * distinct 4-tuples; they can't be discriminated by label alone.
146 146 *
147 147 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
148 148 * as there's no connection set-up handshake and no shared state.
149 149 *
150 150 * Labels on looped-back packets within a single zone do not need to be
151 151 * checked, as all processes in the same zone have the same label.
152 152 *
153 153 * Finally, for unlabeled packets received by a labeled system, special rules
154 154 * apply. We consider only the MLP if there is one. Otherwise, we prefer a
155 155 * socket in the zone whose label matches the default label of the sender, if
156 156 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the
157 157 * receiver's label must dominate the sender's default label.
158 158 *
159 159 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
160 160 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
161 161 * ip_stack);
162 162 *
163 163 * Lookup routine to find a exact match for {src, dst, local port,
164 164 * remote port) for TCP connections in ipcl_conn_fanout. The address and
165 165 * ports are read from the IP and TCP header respectively.
166 166 *
167 167 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol,
168 168 * zoneid, ip_stack);
169 169 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
170 170 * zoneid, ip_stack);
171 171 *
172 172 * Lookup routine to find a listener with the tuple {lport, laddr,
173 173 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional
174 174 * parameter interface index is also compared.
175 175 *
176 176 * void ipcl_walk(func, arg, ip_stack)
177 177 *
178 178 * Apply 'func' to every connection available. The 'func' is called as
179 179 * (*func)(connp, arg). The walk is non-atomic so connections may be
180 180 * created and destroyed during the walk. The CONN_CONDEMNED and
181 181 * CONN_INCIPIENT flags ensure that connections which are newly created
182 182 * or being destroyed are not selected by the walker.
183 183 *
184 184 * Table Updates
185 185 * -------------
186 186 *
187 187 * int ipcl_conn_insert(connp);
188 188 * int ipcl_conn_insert_v4(connp);
189 189 * int ipcl_conn_insert_v6(connp);
190 190 *
191 191 * Insert 'connp' in the ipcl_conn_fanout.
192 192 * Arguments :
193 193 * connp conn_t to be inserted
194 194 *
195 195 * Return value :
196 196 * 0 if connp was inserted
197 197 * EADDRINUSE if the connection with the same tuple
198 198 * already exists.
199 199 *
200 200 * int ipcl_bind_insert(connp);
201 201 * int ipcl_bind_insert_v4(connp);
202 202 * int ipcl_bind_insert_v6(connp);
203 203 *
204 204 * Insert 'connp' in ipcl_bind_fanout.
205 205 * Arguments :
206 206 * connp conn_t to be inserted
207 207 *
208 208 *
209 209 * void ipcl_hash_remove(connp);
210 210 *
211 211 * Removes the 'connp' from the connection fanout table.
212 212 *
213 213 * Connection Creation/Destruction
214 214 * -------------------------------
215 215 *
216 216 * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
217 217 *
218 218 * Creates a new conn based on the type flag, inserts it into
219 219 * globalhash table.
220 220 *
221 221 * type: This flag determines the type of conn_t which needs to be
222 222 * created i.e., which kmem_cache it comes from.
223 223 * IPCL_TCPCONN indicates a TCP connection
224 224 * IPCL_SCTPCONN indicates a SCTP connection
225 225 * IPCL_UDPCONN indicates a UDP conn_t.
226 226 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t.
227 227 * IPCL_RTSCONN indicates a RTS conn_t.
228 228 * IPCL_IPCCONN indicates all other connections.
229 229 *
230 230 * void ipcl_conn_destroy(connp)
231 231 *
232 232 * Destroys the connection state, removes it from the global
233 233 * connection hash table and frees its memory.
234 234 */
235 235
236 236 #include <sys/types.h>
237 237 #include <sys/stream.h>
238 238 #include <sys/stropts.h>
239 239 #include <sys/sysmacros.h>
240 240 #include <sys/strsubr.h>
241 241 #include <sys/strsun.h>
242 242 #define _SUN_TPI_VERSION 2
243 243 #include <sys/ddi.h>
244 244 #include <sys/cmn_err.h>
245 245 #include <sys/debug.h>
246 246
247 247 #include <sys/systm.h>
248 248 #include <sys/param.h>
249 249 #include <sys/kmem.h>
250 250 #include <sys/isa_defs.h>
251 251 #include <inet/common.h>
252 252 #include <netinet/ip6.h>
253 253 #include <netinet/icmp6.h>
254 254
255 255 #include <inet/ip.h>
256 256 #include <inet/ip_if.h>
257 257 #include <inet/ip_ire.h>
258 258 #include <inet/ip6.h>
259 259 #include <inet/ip_ndp.h>
260 260 #include <inet/ip_impl.h>
261 261 #include <inet/udp_impl.h>
262 262 #include <inet/sctp_ip.h>
263 263 #include <inet/sctp/sctp_impl.h>
264 264 #include <inet/rawip_impl.h>
265 265 #include <inet/rts_impl.h>
266 266 #include <inet/iptun/iptun_impl.h>
267 267
268 268 #include <sys/cpuvar.h>
269 269
270 270 #include <inet/ipclassifier.h>
271 271 #include <inet/tcp.h>
272 272 #include <inet/ipsec_impl.h>
273 273
274 274 #include <sys/tsol/tnet.h>
275 275 #include <sys/sockio.h>
276 276
277 277 /* Old value for compatibility. Setable in /etc/system */
278 278 uint_t tcp_conn_hash_size = 0;
279 279
280 280 /* New value. Zero means choose automatically. Setable in /etc/system */
281 281 uint_t ipcl_conn_hash_size = 0;
282 282 uint_t ipcl_conn_hash_memfactor = 8192;
283 283 uint_t ipcl_conn_hash_maxsize = 82500;
284 284
285 285 /* bind/udp fanout table size */
286 286 uint_t ipcl_bind_fanout_size = 512;
287 287 uint_t ipcl_udp_fanout_size = 16384;
288 288
289 289 /* Raw socket fanout size. Must be a power of 2. */
290 290 uint_t ipcl_raw_fanout_size = 256;
291 291
292 292 /*
293 293 * The IPCL_IPTUN_HASH() function works best with a prime table size. We
294 294 * expect that most large deployments would have hundreds of tunnels, and
295 295 * thousands in the extreme case.
296 296 */
297 297 uint_t ipcl_iptun_fanout_size = 6143;
298 298
299 299 /*
300 300 * Power of 2^N Primes useful for hashing for N of 0-28,
301 301 * these primes are the nearest prime <= 2^N - 2^(N-2).
302 302 */
303 303
304 304 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \
305 305 6143, 12281, 24571, 49139, 98299, 196597, 393209, \
306 306 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \
307 307 50331599, 100663291, 201326557, 0}
308 308
309 309 /*
310 310 * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
311 311 * are aligned on cache lines.
312 312 */
313 313 typedef union itc_s {
314 314 conn_t itc_conn;
315 315 char itcu_filler[CACHE_ALIGN(conn_s)];
316 316 } itc_t;
317 317
318 318 struct kmem_cache *tcp_conn_cache;
319 319 struct kmem_cache *ip_conn_cache;
320 320 extern struct kmem_cache *sctp_conn_cache;
321 321 struct kmem_cache *udp_conn_cache;
322 322 struct kmem_cache *rawip_conn_cache;
323 323 struct kmem_cache *rts_conn_cache;
324 324
325 325 extern void tcp_timermp_free(tcp_t *);
326 326 extern mblk_t *tcp_timermp_alloc(int);
327 327
328 328 static int ip_conn_constructor(void *, void *, int);
329 329 static void ip_conn_destructor(void *, void *);
330 330
331 331 static int tcp_conn_constructor(void *, void *, int);
332 332 static void tcp_conn_destructor(void *, void *);
333 333
334 334 static int udp_conn_constructor(void *, void *, int);
335 335 static void udp_conn_destructor(void *, void *);
336 336
337 337 static int rawip_conn_constructor(void *, void *, int);
338 338 static void rawip_conn_destructor(void *, void *);
339 339
340 340 static int rts_conn_constructor(void *, void *, int);
341 341 static void rts_conn_destructor(void *, void *);
342 342
343 343 /*
344 344 * Global (for all stack instances) init routine
345 345 */
346 346 void
347 347 ipcl_g_init(void)
348 348 {
349 349 ip_conn_cache = kmem_cache_create("ip_conn_cache",
350 350 sizeof (conn_t), CACHE_ALIGN_SIZE,
351 351 ip_conn_constructor, ip_conn_destructor,
352 352 NULL, NULL, NULL, 0);
353 353
354 354 tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
355 355 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
356 356 tcp_conn_constructor, tcp_conn_destructor,
357 357 tcp_conn_reclaim, NULL, NULL, 0);
358 358
359 359 udp_conn_cache = kmem_cache_create("udp_conn_cache",
360 360 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
361 361 udp_conn_constructor, udp_conn_destructor,
362 362 NULL, NULL, NULL, 0);
363 363
364 364 rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
365 365 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
366 366 rawip_conn_constructor, rawip_conn_destructor,
367 367 NULL, NULL, NULL, 0);
368 368
369 369 rts_conn_cache = kmem_cache_create("rts_conn_cache",
370 370 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
371 371 rts_conn_constructor, rts_conn_destructor,
372 372 NULL, NULL, NULL, 0);
373 373 }
374 374
375 375 /*
376 376 * ipclassifier intialization routine, sets up hash tables.
377 377 */
378 378 void
379 379 ipcl_init(ip_stack_t *ipst)
380 380 {
381 381 int i;
382 382 int sizes[] = P2Ps();
383 383
384 384 /*
385 385 * Calculate size of conn fanout table from /etc/system settings
386 386 */
387 387 if (ipcl_conn_hash_size != 0) {
388 388 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
389 389 } else if (tcp_conn_hash_size != 0) {
390 390 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
391 391 } else {
392 392 extern pgcnt_t freemem;
393 393
394 394 ipst->ips_ipcl_conn_fanout_size =
395 395 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
396 396
397 397 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
398 398 ipst->ips_ipcl_conn_fanout_size =
399 399 ipcl_conn_hash_maxsize;
400 400 }
401 401 }
402 402
403 403 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
404 404 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
405 405 break;
406 406 }
407 407 }
408 408 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
409 409 /* Out of range, use the 2^16 value */
410 410 ipst->ips_ipcl_conn_fanout_size = sizes[16];
411 411 }
412 412
413 413 /* Take values from /etc/system */
414 414 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
415 415 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
416 416 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
417 417 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
418 418
419 419 ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
420 420
421 421 ipst->ips_ipcl_conn_fanout = kmem_zalloc(
422 422 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
423 423
424 424 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
425 425 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
426 426 MUTEX_DEFAULT, NULL);
427 427 }
428 428
429 429 ipst->ips_ipcl_bind_fanout = kmem_zalloc(
430 430 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
431 431
432 432 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
433 433 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
434 434 MUTEX_DEFAULT, NULL);
435 435 }
436 436
437 437 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
438 438 sizeof (connf_t), KM_SLEEP);
439 439 for (i = 0; i < IPPROTO_MAX; i++) {
440 440 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
441 441 MUTEX_DEFAULT, NULL);
442 442 }
443 443
444 444 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
445 445 sizeof (connf_t), KM_SLEEP);
446 446 for (i = 0; i < IPPROTO_MAX; i++) {
447 447 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
448 448 MUTEX_DEFAULT, NULL);
449 449 }
450 450
451 451 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
452 452 mutex_init(&ipst->ips_rts_clients->connf_lock,
453 453 NULL, MUTEX_DEFAULT, NULL);
454 454
455 455 ipst->ips_ipcl_udp_fanout = kmem_zalloc(
456 456 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
457 457 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
458 458 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
459 459 MUTEX_DEFAULT, NULL);
460 460 }
461 461
462 462 ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
463 463 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
464 464 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
465 465 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
466 466 MUTEX_DEFAULT, NULL);
467 467 }
468 468
469 469 ipst->ips_ipcl_raw_fanout = kmem_zalloc(
470 470 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
471 471 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
472 472 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
473 473 MUTEX_DEFAULT, NULL);
474 474 }
475 475
476 476 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
477 477 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
478 478 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
479 479 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
480 480 NULL, MUTEX_DEFAULT, NULL);
481 481 }
482 482 }
483 483
484 484 void
485 485 ipcl_g_destroy(void)
486 486 {
487 487 kmem_cache_destroy(ip_conn_cache);
488 488 kmem_cache_destroy(tcp_conn_cache);
489 489 kmem_cache_destroy(udp_conn_cache);
490 490 kmem_cache_destroy(rawip_conn_cache);
491 491 kmem_cache_destroy(rts_conn_cache);
492 492 }
493 493
494 494 /*
495 495 * All user-level and kernel use of the stack must be gone
496 496 * by now.
497 497 */
498 498 void
499 499 ipcl_destroy(ip_stack_t *ipst)
500 500 {
501 501 int i;
502 502
503 503 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
504 504 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
505 505 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
506 506 }
507 507 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
508 508 sizeof (connf_t));
509 509 ipst->ips_ipcl_conn_fanout = NULL;
510 510
511 511 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
512 512 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
513 513 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
514 514 }
515 515 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
516 516 sizeof (connf_t));
517 517 ipst->ips_ipcl_bind_fanout = NULL;
518 518
519 519 for (i = 0; i < IPPROTO_MAX; i++) {
520 520 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
521 521 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
522 522 }
523 523 kmem_free(ipst->ips_ipcl_proto_fanout_v4,
524 524 IPPROTO_MAX * sizeof (connf_t));
525 525 ipst->ips_ipcl_proto_fanout_v4 = NULL;
526 526
527 527 for (i = 0; i < IPPROTO_MAX; i++) {
528 528 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
529 529 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
530 530 }
531 531 kmem_free(ipst->ips_ipcl_proto_fanout_v6,
532 532 IPPROTO_MAX * sizeof (connf_t));
533 533 ipst->ips_ipcl_proto_fanout_v6 = NULL;
534 534
535 535 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
536 536 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
537 537 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
538 538 }
539 539 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
540 540 sizeof (connf_t));
541 541 ipst->ips_ipcl_udp_fanout = NULL;
542 542
543 543 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
544 544 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
545 545 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
546 546 }
547 547 kmem_free(ipst->ips_ipcl_iptun_fanout,
548 548 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
549 549 ipst->ips_ipcl_iptun_fanout = NULL;
550 550
551 551 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
552 552 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
553 553 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
554 554 }
555 555 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
556 556 sizeof (connf_t));
557 557 ipst->ips_ipcl_raw_fanout = NULL;
558 558
559 559 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
560 560 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
561 561 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
562 562 }
563 563 kmem_free(ipst->ips_ipcl_globalhash_fanout,
564 564 sizeof (connf_t) * CONN_G_HASH_SIZE);
565 565 ipst->ips_ipcl_globalhash_fanout = NULL;
566 566
567 567 ASSERT(ipst->ips_rts_clients->connf_head == NULL);
568 568 mutex_destroy(&ipst->ips_rts_clients->connf_lock);
569 569 kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
570 570 ipst->ips_rts_clients = NULL;
571 571 }
572 572
573 573 /*
574 574 * conn creation routine. initialize the conn, sets the reference
575 575 * and inserts it in the global hash table.
576 576 */
577 577 conn_t *
578 578 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
579 579 {
580 580 conn_t *connp;
581 581 struct kmem_cache *conn_cache;
582 582
583 583 switch (type) {
584 584 case IPCL_SCTPCONN:
585 585 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
586 586 return (NULL);
587 587 sctp_conn_init(connp);
588 588 netstack_hold(ns);
589 589 connp->conn_netstack = ns;
590 590 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
591 591 connp->conn_ixa->ixa_conn_id = (long)connp;
592 592 ipcl_globalhash_insert(connp);
593 593 return (connp);
594 594
595 595 case IPCL_TCPCONN:
596 596 conn_cache = tcp_conn_cache;
597 597 break;
598 598
599 599 case IPCL_UDPCONN:
600 600 conn_cache = udp_conn_cache;
601 601 break;
602 602
603 603 case IPCL_RAWIPCONN:
604 604 conn_cache = rawip_conn_cache;
605 605 break;
606 606
607 607 case IPCL_RTSCONN:
608 608 conn_cache = rts_conn_cache;
609 609 break;
610 610
611 611 case IPCL_IPCCONN:
612 612 conn_cache = ip_conn_cache;
613 613 break;
614 614
615 615 default:
616 616 conn_cache = NULL;
617 617 connp = NULL;
618 618 ASSERT(0);
619 619 }
620 620
621 621 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
622 622 return (NULL);
623 623
624 624 connp->conn_ref = 1;
625 625 netstack_hold(ns);
626 626 connp->conn_netstack = ns;
627 627 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
628 628 connp->conn_ixa->ixa_conn_id = (long)connp;
629 629 ipcl_globalhash_insert(connp);
630 630 return (connp);
631 631 }
632 632
633 633 void
634 634 ipcl_conn_destroy(conn_t *connp)
635 635 {
636 636 mblk_t *mp;
637 637 netstack_t *ns = connp->conn_netstack;
638 638
639 639 ASSERT(!MUTEX_HELD(&connp->conn_lock));
640 640 ASSERT(connp->conn_ref == 0);
641 641 ASSERT(connp->conn_ioctlref == 0);
642 642
643 643 DTRACE_PROBE1(conn__destroy, conn_t *, connp);
644 644
645 645 if (connp->conn_cred != NULL) {
646 646 crfree(connp->conn_cred);
647 647 connp->conn_cred = NULL;
648 648 /* ixa_cred done in ipcl_conn_cleanup below */
649 649 }
650 650
651 651 if (connp->conn_ht_iphc != NULL) {
652 652 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
653 653 connp->conn_ht_iphc = NULL;
654 654 connp->conn_ht_iphc_allocated = 0;
655 655 connp->conn_ht_iphc_len = 0;
656 656 connp->conn_ht_ulp = NULL;
657 657 connp->conn_ht_ulp_len = 0;
658 658 }
659 659 ip_pkt_free(&connp->conn_xmit_ipp);
660 660
661 661 ipcl_globalhash_remove(connp);
662 662
663 663 if (connp->conn_latch != NULL) {
664 664 IPLATCH_REFRELE(connp->conn_latch);
665 665 connp->conn_latch = NULL;
666 666 }
667 667 if (connp->conn_latch_in_policy != NULL) {
668 668 IPPOL_REFRELE(connp->conn_latch_in_policy);
669 669 connp->conn_latch_in_policy = NULL;
670 670 }
671 671 if (connp->conn_latch_in_action != NULL) {
672 672 IPACT_REFRELE(connp->conn_latch_in_action);
673 673 connp->conn_latch_in_action = NULL;
674 674 }
675 675 if (connp->conn_policy != NULL) {
676 676 IPPH_REFRELE(connp->conn_policy, ns);
677 677 connp->conn_policy = NULL;
678 678 }
679 679
680 680 if (connp->conn_ipsec_opt_mp != NULL) {
681 681 freemsg(connp->conn_ipsec_opt_mp);
682 682 connp->conn_ipsec_opt_mp = NULL;
683 683 }
684 684
685 685 if (connp->conn_flags & IPCL_TCPCONN) {
686 686 tcp_t *tcp = connp->conn_tcp;
687 687
688 688 tcp_free(tcp);
689 689 mp = tcp->tcp_timercache;
690 690
691 691 tcp->tcp_tcps = NULL;
692 692
693 693 /*
694 694 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
695 695 * the mblk.
696 696 */
697 697 if (tcp->tcp_rsrv_mp != NULL) {
698 698 freeb(tcp->tcp_rsrv_mp);
699 699 tcp->tcp_rsrv_mp = NULL;
700 700 mutex_destroy(&tcp->tcp_rsrv_mp_lock);
701 701 }
702 702
703 703 ipcl_conn_cleanup(connp);
704 704 connp->conn_flags = IPCL_TCPCONN;
705 705 if (ns != NULL) {
706 706 ASSERT(tcp->tcp_tcps == NULL);
707 707 connp->conn_netstack = NULL;
708 708 connp->conn_ixa->ixa_ipst = NULL;
709 709 netstack_rele(ns);
710 710 }
711 711
712 712 bzero(tcp, sizeof (tcp_t));
713 713
714 714 tcp->tcp_timercache = mp;
715 715 tcp->tcp_connp = connp;
716 716 kmem_cache_free(tcp_conn_cache, connp);
717 717 return;
718 718 }
719 719
720 720 if (connp->conn_flags & IPCL_SCTPCONN) {
721 721 ASSERT(ns != NULL);
722 722 sctp_free(connp);
723 723 return;
724 724 }
725 725
726 726 ipcl_conn_cleanup(connp);
727 727 if (ns != NULL) {
728 728 connp->conn_netstack = NULL;
729 729 connp->conn_ixa->ixa_ipst = NULL;
730 730 netstack_rele(ns);
731 731 }
732 732
733 733 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
734 734 if (connp->conn_flags & IPCL_UDPCONN) {
735 735 connp->conn_flags = IPCL_UDPCONN;
736 736 kmem_cache_free(udp_conn_cache, connp);
737 737 } else if (connp->conn_flags & IPCL_RAWIPCONN) {
738 738 connp->conn_flags = IPCL_RAWIPCONN;
739 739 connp->conn_proto = IPPROTO_ICMP;
740 740 connp->conn_ixa->ixa_protocol = connp->conn_proto;
741 741 kmem_cache_free(rawip_conn_cache, connp);
742 742 } else if (connp->conn_flags & IPCL_RTSCONN) {
743 743 connp->conn_flags = IPCL_RTSCONN;
744 744 kmem_cache_free(rts_conn_cache, connp);
745 745 } else {
746 746 connp->conn_flags = IPCL_IPCCONN;
747 747 ASSERT(connp->conn_flags & IPCL_IPCCONN);
748 748 ASSERT(connp->conn_priv == NULL);
749 749 kmem_cache_free(ip_conn_cache, connp);
750 750 }
751 751 }
752 752
753 753 /*
754 754 * Running in cluster mode - deregister listener information
755 755 */
756 756 static void
757 757 ipcl_conn_unlisten(conn_t *connp)
758 758 {
759 759 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
760 760 ASSERT(connp->conn_lport != 0);
761 761
762 762 if (cl_inet_unlisten != NULL) {
763 763 sa_family_t addr_family;
764 764 uint8_t *laddrp;
765 765
766 766 if (connp->conn_ipversion == IPV6_VERSION) {
767 767 addr_family = AF_INET6;
768 768 laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
769 769 } else {
770 770 addr_family = AF_INET;
771 771 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
772 772 }
773 773 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
774 774 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
775 775 }
776 776 connp->conn_flags &= ~IPCL_CL_LISTENER;
777 777 }
778 778
779 779 /*
780 780 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
781 781 * which table the conn belonged to). So for debugging we can see which hash
782 782 * table this connection was in.
783 783 */
784 784 #define IPCL_HASH_REMOVE(connp) { \
785 785 connf_t *connfp = (connp)->conn_fanout; \
786 786 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \
787 787 if (connfp != NULL) { \
788 788 mutex_enter(&connfp->connf_lock); \
789 789 if ((connp)->conn_next != NULL) \
790 790 (connp)->conn_next->conn_prev = \
791 791 (connp)->conn_prev; \
792 792 if ((connp)->conn_prev != NULL) \
793 793 (connp)->conn_prev->conn_next = \
794 794 (connp)->conn_next; \
795 795 else \
796 796 connfp->connf_head = (connp)->conn_next; \
797 797 (connp)->conn_fanout = NULL; \
798 798 (connp)->conn_next = NULL; \
799 799 (connp)->conn_prev = NULL; \
800 800 (connp)->conn_flags |= IPCL_REMOVED; \
801 801 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \
802 802 ipcl_conn_unlisten((connp)); \
803 803 CONN_DEC_REF((connp)); \
804 804 mutex_exit(&connfp->connf_lock); \
805 805 } \
806 806 }
807 807
808 808 void
809 809 ipcl_hash_remove(conn_t *connp)
810 810 {
811 811 uint8_t protocol = connp->conn_proto;
812 812
813 813 IPCL_HASH_REMOVE(connp);
814 814 if (protocol == IPPROTO_RSVP)
815 815 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
816 816 }
817 817
818 818 /*
819 819 * The whole purpose of this function is allow removal of
820 820 * a conn_t from the connected hash for timewait reclaim.
821 821 * This is essentially a TW reclaim fastpath where timewait
822 822 * collector checks under fanout lock (so no one else can
823 823 * get access to the conn_t) that refcnt is 2 i.e. one for
824 824 * TCP and one for the classifier hash list. If ref count
825 825 * is indeed 2, we can just remove the conn under lock and
826 826 * avoid cleaning up the conn under squeue. This gives us
827 827 * improved performance.
828 828 */
829 829 void
830 830 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
831 831 {
832 832 ASSERT(MUTEX_HELD(&connfp->connf_lock));
833 833 ASSERT(MUTEX_HELD(&connp->conn_lock));
834 834 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
835 835
836 836 if ((connp)->conn_next != NULL) {
837 837 (connp)->conn_next->conn_prev = (connp)->conn_prev;
838 838 }
839 839 if ((connp)->conn_prev != NULL) {
840 840 (connp)->conn_prev->conn_next = (connp)->conn_next;
841 841 } else {
842 842 connfp->connf_head = (connp)->conn_next;
843 843 }
844 844 (connp)->conn_fanout = NULL;
845 845 (connp)->conn_next = NULL;
846 846 (connp)->conn_prev = NULL;
847 847 (connp)->conn_flags |= IPCL_REMOVED;
848 848 ASSERT((connp)->conn_ref == 2);
849 849 (connp)->conn_ref--;
850 850 }
851 851
852 852 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \
853 853 ASSERT((connp)->conn_fanout == NULL); \
854 854 ASSERT((connp)->conn_next == NULL); \
855 855 ASSERT((connp)->conn_prev == NULL); \
856 856 if ((connfp)->connf_head != NULL) { \
857 857 (connfp)->connf_head->conn_prev = (connp); \
858 858 (connp)->conn_next = (connfp)->connf_head; \
859 859 } \
860 860 (connp)->conn_fanout = (connfp); \
861 861 (connfp)->connf_head = (connp); \
862 862 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
863 863 IPCL_CONNECTED; \
864 864 CONN_INC_REF(connp); \
865 865 }
866 866
867 867 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \
868 868 IPCL_HASH_REMOVE((connp)); \
869 869 mutex_enter(&(connfp)->connf_lock); \
870 870 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \
871 871 mutex_exit(&(connfp)->connf_lock); \
872 872 }
873 873
874 874 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \
875 875 conn_t *pconnp = NULL, *nconnp; \
876 876 IPCL_HASH_REMOVE((connp)); \
877 877 mutex_enter(&(connfp)->connf_lock); \
878 878 nconnp = (connfp)->connf_head; \
879 879 while (nconnp != NULL && \
880 880 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \
881 881 pconnp = nconnp; \
882 882 nconnp = nconnp->conn_next; \
883 883 } \
884 884 if (pconnp != NULL) { \
885 885 pconnp->conn_next = (connp); \
886 886 (connp)->conn_prev = pconnp; \
887 887 } else { \
888 888 (connfp)->connf_head = (connp); \
889 889 } \
890 890 if (nconnp != NULL) { \
891 891 (connp)->conn_next = nconnp; \
892 892 nconnp->conn_prev = (connp); \
893 893 } \
894 894 (connp)->conn_fanout = (connfp); \
895 895 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
896 896 IPCL_BOUND; \
897 897 CONN_INC_REF(connp); \
898 898 mutex_exit(&(connfp)->connf_lock); \
899 899 }
900 900
901 901 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \
902 902 conn_t **list, *prev, *next; \
903 903 boolean_t isv4mapped = \
904 904 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \
905 905 IPCL_HASH_REMOVE((connp)); \
906 906 mutex_enter(&(connfp)->connf_lock); \
907 907 list = &(connfp)->connf_head; \
908 908 prev = NULL; \
909 909 while ((next = *list) != NULL) { \
910 910 if (isv4mapped && \
911 911 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \
912 912 connp->conn_zoneid == next->conn_zoneid) { \
913 913 (connp)->conn_next = next; \
914 914 if (prev != NULL) \
915 915 prev = next->conn_prev; \
916 916 next->conn_prev = (connp); \
917 917 break; \
918 918 } \
919 919 list = &next->conn_next; \
920 920 prev = next; \
921 921 } \
922 922 (connp)->conn_prev = prev; \
923 923 *list = (connp); \
924 924 (connp)->conn_fanout = (connfp); \
925 925 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
926 926 IPCL_BOUND; \
927 927 CONN_INC_REF((connp)); \
928 928 mutex_exit(&(connfp)->connf_lock); \
929 929 }
930 930
931 931 void
932 932 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
933 933 {
934 934 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
935 935 }
936 936
937 937 /*
938 938 * Because the classifier is used to classify inbound packets, the destination
939 939 * address is meant to be our local tunnel address (tunnel source), and the
940 940 * source the remote tunnel address (tunnel destination).
941 941 *
942 942 * Note that conn_proto can't be used for fanout since the upper protocol
943 943 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
944 944 */
945 945 conn_t *
946 946 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
947 947 {
948 948 connf_t *connfp;
949 949 conn_t *connp;
950 950
951 951 /* first look for IPv4 tunnel links */
952 952 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
953 953 mutex_enter(&connfp->connf_lock);
954 954 for (connp = connfp->connf_head; connp != NULL;
955 955 connp = connp->conn_next) {
956 956 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
957 957 break;
958 958 }
959 959 if (connp != NULL)
960 960 goto done;
961 961
962 962 mutex_exit(&connfp->connf_lock);
963 963
964 964 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
965 965 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
966 966 INADDR_ANY)];
967 967 mutex_enter(&connfp->connf_lock);
968 968 for (connp = connfp->connf_head; connp != NULL;
969 969 connp = connp->conn_next) {
970 970 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
971 971 break;
972 972 }
973 973 done:
974 974 if (connp != NULL)
975 975 CONN_INC_REF(connp);
976 976 mutex_exit(&connfp->connf_lock);
977 977 return (connp);
978 978 }
979 979
980 980 conn_t *
981 981 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
982 982 {
983 983 connf_t *connfp;
984 984 conn_t *connp;
985 985
986 986 /* Look for an IPv6 tunnel link */
987 987 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
988 988 mutex_enter(&connfp->connf_lock);
989 989 for (connp = connfp->connf_head; connp != NULL;
990 990 connp = connp->conn_next) {
991 991 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
992 992 CONN_INC_REF(connp);
993 993 break;
994 994 }
995 995 }
996 996 mutex_exit(&connfp->connf_lock);
997 997 return (connp);
998 998 }
999 999
1000 1000 /*
1001 1001 * This function is used only for inserting SCTP raw socket now.
1002 1002 * This may change later.
1003 1003 *
1004 1004 * Note that only one raw socket can be bound to a port. The param
1005 1005 * lport is in network byte order.
1006 1006 */
1007 1007 static int
1008 1008 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1009 1009 {
1010 1010 connf_t *connfp;
1011 1011 conn_t *oconnp;
1012 1012 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1013 1013
1014 1014 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1015 1015
1016 1016 /* Check for existing raw socket already bound to the port. */
1017 1017 mutex_enter(&connfp->connf_lock);
1018 1018 for (oconnp = connfp->connf_head; oconnp != NULL;
1019 1019 oconnp = oconnp->conn_next) {
1020 1020 if (oconnp->conn_lport == lport &&
1021 1021 oconnp->conn_zoneid == connp->conn_zoneid &&
1022 1022 oconnp->conn_family == connp->conn_family &&
1023 1023 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1024 1024 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1025 1025 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1026 1026 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1027 1027 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1028 1028 &connp->conn_laddr_v6))) {
1029 1029 break;
1030 1030 }
1031 1031 }
1032 1032 mutex_exit(&connfp->connf_lock);
1033 1033 if (oconnp != NULL)
1034 1034 return (EADDRNOTAVAIL);
1035 1035
1036 1036 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1037 1037 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1038 1038 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1039 1039 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1040 1040 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1041 1041 } else {
1042 1042 IPCL_HASH_INSERT_BOUND(connfp, connp);
1043 1043 }
1044 1044 } else {
1045 1045 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1046 1046 }
1047 1047 return (0);
1048 1048 }
1049 1049
1050 1050 static int
1051 1051 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1052 1052 {
1053 1053 connf_t *connfp;
1054 1054 conn_t *tconnp;
1055 1055 ipaddr_t laddr = connp->conn_laddr_v4;
1056 1056 ipaddr_t faddr = connp->conn_faddr_v4;
1057 1057
1058 1058 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1059 1059 mutex_enter(&connfp->connf_lock);
1060 1060 for (tconnp = connfp->connf_head; tconnp != NULL;
1061 1061 tconnp = tconnp->conn_next) {
1062 1062 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1063 1063 /* A tunnel is already bound to these addresses. */
1064 1064 mutex_exit(&connfp->connf_lock);
1065 1065 return (EADDRINUSE);
1066 1066 }
1067 1067 }
1068 1068 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1069 1069 mutex_exit(&connfp->connf_lock);
1070 1070 return (0);
1071 1071 }
1072 1072
1073 1073 static int
1074 1074 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1075 1075 {
1076 1076 connf_t *connfp;
1077 1077 conn_t *tconnp;
1078 1078 in6_addr_t *laddr = &connp->conn_laddr_v6;
1079 1079 in6_addr_t *faddr = &connp->conn_faddr_v6;
1080 1080
1081 1081 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1082 1082 mutex_enter(&connfp->connf_lock);
1083 1083 for (tconnp = connfp->connf_head; tconnp != NULL;
1084 1084 tconnp = tconnp->conn_next) {
1085 1085 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1086 1086 /* A tunnel is already bound to these addresses. */
1087 1087 mutex_exit(&connfp->connf_lock);
1088 1088 return (EADDRINUSE);
1089 1089 }
1090 1090 }
1091 1091 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1092 1092 mutex_exit(&connfp->connf_lock);
1093 1093 return (0);
1094 1094 }
1095 1095
1096 1096 /*
1097 1097 * Check for a MAC exemption conflict on a labeled system. Note that for
1098 1098 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1099 1099 * transport layer. This check is for binding all other protocols.
1100 1100 *
1101 1101 * Returns true if there's a conflict.
1102 1102 */
1103 1103 static boolean_t
1104 1104 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1105 1105 {
1106 1106 connf_t *connfp;
1107 1107 conn_t *tconn;
1108 1108
1109 1109 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1110 1110 mutex_enter(&connfp->connf_lock);
1111 1111 for (tconn = connfp->connf_head; tconn != NULL;
1112 1112 tconn = tconn->conn_next) {
1113 1113 /* We don't allow v4 fallback for v6 raw socket */
1114 1114 if (connp->conn_family != tconn->conn_family)
1115 1115 continue;
1116 1116 /* If neither is exempt, then there's no conflict */
1117 1117 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1118 1118 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1119 1119 continue;
1120 1120 /* We are only concerned about sockets for a different zone */
1121 1121 if (connp->conn_zoneid == tconn->conn_zoneid)
1122 1122 continue;
1123 1123 /* If both are bound to different specific addrs, ok */
1124 1124 if (connp->conn_laddr_v4 != INADDR_ANY &&
1125 1125 tconn->conn_laddr_v4 != INADDR_ANY &&
1126 1126 connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1127 1127 continue;
1128 1128 /* These two conflict; fail */
1129 1129 break;
1130 1130 }
1131 1131 mutex_exit(&connfp->connf_lock);
1132 1132 return (tconn != NULL);
1133 1133 }
1134 1134
1135 1135 static boolean_t
1136 1136 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1137 1137 {
1138 1138 connf_t *connfp;
1139 1139 conn_t *tconn;
1140 1140
1141 1141 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1142 1142 mutex_enter(&connfp->connf_lock);
1143 1143 for (tconn = connfp->connf_head; tconn != NULL;
1144 1144 tconn = tconn->conn_next) {
1145 1145 /* We don't allow v4 fallback for v6 raw socket */
1146 1146 if (connp->conn_family != tconn->conn_family)
1147 1147 continue;
1148 1148 /* If neither is exempt, then there's no conflict */
1149 1149 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1150 1150 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1151 1151 continue;
1152 1152 /* We are only concerned about sockets for a different zone */
1153 1153 if (connp->conn_zoneid == tconn->conn_zoneid)
1154 1154 continue;
1155 1155 /* If both are bound to different addrs, ok */
1156 1156 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1157 1157 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1158 1158 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1159 1159 &tconn->conn_laddr_v6))
1160 1160 continue;
1161 1161 /* These two conflict; fail */
1162 1162 break;
1163 1163 }
1164 1164 mutex_exit(&connfp->connf_lock);
1165 1165 return (tconn != NULL);
1166 1166 }
1167 1167
1168 1168 /*
1169 1169 * (v4, v6) bind hash insertion routines
1170 1170 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1171 1171 */
1172 1172
1173 1173 int
1174 1174 ipcl_bind_insert(conn_t *connp)
1175 1175 {
1176 1176 if (connp->conn_ipversion == IPV6_VERSION)
1177 1177 return (ipcl_bind_insert_v6(connp));
1178 1178 else
1179 1179 return (ipcl_bind_insert_v4(connp));
1180 1180 }
1181 1181
1182 1182 int
1183 1183 ipcl_bind_insert_v4(conn_t *connp)
1184 1184 {
1185 1185 connf_t *connfp;
1186 1186 int ret = 0;
1187 1187 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1188 1188 uint16_t lport = connp->conn_lport;
1189 1189 uint8_t protocol = connp->conn_proto;
1190 1190
1191 1191 if (IPCL_IS_IPTUN(connp))
1192 1192 return (ipcl_iptun_hash_insert(connp, ipst));
1193 1193
1194 1194 switch (protocol) {
1195 1195 default:
1196 1196 if (is_system_labeled() &&
1197 1197 check_exempt_conflict_v4(connp, ipst))
1198 1198 return (EADDRINUSE);
1199 1199 /* FALLTHROUGH */
1200 1200 case IPPROTO_UDP:
1201 1201 if (protocol == IPPROTO_UDP) {
1202 1202 connfp = &ipst->ips_ipcl_udp_fanout[
1203 1203 IPCL_UDP_HASH(lport, ipst)];
1204 1204 } else {
1205 1205 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1206 1206 }
1207 1207
1208 1208 if (connp->conn_faddr_v4 != INADDR_ANY) {
1209 1209 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1210 1210 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1211 1211 IPCL_HASH_INSERT_BOUND(connfp, connp);
1212 1212 } else {
1213 1213 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1214 1214 }
1215 1215 if (protocol == IPPROTO_RSVP)
1216 1216 ill_set_inputfn_all(ipst);
1217 1217 break;
1218 1218
1219 1219 case IPPROTO_TCP:
1220 1220 /* Insert it in the Bind Hash */
1221 1221 ASSERT(connp->conn_zoneid != ALL_ZONES);
1222 1222 connfp = &ipst->ips_ipcl_bind_fanout[
1223 1223 IPCL_BIND_HASH(lport, ipst)];
1224 1224 if (connp->conn_laddr_v4 != INADDR_ANY) {
1225 1225 IPCL_HASH_INSERT_BOUND(connfp, connp);
1226 1226 } else {
1227 1227 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1228 1228 }
1229 1229 if (cl_inet_listen != NULL) {
1230 1230 ASSERT(connp->conn_ipversion == IPV4_VERSION);
1231 1231 connp->conn_flags |= IPCL_CL_LISTENER;
1232 1232 (*cl_inet_listen)(
1233 1233 connp->conn_netstack->netstack_stackid,
1234 1234 IPPROTO_TCP, AF_INET,
1235 1235 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1236 1236 }
1237 1237 break;
1238 1238
1239 1239 case IPPROTO_SCTP:
1240 1240 ret = ipcl_sctp_hash_insert(connp, lport);
1241 1241 break;
1242 1242 }
1243 1243
1244 1244 return (ret);
1245 1245 }
1246 1246
1247 1247 int
1248 1248 ipcl_bind_insert_v6(conn_t *connp)
1249 1249 {
1250 1250 connf_t *connfp;
1251 1251 int ret = 0;
1252 1252 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1253 1253 uint16_t lport = connp->conn_lport;
1254 1254 uint8_t protocol = connp->conn_proto;
1255 1255
1256 1256 if (IPCL_IS_IPTUN(connp)) {
1257 1257 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1258 1258 }
1259 1259
1260 1260 switch (protocol) {
1261 1261 default:
1262 1262 if (is_system_labeled() &&
1263 1263 check_exempt_conflict_v6(connp, ipst))
1264 1264 return (EADDRINUSE);
1265 1265 /* FALLTHROUGH */
1266 1266 case IPPROTO_UDP:
1267 1267 if (protocol == IPPROTO_UDP) {
1268 1268 connfp = &ipst->ips_ipcl_udp_fanout[
1269 1269 IPCL_UDP_HASH(lport, ipst)];
1270 1270 } else {
1271 1271 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1272 1272 }
1273 1273
1274 1274 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1275 1275 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1276 1276 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1277 1277 IPCL_HASH_INSERT_BOUND(connfp, connp);
1278 1278 } else {
1279 1279 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1280 1280 }
1281 1281 break;
1282 1282
1283 1283 case IPPROTO_TCP:
1284 1284 /* Insert it in the Bind Hash */
1285 1285 ASSERT(connp->conn_zoneid != ALL_ZONES);
1286 1286 connfp = &ipst->ips_ipcl_bind_fanout[
1287 1287 IPCL_BIND_HASH(lport, ipst)];
1288 1288 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1289 1289 IPCL_HASH_INSERT_BOUND(connfp, connp);
1290 1290 } else {
1291 1291 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1292 1292 }
1293 1293 if (cl_inet_listen != NULL) {
1294 1294 sa_family_t addr_family;
1295 1295 uint8_t *laddrp;
1296 1296
1297 1297 if (connp->conn_ipversion == IPV6_VERSION) {
1298 1298 addr_family = AF_INET6;
1299 1299 laddrp =
1300 1300 (uint8_t *)&connp->conn_bound_addr_v6;
1301 1301 } else {
1302 1302 addr_family = AF_INET;
1303 1303 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1304 1304 }
1305 1305 connp->conn_flags |= IPCL_CL_LISTENER;
1306 1306 (*cl_inet_listen)(
1307 1307 connp->conn_netstack->netstack_stackid,
1308 1308 IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1309 1309 }
1310 1310 break;
1311 1311
1312 1312 case IPPROTO_SCTP:
1313 1313 ret = ipcl_sctp_hash_insert(connp, lport);
1314 1314 break;
1315 1315 }
1316 1316
1317 1317 return (ret);
1318 1318 }
1319 1319
1320 1320 /*
1321 1321 * ipcl_conn_hash insertion routines.
1322 1322 * The caller has already set conn_proto and the addresses/ports in the conn_t.
1323 1323 */
1324 1324
1325 1325 int
1326 1326 ipcl_conn_insert(conn_t *connp)
1327 1327 {
1328 1328 if (connp->conn_ipversion == IPV6_VERSION)
1329 1329 return (ipcl_conn_insert_v6(connp));
1330 1330 else
1331 1331 return (ipcl_conn_insert_v4(connp));
1332 1332 }
1333 1333
1334 1334 int
1335 1335 ipcl_conn_insert_v4(conn_t *connp)
1336 1336 {
1337 1337 connf_t *connfp;
1338 1338 conn_t *tconnp;
1339 1339 int ret = 0;
1340 1340 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1341 1341 uint16_t lport = connp->conn_lport;
1342 1342 uint8_t protocol = connp->conn_proto;
1343 1343
1344 1344 if (IPCL_IS_IPTUN(connp))
1345 1345 return (ipcl_iptun_hash_insert(connp, ipst));
1346 1346
1347 1347 switch (protocol) {
1348 1348 case IPPROTO_TCP:
1349 1349 /*
1350 1350 * For TCP, we check whether the connection tuple already
1351 1351 * exists before allowing the connection to proceed. We
1352 1352 * also allow indexing on the zoneid. This is to allow
1353 1353 * multiple shared stack zones to have the same tcp
1354 1354 * connection tuple. In practice this only happens for
1355 1355 * INADDR_LOOPBACK as it's the only local address which
1356 1356 * doesn't have to be unique.
1357 1357 */
1358 1358 connfp = &ipst->ips_ipcl_conn_fanout[
1359 1359 IPCL_CONN_HASH(connp->conn_faddr_v4,
1360 1360 connp->conn_ports, ipst)];
1361 1361 mutex_enter(&connfp->connf_lock);
1362 1362 for (tconnp = connfp->connf_head; tconnp != NULL;
1363 1363 tconnp = tconnp->conn_next) {
1364 1364 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1365 1365 connp->conn_faddr_v4, connp->conn_laddr_v4,
1366 1366 connp->conn_ports) &&
1367 1367 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1368 1368 /* Already have a conn. bail out */
1369 1369 mutex_exit(&connfp->connf_lock);
1370 1370 return (EADDRINUSE);
1371 1371 }
1372 1372 }
1373 1373 if (connp->conn_fanout != NULL) {
1374 1374 /*
1375 1375 * Probably a XTI/TLI application trying to do a
1376 1376 * rebind. Let it happen.
1377 1377 */
1378 1378 mutex_exit(&connfp->connf_lock);
1379 1379 IPCL_HASH_REMOVE(connp);
1380 1380 mutex_enter(&connfp->connf_lock);
1381 1381 }
1382 1382
1383 1383 ASSERT(connp->conn_recv != NULL);
1384 1384 ASSERT(connp->conn_recvicmp != NULL);
1385 1385
1386 1386 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1387 1387 mutex_exit(&connfp->connf_lock);
1388 1388 break;
1389 1389
1390 1390 case IPPROTO_SCTP:
1391 1391 /*
1392 1392 * The raw socket may have already been bound, remove it
1393 1393 * from the hash first.
1394 1394 */
1395 1395 IPCL_HASH_REMOVE(connp);
1396 1396 ret = ipcl_sctp_hash_insert(connp, lport);
1397 1397 break;
1398 1398
1399 1399 default:
1400 1400 /*
1401 1401 * Check for conflicts among MAC exempt bindings. For
1402 1402 * transports with port numbers, this is done by the upper
1403 1403 * level per-transport binding logic. For all others, it's
1404 1404 * done here.
1405 1405 */
1406 1406 if (is_system_labeled() &&
1407 1407 check_exempt_conflict_v4(connp, ipst))
1408 1408 return (EADDRINUSE);
1409 1409 /* FALLTHROUGH */
1410 1410
1411 1411 case IPPROTO_UDP:
1412 1412 if (protocol == IPPROTO_UDP) {
1413 1413 connfp = &ipst->ips_ipcl_udp_fanout[
1414 1414 IPCL_UDP_HASH(lport, ipst)];
1415 1415 } else {
1416 1416 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1417 1417 }
1418 1418
1419 1419 if (connp->conn_faddr_v4 != INADDR_ANY) {
1420 1420 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1421 1421 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1422 1422 IPCL_HASH_INSERT_BOUND(connfp, connp);
1423 1423 } else {
1424 1424 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1425 1425 }
1426 1426 break;
1427 1427 }
1428 1428
1429 1429 return (ret);
1430 1430 }
1431 1431
1432 1432 int
1433 1433 ipcl_conn_insert_v6(conn_t *connp)
1434 1434 {
1435 1435 connf_t *connfp;
1436 1436 conn_t *tconnp;
1437 1437 int ret = 0;
1438 1438 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1439 1439 uint16_t lport = connp->conn_lport;
1440 1440 uint8_t protocol = connp->conn_proto;
1441 1441 uint_t ifindex = connp->conn_bound_if;
1442 1442
1443 1443 if (IPCL_IS_IPTUN(connp))
1444 1444 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1445 1445
1446 1446 switch (protocol) {
1447 1447 case IPPROTO_TCP:
1448 1448
1449 1449 /*
1450 1450 * For tcp, we check whether the connection tuple already
1451 1451 * exists before allowing the connection to proceed. We
1452 1452 * also allow indexing on the zoneid. This is to allow
1453 1453 * multiple shared stack zones to have the same tcp
1454 1454 * connection tuple. In practice this only happens for
1455 1455 * ipv6_loopback as it's the only local address which
1456 1456 * doesn't have to be unique.
1457 1457 */
1458 1458 connfp = &ipst->ips_ipcl_conn_fanout[
1459 1459 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1460 1460 ipst)];
1461 1461 mutex_enter(&connfp->connf_lock);
1462 1462 for (tconnp = connfp->connf_head; tconnp != NULL;
1463 1463 tconnp = tconnp->conn_next) {
1464 1464 /* NOTE: need to match zoneid. Bug in onnv-gate */
1465 1465 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1466 1466 connp->conn_faddr_v6, connp->conn_laddr_v6,
1467 1467 connp->conn_ports) &&
1468 1468 (tconnp->conn_bound_if == 0 ||
1469 1469 tconnp->conn_bound_if == ifindex) &&
1470 1470 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1471 1471 /* Already have a conn. bail out */
1472 1472 mutex_exit(&connfp->connf_lock);
1473 1473 return (EADDRINUSE);
1474 1474 }
1475 1475 }
1476 1476 if (connp->conn_fanout != NULL) {
1477 1477 /*
1478 1478 * Probably a XTI/TLI application trying to do a
1479 1479 * rebind. Let it happen.
1480 1480 */
1481 1481 mutex_exit(&connfp->connf_lock);
1482 1482 IPCL_HASH_REMOVE(connp);
1483 1483 mutex_enter(&connfp->connf_lock);
1484 1484 }
1485 1485 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1486 1486 mutex_exit(&connfp->connf_lock);
1487 1487 break;
1488 1488
1489 1489 case IPPROTO_SCTP:
1490 1490 IPCL_HASH_REMOVE(connp);
1491 1491 ret = ipcl_sctp_hash_insert(connp, lport);
1492 1492 break;
1493 1493
1494 1494 default:
1495 1495 if (is_system_labeled() &&
1496 1496 check_exempt_conflict_v6(connp, ipst))
1497 1497 return (EADDRINUSE);
1498 1498 /* FALLTHROUGH */
1499 1499 case IPPROTO_UDP:
1500 1500 if (protocol == IPPROTO_UDP) {
1501 1501 connfp = &ipst->ips_ipcl_udp_fanout[
1502 1502 IPCL_UDP_HASH(lport, ipst)];
1503 1503 } else {
1504 1504 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1505 1505 }
1506 1506
1507 1507 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1508 1508 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1509 1509 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1510 1510 IPCL_HASH_INSERT_BOUND(connfp, connp);
1511 1511 } else {
1512 1512 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1513 1513 }
1514 1514 break;
1515 1515 }
1516 1516
1517 1517 return (ret);
1518 1518 }
1519 1519
1520 1520 /*
1521 1521 * v4 packet classifying function. looks up the fanout table to
1522 1522 * find the conn, the packet belongs to. returns the conn with
1523 1523 * the reference held, null otherwise.
1524 1524 *
1525 1525 * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1526 1526 * Lookup" comment block are applied. Labels are also checked as described
1527 1527 * above. If the packet is from the inside (looped back), and is from the same
1528 1528 * zone, then label checks are omitted.
1529 1529 */
1530 1530 conn_t *
1531 1531 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1532 1532 ip_recv_attr_t *ira, ip_stack_t *ipst)
1533 1533 {
1534 1534 ipha_t *ipha;
1535 1535 connf_t *connfp, *bind_connfp;
1536 1536 uint16_t lport;
1537 1537 uint16_t fport;
1538 1538 uint32_t ports;
1539 1539 conn_t *connp;
1540 1540 uint16_t *up;
1541 1541 zoneid_t zoneid = ira->ira_zoneid;
1542 1542
1543 1543 ipha = (ipha_t *)mp->b_rptr;
1544 1544 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1545 1545
1546 1546 switch (protocol) {
1547 1547 case IPPROTO_TCP:
1548 1548 ports = *(uint32_t *)up;
1549 1549 connfp =
1550 1550 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1551 1551 ports, ipst)];
1552 1552 mutex_enter(&connfp->connf_lock);
1553 1553 for (connp = connfp->connf_head; connp != NULL;
1554 1554 connp = connp->conn_next) {
1555 1555 if (IPCL_CONN_MATCH(connp, protocol,
1556 1556 ipha->ipha_src, ipha->ipha_dst, ports) &&
1557 1557 (connp->conn_zoneid == zoneid ||
1558 1558 connp->conn_allzones ||
1559 1559 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1560 1560 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1561 1561 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1562 1562 break;
1563 1563 }
1564 1564
1565 1565 if (connp != NULL) {
1566 1566 /*
1567 1567 * We have a fully-bound TCP connection.
1568 1568 *
1569 1569 * For labeled systems, there's no need to check the
1570 1570 * label here. It's known to be good as we checked
1571 1571 * before allowing the connection to become bound.
1572 1572 */
1573 1573 CONN_INC_REF(connp);
1574 1574 mutex_exit(&connfp->connf_lock);
1575 1575 return (connp);
1576 1576 }
1577 1577
1578 1578 mutex_exit(&connfp->connf_lock);
1579 1579 lport = up[1];
1580 1580 bind_connfp =
1581 1581 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1582 1582 mutex_enter(&bind_connfp->connf_lock);
1583 1583 for (connp = bind_connfp->connf_head; connp != NULL;
1584 1584 connp = connp->conn_next) {
1585 1585 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1586 1586 lport) &&
1587 1587 (connp->conn_zoneid == zoneid ||
1588 1588 connp->conn_allzones ||
1589 1589 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1590 1590 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1591 1591 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1592 1592 break;
1593 1593 }
1594 1594
1595 1595 /*
1596 1596 * If the matching connection is SLP on a private address, then
1597 1597 * the label on the packet must match the local zone's label.
1598 1598 * Otherwise, it must be in the label range defined by tnrh.
1599 1599 * This is ensured by tsol_receive_local.
1600 1600 *
1601 1601 * Note that we don't check tsol_receive_local for
1602 1602 * the connected case.
1603 1603 */
1604 1604 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1605 1605 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1606 1606 ira, connp)) {
1607 1607 DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1608 1608 char *, "connp(1) could not receive mp(2)",
1609 1609 conn_t *, connp, mblk_t *, mp);
1610 1610 connp = NULL;
1611 1611 }
1612 1612
1613 1613 if (connp != NULL) {
1614 1614 /* Have a listener at least */
1615 1615 CONN_INC_REF(connp);
1616 1616 mutex_exit(&bind_connfp->connf_lock);
1617 1617 return (connp);
1618 1618 }
1619 1619
1620 1620 mutex_exit(&bind_connfp->connf_lock);
1621 1621 break;
1622 1622
1623 1623 case IPPROTO_UDP:
1624 1624 lport = up[1];
1625 1625 fport = up[0];
1626 1626 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1627 1627 mutex_enter(&connfp->connf_lock);
1628 1628 for (connp = connfp->connf_head; connp != NULL;
1629 1629 connp = connp->conn_next) {
1630 1630 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1631 1631 fport, ipha->ipha_src) &&
1632 1632 (connp->conn_zoneid == zoneid ||
1633 1633 connp->conn_allzones ||
1634 1634 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1635 1635 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1636 1636 break;
1637 1637 }
1638 1638
1639 1639 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1640 1640 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1641 1641 ira, connp)) {
1642 1642 DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1643 1643 char *, "connp(1) could not receive mp(2)",
1644 1644 conn_t *, connp, mblk_t *, mp);
1645 1645 connp = NULL;
1646 1646 }
1647 1647
1648 1648 if (connp != NULL) {
1649 1649 CONN_INC_REF(connp);
1650 1650 mutex_exit(&connfp->connf_lock);
1651 1651 return (connp);
1652 1652 }
1653 1653
1654 1654 /*
1655 1655 * We shouldn't come here for multicast/broadcast packets
1656 1656 */
1657 1657 mutex_exit(&connfp->connf_lock);
1658 1658
1659 1659 break;
1660 1660
1661 1661 case IPPROTO_ENCAP:
1662 1662 case IPPROTO_IPV6:
1663 1663 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1664 1664 &ipha->ipha_dst, ipst));
1665 1665 }
1666 1666
1667 1667 return (NULL);
1668 1668 }
1669 1669
1670 1670 conn_t *
1671 1671 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1672 1672 ip_recv_attr_t *ira, ip_stack_t *ipst)
1673 1673 {
1674 1674 ip6_t *ip6h;
1675 1675 connf_t *connfp, *bind_connfp;
1676 1676 uint16_t lport;
1677 1677 uint16_t fport;
1678 1678 tcpha_t *tcpha;
1679 1679 uint32_t ports;
1680 1680 conn_t *connp;
1681 1681 uint16_t *up;
1682 1682 zoneid_t zoneid = ira->ira_zoneid;
1683 1683
1684 1684 ip6h = (ip6_t *)mp->b_rptr;
1685 1685
1686 1686 switch (protocol) {
1687 1687 case IPPROTO_TCP:
1688 1688 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1689 1689 up = &tcpha->tha_lport;
1690 1690 ports = *(uint32_t *)up;
1691 1691
1692 1692 connfp =
1693 1693 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1694 1694 ports, ipst)];
1695 1695 mutex_enter(&connfp->connf_lock);
1696 1696 for (connp = connfp->connf_head; connp != NULL;
1697 1697 connp = connp->conn_next) {
1698 1698 if (IPCL_CONN_MATCH_V6(connp, protocol,
1699 1699 ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1700 1700 (connp->conn_zoneid == zoneid ||
1701 1701 connp->conn_allzones ||
1702 1702 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1703 1703 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1704 1704 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1705 1705 break;
1706 1706 }
1707 1707
1708 1708 if (connp != NULL) {
1709 1709 /*
1710 1710 * We have a fully-bound TCP connection.
1711 1711 *
1712 1712 * For labeled systems, there's no need to check the
1713 1713 * label here. It's known to be good as we checked
1714 1714 * before allowing the connection to become bound.
1715 1715 */
1716 1716 CONN_INC_REF(connp);
1717 1717 mutex_exit(&connfp->connf_lock);
1718 1718 return (connp);
1719 1719 }
1720 1720
1721 1721 mutex_exit(&connfp->connf_lock);
1722 1722
1723 1723 lport = up[1];
1724 1724 bind_connfp =
1725 1725 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1726 1726 mutex_enter(&bind_connfp->connf_lock);
1727 1727 for (connp = bind_connfp->connf_head; connp != NULL;
1728 1728 connp = connp->conn_next) {
1729 1729 if (IPCL_BIND_MATCH_V6(connp, protocol,
1730 1730 ip6h->ip6_dst, lport) &&
1731 1731 (connp->conn_zoneid == zoneid ||
1732 1732 connp->conn_allzones ||
1733 1733 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1734 1734 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1735 1735 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1736 1736 break;
1737 1737 }
1738 1738
1739 1739 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1740 1740 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1741 1741 ira, connp)) {
1742 1742 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1743 1743 char *, "connp(1) could not receive mp(2)",
1744 1744 conn_t *, connp, mblk_t *, mp);
1745 1745 connp = NULL;
1746 1746 }
1747 1747
1748 1748 if (connp != NULL) {
1749 1749 /* Have a listner at least */
1750 1750 CONN_INC_REF(connp);
1751 1751 mutex_exit(&bind_connfp->connf_lock);
1752 1752 return (connp);
1753 1753 }
1754 1754
1755 1755 mutex_exit(&bind_connfp->connf_lock);
1756 1756 break;
1757 1757
1758 1758 case IPPROTO_UDP:
1759 1759 up = (uint16_t *)&mp->b_rptr[hdr_len];
1760 1760 lport = up[1];
1761 1761 fport = up[0];
1762 1762 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1763 1763 mutex_enter(&connfp->connf_lock);
1764 1764 for (connp = connfp->connf_head; connp != NULL;
1765 1765 connp = connp->conn_next) {
1766 1766 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1767 1767 fport, ip6h->ip6_src) &&
1768 1768 (connp->conn_zoneid == zoneid ||
1769 1769 connp->conn_allzones ||
1770 1770 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1771 1771 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1772 1772 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1773 1773 break;
1774 1774 }
1775 1775
1776 1776 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1777 1777 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1778 1778 ira, connp)) {
1779 1779 DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1780 1780 char *, "connp(1) could not receive mp(2)",
1781 1781 conn_t *, connp, mblk_t *, mp);
1782 1782 connp = NULL;
1783 1783 }
1784 1784
1785 1785 if (connp != NULL) {
1786 1786 CONN_INC_REF(connp);
1787 1787 mutex_exit(&connfp->connf_lock);
1788 1788 return (connp);
1789 1789 }
1790 1790
1791 1791 /*
1792 1792 * We shouldn't come here for multicast/broadcast packets
1793 1793 */
1794 1794 mutex_exit(&connfp->connf_lock);
1795 1795 break;
1796 1796 case IPPROTO_ENCAP:
1797 1797 case IPPROTO_IPV6:
1798 1798 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1799 1799 &ip6h->ip6_dst, ipst));
1800 1800 }
1801 1801
1802 1802 return (NULL);
1803 1803 }
1804 1804
1805 1805 /*
1806 1806 * wrapper around ipcl_classify_(v4,v6) routines.
1807 1807 */
1808 1808 conn_t *
1809 1809 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1810 1810 {
1811 1811 if (ira->ira_flags & IRAF_IS_IPV4) {
1812 1812 return (ipcl_classify_v4(mp, ira->ira_protocol,
1813 1813 ira->ira_ip_hdr_length, ira, ipst));
1814 1814 } else {
1815 1815 return (ipcl_classify_v6(mp, ira->ira_protocol,
1816 1816 ira->ira_ip_hdr_length, ira, ipst));
1817 1817 }
1818 1818 }
1819 1819
1820 1820 /*
1821 1821 * Only used to classify SCTP RAW sockets
1822 1822 */
1823 1823 conn_t *
1824 1824 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1825 1825 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1826 1826 {
1827 1827 connf_t *connfp;
1828 1828 conn_t *connp;
1829 1829 in_port_t lport;
1830 1830 int ipversion;
1831 1831 const void *dst;
1832 1832 zoneid_t zoneid = ira->ira_zoneid;
1833 1833
1834 1834 lport = ((uint16_t *)&ports)[1];
1835 1835 if (ira->ira_flags & IRAF_IS_IPV4) {
1836 1836 dst = (const void *)&ipha->ipha_dst;
1837 1837 ipversion = IPV4_VERSION;
1838 1838 } else {
1839 1839 dst = (const void *)&ip6h->ip6_dst;
1840 1840 ipversion = IPV6_VERSION;
1841 1841 }
1842 1842
1843 1843 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1844 1844 mutex_enter(&connfp->connf_lock);
1845 1845 for (connp = connfp->connf_head; connp != NULL;
1846 1846 connp = connp->conn_next) {
1847 1847 /* We don't allow v4 fallback for v6 raw socket. */
1848 1848 if (ipversion != connp->conn_ipversion)
1849 1849 continue;
1850 1850 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1851 1851 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1852 1852 if (ipversion == IPV4_VERSION) {
1853 1853 if (!IPCL_CONN_MATCH(connp, protocol,
1854 1854 ipha->ipha_src, ipha->ipha_dst, ports))
1855 1855 continue;
1856 1856 } else {
1857 1857 if (!IPCL_CONN_MATCH_V6(connp, protocol,
1858 1858 ip6h->ip6_src, ip6h->ip6_dst, ports))
1859 1859 continue;
1860 1860 }
1861 1861 } else {
1862 1862 if (ipversion == IPV4_VERSION) {
1863 1863 if (!IPCL_BIND_MATCH(connp, protocol,
1864 1864 ipha->ipha_dst, lport))
1865 1865 continue;
1866 1866 } else {
1867 1867 if (!IPCL_BIND_MATCH_V6(connp, protocol,
1868 1868 ip6h->ip6_dst, lport))
1869 1869 continue;
1870 1870 }
1871 1871 }
1872 1872
1873 1873 if (connp->conn_zoneid == zoneid ||
1874 1874 connp->conn_allzones ||
1875 1875 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1876 1876 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1877 1877 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1878 1878 break;
1879 1879 }
1880 1880
1881 1881 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1882 1882 !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1883 1883 DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1884 1884 char *, "connp(1) could not receive mp(2)",
1885 1885 conn_t *, connp, mblk_t *, mp);
1886 1886 connp = NULL;
1887 1887 }
1888 1888
1889 1889 if (connp != NULL)
1890 1890 goto found;
1891 1891 mutex_exit(&connfp->connf_lock);
1892 1892
1893 1893 /* Try to look for a wildcard SCTP RAW socket match. */
1894 1894 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1895 1895 mutex_enter(&connfp->connf_lock);
1896 1896 for (connp = connfp->connf_head; connp != NULL;
1897 1897 connp = connp->conn_next) {
1898 1898 /* We don't allow v4 fallback for v6 raw socket. */
1899 1899 if (ipversion != connp->conn_ipversion)
1900 1900 continue;
1901 1901 if (!IPCL_ZONE_MATCH(connp, zoneid))
1902 1902 continue;
1903 1903
1904 1904 if (ipversion == IPV4_VERSION) {
1905 1905 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1906 1906 break;
1907 1907 } else {
1908 1908 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1909 1909 break;
1910 1910 }
1911 1911 }
1912 1912 }
1913 1913
1914 1914 if (connp != NULL)
1915 1915 goto found;
1916 1916
1917 1917 mutex_exit(&connfp->connf_lock);
1918 1918 return (NULL);
1919 1919
1920 1920 found:
1921 1921 ASSERT(connp != NULL);
1922 1922 CONN_INC_REF(connp);
1923 1923 mutex_exit(&connfp->connf_lock);
1924 1924 return (connp);
1925 1925 }
1926 1926
1927 1927 /* ARGSUSED */
1928 1928 static int
1929 1929 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1930 1930 {
1931 1931 itc_t *itc = (itc_t *)buf;
1932 1932 conn_t *connp = &itc->itc_conn;
1933 1933 tcp_t *tcp = (tcp_t *)&itc[1];
1934 1934
1935 1935 bzero(connp, sizeof (conn_t));
1936 1936 bzero(tcp, sizeof (tcp_t));
1937 1937
1938 1938 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1939 1939 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1940 1940 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1941 1941 tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1942 1942 if (tcp->tcp_timercache == NULL)
1943 1943 return (ENOMEM);
1944 1944 connp->conn_tcp = tcp;
1945 1945 connp->conn_flags = IPCL_TCPCONN;
1946 1946 connp->conn_proto = IPPROTO_TCP;
1947 1947 tcp->tcp_connp = connp;
1948 1948 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1949 1949
1950 1950 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1951 1951 if (connp->conn_ixa == NULL) {
1952 1952 tcp_timermp_free(tcp);
1953 1953 return (ENOMEM);
1954 1954 }
1955 1955 connp->conn_ixa->ixa_refcnt = 1;
1956 1956 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1957 1957 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1958 1958 return (0);
1959 1959 }
1960 1960
1961 1961 /* ARGSUSED */
1962 1962 static void
1963 1963 tcp_conn_destructor(void *buf, void *cdrarg)
1964 1964 {
1965 1965 itc_t *itc = (itc_t *)buf;
1966 1966 conn_t *connp = &itc->itc_conn;
1967 1967 tcp_t *tcp = (tcp_t *)&itc[1];
1968 1968
1969 1969 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1970 1970 ASSERT(tcp->tcp_connp == connp);
1971 1971 ASSERT(connp->conn_tcp == tcp);
1972 1972 tcp_timermp_free(tcp);
1973 1973 mutex_destroy(&connp->conn_lock);
1974 1974 cv_destroy(&connp->conn_cv);
1975 1975 cv_destroy(&connp->conn_sq_cv);
1976 1976 rw_destroy(&connp->conn_ilg_lock);
1977 1977
1978 1978 /* Can be NULL if constructor failed */
1979 1979 if (connp->conn_ixa != NULL) {
1980 1980 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1981 1981 ASSERT(connp->conn_ixa->ixa_ire == NULL);
1982 1982 ASSERT(connp->conn_ixa->ixa_nce == NULL);
1983 1983 ixa_refrele(connp->conn_ixa);
1984 1984 }
1985 1985 }
1986 1986
1987 1987 /* ARGSUSED */
1988 1988 static int
1989 1989 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1990 1990 {
1991 1991 itc_t *itc = (itc_t *)buf;
1992 1992 conn_t *connp = &itc->itc_conn;
1993 1993
1994 1994 bzero(connp, sizeof (conn_t));
1995 1995 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1996 1996 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1997 1997 connp->conn_flags = IPCL_IPCCONN;
1998 1998 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1999 1999
2000 2000 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2001 2001 if (connp->conn_ixa == NULL)
2002 2002 return (ENOMEM);
2003 2003 connp->conn_ixa->ixa_refcnt = 1;
2004 2004 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2005 2005 return (0);
2006 2006 }
2007 2007
2008 2008 /* ARGSUSED */
2009 2009 static void
2010 2010 ip_conn_destructor(void *buf, void *cdrarg)
2011 2011 {
2012 2012 itc_t *itc = (itc_t *)buf;
2013 2013 conn_t *connp = &itc->itc_conn;
2014 2014
2015 2015 ASSERT(connp->conn_flags & IPCL_IPCCONN);
2016 2016 ASSERT(connp->conn_priv == NULL);
2017 2017 mutex_destroy(&connp->conn_lock);
2018 2018 cv_destroy(&connp->conn_cv);
2019 2019 rw_destroy(&connp->conn_ilg_lock);
2020 2020
2021 2021 /* Can be NULL if constructor failed */
2022 2022 if (connp->conn_ixa != NULL) {
2023 2023 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2024 2024 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2025 2025 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2026 2026 ixa_refrele(connp->conn_ixa);
2027 2027 }
2028 2028 }
2029 2029
2030 2030 /* ARGSUSED */
2031 2031 static int
2032 2032 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2033 2033 {
2034 2034 itc_t *itc = (itc_t *)buf;
2035 2035 conn_t *connp = &itc->itc_conn;
2036 2036 udp_t *udp = (udp_t *)&itc[1];
2037 2037
2038 2038 bzero(connp, sizeof (conn_t));
2039 2039 bzero(udp, sizeof (udp_t));
2040 2040
2041 2041 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2042 2042 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2043 2043 connp->conn_udp = udp;
2044 2044 connp->conn_flags = IPCL_UDPCONN;
2045 2045 connp->conn_proto = IPPROTO_UDP;
2046 2046 udp->udp_connp = connp;
2047 2047 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2048 2048 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2049 2049 if (connp->conn_ixa == NULL)
2050 2050 return (ENOMEM);
2051 2051 connp->conn_ixa->ixa_refcnt = 1;
2052 2052 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2053 2053 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2054 2054 return (0);
2055 2055 }
2056 2056
2057 2057 /* ARGSUSED */
2058 2058 static void
2059 2059 udp_conn_destructor(void *buf, void *cdrarg)
2060 2060 {
2061 2061 itc_t *itc = (itc_t *)buf;
2062 2062 conn_t *connp = &itc->itc_conn;
2063 2063 udp_t *udp = (udp_t *)&itc[1];
2064 2064
2065 2065 ASSERT(connp->conn_flags & IPCL_UDPCONN);
2066 2066 ASSERT(udp->udp_connp == connp);
2067 2067 ASSERT(connp->conn_udp == udp);
2068 2068 mutex_destroy(&connp->conn_lock);
2069 2069 cv_destroy(&connp->conn_cv);
2070 2070 rw_destroy(&connp->conn_ilg_lock);
2071 2071
2072 2072 /* Can be NULL if constructor failed */
2073 2073 if (connp->conn_ixa != NULL) {
2074 2074 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2075 2075 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2076 2076 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2077 2077 ixa_refrele(connp->conn_ixa);
2078 2078 }
2079 2079 }
2080 2080
2081 2081 /* ARGSUSED */
2082 2082 static int
2083 2083 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2084 2084 {
2085 2085 itc_t *itc = (itc_t *)buf;
2086 2086 conn_t *connp = &itc->itc_conn;
2087 2087 icmp_t *icmp = (icmp_t *)&itc[1];
2088 2088
2089 2089 bzero(connp, sizeof (conn_t));
2090 2090 bzero(icmp, sizeof (icmp_t));
2091 2091
2092 2092 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2093 2093 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2094 2094 connp->conn_icmp = icmp;
2095 2095 connp->conn_flags = IPCL_RAWIPCONN;
2096 2096 connp->conn_proto = IPPROTO_ICMP;
2097 2097 icmp->icmp_connp = connp;
2098 2098 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2099 2099 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2100 2100 if (connp->conn_ixa == NULL)
2101 2101 return (ENOMEM);
2102 2102 connp->conn_ixa->ixa_refcnt = 1;
2103 2103 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2104 2104 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2105 2105 return (0);
2106 2106 }
2107 2107
2108 2108 /* ARGSUSED */
2109 2109 static void
2110 2110 rawip_conn_destructor(void *buf, void *cdrarg)
2111 2111 {
2112 2112 itc_t *itc = (itc_t *)buf;
2113 2113 conn_t *connp = &itc->itc_conn;
2114 2114 icmp_t *icmp = (icmp_t *)&itc[1];
2115 2115
2116 2116 ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2117 2117 ASSERT(icmp->icmp_connp == connp);
2118 2118 ASSERT(connp->conn_icmp == icmp);
2119 2119 mutex_destroy(&connp->conn_lock);
2120 2120 cv_destroy(&connp->conn_cv);
2121 2121 rw_destroy(&connp->conn_ilg_lock);
2122 2122
2123 2123 /* Can be NULL if constructor failed */
2124 2124 if (connp->conn_ixa != NULL) {
2125 2125 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2126 2126 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2127 2127 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2128 2128 ixa_refrele(connp->conn_ixa);
2129 2129 }
2130 2130 }
2131 2131
2132 2132 /* ARGSUSED */
2133 2133 static int
2134 2134 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2135 2135 {
2136 2136 itc_t *itc = (itc_t *)buf;
2137 2137 conn_t *connp = &itc->itc_conn;
2138 2138 rts_t *rts = (rts_t *)&itc[1];
2139 2139
2140 2140 bzero(connp, sizeof (conn_t));
2141 2141 bzero(rts, sizeof (rts_t));
2142 2142
2143 2143 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2144 2144 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2145 2145 connp->conn_rts = rts;
2146 2146 connp->conn_flags = IPCL_RTSCONN;
2147 2147 rts->rts_connp = connp;
2148 2148 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2149 2149 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2150 2150 if (connp->conn_ixa == NULL)
2151 2151 return (ENOMEM);
2152 2152 connp->conn_ixa->ixa_refcnt = 1;
2153 2153 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2154 2154 return (0);
2155 2155 }
2156 2156
2157 2157 /* ARGSUSED */
2158 2158 static void
2159 2159 rts_conn_destructor(void *buf, void *cdrarg)
2160 2160 {
2161 2161 itc_t *itc = (itc_t *)buf;
2162 2162 conn_t *connp = &itc->itc_conn;
2163 2163 rts_t *rts = (rts_t *)&itc[1];
2164 2164
2165 2165 ASSERT(connp->conn_flags & IPCL_RTSCONN);
2166 2166 ASSERT(rts->rts_connp == connp);
2167 2167 ASSERT(connp->conn_rts == rts);
2168 2168 mutex_destroy(&connp->conn_lock);
2169 2169 cv_destroy(&connp->conn_cv);
2170 2170 rw_destroy(&connp->conn_ilg_lock);
2171 2171
2172 2172 /* Can be NULL if constructor failed */
2173 2173 if (connp->conn_ixa != NULL) {
2174 2174 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2175 2175 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2176 2176 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2177 2177 ixa_refrele(connp->conn_ixa);
2178 2178 }
2179 2179 }
2180 2180
2181 2181 /*
2182 2182 * Called as part of ipcl_conn_destroy to assert and clear any pointers
2183 2183 * in the conn_t.
2184 2184 *
2185 2185 * Below we list all the pointers in the conn_t as a documentation aid.
2186 2186 * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2187 2187 * If you add any pointers to the conn_t please add an ASSERT here
2188 2188 * and #ifdef it out if it can't be actually asserted to be NULL.
2189 2189 * In any case, we bzero most of the conn_t at the end of the function.
2190 2190 */
2191 2191 void
2192 2192 ipcl_conn_cleanup(conn_t *connp)
2193 2193 {
2194 2194 ip_xmit_attr_t *ixa;
2195 2195
2196 2196 ASSERT(connp->conn_latch == NULL);
2197 2197 ASSERT(connp->conn_latch_in_policy == NULL);
2198 2198 ASSERT(connp->conn_latch_in_action == NULL);
2199 2199 #ifdef notdef
2200 2200 ASSERT(connp->conn_rq == NULL);
2201 2201 ASSERT(connp->conn_wq == NULL);
2202 2202 #endif
2203 2203 ASSERT(connp->conn_cred == NULL);
2204 2204 ASSERT(connp->conn_g_fanout == NULL);
2205 2205 ASSERT(connp->conn_g_next == NULL);
2206 2206 ASSERT(connp->conn_g_prev == NULL);
2207 2207 ASSERT(connp->conn_policy == NULL);
2208 2208 ASSERT(connp->conn_fanout == NULL);
2209 2209 ASSERT(connp->conn_next == NULL);
2210 2210 ASSERT(connp->conn_prev == NULL);
2211 2211 ASSERT(connp->conn_oper_pending_ill == NULL);
2212 2212 ASSERT(connp->conn_ilg == NULL);
2213 2213 ASSERT(connp->conn_drain_next == NULL);
2214 2214 ASSERT(connp->conn_drain_prev == NULL);
2215 2215 #ifdef notdef
2216 2216 /* conn_idl is not cleared when removed from idl list */
2217 2217 ASSERT(connp->conn_idl == NULL);
2218 2218 #endif
2219 2219 ASSERT(connp->conn_ipsec_opt_mp == NULL);
2220 2220 #ifdef notdef
2221 2221 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2222 2222 ASSERT(connp->conn_netstack == NULL);
2223 2223 #endif
2224 2224
2225 2225 ASSERT(connp->conn_helper_info == NULL);
2226 2226 ASSERT(connp->conn_ixa != NULL);
2227 2227 ixa = connp->conn_ixa;
2228 2228 ASSERT(ixa->ixa_refcnt == 1);
2229 2229 /* Need to preserve ixa_protocol */
2230 2230 ixa_cleanup(ixa);
2231 2231 ixa->ixa_flags = 0;
2232 2232
2233 2233 /* Clear out the conn_t fields that are not preserved */
2234 2234 bzero(&connp->conn_start_clr,
2235 2235 sizeof (conn_t) -
2236 2236 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2237 2237 }
2238 2238
2239 2239 /*
2240 2240 * All conns are inserted in a global multi-list for the benefit of
2241 2241 * walkers. The walk is guaranteed to walk all open conns at the time
2242 2242 * of the start of the walk exactly once. This property is needed to
2243 2243 * achieve some cleanups during unplumb of interfaces. This is achieved
2244 2244 * as follows.
2245 2245 *
2246 2246 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2247 2247 * call the insert and delete functions below at creation and deletion
2248 2248 * time respectively. The conn never moves or changes its position in this
2249 2249 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2250 2250 * won't increase due to walkers, once the conn deletion has started. Note
2251 2251 * that we can't remove the conn from the global list and then wait for
2252 2252 * the refcnt to drop to zero, since walkers would then see a truncated
2253 2253 * list. CONN_INCIPIENT ensures that walkers don't start looking at
2254 2254 * conns until ip_open is ready to make them globally visible.
2255 2255 * The global round robin multi-list locks are held only to get the
2256 2256 * next member/insertion/deletion and contention should be negligible
2257 2257 * if the multi-list is much greater than the number of cpus.
2258 2258 */
2259 2259 void
2260 2260 ipcl_globalhash_insert(conn_t *connp)
2261 2261 {
2262 2262 int index;
2263 2263 struct connf_s *connfp;
2264 2264 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
2265 2265
2266 2266 /*
2267 2267 * No need for atomic here. Approximate even distribution
2268 2268 * in the global lists is sufficient.
2269 2269 */
2270 2270 ipst->ips_conn_g_index++;
2271 2271 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2272 2272
2273 2273 connp->conn_g_prev = NULL;
2274 2274 /*
2275 2275 * Mark as INCIPIENT, so that walkers will ignore this
2276 2276 * for now, till ip_open is ready to make it visible globally.
2277 2277 */
2278 2278 connp->conn_state_flags |= CONN_INCIPIENT;
2279 2279
2280 2280 connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2281 2281 /* Insert at the head of the list */
2282 2282 mutex_enter(&connfp->connf_lock);
2283 2283 connp->conn_g_next = connfp->connf_head;
2284 2284 if (connp->conn_g_next != NULL)
2285 2285 connp->conn_g_next->conn_g_prev = connp;
2286 2286 connfp->connf_head = connp;
2287 2287
2288 2288 /* The fanout bucket this conn points to */
2289 2289 connp->conn_g_fanout = connfp;
2290 2290
2291 2291 mutex_exit(&connfp->connf_lock);
2292 2292 }
2293 2293
2294 2294 void
2295 2295 ipcl_globalhash_remove(conn_t *connp)
2296 2296 {
2297 2297 struct connf_s *connfp;
2298 2298
2299 2299 /*
2300 2300 * We were never inserted in the global multi list.
2301 2301 * IPCL_NONE variety is never inserted in the global multilist
2302 2302 * since it is presumed to not need any cleanup and is transient.
2303 2303 */
2304 2304 if (connp->conn_g_fanout == NULL)
2305 2305 return;
2306 2306
2307 2307 connfp = connp->conn_g_fanout;
2308 2308 mutex_enter(&connfp->connf_lock);
2309 2309 if (connp->conn_g_prev != NULL)
2310 2310 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2311 2311 else
2312 2312 connfp->connf_head = connp->conn_g_next;
2313 2313 if (connp->conn_g_next != NULL)
2314 2314 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2315 2315 mutex_exit(&connfp->connf_lock);
2316 2316
2317 2317 /* Better to stumble on a null pointer than to corrupt memory */
2318 2318 connp->conn_g_next = NULL;
2319 2319 connp->conn_g_prev = NULL;
2320 2320 connp->conn_g_fanout = NULL;
2321 2321 }
2322 2322
2323 2323 /*
2324 2324 * Walk the list of all conn_t's in the system, calling the function provided
2325 2325 * With the specified argument for each.
2326 2326 * Applies to both IPv4 and IPv6.
2327 2327 *
2328 2328 * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2329 2329 * conn_oper_pending_ill). To guard against stale pointers
2330 2330 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2331 2331 * unplumbed or removed. New conn_t's that are created while we are walking
2332 2332 * may be missed by this walk, because they are not necessarily inserted
2333 2333 * at the tail of the list. They are new conn_t's and thus don't have any
2334 2334 * stale pointers. The CONN_CLOSING flag ensures that no new reference
2335 2335 * is created to the struct that is going away.
2336 2336 */
2337 2337 void
2338 2338 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2339 2339 {
2340 2340 int i;
2341 2341 conn_t *connp;
2342 2342 conn_t *prev_connp;
2343 2343
2344 2344 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2345 2345 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2346 2346 prev_connp = NULL;
2347 2347 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2348 2348 while (connp != NULL) {
2349 2349 mutex_enter(&connp->conn_lock);
2350 2350 if (connp->conn_state_flags &
2351 2351 (CONN_CONDEMNED | CONN_INCIPIENT)) {
2352 2352 mutex_exit(&connp->conn_lock);
2353 2353 connp = connp->conn_g_next;
2354 2354 continue;
2355 2355 }
2356 2356 CONN_INC_REF_LOCKED(connp);
2357 2357 mutex_exit(&connp->conn_lock);
2358 2358 mutex_exit(
2359 2359 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2360 2360 (*func)(connp, arg);
2361 2361 if (prev_connp != NULL)
2362 2362 CONN_DEC_REF(prev_connp);
2363 2363 mutex_enter(
2364 2364 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2365 2365 prev_connp = connp;
2366 2366 connp = connp->conn_g_next;
2367 2367 }
2368 2368 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2369 2369 if (prev_connp != NULL)
2370 2370 CONN_DEC_REF(prev_connp);
2371 2371 }
2372 2372 }
2373 2373
2374 2374 /*
2375 2375 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2376 2376 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2377 2377 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2378 2378 * (peer tcp in ESTABLISHED state).
2379 2379 */
2380 2380 conn_t *
2381 2381 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2382 2382 ip_stack_t *ipst)
2383 2383 {
2384 2384 uint32_t ports;
2385 2385 uint16_t *pports = (uint16_t *)&ports;
2386 2386 connf_t *connfp;
2387 2387 conn_t *tconnp;
2388 2388 boolean_t zone_chk;
2389 2389
2390 2390 /*
2391 2391 * If either the source of destination address is loopback, then
2392 2392 * both endpoints must be in the same Zone. Otherwise, both of
2393 2393 * the addresses are system-wide unique (tcp is in ESTABLISHED
2394 2394 * state) and the endpoints may reside in different Zones.
2395 2395 */
2396 2396 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2397 2397 ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2398 2398
2399 2399 pports[0] = tcpha->tha_fport;
2400 2400 pports[1] = tcpha->tha_lport;
2401 2401
2402 2402 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2403 2403 ports, ipst)];
2404 2404
2405 2405 mutex_enter(&connfp->connf_lock);
2406 2406 for (tconnp = connfp->connf_head; tconnp != NULL;
2407 2407 tconnp = tconnp->conn_next) {
2408 2408
2409 2409 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2410 2410 ipha->ipha_dst, ipha->ipha_src, ports) &&
2411 2411 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2412 2412 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2413 2413
2414 2414 ASSERT(tconnp != connp);
2415 2415 CONN_INC_REF(tconnp);
2416 2416 mutex_exit(&connfp->connf_lock);
2417 2417 return (tconnp);
2418 2418 }
2419 2419 }
2420 2420 mutex_exit(&connfp->connf_lock);
2421 2421 return (NULL);
2422 2422 }
2423 2423
2424 2424 /*
2425 2425 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2426 2426 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2427 2427 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2428 2428 * (peer tcp in ESTABLISHED state).
2429 2429 */
2430 2430 conn_t *
2431 2431 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2432 2432 ip_stack_t *ipst)
2433 2433 {
2434 2434 uint32_t ports;
2435 2435 uint16_t *pports = (uint16_t *)&ports;
2436 2436 connf_t *connfp;
2437 2437 conn_t *tconnp;
2438 2438 boolean_t zone_chk;
2439 2439
2440 2440 /*
2441 2441 * If either the source of destination address is loopback, then
2442 2442 * both endpoints must be in the same Zone. Otherwise, both of
2443 2443 * the addresses are system-wide unique (tcp is in ESTABLISHED
2444 2444 * state) and the endpoints may reside in different Zones. We
2445 2445 * don't do Zone check for link local address(es) because the
2446 2446 * current Zone implementation treats each link local address as
2447 2447 * being unique per system node, i.e. they belong to global Zone.
2448 2448 */
2449 2449 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2450 2450 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2451 2451
2452 2452 pports[0] = tcpha->tha_fport;
2453 2453 pports[1] = tcpha->tha_lport;
2454 2454
2455 2455 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2456 2456 ports, ipst)];
2457 2457
2458 2458 mutex_enter(&connfp->connf_lock);
2459 2459 for (tconnp = connfp->connf_head; tconnp != NULL;
2460 2460 tconnp = tconnp->conn_next) {
2461 2461
2462 2462 /* We skip conn_bound_if check here as this is loopback tcp */
2463 2463 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2464 2464 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2465 2465 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2466 2466 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2467 2467
2468 2468 ASSERT(tconnp != connp);
2469 2469 CONN_INC_REF(tconnp);
2470 2470 mutex_exit(&connfp->connf_lock);
2471 2471 return (tconnp);
2472 2472 }
2473 2473 }
2474 2474 mutex_exit(&connfp->connf_lock);
2475 2475 return (NULL);
2476 2476 }
2477 2477
2478 2478 /*
2479 2479 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2480 2480 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2481 2481 * Only checks for connected entries i.e. no INADDR_ANY checks.
2482 2482 */
2483 2483 conn_t *
2484 2484 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2485 2485 ip_stack_t *ipst)
2486 2486 {
2487 2487 uint32_t ports;
2488 2488 uint16_t *pports;
2489 2489 connf_t *connfp;
2490 2490 conn_t *tconnp;
2491 2491
2492 2492 pports = (uint16_t *)&ports;
2493 2493 pports[0] = tcpha->tha_fport;
2494 2494 pports[1] = tcpha->tha_lport;
2495 2495
2496 2496 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2497 2497 ports, ipst)];
2498 2498
2499 2499 mutex_enter(&connfp->connf_lock);
2500 2500 for (tconnp = connfp->connf_head; tconnp != NULL;
2501 2501 tconnp = tconnp->conn_next) {
2502 2502
2503 2503 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2504 2504 ipha->ipha_dst, ipha->ipha_src, ports) &&
2505 2505 tconnp->conn_tcp->tcp_state >= min_state) {
2506 2506
2507 2507 CONN_INC_REF(tconnp);
2508 2508 mutex_exit(&connfp->connf_lock);
2509 2509 return (tconnp);
2510 2510 }
2511 2511 }
2512 2512 mutex_exit(&connfp->connf_lock);
2513 2513 return (NULL);
2514 2514 }
2515 2515
2516 2516 /*
2517 2517 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2518 2518 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2519 2519 * Only checks for connected entries i.e. no INADDR_ANY checks.
2520 2520 * Match on ifindex in addition to addresses.
2521 2521 */
2522 2522 conn_t *
2523 2523 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2524 2524 uint_t ifindex, ip_stack_t *ipst)
2525 2525 {
2526 2526 tcp_t *tcp;
2527 2527 uint32_t ports;
2528 2528 uint16_t *pports;
2529 2529 connf_t *connfp;
2530 2530 conn_t *tconnp;
2531 2531
2532 2532 pports = (uint16_t *)&ports;
2533 2533 pports[0] = tcpha->tha_fport;
2534 2534 pports[1] = tcpha->tha_lport;
2535 2535
2536 2536 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2537 2537 ports, ipst)];
2538 2538
2539 2539 mutex_enter(&connfp->connf_lock);
2540 2540 for (tconnp = connfp->connf_head; tconnp != NULL;
2541 2541 tconnp = tconnp->conn_next) {
2542 2542
2543 2543 tcp = tconnp->conn_tcp;
2544 2544 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2545 2545 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2546 2546 tcp->tcp_state >= min_state &&
2547 2547 (tconnp->conn_bound_if == 0 ||
2548 2548 tconnp->conn_bound_if == ifindex)) {
2549 2549
2550 2550 CONN_INC_REF(tconnp);
2551 2551 mutex_exit(&connfp->connf_lock);
2552 2552 return (tconnp);
2553 2553 }
2554 2554 }
2555 2555 mutex_exit(&connfp->connf_lock);
2556 2556 return (NULL);
2557 2557 }
2558 2558
2559 2559 /*
2560 2560 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2561 2561 * a listener when changing state.
2562 2562 */
2563 2563 conn_t *
2564 2564 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2565 2565 ip_stack_t *ipst)
2566 2566 {
2567 2567 connf_t *bind_connfp;
2568 2568 conn_t *connp;
2569 2569 tcp_t *tcp;
2570 2570
2571 2571 /*
2572 2572 * Avoid false matches for packets sent to an IP destination of
2573 2573 * all zeros.
2574 2574 */
2575 2575 if (laddr == 0)
2576 2576 return (NULL);
2577 2577
2578 2578 ASSERT(zoneid != ALL_ZONES);
2579 2579
2580 2580 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2581 2581 mutex_enter(&bind_connfp->connf_lock);
2582 2582 for (connp = bind_connfp->connf_head; connp != NULL;
2583 2583 connp = connp->conn_next) {
2584 2584 tcp = connp->conn_tcp;
2585 2585 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2586 2586 IPCL_ZONE_MATCH(connp, zoneid) &&
2587 2587 (tcp->tcp_listener == NULL)) {
2588 2588 CONN_INC_REF(connp);
2589 2589 mutex_exit(&bind_connfp->connf_lock);
2590 2590 return (connp);
2591 2591 }
2592 2592 }
2593 2593 mutex_exit(&bind_connfp->connf_lock);
2594 2594 return (NULL);
2595 2595 }
2596 2596
2597 2597 /*
2598 2598 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2599 2599 * a listener when changing state.
2600 2600 */
2601 2601 conn_t *
2602 2602 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2603 2603 zoneid_t zoneid, ip_stack_t *ipst)
2604 2604 {
2605 2605 connf_t *bind_connfp;
2606 2606 conn_t *connp = NULL;
2607 2607 tcp_t *tcp;
2608 2608
2609 2609 /*
2610 2610 * Avoid false matches for packets sent to an IP destination of
2611 2611 * all zeros.
2612 2612 */
2613 2613 if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2614 2614 return (NULL);
2615 2615
2616 2616 ASSERT(zoneid != ALL_ZONES);
2617 2617
2618 2618 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2619 2619 mutex_enter(&bind_connfp->connf_lock);
2620 2620 for (connp = bind_connfp->connf_head; connp != NULL;
2621 2621 connp = connp->conn_next) {
2622 2622 tcp = connp->conn_tcp;
2623 2623 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2624 2624 IPCL_ZONE_MATCH(connp, zoneid) &&
2625 2625 (connp->conn_bound_if == 0 ||
2626 2626 connp->conn_bound_if == ifindex) &&
2627 2627 tcp->tcp_listener == NULL) {
2628 2628 CONN_INC_REF(connp);
2629 2629 mutex_exit(&bind_connfp->connf_lock);
2630 2630 return (connp);
2631 2631 }
2632 2632 }
2633 2633 mutex_exit(&bind_connfp->connf_lock);
2634 2634 return (NULL);
2635 2635 }
2636 2636
2637 2637 /*
2638 2638 * ipcl_get_next_conn
2639 2639 * get the next entry in the conn global list
2640 2640 * and put a reference on the next_conn.
2641 2641 * decrement the reference on the current conn.
2642 2642 *
2643 2643 * This is an iterator based walker function that also provides for
2644 2644 * some selection by the caller. It walks through the conn_hash bucket
2645 2645 * searching for the next valid connp in the list, and selects connections
2646 2646 * that are neither closed nor condemned. It also REFHOLDS the conn
2647 2647 * thus ensuring that the conn exists when the caller uses the conn.
2648 2648 */
2649 2649 conn_t *
2650 2650 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2651 2651 {
2652 2652 conn_t *next_connp;
2653 2653
2654 2654 if (connfp == NULL)
2655 2655 return (NULL);
2656 2656
2657 2657 mutex_enter(&connfp->connf_lock);
2658 2658
2659 2659 next_connp = (connp == NULL) ?
2660 2660 connfp->connf_head : connp->conn_g_next;
2661 2661
2662 2662 while (next_connp != NULL) {
2663 2663 mutex_enter(&next_connp->conn_lock);
2664 2664 if (!(next_connp->conn_flags & conn_flags) ||
2665 2665 (next_connp->conn_state_flags &
2666 2666 (CONN_CONDEMNED | CONN_INCIPIENT))) {
2667 2667 /*
2668 2668 * This conn has been condemned or
2669 2669 * is closing, or the flags don't match
2670 2670 */
2671 2671 mutex_exit(&next_connp->conn_lock);
2672 2672 next_connp = next_connp->conn_g_next;
2673 2673 continue;
2674 2674 }
2675 2675 CONN_INC_REF_LOCKED(next_connp);
2676 2676 mutex_exit(&next_connp->conn_lock);
2677 2677 break;
2678 2678 }
2679 2679
2680 2680 mutex_exit(&connfp->connf_lock);
2681 2681
2682 2682 if (connp != NULL)
2683 2683 CONN_DEC_REF(connp);
2684 2684
2685 2685 return (next_connp);
2686 2686 }
2687 2687
2688 2688 #ifdef CONN_DEBUG
2689 2689 /*
2690 2690 * Trace of the last NBUF refhold/refrele
2691 2691 */
2692 2692 int
2693 2693 conn_trace_ref(conn_t *connp)
2694 2694 {
2695 2695 int last;
2696 2696 conn_trace_t *ctb;
2697 2697
2698 2698 ASSERT(MUTEX_HELD(&connp->conn_lock));
2699 2699 last = connp->conn_trace_last;
2700 2700 last++;
2701 2701 if (last == CONN_TRACE_MAX)
2702 2702 last = 0;
2703 2703
2704 2704 ctb = &connp->conn_trace_buf[last];
2705 2705 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2706 2706 connp->conn_trace_last = last;
2707 2707 return (1);
2708 2708 }
2709 2709
2710 2710 int
2711 2711 conn_untrace_ref(conn_t *connp)
2712 2712 {
2713 2713 int last;
2714 2714 conn_trace_t *ctb;
2715 2715
2716 2716 ASSERT(MUTEX_HELD(&connp->conn_lock));
2717 2717 last = connp->conn_trace_last;
2718 2718 last++;
2719 2719 if (last == CONN_TRACE_MAX)
2720 2720 last = 0;
2721 2721
2722 2722 ctb = &connp->conn_trace_buf[last];
2723 2723 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2724 2724 connp->conn_trace_last = last;
|
↓ open down ↓ |
2690 lines elided |
↑ open up ↑ |
2725 2725 return (1);
2726 2726 }
2727 2727 #endif
2728 2728
2729 2729 mib2_socketInfoEntry_t *
2730 2730 conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie)
2731 2731 {
2732 2732 vnode_t *vn = NULL;
2733 2733 vattr_t attr;
2734 2734 uint64_t flags = 0;
2735 + sock_upcalls_t *upcalls;
2736 + sock_upper_handle_t upper_handle;
2735 2737
2736 2738 /*
2737 2739 * If the connection is closing, it is not safe to make an upcall or
2738 2740 * access the stream associated with the connection.
2739 2741 * The callers of this function have a reference on connp itself
2740 2742 * so, as long as it is not closing, it's safe to continue.
2741 2743 */
2742 2744 mutex_enter(&connp->conn_lock);
2743 2745
2744 2746 if ((connp->conn_state_flags & CONN_CLOSING)) {
2745 2747 mutex_exit(&connp->conn_lock);
2746 2748 return (NULL);
2747 2749 }
2748 2750
2749 2751 /*
2750 2752 * Continue to hold conn_lock because we don't want to race with an
2751 2753 * in-progress close, which will have set-to-NULL (and destroyed
2752 2754 * upper_handle, aka sonode (and vnode)) BEFORE setting CONN_CLOSING.
2755 + *
2756 + * There is still a race with an in-progress OPEN, however, where
2757 + * conn_upper_handle and conn_upcalls are being assigned (in multiple
2758 + * codepaths) WITHOUT conn_lock being held. We address that race
2759 + * HERE, however, given that both are going from NULL to non-NULL,
2760 + * if we lose the race, we don't get any data for the in-progress-OPEN
2761 + * socket.
2753 2762 */
2754 2763
2755 - if (connp->conn_upper_handle != NULL) {
2756 - vn = (*connp->conn_upcalls->su_get_vnode)
2757 - (connp->conn_upper_handle);
2764 + upcalls = connp->conn_upcalls;
2765 + upper_handle = connp->conn_upper_handle;
2766 + /* Check BOTH for non-NULL before attempting an upcall. */
2767 + if (upper_handle != NULL && upcalls != NULL) {
2768 + /* su_get_vnode() returns one with VN_HOLD() already done. */
2769 + vn = upcalls->su_get_vnode(upper_handle);
2758 2770 } else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL) {
2759 2771 vn = STREAM(connp->conn_rq)->sd_pvnode;
2760 2772 if (vn != NULL)
2761 2773 VN_HOLD(vn);
2762 2774 flags |= MIB2_SOCKINFO_STREAM;
2763 2775 }
2764 2776
2765 2777 mutex_exit(&connp->conn_lock);
2766 2778
2767 2779 if (vn == NULL || VOP_GETATTR(vn, &attr, 0, CRED(), NULL) != 0) {
2768 2780 if (vn != NULL)
2769 2781 VN_RELE(vn);
2770 2782 return (NULL);
2771 2783 }
2772 2784
2773 2785 VN_RELE(vn);
2774 2786
2775 2787 bzero(sie, sizeof (*sie));
2776 2788
2777 2789 sie->sie_flags = flags;
2778 2790 sie->sie_inode = attr.va_nodeid;
2779 2791 sie->sie_dev = attr.va_rdev;
2780 2792
2781 2793 return (sie);
2782 2794 }
|
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX