Print this page
12976 system panics with error in IP module
Reviewed by: Andy Fiddaman <andy@omniosce.org>
Reviewed by: Paul Winder <p.winder@me.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/inet/ip/ipclassifier.c
+++ new/usr/src/uts/common/inet/ip/ipclassifier.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
24 + * Copyright 2020 Joyent, Inc.
24 25 */
25 26
26 27 /*
27 28 * IP PACKET CLASSIFIER
28 29 *
29 30 * The IP packet classifier provides mapping between IP packets and persistent
30 31 * connection state for connection-oriented protocols. It also provides
31 32 * interface for managing connection states.
32 33 *
33 34 * The connection state is kept in conn_t data structure and contains, among
34 35 * other things:
35 36 *
36 37 * o local/remote address and ports
37 38 * o Transport protocol
38 39 * o squeue for the connection (for TCP only)
39 40 * o reference counter
40 41 * o Connection state
41 42 * o hash table linkage
42 43 * o interface/ire information
43 44 * o credentials
44 45 * o ipsec policy
45 46 * o send and receive functions.
46 47 * o mutex lock.
47 48 *
48 49 * Connections use a reference counting scheme. They are freed when the
49 50 * reference counter drops to zero. A reference is incremented when connection
50 51 * is placed in a list or table, when incoming packet for the connection arrives
51 52 * and when connection is processed via squeue (squeue processing may be
52 53 * asynchronous and the reference protects the connection from being destroyed
53 54 * before its processing is finished).
54 55 *
55 56 * conn_recv is used to pass up packets to the ULP.
56 57 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
57 58 * a listener, and changes to tcp_input_listener as the listener has picked a
58 59 * good squeue. For other cases it is set to tcp_input_data.
59 60 *
60 61 * conn_recvicmp is used to pass up ICMP errors to the ULP.
61 62 *
62 63 * Classifier uses several hash tables:
63 64 *
64 65 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state
65 66 * ipcl_bind_fanout: contains all connections in BOUND state
66 67 * ipcl_proto_fanout: IPv4 protocol fanout
67 68 * ipcl_proto_fanout_v6: IPv6 protocol fanout
68 69 * ipcl_udp_fanout: contains all UDP connections
69 70 * ipcl_iptun_fanout: contains all IP tunnel connections
70 71 * ipcl_globalhash_fanout: contains all connections
71 72 *
72 73 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
73 74 * which need to view all existing connections.
74 75 *
75 76 * All tables are protected by per-bucket locks. When both per-bucket lock and
76 77 * connection lock need to be held, the per-bucket lock should be acquired
77 78 * first, followed by the connection lock.
78 79 *
79 80 * All functions doing search in one of these tables increment a reference
80 81 * counter on the connection found (if any). This reference should be dropped
81 82 * when the caller has finished processing the connection.
82 83 *
83 84 *
84 85 * INTERFACES:
85 86 * ===========
86 87 *
87 88 * Connection Lookup:
88 89 * ------------------
89 90 *
90 91 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
91 92 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
92 93 *
93 94 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
94 95 * it can't find any associated connection. If the connection is found, its
95 96 * reference counter is incremented.
96 97 *
97 98 * mp: mblock, containing packet header. The full header should fit
98 99 * into a single mblock. It should also contain at least full IP
99 100 * and TCP or UDP header.
100 101 *
101 102 * protocol: Either IPPROTO_TCP or IPPROTO_UDP.
102 103 *
103 104 * hdr_len: The size of IP header. It is used to find TCP or UDP header in
104 105 * the packet.
105 106 *
106 107 * ira->ira_zoneid: The zone in which the returned connection must be; the
107 108 * zoneid corresponding to the ire_zoneid on the IRE located for
108 109 * the packet's destination address.
109 110 *
110 111 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
111 112 * IRAF_TX_SHARED_ADDR flags
112 113 *
113 114 * For TCP connections, the lookup order is as follows:
114 115 * 5-tuple {src, dst, protocol, local port, remote port}
115 116 * lookup in ipcl_conn_fanout table.
116 117 * 3-tuple {dst, remote port, protocol} lookup in
117 118 * ipcl_bind_fanout table.
118 119 *
119 120 * For UDP connections, a 5-tuple {src, dst, protocol, local port,
120 121 * remote port} lookup is done on ipcl_udp_fanout. Note that,
121 122 * these interfaces do not handle cases where a packets belongs
122 123 * to multiple UDP clients, which is handled in IP itself.
123 124 *
124 125 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
125 126 * determine which actual zone gets the segment. This is used only in a
126 127 * labeled environment. The matching rules are:
127 128 *
128 129 * - If it's not a multilevel port, then the label on the packet selects
129 130 * the zone. Unlabeled packets are delivered to the global zone.
130 131 *
131 132 * - If it's a multilevel port, then only the zone registered to receive
132 133 * packets on that port matches.
133 134 *
134 135 * Also, in a labeled environment, packet labels need to be checked. For fully
135 136 * bound TCP connections, we can assume that the packet label was checked
136 137 * during connection establishment, and doesn't need to be checked on each
137 138 * packet. For others, though, we need to check for strict equality or, for
138 139 * multilevel ports, membership in the range or set. This part currently does
139 140 * a tnrh lookup on each packet, but could be optimized to use cached results
140 141 * if that were necessary. (SCTP doesn't come through here, but if it did,
141 142 * we would apply the same rules as TCP.)
142 143 *
143 144 * An implication of the above is that fully-bound TCP sockets must always use
144 145 * distinct 4-tuples; they can't be discriminated by label alone.
145 146 *
146 147 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
147 148 * as there's no connection set-up handshake and no shared state.
148 149 *
149 150 * Labels on looped-back packets within a single zone do not need to be
150 151 * checked, as all processes in the same zone have the same label.
151 152 *
152 153 * Finally, for unlabeled packets received by a labeled system, special rules
153 154 * apply. We consider only the MLP if there is one. Otherwise, we prefer a
154 155 * socket in the zone whose label matches the default label of the sender, if
155 156 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the
156 157 * receiver's label must dominate the sender's default label.
157 158 *
158 159 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
159 160 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
160 161 * ip_stack);
161 162 *
162 163 * Lookup routine to find a exact match for {src, dst, local port,
163 164 * remote port) for TCP connections in ipcl_conn_fanout. The address and
164 165 * ports are read from the IP and TCP header respectively.
165 166 *
166 167 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol,
167 168 * zoneid, ip_stack);
168 169 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
169 170 * zoneid, ip_stack);
170 171 *
171 172 * Lookup routine to find a listener with the tuple {lport, laddr,
172 173 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional
173 174 * parameter interface index is also compared.
174 175 *
175 176 * void ipcl_walk(func, arg, ip_stack)
176 177 *
177 178 * Apply 'func' to every connection available. The 'func' is called as
178 179 * (*func)(connp, arg). The walk is non-atomic so connections may be
179 180 * created and destroyed during the walk. The CONN_CONDEMNED and
180 181 * CONN_INCIPIENT flags ensure that connections which are newly created
181 182 * or being destroyed are not selected by the walker.
182 183 *
183 184 * Table Updates
184 185 * -------------
185 186 *
186 187 * int ipcl_conn_insert(connp);
187 188 * int ipcl_conn_insert_v4(connp);
188 189 * int ipcl_conn_insert_v6(connp);
189 190 *
190 191 * Insert 'connp' in the ipcl_conn_fanout.
191 192 * Arguments :
192 193 * connp conn_t to be inserted
193 194 *
194 195 * Return value :
195 196 * 0 if connp was inserted
196 197 * EADDRINUSE if the connection with the same tuple
197 198 * already exists.
198 199 *
199 200 * int ipcl_bind_insert(connp);
200 201 * int ipcl_bind_insert_v4(connp);
201 202 * int ipcl_bind_insert_v6(connp);
202 203 *
203 204 * Insert 'connp' in ipcl_bind_fanout.
204 205 * Arguments :
205 206 * connp conn_t to be inserted
206 207 *
207 208 *
208 209 * void ipcl_hash_remove(connp);
209 210 *
210 211 * Removes the 'connp' from the connection fanout table.
211 212 *
212 213 * Connection Creation/Destruction
213 214 * -------------------------------
214 215 *
215 216 * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
216 217 *
217 218 * Creates a new conn based on the type flag, inserts it into
218 219 * globalhash table.
219 220 *
220 221 * type: This flag determines the type of conn_t which needs to be
221 222 * created i.e., which kmem_cache it comes from.
222 223 * IPCL_TCPCONN indicates a TCP connection
223 224 * IPCL_SCTPCONN indicates a SCTP connection
224 225 * IPCL_UDPCONN indicates a UDP conn_t.
225 226 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t.
226 227 * IPCL_RTSCONN indicates a RTS conn_t.
227 228 * IPCL_IPCCONN indicates all other connections.
228 229 *
229 230 * void ipcl_conn_destroy(connp)
230 231 *
231 232 * Destroys the connection state, removes it from the global
232 233 * connection hash table and frees its memory.
233 234 */
234 235
235 236 #include <sys/types.h>
236 237 #include <sys/stream.h>
237 238 #include <sys/stropts.h>
238 239 #include <sys/sysmacros.h>
239 240 #include <sys/strsubr.h>
240 241 #include <sys/strsun.h>
241 242 #define _SUN_TPI_VERSION 2
242 243 #include <sys/ddi.h>
243 244 #include <sys/cmn_err.h>
244 245 #include <sys/debug.h>
245 246
246 247 #include <sys/systm.h>
247 248 #include <sys/param.h>
248 249 #include <sys/kmem.h>
249 250 #include <sys/isa_defs.h>
250 251 #include <inet/common.h>
251 252 #include <netinet/ip6.h>
252 253 #include <netinet/icmp6.h>
253 254
254 255 #include <inet/ip.h>
255 256 #include <inet/ip_if.h>
256 257 #include <inet/ip_ire.h>
257 258 #include <inet/ip6.h>
258 259 #include <inet/ip_ndp.h>
259 260 #include <inet/ip_impl.h>
260 261 #include <inet/udp_impl.h>
261 262 #include <inet/sctp_ip.h>
262 263 #include <inet/sctp/sctp_impl.h>
263 264 #include <inet/rawip_impl.h>
264 265 #include <inet/rts_impl.h>
265 266 #include <inet/iptun/iptun_impl.h>
266 267
267 268 #include <sys/cpuvar.h>
268 269
269 270 #include <inet/ipclassifier.h>
270 271 #include <inet/tcp.h>
271 272 #include <inet/ipsec_impl.h>
272 273
273 274 #include <sys/tsol/tnet.h>
274 275 #include <sys/sockio.h>
275 276
276 277 /* Old value for compatibility. Setable in /etc/system */
277 278 uint_t tcp_conn_hash_size = 0;
278 279
279 280 /* New value. Zero means choose automatically. Setable in /etc/system */
280 281 uint_t ipcl_conn_hash_size = 0;
281 282 uint_t ipcl_conn_hash_memfactor = 8192;
282 283 uint_t ipcl_conn_hash_maxsize = 82500;
283 284
284 285 /* bind/udp fanout table size */
285 286 uint_t ipcl_bind_fanout_size = 512;
286 287 uint_t ipcl_udp_fanout_size = 16384;
287 288
288 289 /* Raw socket fanout size. Must be a power of 2. */
289 290 uint_t ipcl_raw_fanout_size = 256;
290 291
291 292 /*
292 293 * The IPCL_IPTUN_HASH() function works best with a prime table size. We
293 294 * expect that most large deployments would have hundreds of tunnels, and
294 295 * thousands in the extreme case.
295 296 */
296 297 uint_t ipcl_iptun_fanout_size = 6143;
297 298
298 299 /*
299 300 * Power of 2^N Primes useful for hashing for N of 0-28,
300 301 * these primes are the nearest prime <= 2^N - 2^(N-2).
301 302 */
302 303
303 304 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \
304 305 6143, 12281, 24571, 49139, 98299, 196597, 393209, \
305 306 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \
306 307 50331599, 100663291, 201326557, 0}
307 308
308 309 /*
309 310 * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
310 311 * are aligned on cache lines.
311 312 */
312 313 typedef union itc_s {
313 314 conn_t itc_conn;
314 315 char itcu_filler[CACHE_ALIGN(conn_s)];
315 316 } itc_t;
316 317
317 318 struct kmem_cache *tcp_conn_cache;
318 319 struct kmem_cache *ip_conn_cache;
319 320 extern struct kmem_cache *sctp_conn_cache;
320 321 struct kmem_cache *udp_conn_cache;
321 322 struct kmem_cache *rawip_conn_cache;
322 323 struct kmem_cache *rts_conn_cache;
323 324
324 325 extern void tcp_timermp_free(tcp_t *);
325 326 extern mblk_t *tcp_timermp_alloc(int);
326 327
327 328 static int ip_conn_constructor(void *, void *, int);
328 329 static void ip_conn_destructor(void *, void *);
329 330
330 331 static int tcp_conn_constructor(void *, void *, int);
331 332 static void tcp_conn_destructor(void *, void *);
332 333
333 334 static int udp_conn_constructor(void *, void *, int);
334 335 static void udp_conn_destructor(void *, void *);
335 336
336 337 static int rawip_conn_constructor(void *, void *, int);
337 338 static void rawip_conn_destructor(void *, void *);
338 339
339 340 static int rts_conn_constructor(void *, void *, int);
340 341 static void rts_conn_destructor(void *, void *);
341 342
342 343 /*
343 344 * Global (for all stack instances) init routine
344 345 */
345 346 void
346 347 ipcl_g_init(void)
347 348 {
348 349 ip_conn_cache = kmem_cache_create("ip_conn_cache",
349 350 sizeof (conn_t), CACHE_ALIGN_SIZE,
350 351 ip_conn_constructor, ip_conn_destructor,
351 352 NULL, NULL, NULL, 0);
352 353
353 354 tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
354 355 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
355 356 tcp_conn_constructor, tcp_conn_destructor,
356 357 tcp_conn_reclaim, NULL, NULL, 0);
357 358
358 359 udp_conn_cache = kmem_cache_create("udp_conn_cache",
359 360 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
360 361 udp_conn_constructor, udp_conn_destructor,
361 362 NULL, NULL, NULL, 0);
362 363
363 364 rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
364 365 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
365 366 rawip_conn_constructor, rawip_conn_destructor,
366 367 NULL, NULL, NULL, 0);
367 368
368 369 rts_conn_cache = kmem_cache_create("rts_conn_cache",
369 370 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
370 371 rts_conn_constructor, rts_conn_destructor,
371 372 NULL, NULL, NULL, 0);
372 373 }
373 374
374 375 /*
375 376 * ipclassifier intialization routine, sets up hash tables.
376 377 */
377 378 void
378 379 ipcl_init(ip_stack_t *ipst)
379 380 {
380 381 int i;
381 382 int sizes[] = P2Ps();
382 383
383 384 /*
384 385 * Calculate size of conn fanout table from /etc/system settings
385 386 */
386 387 if (ipcl_conn_hash_size != 0) {
387 388 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
388 389 } else if (tcp_conn_hash_size != 0) {
389 390 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
390 391 } else {
391 392 extern pgcnt_t freemem;
392 393
393 394 ipst->ips_ipcl_conn_fanout_size =
394 395 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
395 396
396 397 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
397 398 ipst->ips_ipcl_conn_fanout_size =
398 399 ipcl_conn_hash_maxsize;
399 400 }
400 401 }
401 402
402 403 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
403 404 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
404 405 break;
405 406 }
406 407 }
407 408 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
408 409 /* Out of range, use the 2^16 value */
409 410 ipst->ips_ipcl_conn_fanout_size = sizes[16];
410 411 }
411 412
412 413 /* Take values from /etc/system */
413 414 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
414 415 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
415 416 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
416 417 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
417 418
418 419 ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
419 420
420 421 ipst->ips_ipcl_conn_fanout = kmem_zalloc(
421 422 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
422 423
423 424 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
424 425 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
425 426 MUTEX_DEFAULT, NULL);
426 427 }
427 428
428 429 ipst->ips_ipcl_bind_fanout = kmem_zalloc(
429 430 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
430 431
431 432 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
432 433 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
433 434 MUTEX_DEFAULT, NULL);
434 435 }
435 436
436 437 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
437 438 sizeof (connf_t), KM_SLEEP);
438 439 for (i = 0; i < IPPROTO_MAX; i++) {
439 440 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
440 441 MUTEX_DEFAULT, NULL);
441 442 }
442 443
443 444 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
444 445 sizeof (connf_t), KM_SLEEP);
445 446 for (i = 0; i < IPPROTO_MAX; i++) {
446 447 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
447 448 MUTEX_DEFAULT, NULL);
448 449 }
449 450
450 451 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
451 452 mutex_init(&ipst->ips_rts_clients->connf_lock,
452 453 NULL, MUTEX_DEFAULT, NULL);
453 454
454 455 ipst->ips_ipcl_udp_fanout = kmem_zalloc(
455 456 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
456 457 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
457 458 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
458 459 MUTEX_DEFAULT, NULL);
459 460 }
460 461
461 462 ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
462 463 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
463 464 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
464 465 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
465 466 MUTEX_DEFAULT, NULL);
466 467 }
467 468
468 469 ipst->ips_ipcl_raw_fanout = kmem_zalloc(
469 470 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
470 471 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
471 472 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
472 473 MUTEX_DEFAULT, NULL);
473 474 }
474 475
475 476 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
476 477 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
477 478 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
478 479 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
479 480 NULL, MUTEX_DEFAULT, NULL);
480 481 }
481 482 }
482 483
483 484 void
484 485 ipcl_g_destroy(void)
485 486 {
486 487 kmem_cache_destroy(ip_conn_cache);
487 488 kmem_cache_destroy(tcp_conn_cache);
488 489 kmem_cache_destroy(udp_conn_cache);
489 490 kmem_cache_destroy(rawip_conn_cache);
490 491 kmem_cache_destroy(rts_conn_cache);
491 492 }
492 493
493 494 /*
494 495 * All user-level and kernel use of the stack must be gone
495 496 * by now.
496 497 */
497 498 void
498 499 ipcl_destroy(ip_stack_t *ipst)
499 500 {
500 501 int i;
501 502
502 503 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
503 504 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
504 505 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
505 506 }
506 507 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
507 508 sizeof (connf_t));
508 509 ipst->ips_ipcl_conn_fanout = NULL;
509 510
510 511 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
511 512 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
512 513 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
513 514 }
514 515 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
515 516 sizeof (connf_t));
516 517 ipst->ips_ipcl_bind_fanout = NULL;
517 518
518 519 for (i = 0; i < IPPROTO_MAX; i++) {
519 520 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
520 521 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
521 522 }
522 523 kmem_free(ipst->ips_ipcl_proto_fanout_v4,
523 524 IPPROTO_MAX * sizeof (connf_t));
524 525 ipst->ips_ipcl_proto_fanout_v4 = NULL;
525 526
526 527 for (i = 0; i < IPPROTO_MAX; i++) {
527 528 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
528 529 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
529 530 }
530 531 kmem_free(ipst->ips_ipcl_proto_fanout_v6,
531 532 IPPROTO_MAX * sizeof (connf_t));
532 533 ipst->ips_ipcl_proto_fanout_v6 = NULL;
533 534
534 535 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
535 536 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
536 537 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
537 538 }
538 539 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
539 540 sizeof (connf_t));
540 541 ipst->ips_ipcl_udp_fanout = NULL;
541 542
542 543 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
543 544 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
544 545 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
545 546 }
546 547 kmem_free(ipst->ips_ipcl_iptun_fanout,
547 548 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
548 549 ipst->ips_ipcl_iptun_fanout = NULL;
549 550
550 551 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
551 552 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
552 553 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
553 554 }
554 555 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
555 556 sizeof (connf_t));
556 557 ipst->ips_ipcl_raw_fanout = NULL;
557 558
558 559 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
559 560 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
560 561 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
561 562 }
562 563 kmem_free(ipst->ips_ipcl_globalhash_fanout,
563 564 sizeof (connf_t) * CONN_G_HASH_SIZE);
564 565 ipst->ips_ipcl_globalhash_fanout = NULL;
565 566
566 567 ASSERT(ipst->ips_rts_clients->connf_head == NULL);
567 568 mutex_destroy(&ipst->ips_rts_clients->connf_lock);
568 569 kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
569 570 ipst->ips_rts_clients = NULL;
570 571 }
571 572
572 573 /*
573 574 * conn creation routine. initialize the conn, sets the reference
574 575 * and inserts it in the global hash table.
575 576 */
576 577 conn_t *
577 578 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
578 579 {
579 580 conn_t *connp;
580 581 struct kmem_cache *conn_cache;
581 582
582 583 switch (type) {
583 584 case IPCL_SCTPCONN:
584 585 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
585 586 return (NULL);
586 587 sctp_conn_init(connp);
587 588 netstack_hold(ns);
588 589 connp->conn_netstack = ns;
589 590 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
590 591 connp->conn_ixa->ixa_conn_id = (long)connp;
591 592 ipcl_globalhash_insert(connp);
592 593 return (connp);
593 594
594 595 case IPCL_TCPCONN:
595 596 conn_cache = tcp_conn_cache;
596 597 break;
597 598
598 599 case IPCL_UDPCONN:
599 600 conn_cache = udp_conn_cache;
600 601 break;
601 602
602 603 case IPCL_RAWIPCONN:
603 604 conn_cache = rawip_conn_cache;
604 605 break;
605 606
606 607 case IPCL_RTSCONN:
607 608 conn_cache = rts_conn_cache;
608 609 break;
609 610
610 611 case IPCL_IPCCONN:
611 612 conn_cache = ip_conn_cache;
612 613 break;
613 614
614 615 default:
615 616 conn_cache = NULL;
616 617 connp = NULL;
617 618 ASSERT(0);
618 619 }
619 620
620 621 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
621 622 return (NULL);
622 623
623 624 connp->conn_ref = 1;
624 625 netstack_hold(ns);
625 626 connp->conn_netstack = ns;
626 627 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
627 628 connp->conn_ixa->ixa_conn_id = (long)connp;
628 629 ipcl_globalhash_insert(connp);
629 630 return (connp);
630 631 }
631 632
632 633 void
633 634 ipcl_conn_destroy(conn_t *connp)
634 635 {
635 636 mblk_t *mp;
636 637 netstack_t *ns = connp->conn_netstack;
637 638
638 639 ASSERT(!MUTEX_HELD(&connp->conn_lock));
639 640 ASSERT(connp->conn_ref == 0);
640 641 ASSERT(connp->conn_ioctlref == 0);
641 642
642 643 DTRACE_PROBE1(conn__destroy, conn_t *, connp);
643 644
644 645 if (connp->conn_cred != NULL) {
645 646 crfree(connp->conn_cred);
646 647 connp->conn_cred = NULL;
647 648 /* ixa_cred done in ipcl_conn_cleanup below */
648 649 }
649 650
650 651 if (connp->conn_ht_iphc != NULL) {
651 652 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
652 653 connp->conn_ht_iphc = NULL;
653 654 connp->conn_ht_iphc_allocated = 0;
654 655 connp->conn_ht_iphc_len = 0;
655 656 connp->conn_ht_ulp = NULL;
656 657 connp->conn_ht_ulp_len = 0;
657 658 }
658 659 ip_pkt_free(&connp->conn_xmit_ipp);
659 660
660 661 ipcl_globalhash_remove(connp);
661 662
662 663 if (connp->conn_latch != NULL) {
663 664 IPLATCH_REFRELE(connp->conn_latch);
664 665 connp->conn_latch = NULL;
665 666 }
666 667 if (connp->conn_latch_in_policy != NULL) {
667 668 IPPOL_REFRELE(connp->conn_latch_in_policy);
668 669 connp->conn_latch_in_policy = NULL;
669 670 }
670 671 if (connp->conn_latch_in_action != NULL) {
671 672 IPACT_REFRELE(connp->conn_latch_in_action);
672 673 connp->conn_latch_in_action = NULL;
673 674 }
674 675 if (connp->conn_policy != NULL) {
675 676 IPPH_REFRELE(connp->conn_policy, ns);
676 677 connp->conn_policy = NULL;
677 678 }
678 679
679 680 if (connp->conn_ipsec_opt_mp != NULL) {
680 681 freemsg(connp->conn_ipsec_opt_mp);
681 682 connp->conn_ipsec_opt_mp = NULL;
682 683 }
683 684
684 685 if (connp->conn_flags & IPCL_TCPCONN) {
685 686 tcp_t *tcp = connp->conn_tcp;
686 687
687 688 tcp_free(tcp);
688 689 mp = tcp->tcp_timercache;
689 690
690 691 tcp->tcp_tcps = NULL;
691 692
692 693 /*
693 694 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
694 695 * the mblk.
695 696 */
696 697 if (tcp->tcp_rsrv_mp != NULL) {
697 698 freeb(tcp->tcp_rsrv_mp);
698 699 tcp->tcp_rsrv_mp = NULL;
699 700 mutex_destroy(&tcp->tcp_rsrv_mp_lock);
700 701 }
701 702
702 703 ipcl_conn_cleanup(connp);
703 704 connp->conn_flags = IPCL_TCPCONN;
704 705 if (ns != NULL) {
705 706 ASSERT(tcp->tcp_tcps == NULL);
706 707 connp->conn_netstack = NULL;
707 708 connp->conn_ixa->ixa_ipst = NULL;
708 709 netstack_rele(ns);
709 710 }
710 711
711 712 bzero(tcp, sizeof (tcp_t));
712 713
713 714 tcp->tcp_timercache = mp;
714 715 tcp->tcp_connp = connp;
715 716 kmem_cache_free(tcp_conn_cache, connp);
716 717 return;
717 718 }
718 719
719 720 if (connp->conn_flags & IPCL_SCTPCONN) {
720 721 ASSERT(ns != NULL);
721 722 sctp_free(connp);
722 723 return;
723 724 }
724 725
725 726 ipcl_conn_cleanup(connp);
726 727 if (ns != NULL) {
727 728 connp->conn_netstack = NULL;
728 729 connp->conn_ixa->ixa_ipst = NULL;
729 730 netstack_rele(ns);
730 731 }
731 732
732 733 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
733 734 if (connp->conn_flags & IPCL_UDPCONN) {
734 735 connp->conn_flags = IPCL_UDPCONN;
735 736 kmem_cache_free(udp_conn_cache, connp);
736 737 } else if (connp->conn_flags & IPCL_RAWIPCONN) {
737 738 connp->conn_flags = IPCL_RAWIPCONN;
738 739 connp->conn_proto = IPPROTO_ICMP;
739 740 connp->conn_ixa->ixa_protocol = connp->conn_proto;
740 741 kmem_cache_free(rawip_conn_cache, connp);
741 742 } else if (connp->conn_flags & IPCL_RTSCONN) {
742 743 connp->conn_flags = IPCL_RTSCONN;
743 744 kmem_cache_free(rts_conn_cache, connp);
744 745 } else {
745 746 connp->conn_flags = IPCL_IPCCONN;
746 747 ASSERT(connp->conn_flags & IPCL_IPCCONN);
747 748 ASSERT(connp->conn_priv == NULL);
748 749 kmem_cache_free(ip_conn_cache, connp);
749 750 }
750 751 }
751 752
752 753 /*
753 754 * Running in cluster mode - deregister listener information
754 755 */
755 756 static void
756 757 ipcl_conn_unlisten(conn_t *connp)
757 758 {
758 759 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
759 760 ASSERT(connp->conn_lport != 0);
760 761
761 762 if (cl_inet_unlisten != NULL) {
762 763 sa_family_t addr_family;
763 764 uint8_t *laddrp;
764 765
765 766 if (connp->conn_ipversion == IPV6_VERSION) {
766 767 addr_family = AF_INET6;
767 768 laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
768 769 } else {
769 770 addr_family = AF_INET;
770 771 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
771 772 }
772 773 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
773 774 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
774 775 }
775 776 connp->conn_flags &= ~IPCL_CL_LISTENER;
776 777 }
777 778
778 779 /*
779 780 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
780 781 * which table the conn belonged to). So for debugging we can see which hash
781 782 * table this connection was in.
782 783 */
783 784 #define IPCL_HASH_REMOVE(connp) { \
784 785 connf_t *connfp = (connp)->conn_fanout; \
785 786 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \
786 787 if (connfp != NULL) { \
787 788 mutex_enter(&connfp->connf_lock); \
788 789 if ((connp)->conn_next != NULL) \
789 790 (connp)->conn_next->conn_prev = \
790 791 (connp)->conn_prev; \
791 792 if ((connp)->conn_prev != NULL) \
792 793 (connp)->conn_prev->conn_next = \
793 794 (connp)->conn_next; \
794 795 else \
795 796 connfp->connf_head = (connp)->conn_next; \
796 797 (connp)->conn_fanout = NULL; \
797 798 (connp)->conn_next = NULL; \
798 799 (connp)->conn_prev = NULL; \
799 800 (connp)->conn_flags |= IPCL_REMOVED; \
800 801 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \
801 802 ipcl_conn_unlisten((connp)); \
802 803 CONN_DEC_REF((connp)); \
803 804 mutex_exit(&connfp->connf_lock); \
804 805 } \
805 806 }
806 807
807 808 void
808 809 ipcl_hash_remove(conn_t *connp)
809 810 {
810 811 uint8_t protocol = connp->conn_proto;
811 812
812 813 IPCL_HASH_REMOVE(connp);
813 814 if (protocol == IPPROTO_RSVP)
814 815 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
815 816 }
816 817
817 818 /*
818 819 * The whole purpose of this function is allow removal of
819 820 * a conn_t from the connected hash for timewait reclaim.
820 821 * This is essentially a TW reclaim fastpath where timewait
821 822 * collector checks under fanout lock (so no one else can
822 823 * get access to the conn_t) that refcnt is 2 i.e. one for
823 824 * TCP and one for the classifier hash list. If ref count
824 825 * is indeed 2, we can just remove the conn under lock and
825 826 * avoid cleaning up the conn under squeue. This gives us
826 827 * improved performance.
827 828 */
828 829 void
829 830 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
830 831 {
831 832 ASSERT(MUTEX_HELD(&connfp->connf_lock));
832 833 ASSERT(MUTEX_HELD(&connp->conn_lock));
833 834 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
834 835
835 836 if ((connp)->conn_next != NULL) {
836 837 (connp)->conn_next->conn_prev = (connp)->conn_prev;
837 838 }
838 839 if ((connp)->conn_prev != NULL) {
839 840 (connp)->conn_prev->conn_next = (connp)->conn_next;
840 841 } else {
841 842 connfp->connf_head = (connp)->conn_next;
842 843 }
843 844 (connp)->conn_fanout = NULL;
844 845 (connp)->conn_next = NULL;
845 846 (connp)->conn_prev = NULL;
846 847 (connp)->conn_flags |= IPCL_REMOVED;
847 848 ASSERT((connp)->conn_ref == 2);
848 849 (connp)->conn_ref--;
849 850 }
850 851
851 852 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \
852 853 ASSERT((connp)->conn_fanout == NULL); \
853 854 ASSERT((connp)->conn_next == NULL); \
854 855 ASSERT((connp)->conn_prev == NULL); \
855 856 if ((connfp)->connf_head != NULL) { \
856 857 (connfp)->connf_head->conn_prev = (connp); \
857 858 (connp)->conn_next = (connfp)->connf_head; \
858 859 } \
859 860 (connp)->conn_fanout = (connfp); \
860 861 (connfp)->connf_head = (connp); \
861 862 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
862 863 IPCL_CONNECTED; \
863 864 CONN_INC_REF(connp); \
864 865 }
865 866
866 867 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \
867 868 IPCL_HASH_REMOVE((connp)); \
868 869 mutex_enter(&(connfp)->connf_lock); \
869 870 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \
870 871 mutex_exit(&(connfp)->connf_lock); \
871 872 }
872 873
873 874 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \
874 875 conn_t *pconnp = NULL, *nconnp; \
875 876 IPCL_HASH_REMOVE((connp)); \
876 877 mutex_enter(&(connfp)->connf_lock); \
877 878 nconnp = (connfp)->connf_head; \
878 879 while (nconnp != NULL && \
879 880 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \
880 881 pconnp = nconnp; \
881 882 nconnp = nconnp->conn_next; \
882 883 } \
883 884 if (pconnp != NULL) { \
884 885 pconnp->conn_next = (connp); \
885 886 (connp)->conn_prev = pconnp; \
886 887 } else { \
887 888 (connfp)->connf_head = (connp); \
888 889 } \
889 890 if (nconnp != NULL) { \
890 891 (connp)->conn_next = nconnp; \
891 892 nconnp->conn_prev = (connp); \
892 893 } \
893 894 (connp)->conn_fanout = (connfp); \
894 895 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
895 896 IPCL_BOUND; \
896 897 CONN_INC_REF(connp); \
897 898 mutex_exit(&(connfp)->connf_lock); \
898 899 }
899 900
900 901 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \
901 902 conn_t **list, *prev, *next; \
902 903 boolean_t isv4mapped = \
903 904 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \
904 905 IPCL_HASH_REMOVE((connp)); \
905 906 mutex_enter(&(connfp)->connf_lock); \
906 907 list = &(connfp)->connf_head; \
907 908 prev = NULL; \
908 909 while ((next = *list) != NULL) { \
909 910 if (isv4mapped && \
910 911 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \
911 912 connp->conn_zoneid == next->conn_zoneid) { \
912 913 (connp)->conn_next = next; \
913 914 if (prev != NULL) \
914 915 prev = next->conn_prev; \
915 916 next->conn_prev = (connp); \
916 917 break; \
917 918 } \
918 919 list = &next->conn_next; \
919 920 prev = next; \
920 921 } \
921 922 (connp)->conn_prev = prev; \
922 923 *list = (connp); \
923 924 (connp)->conn_fanout = (connfp); \
924 925 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
925 926 IPCL_BOUND; \
926 927 CONN_INC_REF((connp)); \
927 928 mutex_exit(&(connfp)->connf_lock); \
928 929 }
929 930
930 931 void
931 932 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
932 933 {
933 934 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
934 935 }
935 936
936 937 /*
937 938 * Because the classifier is used to classify inbound packets, the destination
938 939 * address is meant to be our local tunnel address (tunnel source), and the
939 940 * source the remote tunnel address (tunnel destination).
940 941 *
941 942 * Note that conn_proto can't be used for fanout since the upper protocol
942 943 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
943 944 */
944 945 conn_t *
945 946 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
946 947 {
947 948 connf_t *connfp;
948 949 conn_t *connp;
949 950
950 951 /* first look for IPv4 tunnel links */
951 952 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
952 953 mutex_enter(&connfp->connf_lock);
953 954 for (connp = connfp->connf_head; connp != NULL;
954 955 connp = connp->conn_next) {
955 956 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
956 957 break;
957 958 }
958 959 if (connp != NULL)
959 960 goto done;
960 961
961 962 mutex_exit(&connfp->connf_lock);
962 963
963 964 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
964 965 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
965 966 INADDR_ANY)];
966 967 mutex_enter(&connfp->connf_lock);
967 968 for (connp = connfp->connf_head; connp != NULL;
968 969 connp = connp->conn_next) {
969 970 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
970 971 break;
971 972 }
972 973 done:
973 974 if (connp != NULL)
974 975 CONN_INC_REF(connp);
975 976 mutex_exit(&connfp->connf_lock);
976 977 return (connp);
977 978 }
978 979
979 980 conn_t *
980 981 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
981 982 {
982 983 connf_t *connfp;
983 984 conn_t *connp;
984 985
985 986 /* Look for an IPv6 tunnel link */
986 987 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
987 988 mutex_enter(&connfp->connf_lock);
988 989 for (connp = connfp->connf_head; connp != NULL;
989 990 connp = connp->conn_next) {
990 991 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
991 992 CONN_INC_REF(connp);
992 993 break;
993 994 }
994 995 }
995 996 mutex_exit(&connfp->connf_lock);
996 997 return (connp);
997 998 }
998 999
999 1000 /*
1000 1001 * This function is used only for inserting SCTP raw socket now.
1001 1002 * This may change later.
1002 1003 *
1003 1004 * Note that only one raw socket can be bound to a port. The param
1004 1005 * lport is in network byte order.
1005 1006 */
1006 1007 static int
1007 1008 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1008 1009 {
1009 1010 connf_t *connfp;
1010 1011 conn_t *oconnp;
1011 1012 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1012 1013
1013 1014 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1014 1015
1015 1016 /* Check for existing raw socket already bound to the port. */
1016 1017 mutex_enter(&connfp->connf_lock);
1017 1018 for (oconnp = connfp->connf_head; oconnp != NULL;
1018 1019 oconnp = oconnp->conn_next) {
1019 1020 if (oconnp->conn_lport == lport &&
1020 1021 oconnp->conn_zoneid == connp->conn_zoneid &&
1021 1022 oconnp->conn_family == connp->conn_family &&
1022 1023 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1023 1024 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1024 1025 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1025 1026 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1026 1027 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1027 1028 &connp->conn_laddr_v6))) {
1028 1029 break;
1029 1030 }
1030 1031 }
1031 1032 mutex_exit(&connfp->connf_lock);
1032 1033 if (oconnp != NULL)
1033 1034 return (EADDRNOTAVAIL);
1034 1035
1035 1036 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1036 1037 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1037 1038 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1038 1039 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1039 1040 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1040 1041 } else {
1041 1042 IPCL_HASH_INSERT_BOUND(connfp, connp);
1042 1043 }
1043 1044 } else {
1044 1045 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1045 1046 }
1046 1047 return (0);
1047 1048 }
1048 1049
1049 1050 static int
1050 1051 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1051 1052 {
1052 1053 connf_t *connfp;
1053 1054 conn_t *tconnp;
1054 1055 ipaddr_t laddr = connp->conn_laddr_v4;
1055 1056 ipaddr_t faddr = connp->conn_faddr_v4;
1056 1057
1057 1058 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1058 1059 mutex_enter(&connfp->connf_lock);
1059 1060 for (tconnp = connfp->connf_head; tconnp != NULL;
1060 1061 tconnp = tconnp->conn_next) {
1061 1062 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1062 1063 /* A tunnel is already bound to these addresses. */
1063 1064 mutex_exit(&connfp->connf_lock);
1064 1065 return (EADDRINUSE);
1065 1066 }
1066 1067 }
1067 1068 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1068 1069 mutex_exit(&connfp->connf_lock);
1069 1070 return (0);
1070 1071 }
1071 1072
1072 1073 static int
1073 1074 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1074 1075 {
1075 1076 connf_t *connfp;
1076 1077 conn_t *tconnp;
1077 1078 in6_addr_t *laddr = &connp->conn_laddr_v6;
1078 1079 in6_addr_t *faddr = &connp->conn_faddr_v6;
1079 1080
1080 1081 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1081 1082 mutex_enter(&connfp->connf_lock);
1082 1083 for (tconnp = connfp->connf_head; tconnp != NULL;
1083 1084 tconnp = tconnp->conn_next) {
1084 1085 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1085 1086 /* A tunnel is already bound to these addresses. */
1086 1087 mutex_exit(&connfp->connf_lock);
1087 1088 return (EADDRINUSE);
1088 1089 }
1089 1090 }
1090 1091 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1091 1092 mutex_exit(&connfp->connf_lock);
1092 1093 return (0);
1093 1094 }
1094 1095
1095 1096 /*
1096 1097 * Check for a MAC exemption conflict on a labeled system. Note that for
1097 1098 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1098 1099 * transport layer. This check is for binding all other protocols.
1099 1100 *
1100 1101 * Returns true if there's a conflict.
1101 1102 */
1102 1103 static boolean_t
1103 1104 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1104 1105 {
1105 1106 connf_t *connfp;
1106 1107 conn_t *tconn;
1107 1108
1108 1109 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1109 1110 mutex_enter(&connfp->connf_lock);
1110 1111 for (tconn = connfp->connf_head; tconn != NULL;
1111 1112 tconn = tconn->conn_next) {
1112 1113 /* We don't allow v4 fallback for v6 raw socket */
1113 1114 if (connp->conn_family != tconn->conn_family)
1114 1115 continue;
1115 1116 /* If neither is exempt, then there's no conflict */
1116 1117 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1117 1118 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1118 1119 continue;
1119 1120 /* We are only concerned about sockets for a different zone */
1120 1121 if (connp->conn_zoneid == tconn->conn_zoneid)
1121 1122 continue;
1122 1123 /* If both are bound to different specific addrs, ok */
1123 1124 if (connp->conn_laddr_v4 != INADDR_ANY &&
1124 1125 tconn->conn_laddr_v4 != INADDR_ANY &&
1125 1126 connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1126 1127 continue;
1127 1128 /* These two conflict; fail */
1128 1129 break;
1129 1130 }
1130 1131 mutex_exit(&connfp->connf_lock);
1131 1132 return (tconn != NULL);
1132 1133 }
1133 1134
1134 1135 static boolean_t
1135 1136 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1136 1137 {
1137 1138 connf_t *connfp;
1138 1139 conn_t *tconn;
1139 1140
1140 1141 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1141 1142 mutex_enter(&connfp->connf_lock);
1142 1143 for (tconn = connfp->connf_head; tconn != NULL;
1143 1144 tconn = tconn->conn_next) {
1144 1145 /* We don't allow v4 fallback for v6 raw socket */
1145 1146 if (connp->conn_family != tconn->conn_family)
1146 1147 continue;
1147 1148 /* If neither is exempt, then there's no conflict */
1148 1149 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1149 1150 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1150 1151 continue;
1151 1152 /* We are only concerned about sockets for a different zone */
1152 1153 if (connp->conn_zoneid == tconn->conn_zoneid)
1153 1154 continue;
1154 1155 /* If both are bound to different addrs, ok */
1155 1156 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1156 1157 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1157 1158 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1158 1159 &tconn->conn_laddr_v6))
1159 1160 continue;
1160 1161 /* These two conflict; fail */
1161 1162 break;
1162 1163 }
1163 1164 mutex_exit(&connfp->connf_lock);
1164 1165 return (tconn != NULL);
1165 1166 }
1166 1167
1167 1168 /*
1168 1169 * (v4, v6) bind hash insertion routines
1169 1170 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1170 1171 */
1171 1172
1172 1173 int
1173 1174 ipcl_bind_insert(conn_t *connp)
1174 1175 {
1175 1176 if (connp->conn_ipversion == IPV6_VERSION)
1176 1177 return (ipcl_bind_insert_v6(connp));
1177 1178 else
1178 1179 return (ipcl_bind_insert_v4(connp));
1179 1180 }
1180 1181
1181 1182 int
1182 1183 ipcl_bind_insert_v4(conn_t *connp)
1183 1184 {
1184 1185 connf_t *connfp;
1185 1186 int ret = 0;
1186 1187 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1187 1188 uint16_t lport = connp->conn_lport;
1188 1189 uint8_t protocol = connp->conn_proto;
1189 1190
1190 1191 if (IPCL_IS_IPTUN(connp))
1191 1192 return (ipcl_iptun_hash_insert(connp, ipst));
1192 1193
1193 1194 switch (protocol) {
1194 1195 default:
1195 1196 if (is_system_labeled() &&
1196 1197 check_exempt_conflict_v4(connp, ipst))
1197 1198 return (EADDRINUSE);
1198 1199 /* FALLTHROUGH */
1199 1200 case IPPROTO_UDP:
1200 1201 if (protocol == IPPROTO_UDP) {
1201 1202 connfp = &ipst->ips_ipcl_udp_fanout[
1202 1203 IPCL_UDP_HASH(lport, ipst)];
1203 1204 } else {
1204 1205 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1205 1206 }
1206 1207
1207 1208 if (connp->conn_faddr_v4 != INADDR_ANY) {
1208 1209 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1209 1210 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1210 1211 IPCL_HASH_INSERT_BOUND(connfp, connp);
1211 1212 } else {
1212 1213 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1213 1214 }
1214 1215 if (protocol == IPPROTO_RSVP)
1215 1216 ill_set_inputfn_all(ipst);
1216 1217 break;
1217 1218
1218 1219 case IPPROTO_TCP:
1219 1220 /* Insert it in the Bind Hash */
1220 1221 ASSERT(connp->conn_zoneid != ALL_ZONES);
1221 1222 connfp = &ipst->ips_ipcl_bind_fanout[
1222 1223 IPCL_BIND_HASH(lport, ipst)];
1223 1224 if (connp->conn_laddr_v4 != INADDR_ANY) {
1224 1225 IPCL_HASH_INSERT_BOUND(connfp, connp);
1225 1226 } else {
1226 1227 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1227 1228 }
1228 1229 if (cl_inet_listen != NULL) {
1229 1230 ASSERT(connp->conn_ipversion == IPV4_VERSION);
1230 1231 connp->conn_flags |= IPCL_CL_LISTENER;
1231 1232 (*cl_inet_listen)(
1232 1233 connp->conn_netstack->netstack_stackid,
1233 1234 IPPROTO_TCP, AF_INET,
1234 1235 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1235 1236 }
1236 1237 break;
1237 1238
1238 1239 case IPPROTO_SCTP:
1239 1240 ret = ipcl_sctp_hash_insert(connp, lport);
1240 1241 break;
1241 1242 }
1242 1243
1243 1244 return (ret);
1244 1245 }
1245 1246
1246 1247 int
1247 1248 ipcl_bind_insert_v6(conn_t *connp)
1248 1249 {
1249 1250 connf_t *connfp;
1250 1251 int ret = 0;
1251 1252 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1252 1253 uint16_t lport = connp->conn_lport;
1253 1254 uint8_t protocol = connp->conn_proto;
1254 1255
1255 1256 if (IPCL_IS_IPTUN(connp)) {
1256 1257 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1257 1258 }
1258 1259
1259 1260 switch (protocol) {
1260 1261 default:
1261 1262 if (is_system_labeled() &&
1262 1263 check_exempt_conflict_v6(connp, ipst))
1263 1264 return (EADDRINUSE);
1264 1265 /* FALLTHROUGH */
1265 1266 case IPPROTO_UDP:
1266 1267 if (protocol == IPPROTO_UDP) {
1267 1268 connfp = &ipst->ips_ipcl_udp_fanout[
1268 1269 IPCL_UDP_HASH(lport, ipst)];
1269 1270 } else {
1270 1271 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1271 1272 }
1272 1273
1273 1274 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1274 1275 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1275 1276 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1276 1277 IPCL_HASH_INSERT_BOUND(connfp, connp);
1277 1278 } else {
1278 1279 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1279 1280 }
1280 1281 break;
1281 1282
1282 1283 case IPPROTO_TCP:
1283 1284 /* Insert it in the Bind Hash */
1284 1285 ASSERT(connp->conn_zoneid != ALL_ZONES);
1285 1286 connfp = &ipst->ips_ipcl_bind_fanout[
1286 1287 IPCL_BIND_HASH(lport, ipst)];
1287 1288 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1288 1289 IPCL_HASH_INSERT_BOUND(connfp, connp);
1289 1290 } else {
1290 1291 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1291 1292 }
1292 1293 if (cl_inet_listen != NULL) {
1293 1294 sa_family_t addr_family;
1294 1295 uint8_t *laddrp;
1295 1296
1296 1297 if (connp->conn_ipversion == IPV6_VERSION) {
1297 1298 addr_family = AF_INET6;
1298 1299 laddrp =
1299 1300 (uint8_t *)&connp->conn_bound_addr_v6;
1300 1301 } else {
1301 1302 addr_family = AF_INET;
1302 1303 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1303 1304 }
1304 1305 connp->conn_flags |= IPCL_CL_LISTENER;
1305 1306 (*cl_inet_listen)(
1306 1307 connp->conn_netstack->netstack_stackid,
1307 1308 IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1308 1309 }
1309 1310 break;
1310 1311
1311 1312 case IPPROTO_SCTP:
1312 1313 ret = ipcl_sctp_hash_insert(connp, lport);
1313 1314 break;
1314 1315 }
1315 1316
1316 1317 return (ret);
1317 1318 }
1318 1319
1319 1320 /*
1320 1321 * ipcl_conn_hash insertion routines.
1321 1322 * The caller has already set conn_proto and the addresses/ports in the conn_t.
1322 1323 */
1323 1324
1324 1325 int
1325 1326 ipcl_conn_insert(conn_t *connp)
1326 1327 {
1327 1328 if (connp->conn_ipversion == IPV6_VERSION)
1328 1329 return (ipcl_conn_insert_v6(connp));
1329 1330 else
1330 1331 return (ipcl_conn_insert_v4(connp));
1331 1332 }
1332 1333
1333 1334 int
1334 1335 ipcl_conn_insert_v4(conn_t *connp)
1335 1336 {
1336 1337 connf_t *connfp;
1337 1338 conn_t *tconnp;
1338 1339 int ret = 0;
1339 1340 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1340 1341 uint16_t lport = connp->conn_lport;
1341 1342 uint8_t protocol = connp->conn_proto;
1342 1343
1343 1344 if (IPCL_IS_IPTUN(connp))
1344 1345 return (ipcl_iptun_hash_insert(connp, ipst));
1345 1346
1346 1347 switch (protocol) {
1347 1348 case IPPROTO_TCP:
1348 1349 /*
1349 1350 * For TCP, we check whether the connection tuple already
1350 1351 * exists before allowing the connection to proceed. We
1351 1352 * also allow indexing on the zoneid. This is to allow
1352 1353 * multiple shared stack zones to have the same tcp
1353 1354 * connection tuple. In practice this only happens for
1354 1355 * INADDR_LOOPBACK as it's the only local address which
1355 1356 * doesn't have to be unique.
1356 1357 */
1357 1358 connfp = &ipst->ips_ipcl_conn_fanout[
1358 1359 IPCL_CONN_HASH(connp->conn_faddr_v4,
1359 1360 connp->conn_ports, ipst)];
1360 1361 mutex_enter(&connfp->connf_lock);
1361 1362 for (tconnp = connfp->connf_head; tconnp != NULL;
1362 1363 tconnp = tconnp->conn_next) {
1363 1364 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1364 1365 connp->conn_faddr_v4, connp->conn_laddr_v4,
1365 1366 connp->conn_ports) &&
1366 1367 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1367 1368 /* Already have a conn. bail out */
1368 1369 mutex_exit(&connfp->connf_lock);
1369 1370 return (EADDRINUSE);
1370 1371 }
1371 1372 }
1372 1373 if (connp->conn_fanout != NULL) {
1373 1374 /*
1374 1375 * Probably a XTI/TLI application trying to do a
1375 1376 * rebind. Let it happen.
1376 1377 */
1377 1378 mutex_exit(&connfp->connf_lock);
1378 1379 IPCL_HASH_REMOVE(connp);
1379 1380 mutex_enter(&connfp->connf_lock);
1380 1381 }
1381 1382
1382 1383 ASSERT(connp->conn_recv != NULL);
1383 1384 ASSERT(connp->conn_recvicmp != NULL);
1384 1385
1385 1386 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1386 1387 mutex_exit(&connfp->connf_lock);
1387 1388 break;
1388 1389
1389 1390 case IPPROTO_SCTP:
1390 1391 /*
1391 1392 * The raw socket may have already been bound, remove it
1392 1393 * from the hash first.
1393 1394 */
1394 1395 IPCL_HASH_REMOVE(connp);
1395 1396 ret = ipcl_sctp_hash_insert(connp, lport);
1396 1397 break;
1397 1398
1398 1399 default:
1399 1400 /*
1400 1401 * Check for conflicts among MAC exempt bindings. For
1401 1402 * transports with port numbers, this is done by the upper
1402 1403 * level per-transport binding logic. For all others, it's
1403 1404 * done here.
1404 1405 */
1405 1406 if (is_system_labeled() &&
1406 1407 check_exempt_conflict_v4(connp, ipst))
1407 1408 return (EADDRINUSE);
1408 1409 /* FALLTHROUGH */
1409 1410
1410 1411 case IPPROTO_UDP:
1411 1412 if (protocol == IPPROTO_UDP) {
1412 1413 connfp = &ipst->ips_ipcl_udp_fanout[
1413 1414 IPCL_UDP_HASH(lport, ipst)];
1414 1415 } else {
1415 1416 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1416 1417 }
1417 1418
1418 1419 if (connp->conn_faddr_v4 != INADDR_ANY) {
1419 1420 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1420 1421 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1421 1422 IPCL_HASH_INSERT_BOUND(connfp, connp);
1422 1423 } else {
1423 1424 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1424 1425 }
1425 1426 break;
1426 1427 }
1427 1428
1428 1429 return (ret);
1429 1430 }
1430 1431
1431 1432 int
1432 1433 ipcl_conn_insert_v6(conn_t *connp)
1433 1434 {
1434 1435 connf_t *connfp;
1435 1436 conn_t *tconnp;
1436 1437 int ret = 0;
1437 1438 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1438 1439 uint16_t lport = connp->conn_lport;
1439 1440 uint8_t protocol = connp->conn_proto;
1440 1441 uint_t ifindex = connp->conn_bound_if;
1441 1442
1442 1443 if (IPCL_IS_IPTUN(connp))
1443 1444 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1444 1445
1445 1446 switch (protocol) {
1446 1447 case IPPROTO_TCP:
1447 1448
1448 1449 /*
1449 1450 * For tcp, we check whether the connection tuple already
1450 1451 * exists before allowing the connection to proceed. We
1451 1452 * also allow indexing on the zoneid. This is to allow
1452 1453 * multiple shared stack zones to have the same tcp
1453 1454 * connection tuple. In practice this only happens for
1454 1455 * ipv6_loopback as it's the only local address which
1455 1456 * doesn't have to be unique.
1456 1457 */
1457 1458 connfp = &ipst->ips_ipcl_conn_fanout[
1458 1459 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1459 1460 ipst)];
1460 1461 mutex_enter(&connfp->connf_lock);
1461 1462 for (tconnp = connfp->connf_head; tconnp != NULL;
1462 1463 tconnp = tconnp->conn_next) {
1463 1464 /* NOTE: need to match zoneid. Bug in onnv-gate */
1464 1465 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1465 1466 connp->conn_faddr_v6, connp->conn_laddr_v6,
1466 1467 connp->conn_ports) &&
1467 1468 (tconnp->conn_bound_if == 0 ||
1468 1469 tconnp->conn_bound_if == ifindex) &&
1469 1470 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1470 1471 /* Already have a conn. bail out */
1471 1472 mutex_exit(&connfp->connf_lock);
1472 1473 return (EADDRINUSE);
1473 1474 }
1474 1475 }
1475 1476 if (connp->conn_fanout != NULL) {
1476 1477 /*
1477 1478 * Probably a XTI/TLI application trying to do a
1478 1479 * rebind. Let it happen.
1479 1480 */
1480 1481 mutex_exit(&connfp->connf_lock);
1481 1482 IPCL_HASH_REMOVE(connp);
1482 1483 mutex_enter(&connfp->connf_lock);
1483 1484 }
1484 1485 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1485 1486 mutex_exit(&connfp->connf_lock);
1486 1487 break;
1487 1488
1488 1489 case IPPROTO_SCTP:
1489 1490 IPCL_HASH_REMOVE(connp);
1490 1491 ret = ipcl_sctp_hash_insert(connp, lport);
1491 1492 break;
1492 1493
1493 1494 default:
1494 1495 if (is_system_labeled() &&
1495 1496 check_exempt_conflict_v6(connp, ipst))
1496 1497 return (EADDRINUSE);
1497 1498 /* FALLTHROUGH */
1498 1499 case IPPROTO_UDP:
1499 1500 if (protocol == IPPROTO_UDP) {
1500 1501 connfp = &ipst->ips_ipcl_udp_fanout[
1501 1502 IPCL_UDP_HASH(lport, ipst)];
1502 1503 } else {
1503 1504 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1504 1505 }
1505 1506
1506 1507 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1507 1508 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1508 1509 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1509 1510 IPCL_HASH_INSERT_BOUND(connfp, connp);
1510 1511 } else {
1511 1512 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1512 1513 }
1513 1514 break;
1514 1515 }
1515 1516
1516 1517 return (ret);
1517 1518 }
1518 1519
1519 1520 /*
1520 1521 * v4 packet classifying function. looks up the fanout table to
1521 1522 * find the conn, the packet belongs to. returns the conn with
1522 1523 * the reference held, null otherwise.
1523 1524 *
1524 1525 * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1525 1526 * Lookup" comment block are applied. Labels are also checked as described
1526 1527 * above. If the packet is from the inside (looped back), and is from the same
1527 1528 * zone, then label checks are omitted.
1528 1529 */
1529 1530 conn_t *
1530 1531 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1531 1532 ip_recv_attr_t *ira, ip_stack_t *ipst)
1532 1533 {
1533 1534 ipha_t *ipha;
1534 1535 connf_t *connfp, *bind_connfp;
1535 1536 uint16_t lport;
1536 1537 uint16_t fport;
1537 1538 uint32_t ports;
1538 1539 conn_t *connp;
1539 1540 uint16_t *up;
1540 1541 zoneid_t zoneid = ira->ira_zoneid;
1541 1542
1542 1543 ipha = (ipha_t *)mp->b_rptr;
1543 1544 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1544 1545
1545 1546 switch (protocol) {
1546 1547 case IPPROTO_TCP:
1547 1548 ports = *(uint32_t *)up;
1548 1549 connfp =
1549 1550 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1550 1551 ports, ipst)];
1551 1552 mutex_enter(&connfp->connf_lock);
1552 1553 for (connp = connfp->connf_head; connp != NULL;
1553 1554 connp = connp->conn_next) {
1554 1555 if (IPCL_CONN_MATCH(connp, protocol,
1555 1556 ipha->ipha_src, ipha->ipha_dst, ports) &&
1556 1557 (connp->conn_zoneid == zoneid ||
1557 1558 connp->conn_allzones ||
1558 1559 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1559 1560 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1560 1561 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1561 1562 break;
1562 1563 }
1563 1564
1564 1565 if (connp != NULL) {
1565 1566 /*
1566 1567 * We have a fully-bound TCP connection.
1567 1568 *
1568 1569 * For labeled systems, there's no need to check the
1569 1570 * label here. It's known to be good as we checked
1570 1571 * before allowing the connection to become bound.
1571 1572 */
1572 1573 CONN_INC_REF(connp);
1573 1574 mutex_exit(&connfp->connf_lock);
1574 1575 return (connp);
1575 1576 }
1576 1577
1577 1578 mutex_exit(&connfp->connf_lock);
1578 1579 lport = up[1];
1579 1580 bind_connfp =
1580 1581 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1581 1582 mutex_enter(&bind_connfp->connf_lock);
1582 1583 for (connp = bind_connfp->connf_head; connp != NULL;
1583 1584 connp = connp->conn_next) {
1584 1585 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1585 1586 lport) &&
1586 1587 (connp->conn_zoneid == zoneid ||
1587 1588 connp->conn_allzones ||
1588 1589 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1589 1590 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1590 1591 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1591 1592 break;
1592 1593 }
1593 1594
1594 1595 /*
1595 1596 * If the matching connection is SLP on a private address, then
1596 1597 * the label on the packet must match the local zone's label.
1597 1598 * Otherwise, it must be in the label range defined by tnrh.
1598 1599 * This is ensured by tsol_receive_local.
1599 1600 *
1600 1601 * Note that we don't check tsol_receive_local for
1601 1602 * the connected case.
1602 1603 */
1603 1604 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1604 1605 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1605 1606 ira, connp)) {
1606 1607 DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1607 1608 char *, "connp(1) could not receive mp(2)",
1608 1609 conn_t *, connp, mblk_t *, mp);
1609 1610 connp = NULL;
1610 1611 }
1611 1612
1612 1613 if (connp != NULL) {
1613 1614 /* Have a listener at least */
1614 1615 CONN_INC_REF(connp);
1615 1616 mutex_exit(&bind_connfp->connf_lock);
1616 1617 return (connp);
1617 1618 }
1618 1619
1619 1620 mutex_exit(&bind_connfp->connf_lock);
1620 1621 break;
1621 1622
1622 1623 case IPPROTO_UDP:
1623 1624 lport = up[1];
1624 1625 fport = up[0];
1625 1626 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1626 1627 mutex_enter(&connfp->connf_lock);
1627 1628 for (connp = connfp->connf_head; connp != NULL;
1628 1629 connp = connp->conn_next) {
1629 1630 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1630 1631 fport, ipha->ipha_src) &&
1631 1632 (connp->conn_zoneid == zoneid ||
1632 1633 connp->conn_allzones ||
1633 1634 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1634 1635 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1635 1636 break;
1636 1637 }
1637 1638
1638 1639 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1639 1640 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1640 1641 ira, connp)) {
1641 1642 DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1642 1643 char *, "connp(1) could not receive mp(2)",
1643 1644 conn_t *, connp, mblk_t *, mp);
1644 1645 connp = NULL;
1645 1646 }
1646 1647
1647 1648 if (connp != NULL) {
1648 1649 CONN_INC_REF(connp);
1649 1650 mutex_exit(&connfp->connf_lock);
1650 1651 return (connp);
1651 1652 }
1652 1653
1653 1654 /*
1654 1655 * We shouldn't come here for multicast/broadcast packets
1655 1656 */
1656 1657 mutex_exit(&connfp->connf_lock);
1657 1658
1658 1659 break;
1659 1660
1660 1661 case IPPROTO_ENCAP:
1661 1662 case IPPROTO_IPV6:
1662 1663 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1663 1664 &ipha->ipha_dst, ipst));
1664 1665 }
1665 1666
1666 1667 return (NULL);
1667 1668 }
1668 1669
1669 1670 conn_t *
1670 1671 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1671 1672 ip_recv_attr_t *ira, ip_stack_t *ipst)
1672 1673 {
1673 1674 ip6_t *ip6h;
1674 1675 connf_t *connfp, *bind_connfp;
1675 1676 uint16_t lport;
1676 1677 uint16_t fport;
1677 1678 tcpha_t *tcpha;
1678 1679 uint32_t ports;
1679 1680 conn_t *connp;
1680 1681 uint16_t *up;
1681 1682 zoneid_t zoneid = ira->ira_zoneid;
1682 1683
1683 1684 ip6h = (ip6_t *)mp->b_rptr;
1684 1685
1685 1686 switch (protocol) {
1686 1687 case IPPROTO_TCP:
1687 1688 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1688 1689 up = &tcpha->tha_lport;
1689 1690 ports = *(uint32_t *)up;
1690 1691
1691 1692 connfp =
1692 1693 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1693 1694 ports, ipst)];
1694 1695 mutex_enter(&connfp->connf_lock);
1695 1696 for (connp = connfp->connf_head; connp != NULL;
1696 1697 connp = connp->conn_next) {
1697 1698 if (IPCL_CONN_MATCH_V6(connp, protocol,
1698 1699 ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1699 1700 (connp->conn_zoneid == zoneid ||
1700 1701 connp->conn_allzones ||
1701 1702 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1702 1703 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1703 1704 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1704 1705 break;
1705 1706 }
1706 1707
1707 1708 if (connp != NULL) {
1708 1709 /*
1709 1710 * We have a fully-bound TCP connection.
1710 1711 *
1711 1712 * For labeled systems, there's no need to check the
1712 1713 * label here. It's known to be good as we checked
1713 1714 * before allowing the connection to become bound.
1714 1715 */
1715 1716 CONN_INC_REF(connp);
1716 1717 mutex_exit(&connfp->connf_lock);
1717 1718 return (connp);
1718 1719 }
1719 1720
1720 1721 mutex_exit(&connfp->connf_lock);
1721 1722
1722 1723 lport = up[1];
1723 1724 bind_connfp =
1724 1725 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1725 1726 mutex_enter(&bind_connfp->connf_lock);
1726 1727 for (connp = bind_connfp->connf_head; connp != NULL;
1727 1728 connp = connp->conn_next) {
1728 1729 if (IPCL_BIND_MATCH_V6(connp, protocol,
1729 1730 ip6h->ip6_dst, lport) &&
1730 1731 (connp->conn_zoneid == zoneid ||
1731 1732 connp->conn_allzones ||
1732 1733 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1733 1734 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1734 1735 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1735 1736 break;
1736 1737 }
1737 1738
1738 1739 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1739 1740 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1740 1741 ira, connp)) {
1741 1742 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1742 1743 char *, "connp(1) could not receive mp(2)",
1743 1744 conn_t *, connp, mblk_t *, mp);
1744 1745 connp = NULL;
1745 1746 }
1746 1747
1747 1748 if (connp != NULL) {
1748 1749 /* Have a listner at least */
1749 1750 CONN_INC_REF(connp);
1750 1751 mutex_exit(&bind_connfp->connf_lock);
1751 1752 return (connp);
1752 1753 }
1753 1754
1754 1755 mutex_exit(&bind_connfp->connf_lock);
1755 1756 break;
1756 1757
1757 1758 case IPPROTO_UDP:
1758 1759 up = (uint16_t *)&mp->b_rptr[hdr_len];
1759 1760 lport = up[1];
1760 1761 fport = up[0];
1761 1762 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1762 1763 mutex_enter(&connfp->connf_lock);
1763 1764 for (connp = connfp->connf_head; connp != NULL;
1764 1765 connp = connp->conn_next) {
1765 1766 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1766 1767 fport, ip6h->ip6_src) &&
1767 1768 (connp->conn_zoneid == zoneid ||
1768 1769 connp->conn_allzones ||
1769 1770 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1770 1771 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1771 1772 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1772 1773 break;
1773 1774 }
1774 1775
1775 1776 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1776 1777 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1777 1778 ira, connp)) {
1778 1779 DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1779 1780 char *, "connp(1) could not receive mp(2)",
1780 1781 conn_t *, connp, mblk_t *, mp);
1781 1782 connp = NULL;
1782 1783 }
1783 1784
1784 1785 if (connp != NULL) {
1785 1786 CONN_INC_REF(connp);
1786 1787 mutex_exit(&connfp->connf_lock);
1787 1788 return (connp);
1788 1789 }
1789 1790
1790 1791 /*
1791 1792 * We shouldn't come here for multicast/broadcast packets
1792 1793 */
1793 1794 mutex_exit(&connfp->connf_lock);
1794 1795 break;
1795 1796 case IPPROTO_ENCAP:
1796 1797 case IPPROTO_IPV6:
1797 1798 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1798 1799 &ip6h->ip6_dst, ipst));
1799 1800 }
1800 1801
1801 1802 return (NULL);
1802 1803 }
1803 1804
1804 1805 /*
1805 1806 * wrapper around ipcl_classify_(v4,v6) routines.
1806 1807 */
1807 1808 conn_t *
1808 1809 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1809 1810 {
1810 1811 if (ira->ira_flags & IRAF_IS_IPV4) {
1811 1812 return (ipcl_classify_v4(mp, ira->ira_protocol,
1812 1813 ira->ira_ip_hdr_length, ira, ipst));
1813 1814 } else {
1814 1815 return (ipcl_classify_v6(mp, ira->ira_protocol,
1815 1816 ira->ira_ip_hdr_length, ira, ipst));
1816 1817 }
1817 1818 }
1818 1819
1819 1820 /*
1820 1821 * Only used to classify SCTP RAW sockets
1821 1822 */
1822 1823 conn_t *
1823 1824 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1824 1825 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1825 1826 {
1826 1827 connf_t *connfp;
1827 1828 conn_t *connp;
1828 1829 in_port_t lport;
1829 1830 int ipversion;
1830 1831 const void *dst;
1831 1832 zoneid_t zoneid = ira->ira_zoneid;
1832 1833
1833 1834 lport = ((uint16_t *)&ports)[1];
1834 1835 if (ira->ira_flags & IRAF_IS_IPV4) {
1835 1836 dst = (const void *)&ipha->ipha_dst;
1836 1837 ipversion = IPV4_VERSION;
1837 1838 } else {
1838 1839 dst = (const void *)&ip6h->ip6_dst;
1839 1840 ipversion = IPV6_VERSION;
1840 1841 }
1841 1842
1842 1843 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1843 1844 mutex_enter(&connfp->connf_lock);
1844 1845 for (connp = connfp->connf_head; connp != NULL;
1845 1846 connp = connp->conn_next) {
1846 1847 /* We don't allow v4 fallback for v6 raw socket. */
1847 1848 if (ipversion != connp->conn_ipversion)
1848 1849 continue;
1849 1850 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1850 1851 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1851 1852 if (ipversion == IPV4_VERSION) {
1852 1853 if (!IPCL_CONN_MATCH(connp, protocol,
1853 1854 ipha->ipha_src, ipha->ipha_dst, ports))
1854 1855 continue;
1855 1856 } else {
1856 1857 if (!IPCL_CONN_MATCH_V6(connp, protocol,
1857 1858 ip6h->ip6_src, ip6h->ip6_dst, ports))
1858 1859 continue;
1859 1860 }
1860 1861 } else {
1861 1862 if (ipversion == IPV4_VERSION) {
1862 1863 if (!IPCL_BIND_MATCH(connp, protocol,
1863 1864 ipha->ipha_dst, lport))
1864 1865 continue;
1865 1866 } else {
1866 1867 if (!IPCL_BIND_MATCH_V6(connp, protocol,
1867 1868 ip6h->ip6_dst, lport))
1868 1869 continue;
1869 1870 }
1870 1871 }
1871 1872
1872 1873 if (connp->conn_zoneid == zoneid ||
1873 1874 connp->conn_allzones ||
1874 1875 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1875 1876 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1876 1877 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1877 1878 break;
1878 1879 }
1879 1880
1880 1881 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1881 1882 !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1882 1883 DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1883 1884 char *, "connp(1) could not receive mp(2)",
1884 1885 conn_t *, connp, mblk_t *, mp);
1885 1886 connp = NULL;
1886 1887 }
1887 1888
1888 1889 if (connp != NULL)
1889 1890 goto found;
1890 1891 mutex_exit(&connfp->connf_lock);
1891 1892
1892 1893 /* Try to look for a wildcard SCTP RAW socket match. */
1893 1894 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1894 1895 mutex_enter(&connfp->connf_lock);
1895 1896 for (connp = connfp->connf_head; connp != NULL;
1896 1897 connp = connp->conn_next) {
1897 1898 /* We don't allow v4 fallback for v6 raw socket. */
1898 1899 if (ipversion != connp->conn_ipversion)
1899 1900 continue;
1900 1901 if (!IPCL_ZONE_MATCH(connp, zoneid))
1901 1902 continue;
1902 1903
1903 1904 if (ipversion == IPV4_VERSION) {
1904 1905 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1905 1906 break;
1906 1907 } else {
1907 1908 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1908 1909 break;
1909 1910 }
1910 1911 }
1911 1912 }
1912 1913
1913 1914 if (connp != NULL)
1914 1915 goto found;
1915 1916
1916 1917 mutex_exit(&connfp->connf_lock);
1917 1918 return (NULL);
1918 1919
1919 1920 found:
1920 1921 ASSERT(connp != NULL);
1921 1922 CONN_INC_REF(connp);
1922 1923 mutex_exit(&connfp->connf_lock);
1923 1924 return (connp);
1924 1925 }
1925 1926
1926 1927 /* ARGSUSED */
1927 1928 static int
1928 1929 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1929 1930 {
1930 1931 itc_t *itc = (itc_t *)buf;
1931 1932 conn_t *connp = &itc->itc_conn;
1932 1933 tcp_t *tcp = (tcp_t *)&itc[1];
1933 1934
1934 1935 bzero(connp, sizeof (conn_t));
1935 1936 bzero(tcp, sizeof (tcp_t));
1936 1937
1937 1938 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1938 1939 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1939 1940 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1940 1941 tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1941 1942 if (tcp->tcp_timercache == NULL)
1942 1943 return (ENOMEM);
1943 1944 connp->conn_tcp = tcp;
1944 1945 connp->conn_flags = IPCL_TCPCONN;
1945 1946 connp->conn_proto = IPPROTO_TCP;
1946 1947 tcp->tcp_connp = connp;
1947 1948 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1948 1949
1949 1950 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1950 1951 if (connp->conn_ixa == NULL) {
1951 1952 tcp_timermp_free(tcp);
1952 1953 return (ENOMEM);
1953 1954 }
1954 1955 connp->conn_ixa->ixa_refcnt = 1;
1955 1956 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1956 1957 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1957 1958 return (0);
1958 1959 }
1959 1960
1960 1961 /* ARGSUSED */
1961 1962 static void
1962 1963 tcp_conn_destructor(void *buf, void *cdrarg)
1963 1964 {
1964 1965 itc_t *itc = (itc_t *)buf;
1965 1966 conn_t *connp = &itc->itc_conn;
1966 1967 tcp_t *tcp = (tcp_t *)&itc[1];
1967 1968
1968 1969 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1969 1970 ASSERT(tcp->tcp_connp == connp);
1970 1971 ASSERT(connp->conn_tcp == tcp);
1971 1972 tcp_timermp_free(tcp);
1972 1973 mutex_destroy(&connp->conn_lock);
1973 1974 cv_destroy(&connp->conn_cv);
1974 1975 cv_destroy(&connp->conn_sq_cv);
1975 1976 rw_destroy(&connp->conn_ilg_lock);
1976 1977
1977 1978 /* Can be NULL if constructor failed */
1978 1979 if (connp->conn_ixa != NULL) {
1979 1980 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1980 1981 ASSERT(connp->conn_ixa->ixa_ire == NULL);
1981 1982 ASSERT(connp->conn_ixa->ixa_nce == NULL);
1982 1983 ixa_refrele(connp->conn_ixa);
1983 1984 }
1984 1985 }
1985 1986
1986 1987 /* ARGSUSED */
1987 1988 static int
1988 1989 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1989 1990 {
1990 1991 itc_t *itc = (itc_t *)buf;
1991 1992 conn_t *connp = &itc->itc_conn;
1992 1993
1993 1994 bzero(connp, sizeof (conn_t));
1994 1995 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1995 1996 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1996 1997 connp->conn_flags = IPCL_IPCCONN;
1997 1998 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1998 1999
1999 2000 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2000 2001 if (connp->conn_ixa == NULL)
2001 2002 return (ENOMEM);
2002 2003 connp->conn_ixa->ixa_refcnt = 1;
2003 2004 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2004 2005 return (0);
2005 2006 }
2006 2007
2007 2008 /* ARGSUSED */
2008 2009 static void
2009 2010 ip_conn_destructor(void *buf, void *cdrarg)
2010 2011 {
2011 2012 itc_t *itc = (itc_t *)buf;
2012 2013 conn_t *connp = &itc->itc_conn;
2013 2014
2014 2015 ASSERT(connp->conn_flags & IPCL_IPCCONN);
2015 2016 ASSERT(connp->conn_priv == NULL);
2016 2017 mutex_destroy(&connp->conn_lock);
2017 2018 cv_destroy(&connp->conn_cv);
2018 2019 rw_destroy(&connp->conn_ilg_lock);
2019 2020
2020 2021 /* Can be NULL if constructor failed */
2021 2022 if (connp->conn_ixa != NULL) {
2022 2023 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2023 2024 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2024 2025 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2025 2026 ixa_refrele(connp->conn_ixa);
2026 2027 }
2027 2028 }
2028 2029
2029 2030 /* ARGSUSED */
2030 2031 static int
2031 2032 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2032 2033 {
2033 2034 itc_t *itc = (itc_t *)buf;
2034 2035 conn_t *connp = &itc->itc_conn;
2035 2036 udp_t *udp = (udp_t *)&itc[1];
2036 2037
2037 2038 bzero(connp, sizeof (conn_t));
2038 2039 bzero(udp, sizeof (udp_t));
2039 2040
2040 2041 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2041 2042 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2042 2043 connp->conn_udp = udp;
2043 2044 connp->conn_flags = IPCL_UDPCONN;
2044 2045 connp->conn_proto = IPPROTO_UDP;
2045 2046 udp->udp_connp = connp;
2046 2047 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2047 2048 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2048 2049 if (connp->conn_ixa == NULL)
2049 2050 return (ENOMEM);
2050 2051 connp->conn_ixa->ixa_refcnt = 1;
2051 2052 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2052 2053 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2053 2054 return (0);
2054 2055 }
2055 2056
2056 2057 /* ARGSUSED */
2057 2058 static void
2058 2059 udp_conn_destructor(void *buf, void *cdrarg)
2059 2060 {
2060 2061 itc_t *itc = (itc_t *)buf;
2061 2062 conn_t *connp = &itc->itc_conn;
2062 2063 udp_t *udp = (udp_t *)&itc[1];
2063 2064
2064 2065 ASSERT(connp->conn_flags & IPCL_UDPCONN);
2065 2066 ASSERT(udp->udp_connp == connp);
2066 2067 ASSERT(connp->conn_udp == udp);
2067 2068 mutex_destroy(&connp->conn_lock);
2068 2069 cv_destroy(&connp->conn_cv);
2069 2070 rw_destroy(&connp->conn_ilg_lock);
2070 2071
2071 2072 /* Can be NULL if constructor failed */
2072 2073 if (connp->conn_ixa != NULL) {
2073 2074 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2074 2075 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2075 2076 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2076 2077 ixa_refrele(connp->conn_ixa);
2077 2078 }
2078 2079 }
2079 2080
2080 2081 /* ARGSUSED */
2081 2082 static int
2082 2083 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2083 2084 {
2084 2085 itc_t *itc = (itc_t *)buf;
2085 2086 conn_t *connp = &itc->itc_conn;
2086 2087 icmp_t *icmp = (icmp_t *)&itc[1];
2087 2088
2088 2089 bzero(connp, sizeof (conn_t));
2089 2090 bzero(icmp, sizeof (icmp_t));
2090 2091
2091 2092 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2092 2093 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2093 2094 connp->conn_icmp = icmp;
2094 2095 connp->conn_flags = IPCL_RAWIPCONN;
2095 2096 connp->conn_proto = IPPROTO_ICMP;
2096 2097 icmp->icmp_connp = connp;
2097 2098 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2098 2099 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2099 2100 if (connp->conn_ixa == NULL)
2100 2101 return (ENOMEM);
2101 2102 connp->conn_ixa->ixa_refcnt = 1;
2102 2103 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2103 2104 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2104 2105 return (0);
2105 2106 }
2106 2107
2107 2108 /* ARGSUSED */
2108 2109 static void
2109 2110 rawip_conn_destructor(void *buf, void *cdrarg)
2110 2111 {
2111 2112 itc_t *itc = (itc_t *)buf;
2112 2113 conn_t *connp = &itc->itc_conn;
2113 2114 icmp_t *icmp = (icmp_t *)&itc[1];
2114 2115
2115 2116 ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2116 2117 ASSERT(icmp->icmp_connp == connp);
2117 2118 ASSERT(connp->conn_icmp == icmp);
2118 2119 mutex_destroy(&connp->conn_lock);
2119 2120 cv_destroy(&connp->conn_cv);
2120 2121 rw_destroy(&connp->conn_ilg_lock);
2121 2122
2122 2123 /* Can be NULL if constructor failed */
2123 2124 if (connp->conn_ixa != NULL) {
2124 2125 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2125 2126 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2126 2127 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2127 2128 ixa_refrele(connp->conn_ixa);
2128 2129 }
2129 2130 }
2130 2131
2131 2132 /* ARGSUSED */
2132 2133 static int
2133 2134 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2134 2135 {
2135 2136 itc_t *itc = (itc_t *)buf;
2136 2137 conn_t *connp = &itc->itc_conn;
2137 2138 rts_t *rts = (rts_t *)&itc[1];
2138 2139
2139 2140 bzero(connp, sizeof (conn_t));
2140 2141 bzero(rts, sizeof (rts_t));
2141 2142
2142 2143 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2143 2144 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2144 2145 connp->conn_rts = rts;
2145 2146 connp->conn_flags = IPCL_RTSCONN;
2146 2147 rts->rts_connp = connp;
2147 2148 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2148 2149 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2149 2150 if (connp->conn_ixa == NULL)
2150 2151 return (ENOMEM);
2151 2152 connp->conn_ixa->ixa_refcnt = 1;
2152 2153 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2153 2154 return (0);
2154 2155 }
2155 2156
2156 2157 /* ARGSUSED */
2157 2158 static void
2158 2159 rts_conn_destructor(void *buf, void *cdrarg)
2159 2160 {
2160 2161 itc_t *itc = (itc_t *)buf;
2161 2162 conn_t *connp = &itc->itc_conn;
2162 2163 rts_t *rts = (rts_t *)&itc[1];
2163 2164
2164 2165 ASSERT(connp->conn_flags & IPCL_RTSCONN);
2165 2166 ASSERT(rts->rts_connp == connp);
2166 2167 ASSERT(connp->conn_rts == rts);
2167 2168 mutex_destroy(&connp->conn_lock);
2168 2169 cv_destroy(&connp->conn_cv);
2169 2170 rw_destroy(&connp->conn_ilg_lock);
2170 2171
2171 2172 /* Can be NULL if constructor failed */
2172 2173 if (connp->conn_ixa != NULL) {
2173 2174 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2174 2175 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2175 2176 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2176 2177 ixa_refrele(connp->conn_ixa);
2177 2178 }
2178 2179 }
2179 2180
2180 2181 /*
2181 2182 * Called as part of ipcl_conn_destroy to assert and clear any pointers
2182 2183 * in the conn_t.
2183 2184 *
2184 2185 * Below we list all the pointers in the conn_t as a documentation aid.
2185 2186 * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2186 2187 * If you add any pointers to the conn_t please add an ASSERT here
2187 2188 * and #ifdef it out if it can't be actually asserted to be NULL.
2188 2189 * In any case, we bzero most of the conn_t at the end of the function.
2189 2190 */
2190 2191 void
2191 2192 ipcl_conn_cleanup(conn_t *connp)
2192 2193 {
2193 2194 ip_xmit_attr_t *ixa;
2194 2195
2195 2196 ASSERT(connp->conn_latch == NULL);
2196 2197 ASSERT(connp->conn_latch_in_policy == NULL);
2197 2198 ASSERT(connp->conn_latch_in_action == NULL);
2198 2199 #ifdef notdef
2199 2200 ASSERT(connp->conn_rq == NULL);
2200 2201 ASSERT(connp->conn_wq == NULL);
2201 2202 #endif
2202 2203 ASSERT(connp->conn_cred == NULL);
2203 2204 ASSERT(connp->conn_g_fanout == NULL);
2204 2205 ASSERT(connp->conn_g_next == NULL);
2205 2206 ASSERT(connp->conn_g_prev == NULL);
2206 2207 ASSERT(connp->conn_policy == NULL);
2207 2208 ASSERT(connp->conn_fanout == NULL);
2208 2209 ASSERT(connp->conn_next == NULL);
2209 2210 ASSERT(connp->conn_prev == NULL);
2210 2211 ASSERT(connp->conn_oper_pending_ill == NULL);
2211 2212 ASSERT(connp->conn_ilg == NULL);
2212 2213 ASSERT(connp->conn_drain_next == NULL);
2213 2214 ASSERT(connp->conn_drain_prev == NULL);
2214 2215 #ifdef notdef
2215 2216 /* conn_idl is not cleared when removed from idl list */
2216 2217 ASSERT(connp->conn_idl == NULL);
2217 2218 #endif
2218 2219 ASSERT(connp->conn_ipsec_opt_mp == NULL);
2219 2220 #ifdef notdef
2220 2221 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2221 2222 ASSERT(connp->conn_netstack == NULL);
2222 2223 #endif
2223 2224
2224 2225 ASSERT(connp->conn_helper_info == NULL);
2225 2226 ASSERT(connp->conn_ixa != NULL);
2226 2227 ixa = connp->conn_ixa;
2227 2228 ASSERT(ixa->ixa_refcnt == 1);
2228 2229 /* Need to preserve ixa_protocol */
2229 2230 ixa_cleanup(ixa);
2230 2231 ixa->ixa_flags = 0;
2231 2232
2232 2233 /* Clear out the conn_t fields that are not preserved */
2233 2234 bzero(&connp->conn_start_clr,
2234 2235 sizeof (conn_t) -
2235 2236 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2236 2237 }
2237 2238
2238 2239 /*
2239 2240 * All conns are inserted in a global multi-list for the benefit of
2240 2241 * walkers. The walk is guaranteed to walk all open conns at the time
2241 2242 * of the start of the walk exactly once. This property is needed to
2242 2243 * achieve some cleanups during unplumb of interfaces. This is achieved
2243 2244 * as follows.
2244 2245 *
2245 2246 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2246 2247 * call the insert and delete functions below at creation and deletion
2247 2248 * time respectively. The conn never moves or changes its position in this
2248 2249 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2249 2250 * won't increase due to walkers, once the conn deletion has started. Note
2250 2251 * that we can't remove the conn from the global list and then wait for
2251 2252 * the refcnt to drop to zero, since walkers would then see a truncated
2252 2253 * list. CONN_INCIPIENT ensures that walkers don't start looking at
2253 2254 * conns until ip_open is ready to make them globally visible.
2254 2255 * The global round robin multi-list locks are held only to get the
2255 2256 * next member/insertion/deletion and contention should be negligible
2256 2257 * if the multi-list is much greater than the number of cpus.
2257 2258 */
2258 2259 void
2259 2260 ipcl_globalhash_insert(conn_t *connp)
2260 2261 {
2261 2262 int index;
2262 2263 struct connf_s *connfp;
2263 2264 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
2264 2265
2265 2266 /*
2266 2267 * No need for atomic here. Approximate even distribution
2267 2268 * in the global lists is sufficient.
2268 2269 */
2269 2270 ipst->ips_conn_g_index++;
2270 2271 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2271 2272
2272 2273 connp->conn_g_prev = NULL;
2273 2274 /*
2274 2275 * Mark as INCIPIENT, so that walkers will ignore this
2275 2276 * for now, till ip_open is ready to make it visible globally.
2276 2277 */
2277 2278 connp->conn_state_flags |= CONN_INCIPIENT;
2278 2279
2279 2280 connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2280 2281 /* Insert at the head of the list */
2281 2282 mutex_enter(&connfp->connf_lock);
2282 2283 connp->conn_g_next = connfp->connf_head;
2283 2284 if (connp->conn_g_next != NULL)
2284 2285 connp->conn_g_next->conn_g_prev = connp;
2285 2286 connfp->connf_head = connp;
2286 2287
2287 2288 /* The fanout bucket this conn points to */
2288 2289 connp->conn_g_fanout = connfp;
2289 2290
2290 2291 mutex_exit(&connfp->connf_lock);
2291 2292 }
2292 2293
2293 2294 void
2294 2295 ipcl_globalhash_remove(conn_t *connp)
2295 2296 {
2296 2297 struct connf_s *connfp;
2297 2298
2298 2299 /*
2299 2300 * We were never inserted in the global multi list.
2300 2301 * IPCL_NONE variety is never inserted in the global multilist
2301 2302 * since it is presumed to not need any cleanup and is transient.
2302 2303 */
2303 2304 if (connp->conn_g_fanout == NULL)
2304 2305 return;
2305 2306
2306 2307 connfp = connp->conn_g_fanout;
2307 2308 mutex_enter(&connfp->connf_lock);
2308 2309 if (connp->conn_g_prev != NULL)
2309 2310 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2310 2311 else
2311 2312 connfp->connf_head = connp->conn_g_next;
2312 2313 if (connp->conn_g_next != NULL)
2313 2314 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2314 2315 mutex_exit(&connfp->connf_lock);
2315 2316
2316 2317 /* Better to stumble on a null pointer than to corrupt memory */
2317 2318 connp->conn_g_next = NULL;
2318 2319 connp->conn_g_prev = NULL;
2319 2320 connp->conn_g_fanout = NULL;
2320 2321 }
2321 2322
2322 2323 /*
2323 2324 * Walk the list of all conn_t's in the system, calling the function provided
2324 2325 * With the specified argument for each.
2325 2326 * Applies to both IPv4 and IPv6.
2326 2327 *
2327 2328 * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2328 2329 * conn_oper_pending_ill). To guard against stale pointers
2329 2330 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2330 2331 * unplumbed or removed. New conn_t's that are created while we are walking
2331 2332 * may be missed by this walk, because they are not necessarily inserted
2332 2333 * at the tail of the list. They are new conn_t's and thus don't have any
2333 2334 * stale pointers. The CONN_CLOSING flag ensures that no new reference
2334 2335 * is created to the struct that is going away.
2335 2336 */
2336 2337 void
2337 2338 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2338 2339 {
2339 2340 int i;
2340 2341 conn_t *connp;
2341 2342 conn_t *prev_connp;
2342 2343
2343 2344 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2344 2345 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2345 2346 prev_connp = NULL;
2346 2347 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2347 2348 while (connp != NULL) {
2348 2349 mutex_enter(&connp->conn_lock);
2349 2350 if (connp->conn_state_flags &
2350 2351 (CONN_CONDEMNED | CONN_INCIPIENT)) {
2351 2352 mutex_exit(&connp->conn_lock);
2352 2353 connp = connp->conn_g_next;
2353 2354 continue;
2354 2355 }
2355 2356 CONN_INC_REF_LOCKED(connp);
2356 2357 mutex_exit(&connp->conn_lock);
2357 2358 mutex_exit(
2358 2359 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2359 2360 (*func)(connp, arg);
2360 2361 if (prev_connp != NULL)
2361 2362 CONN_DEC_REF(prev_connp);
2362 2363 mutex_enter(
2363 2364 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2364 2365 prev_connp = connp;
2365 2366 connp = connp->conn_g_next;
2366 2367 }
2367 2368 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2368 2369 if (prev_connp != NULL)
2369 2370 CONN_DEC_REF(prev_connp);
2370 2371 }
2371 2372 }
2372 2373
2373 2374 /*
2374 2375 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2375 2376 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2376 2377 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2377 2378 * (peer tcp in ESTABLISHED state).
2378 2379 */
2379 2380 conn_t *
2380 2381 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2381 2382 ip_stack_t *ipst)
2382 2383 {
2383 2384 uint32_t ports;
2384 2385 uint16_t *pports = (uint16_t *)&ports;
2385 2386 connf_t *connfp;
2386 2387 conn_t *tconnp;
2387 2388 boolean_t zone_chk;
2388 2389
2389 2390 /*
2390 2391 * If either the source of destination address is loopback, then
2391 2392 * both endpoints must be in the same Zone. Otherwise, both of
2392 2393 * the addresses are system-wide unique (tcp is in ESTABLISHED
2393 2394 * state) and the endpoints may reside in different Zones.
2394 2395 */
2395 2396 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2396 2397 ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2397 2398
2398 2399 pports[0] = tcpha->tha_fport;
2399 2400 pports[1] = tcpha->tha_lport;
2400 2401
2401 2402 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2402 2403 ports, ipst)];
2403 2404
2404 2405 mutex_enter(&connfp->connf_lock);
2405 2406 for (tconnp = connfp->connf_head; tconnp != NULL;
2406 2407 tconnp = tconnp->conn_next) {
2407 2408
2408 2409 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2409 2410 ipha->ipha_dst, ipha->ipha_src, ports) &&
2410 2411 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2411 2412 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2412 2413
2413 2414 ASSERT(tconnp != connp);
2414 2415 CONN_INC_REF(tconnp);
2415 2416 mutex_exit(&connfp->connf_lock);
2416 2417 return (tconnp);
2417 2418 }
2418 2419 }
2419 2420 mutex_exit(&connfp->connf_lock);
2420 2421 return (NULL);
2421 2422 }
2422 2423
2423 2424 /*
2424 2425 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2425 2426 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2426 2427 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2427 2428 * (peer tcp in ESTABLISHED state).
2428 2429 */
2429 2430 conn_t *
2430 2431 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2431 2432 ip_stack_t *ipst)
2432 2433 {
2433 2434 uint32_t ports;
2434 2435 uint16_t *pports = (uint16_t *)&ports;
2435 2436 connf_t *connfp;
2436 2437 conn_t *tconnp;
2437 2438 boolean_t zone_chk;
2438 2439
2439 2440 /*
2440 2441 * If either the source of destination address is loopback, then
2441 2442 * both endpoints must be in the same Zone. Otherwise, both of
2442 2443 * the addresses are system-wide unique (tcp is in ESTABLISHED
2443 2444 * state) and the endpoints may reside in different Zones. We
2444 2445 * don't do Zone check for link local address(es) because the
2445 2446 * current Zone implementation treats each link local address as
2446 2447 * being unique per system node, i.e. they belong to global Zone.
2447 2448 */
2448 2449 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2449 2450 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2450 2451
2451 2452 pports[0] = tcpha->tha_fport;
2452 2453 pports[1] = tcpha->tha_lport;
2453 2454
2454 2455 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2455 2456 ports, ipst)];
2456 2457
2457 2458 mutex_enter(&connfp->connf_lock);
2458 2459 for (tconnp = connfp->connf_head; tconnp != NULL;
2459 2460 tconnp = tconnp->conn_next) {
2460 2461
2461 2462 /* We skip conn_bound_if check here as this is loopback tcp */
2462 2463 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2463 2464 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2464 2465 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2465 2466 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2466 2467
2467 2468 ASSERT(tconnp != connp);
2468 2469 CONN_INC_REF(tconnp);
2469 2470 mutex_exit(&connfp->connf_lock);
2470 2471 return (tconnp);
2471 2472 }
2472 2473 }
2473 2474 mutex_exit(&connfp->connf_lock);
2474 2475 return (NULL);
2475 2476 }
2476 2477
2477 2478 /*
2478 2479 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2479 2480 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2480 2481 * Only checks for connected entries i.e. no INADDR_ANY checks.
2481 2482 */
2482 2483 conn_t *
2483 2484 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2484 2485 ip_stack_t *ipst)
2485 2486 {
2486 2487 uint32_t ports;
2487 2488 uint16_t *pports;
2488 2489 connf_t *connfp;
2489 2490 conn_t *tconnp;
2490 2491
2491 2492 pports = (uint16_t *)&ports;
2492 2493 pports[0] = tcpha->tha_fport;
2493 2494 pports[1] = tcpha->tha_lport;
2494 2495
2495 2496 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2496 2497 ports, ipst)];
2497 2498
2498 2499 mutex_enter(&connfp->connf_lock);
2499 2500 for (tconnp = connfp->connf_head; tconnp != NULL;
2500 2501 tconnp = tconnp->conn_next) {
2501 2502
2502 2503 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2503 2504 ipha->ipha_dst, ipha->ipha_src, ports) &&
2504 2505 tconnp->conn_tcp->tcp_state >= min_state) {
2505 2506
2506 2507 CONN_INC_REF(tconnp);
2507 2508 mutex_exit(&connfp->connf_lock);
2508 2509 return (tconnp);
2509 2510 }
2510 2511 }
2511 2512 mutex_exit(&connfp->connf_lock);
2512 2513 return (NULL);
2513 2514 }
2514 2515
2515 2516 /*
2516 2517 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2517 2518 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2518 2519 * Only checks for connected entries i.e. no INADDR_ANY checks.
2519 2520 * Match on ifindex in addition to addresses.
2520 2521 */
2521 2522 conn_t *
2522 2523 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2523 2524 uint_t ifindex, ip_stack_t *ipst)
2524 2525 {
2525 2526 tcp_t *tcp;
2526 2527 uint32_t ports;
2527 2528 uint16_t *pports;
2528 2529 connf_t *connfp;
2529 2530 conn_t *tconnp;
2530 2531
2531 2532 pports = (uint16_t *)&ports;
2532 2533 pports[0] = tcpha->tha_fport;
2533 2534 pports[1] = tcpha->tha_lport;
2534 2535
2535 2536 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2536 2537 ports, ipst)];
2537 2538
2538 2539 mutex_enter(&connfp->connf_lock);
2539 2540 for (tconnp = connfp->connf_head; tconnp != NULL;
2540 2541 tconnp = tconnp->conn_next) {
2541 2542
2542 2543 tcp = tconnp->conn_tcp;
2543 2544 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2544 2545 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2545 2546 tcp->tcp_state >= min_state &&
2546 2547 (tconnp->conn_bound_if == 0 ||
2547 2548 tconnp->conn_bound_if == ifindex)) {
2548 2549
2549 2550 CONN_INC_REF(tconnp);
2550 2551 mutex_exit(&connfp->connf_lock);
2551 2552 return (tconnp);
2552 2553 }
2553 2554 }
2554 2555 mutex_exit(&connfp->connf_lock);
2555 2556 return (NULL);
2556 2557 }
2557 2558
2558 2559 /*
2559 2560 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2560 2561 * a listener when changing state.
2561 2562 */
2562 2563 conn_t *
2563 2564 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2564 2565 ip_stack_t *ipst)
2565 2566 {
2566 2567 connf_t *bind_connfp;
2567 2568 conn_t *connp;
2568 2569 tcp_t *tcp;
2569 2570
2570 2571 /*
2571 2572 * Avoid false matches for packets sent to an IP destination of
2572 2573 * all zeros.
2573 2574 */
2574 2575 if (laddr == 0)
2575 2576 return (NULL);
2576 2577
2577 2578 ASSERT(zoneid != ALL_ZONES);
2578 2579
2579 2580 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2580 2581 mutex_enter(&bind_connfp->connf_lock);
2581 2582 for (connp = bind_connfp->connf_head; connp != NULL;
2582 2583 connp = connp->conn_next) {
2583 2584 tcp = connp->conn_tcp;
2584 2585 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2585 2586 IPCL_ZONE_MATCH(connp, zoneid) &&
2586 2587 (tcp->tcp_listener == NULL)) {
2587 2588 CONN_INC_REF(connp);
2588 2589 mutex_exit(&bind_connfp->connf_lock);
2589 2590 return (connp);
2590 2591 }
2591 2592 }
2592 2593 mutex_exit(&bind_connfp->connf_lock);
2593 2594 return (NULL);
2594 2595 }
2595 2596
2596 2597 /*
2597 2598 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2598 2599 * a listener when changing state.
2599 2600 */
2600 2601 conn_t *
2601 2602 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2602 2603 zoneid_t zoneid, ip_stack_t *ipst)
2603 2604 {
2604 2605 connf_t *bind_connfp;
2605 2606 conn_t *connp = NULL;
2606 2607 tcp_t *tcp;
2607 2608
2608 2609 /*
2609 2610 * Avoid false matches for packets sent to an IP destination of
2610 2611 * all zeros.
2611 2612 */
2612 2613 if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2613 2614 return (NULL);
2614 2615
2615 2616 ASSERT(zoneid != ALL_ZONES);
2616 2617
2617 2618 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2618 2619 mutex_enter(&bind_connfp->connf_lock);
2619 2620 for (connp = bind_connfp->connf_head; connp != NULL;
2620 2621 connp = connp->conn_next) {
2621 2622 tcp = connp->conn_tcp;
2622 2623 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2623 2624 IPCL_ZONE_MATCH(connp, zoneid) &&
2624 2625 (connp->conn_bound_if == 0 ||
2625 2626 connp->conn_bound_if == ifindex) &&
2626 2627 tcp->tcp_listener == NULL) {
2627 2628 CONN_INC_REF(connp);
2628 2629 mutex_exit(&bind_connfp->connf_lock);
2629 2630 return (connp);
2630 2631 }
2631 2632 }
2632 2633 mutex_exit(&bind_connfp->connf_lock);
2633 2634 return (NULL);
2634 2635 }
2635 2636
2636 2637 /*
2637 2638 * ipcl_get_next_conn
2638 2639 * get the next entry in the conn global list
2639 2640 * and put a reference on the next_conn.
2640 2641 * decrement the reference on the current conn.
2641 2642 *
2642 2643 * This is an iterator based walker function that also provides for
2643 2644 * some selection by the caller. It walks through the conn_hash bucket
2644 2645 * searching for the next valid connp in the list, and selects connections
2645 2646 * that are neither closed nor condemned. It also REFHOLDS the conn
2646 2647 * thus ensuring that the conn exists when the caller uses the conn.
2647 2648 */
2648 2649 conn_t *
2649 2650 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2650 2651 {
2651 2652 conn_t *next_connp;
2652 2653
2653 2654 if (connfp == NULL)
2654 2655 return (NULL);
2655 2656
2656 2657 mutex_enter(&connfp->connf_lock);
2657 2658
2658 2659 next_connp = (connp == NULL) ?
2659 2660 connfp->connf_head : connp->conn_g_next;
2660 2661
2661 2662 while (next_connp != NULL) {
2662 2663 mutex_enter(&next_connp->conn_lock);
2663 2664 if (!(next_connp->conn_flags & conn_flags) ||
2664 2665 (next_connp->conn_state_flags &
2665 2666 (CONN_CONDEMNED | CONN_INCIPIENT))) {
2666 2667 /*
2667 2668 * This conn has been condemned or
2668 2669 * is closing, or the flags don't match
2669 2670 */
2670 2671 mutex_exit(&next_connp->conn_lock);
2671 2672 next_connp = next_connp->conn_g_next;
2672 2673 continue;
2673 2674 }
2674 2675 CONN_INC_REF_LOCKED(next_connp);
2675 2676 mutex_exit(&next_connp->conn_lock);
2676 2677 break;
2677 2678 }
2678 2679
2679 2680 mutex_exit(&connfp->connf_lock);
2680 2681
2681 2682 if (connp != NULL)
2682 2683 CONN_DEC_REF(connp);
2683 2684
2684 2685 return (next_connp);
2685 2686 }
2686 2687
2687 2688 #ifdef CONN_DEBUG
2688 2689 /*
2689 2690 * Trace of the last NBUF refhold/refrele
2690 2691 */
2691 2692 int
2692 2693 conn_trace_ref(conn_t *connp)
2693 2694 {
2694 2695 int last;
2695 2696 conn_trace_t *ctb;
2696 2697
2697 2698 ASSERT(MUTEX_HELD(&connp->conn_lock));
2698 2699 last = connp->conn_trace_last;
2699 2700 last++;
2700 2701 if (last == CONN_TRACE_MAX)
2701 2702 last = 0;
2702 2703
2703 2704 ctb = &connp->conn_trace_buf[last];
2704 2705 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2705 2706 connp->conn_trace_last = last;
2706 2707 return (1);
2707 2708 }
2708 2709
2709 2710 int
2710 2711 conn_untrace_ref(conn_t *connp)
2711 2712 {
2712 2713 int last;
2713 2714 conn_trace_t *ctb;
2714 2715
2715 2716 ASSERT(MUTEX_HELD(&connp->conn_lock));
2716 2717 last = connp->conn_trace_last;
2717 2718 last++;
2718 2719 if (last == CONN_TRACE_MAX)
2719 2720 last = 0;
2720 2721
2721 2722 ctb = &connp->conn_trace_buf[last];
2722 2723 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2723 2724 connp->conn_trace_last = last;
2724 2725 return (1);
2725 2726 }
2726 2727 #endif
2727 2728
2728 2729 mib2_socketInfoEntry_t *
2729 2730 conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie)
2730 2731 {
2731 2732 vnode_t *vn = NULL;
2732 2733 vattr_t attr;
2733 2734 uint64_t flags = 0;
2734 2735
2735 2736 /*
2736 2737 * If the connection is closing, it is not safe to make an upcall or
2737 2738 * access the stream associated with the connection.
|
↓ open down ↓ |
2704 lines elided |
↑ open up ↑ |
2738 2739 * The callers of this function have a reference on connp itself
2739 2740 * so, as long as it is not closing, it's safe to continue.
2740 2741 */
2741 2742 mutex_enter(&connp->conn_lock);
2742 2743
2743 2744 if ((connp->conn_state_flags & CONN_CLOSING)) {
2744 2745 mutex_exit(&connp->conn_lock);
2745 2746 return (NULL);
2746 2747 }
2747 2748
2748 - mutex_exit(&connp->conn_lock);
2749 + /*
2750 + * Continue to hold conn_lock because we don't want to race with an
2751 + * in-progress close, which will have set-to-NULL (and destroyed
2752 + * upper_handle, aka sonode (and vnode)) BEFORE setting CONN_CLOSING.
2753 + */
2749 2754
2750 2755 if (connp->conn_upper_handle != NULL) {
2751 2756 vn = (*connp->conn_upcalls->su_get_vnode)
2752 2757 (connp->conn_upper_handle);
2753 2758 } else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL) {
2754 2759 vn = STREAM(connp->conn_rq)->sd_pvnode;
2755 2760 if (vn != NULL)
2756 2761 VN_HOLD(vn);
2757 2762 flags |= MIB2_SOCKINFO_STREAM;
2758 2763 }
2759 2764
2765 + mutex_exit(&connp->conn_lock);
2766 +
2760 2767 if (vn == NULL || VOP_GETATTR(vn, &attr, 0, CRED(), NULL) != 0) {
2761 2768 if (vn != NULL)
2762 2769 VN_RELE(vn);
2763 2770 return (NULL);
2764 2771 }
2765 2772
2766 2773 VN_RELE(vn);
2767 2774
2768 2775 bzero(sie, sizeof (*sie));
2769 2776
2770 2777 sie->sie_flags = flags;
2771 2778 sie->sie_inode = attr.va_nodeid;
2772 2779 sie->sie_dev = attr.va_rdev;
2773 2780
2774 2781 return (sie);
2775 2782 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX