Print this page
re #13613 rb4516 Tunables needs volatile keyword
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/inet/ip/ipclassifier.c
+++ new/usr/src/uts/common/inet/ip/ipclassifier.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
23 24 */
24 25
25 26 /*
26 27 * IP PACKET CLASSIFIER
27 28 *
28 29 * The IP packet classifier provides mapping between IP packets and persistent
29 30 * connection state for connection-oriented protocols. It also provides
30 31 * interface for managing connection states.
31 32 *
32 33 * The connection state is kept in conn_t data structure and contains, among
33 34 * other things:
34 35 *
35 36 * o local/remote address and ports
36 37 * o Transport protocol
37 38 * o squeue for the connection (for TCP only)
38 39 * o reference counter
39 40 * o Connection state
40 41 * o hash table linkage
41 42 * o interface/ire information
42 43 * o credentials
43 44 * o ipsec policy
44 45 * o send and receive functions.
45 46 * o mutex lock.
46 47 *
47 48 * Connections use a reference counting scheme. They are freed when the
48 49 * reference counter drops to zero. A reference is incremented when connection
49 50 * is placed in a list or table, when incoming packet for the connection arrives
50 51 * and when connection is processed via squeue (squeue processing may be
51 52 * asynchronous and the reference protects the connection from being destroyed
52 53 * before its processing is finished).
53 54 *
54 55 * conn_recv is used to pass up packets to the ULP.
55 56 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
56 57 * a listener, and changes to tcp_input_listener as the listener has picked a
57 58 * good squeue. For other cases it is set to tcp_input_data.
58 59 *
59 60 * conn_recvicmp is used to pass up ICMP errors to the ULP.
60 61 *
61 62 * Classifier uses several hash tables:
62 63 *
63 64 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state
64 65 * ipcl_bind_fanout: contains all connections in BOUND state
65 66 * ipcl_proto_fanout: IPv4 protocol fanout
66 67 * ipcl_proto_fanout_v6: IPv6 protocol fanout
67 68 * ipcl_udp_fanout: contains all UDP connections
68 69 * ipcl_iptun_fanout: contains all IP tunnel connections
69 70 * ipcl_globalhash_fanout: contains all connections
70 71 *
71 72 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
72 73 * which need to view all existing connections.
73 74 *
74 75 * All tables are protected by per-bucket locks. When both per-bucket lock and
75 76 * connection lock need to be held, the per-bucket lock should be acquired
76 77 * first, followed by the connection lock.
77 78 *
78 79 * All functions doing search in one of these tables increment a reference
79 80 * counter on the connection found (if any). This reference should be dropped
80 81 * when the caller has finished processing the connection.
81 82 *
82 83 *
83 84 * INTERFACES:
84 85 * ===========
85 86 *
86 87 * Connection Lookup:
87 88 * ------------------
88 89 *
89 90 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
90 91 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
91 92 *
92 93 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
93 94 * it can't find any associated connection. If the connection is found, its
94 95 * reference counter is incremented.
95 96 *
96 97 * mp: mblock, containing packet header. The full header should fit
97 98 * into a single mblock. It should also contain at least full IP
98 99 * and TCP or UDP header.
99 100 *
100 101 * protocol: Either IPPROTO_TCP or IPPROTO_UDP.
101 102 *
102 103 * hdr_len: The size of IP header. It is used to find TCP or UDP header in
103 104 * the packet.
104 105 *
105 106 * ira->ira_zoneid: The zone in which the returned connection must be; the
106 107 * zoneid corresponding to the ire_zoneid on the IRE located for
107 108 * the packet's destination address.
108 109 *
109 110 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
110 111 * IRAF_TX_SHARED_ADDR flags
111 112 *
112 113 * For TCP connections, the lookup order is as follows:
113 114 * 5-tuple {src, dst, protocol, local port, remote port}
114 115 * lookup in ipcl_conn_fanout table.
115 116 * 3-tuple {dst, remote port, protocol} lookup in
116 117 * ipcl_bind_fanout table.
117 118 *
118 119 * For UDP connections, a 5-tuple {src, dst, protocol, local port,
119 120 * remote port} lookup is done on ipcl_udp_fanout. Note that,
120 121 * these interfaces do not handle cases where a packets belongs
121 122 * to multiple UDP clients, which is handled in IP itself.
122 123 *
123 124 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
124 125 * determine which actual zone gets the segment. This is used only in a
125 126 * labeled environment. The matching rules are:
126 127 *
127 128 * - If it's not a multilevel port, then the label on the packet selects
128 129 * the zone. Unlabeled packets are delivered to the global zone.
129 130 *
130 131 * - If it's a multilevel port, then only the zone registered to receive
131 132 * packets on that port matches.
132 133 *
133 134 * Also, in a labeled environment, packet labels need to be checked. For fully
134 135 * bound TCP connections, we can assume that the packet label was checked
135 136 * during connection establishment, and doesn't need to be checked on each
136 137 * packet. For others, though, we need to check for strict equality or, for
137 138 * multilevel ports, membership in the range or set. This part currently does
138 139 * a tnrh lookup on each packet, but could be optimized to use cached results
139 140 * if that were necessary. (SCTP doesn't come through here, but if it did,
140 141 * we would apply the same rules as TCP.)
141 142 *
142 143 * An implication of the above is that fully-bound TCP sockets must always use
143 144 * distinct 4-tuples; they can't be discriminated by label alone.
144 145 *
145 146 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
146 147 * as there's no connection set-up handshake and no shared state.
147 148 *
148 149 * Labels on looped-back packets within a single zone do not need to be
149 150 * checked, as all processes in the same zone have the same label.
150 151 *
151 152 * Finally, for unlabeled packets received by a labeled system, special rules
152 153 * apply. We consider only the MLP if there is one. Otherwise, we prefer a
153 154 * socket in the zone whose label matches the default label of the sender, if
154 155 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the
155 156 * receiver's label must dominate the sender's default label.
156 157 *
157 158 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
158 159 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
159 160 * ip_stack);
160 161 *
161 162 * Lookup routine to find a exact match for {src, dst, local port,
162 163 * remote port) for TCP connections in ipcl_conn_fanout. The address and
163 164 * ports are read from the IP and TCP header respectively.
164 165 *
165 166 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol,
166 167 * zoneid, ip_stack);
167 168 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
168 169 * zoneid, ip_stack);
169 170 *
170 171 * Lookup routine to find a listener with the tuple {lport, laddr,
171 172 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional
172 173 * parameter interface index is also compared.
173 174 *
174 175 * void ipcl_walk(func, arg, ip_stack)
175 176 *
176 177 * Apply 'func' to every connection available. The 'func' is called as
177 178 * (*func)(connp, arg). The walk is non-atomic so connections may be
178 179 * created and destroyed during the walk. The CONN_CONDEMNED and
179 180 * CONN_INCIPIENT flags ensure that connections which are newly created
180 181 * or being destroyed are not selected by the walker.
181 182 *
182 183 * Table Updates
183 184 * -------------
184 185 *
185 186 * int ipcl_conn_insert(connp);
186 187 * int ipcl_conn_insert_v4(connp);
187 188 * int ipcl_conn_insert_v6(connp);
188 189 *
189 190 * Insert 'connp' in the ipcl_conn_fanout.
190 191 * Arguements :
191 192 * connp conn_t to be inserted
192 193 *
193 194 * Return value :
194 195 * 0 if connp was inserted
195 196 * EADDRINUSE if the connection with the same tuple
196 197 * already exists.
197 198 *
198 199 * int ipcl_bind_insert(connp);
199 200 * int ipcl_bind_insert_v4(connp);
200 201 * int ipcl_bind_insert_v6(connp);
201 202 *
202 203 * Insert 'connp' in ipcl_bind_fanout.
203 204 * Arguements :
204 205 * connp conn_t to be inserted
205 206 *
206 207 *
207 208 * void ipcl_hash_remove(connp);
208 209 *
209 210 * Removes the 'connp' from the connection fanout table.
210 211 *
211 212 * Connection Creation/Destruction
212 213 * -------------------------------
213 214 *
214 215 * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
215 216 *
216 217 * Creates a new conn based on the type flag, inserts it into
217 218 * globalhash table.
218 219 *
219 220 * type: This flag determines the type of conn_t which needs to be
220 221 * created i.e., which kmem_cache it comes from.
221 222 * IPCL_TCPCONN indicates a TCP connection
222 223 * IPCL_SCTPCONN indicates a SCTP connection
223 224 * IPCL_UDPCONN indicates a UDP conn_t.
224 225 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t.
225 226 * IPCL_RTSCONN indicates a RTS conn_t.
226 227 * IPCL_IPCCONN indicates all other connections.
227 228 *
228 229 * void ipcl_conn_destroy(connp)
229 230 *
230 231 * Destroys the connection state, removes it from the global
231 232 * connection hash table and frees its memory.
232 233 */
233 234
234 235 #include <sys/types.h>
235 236 #include <sys/stream.h>
236 237 #include <sys/stropts.h>
237 238 #include <sys/sysmacros.h>
238 239 #include <sys/strsubr.h>
239 240 #include <sys/strsun.h>
240 241 #define _SUN_TPI_VERSION 2
241 242 #include <sys/ddi.h>
242 243 #include <sys/cmn_err.h>
243 244 #include <sys/debug.h>
244 245
245 246 #include <sys/systm.h>
246 247 #include <sys/param.h>
247 248 #include <sys/kmem.h>
248 249 #include <sys/isa_defs.h>
249 250 #include <inet/common.h>
250 251 #include <netinet/ip6.h>
251 252 #include <netinet/icmp6.h>
252 253
253 254 #include <inet/ip.h>
254 255 #include <inet/ip_if.h>
255 256 #include <inet/ip_ire.h>
256 257 #include <inet/ip6.h>
257 258 #include <inet/ip_ndp.h>
258 259 #include <inet/ip_impl.h>
259 260 #include <inet/udp_impl.h>
260 261 #include <inet/sctp_ip.h>
261 262 #include <inet/sctp/sctp_impl.h>
262 263 #include <inet/rawip_impl.h>
263 264 #include <inet/rts_impl.h>
264 265 #include <inet/iptun/iptun_impl.h>
265 266
266 267 #include <sys/cpuvar.h>
267 268
268 269 #include <inet/ipclassifier.h>
|
↓ open down ↓ |
236 lines elided |
↑ open up ↑ |
269 270 #include <inet/tcp.h>
270 271 #include <inet/ipsec_impl.h>
271 272
272 273 #include <sys/tsol/tnet.h>
273 274 #include <sys/sockio.h>
274 275
275 276 /* Old value for compatibility. Setable in /etc/system */
276 277 uint_t tcp_conn_hash_size = 0;
277 278
278 279 /* New value. Zero means choose automatically. Setable in /etc/system */
279 -uint_t ipcl_conn_hash_size = 0;
280 +volatile uint_t ipcl_conn_hash_size = 0;
280 281 uint_t ipcl_conn_hash_memfactor = 8192;
281 282 uint_t ipcl_conn_hash_maxsize = 82500;
282 283
283 284 /* bind/udp fanout table size */
284 285 uint_t ipcl_bind_fanout_size = 512;
285 286 uint_t ipcl_udp_fanout_size = 16384;
286 287
287 288 /* Raw socket fanout size. Must be a power of 2. */
288 289 uint_t ipcl_raw_fanout_size = 256;
289 290
290 291 /*
291 292 * The IPCL_IPTUN_HASH() function works best with a prime table size. We
292 293 * expect that most large deployments would have hundreds of tunnels, and
293 294 * thousands in the extreme case.
294 295 */
295 296 uint_t ipcl_iptun_fanout_size = 6143;
296 297
297 298 /*
298 299 * Power of 2^N Primes useful for hashing for N of 0-28,
299 300 * these primes are the nearest prime <= 2^N - 2^(N-2).
300 301 */
301 302
302 303 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \
303 304 6143, 12281, 24571, 49139, 98299, 196597, 393209, \
304 305 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \
305 306 50331599, 100663291, 201326557, 0}
306 307
307 308 /*
308 309 * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
309 310 * are aligned on cache lines.
310 311 */
311 312 typedef union itc_s {
312 313 conn_t itc_conn;
313 314 char itcu_filler[CACHE_ALIGN(conn_s)];
314 315 } itc_t;
315 316
316 317 struct kmem_cache *tcp_conn_cache;
317 318 struct kmem_cache *ip_conn_cache;
318 319 extern struct kmem_cache *sctp_conn_cache;
319 320 struct kmem_cache *udp_conn_cache;
320 321 struct kmem_cache *rawip_conn_cache;
321 322 struct kmem_cache *rts_conn_cache;
322 323
323 324 extern void tcp_timermp_free(tcp_t *);
324 325 extern mblk_t *tcp_timermp_alloc(int);
325 326
326 327 static int ip_conn_constructor(void *, void *, int);
327 328 static void ip_conn_destructor(void *, void *);
328 329
329 330 static int tcp_conn_constructor(void *, void *, int);
330 331 static void tcp_conn_destructor(void *, void *);
331 332
332 333 static int udp_conn_constructor(void *, void *, int);
333 334 static void udp_conn_destructor(void *, void *);
334 335
335 336 static int rawip_conn_constructor(void *, void *, int);
336 337 static void rawip_conn_destructor(void *, void *);
337 338
338 339 static int rts_conn_constructor(void *, void *, int);
339 340 static void rts_conn_destructor(void *, void *);
340 341
341 342 /*
342 343 * Global (for all stack instances) init routine
343 344 */
344 345 void
345 346 ipcl_g_init(void)
346 347 {
347 348 ip_conn_cache = kmem_cache_create("ip_conn_cache",
348 349 sizeof (conn_t), CACHE_ALIGN_SIZE,
349 350 ip_conn_constructor, ip_conn_destructor,
350 351 NULL, NULL, NULL, 0);
351 352
352 353 tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
353 354 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
354 355 tcp_conn_constructor, tcp_conn_destructor,
355 356 tcp_conn_reclaim, NULL, NULL, 0);
356 357
357 358 udp_conn_cache = kmem_cache_create("udp_conn_cache",
358 359 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
359 360 udp_conn_constructor, udp_conn_destructor,
360 361 NULL, NULL, NULL, 0);
361 362
362 363 rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
363 364 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
364 365 rawip_conn_constructor, rawip_conn_destructor,
365 366 NULL, NULL, NULL, 0);
366 367
367 368 rts_conn_cache = kmem_cache_create("rts_conn_cache",
368 369 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
369 370 rts_conn_constructor, rts_conn_destructor,
370 371 NULL, NULL, NULL, 0);
371 372 }
372 373
373 374 /*
374 375 * ipclassifier intialization routine, sets up hash tables.
375 376 */
376 377 void
377 378 ipcl_init(ip_stack_t *ipst)
378 379 {
379 380 int i;
380 381 int sizes[] = P2Ps();
381 382
382 383 /*
383 384 * Calculate size of conn fanout table from /etc/system settings
384 385 */
385 386 if (ipcl_conn_hash_size != 0) {
386 387 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
387 388 } else if (tcp_conn_hash_size != 0) {
388 389 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
389 390 } else {
390 391 extern pgcnt_t freemem;
391 392
392 393 ipst->ips_ipcl_conn_fanout_size =
393 394 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
394 395
395 396 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
396 397 ipst->ips_ipcl_conn_fanout_size =
397 398 ipcl_conn_hash_maxsize;
398 399 }
399 400 }
400 401
401 402 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
402 403 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
403 404 break;
404 405 }
405 406 }
406 407 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
407 408 /* Out of range, use the 2^16 value */
408 409 ipst->ips_ipcl_conn_fanout_size = sizes[16];
409 410 }
410 411
411 412 /* Take values from /etc/system */
412 413 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
413 414 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
414 415 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
415 416 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
416 417
417 418 ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
418 419
419 420 ipst->ips_ipcl_conn_fanout = kmem_zalloc(
420 421 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
421 422
422 423 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
423 424 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
424 425 MUTEX_DEFAULT, NULL);
425 426 }
426 427
427 428 ipst->ips_ipcl_bind_fanout = kmem_zalloc(
428 429 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
429 430
430 431 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
431 432 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
432 433 MUTEX_DEFAULT, NULL);
433 434 }
434 435
435 436 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
436 437 sizeof (connf_t), KM_SLEEP);
437 438 for (i = 0; i < IPPROTO_MAX; i++) {
438 439 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
439 440 MUTEX_DEFAULT, NULL);
440 441 }
441 442
442 443 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
443 444 sizeof (connf_t), KM_SLEEP);
444 445 for (i = 0; i < IPPROTO_MAX; i++) {
445 446 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
446 447 MUTEX_DEFAULT, NULL);
447 448 }
448 449
449 450 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
450 451 mutex_init(&ipst->ips_rts_clients->connf_lock,
451 452 NULL, MUTEX_DEFAULT, NULL);
452 453
453 454 ipst->ips_ipcl_udp_fanout = kmem_zalloc(
454 455 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
455 456 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
456 457 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
457 458 MUTEX_DEFAULT, NULL);
458 459 }
459 460
460 461 ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
461 462 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
462 463 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
463 464 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
464 465 MUTEX_DEFAULT, NULL);
465 466 }
466 467
467 468 ipst->ips_ipcl_raw_fanout = kmem_zalloc(
468 469 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
469 470 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
470 471 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
471 472 MUTEX_DEFAULT, NULL);
472 473 }
473 474
474 475 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
475 476 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
476 477 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
477 478 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
478 479 NULL, MUTEX_DEFAULT, NULL);
479 480 }
480 481 }
481 482
482 483 void
483 484 ipcl_g_destroy(void)
484 485 {
485 486 kmem_cache_destroy(ip_conn_cache);
486 487 kmem_cache_destroy(tcp_conn_cache);
487 488 kmem_cache_destroy(udp_conn_cache);
488 489 kmem_cache_destroy(rawip_conn_cache);
489 490 kmem_cache_destroy(rts_conn_cache);
490 491 }
491 492
492 493 /*
493 494 * All user-level and kernel use of the stack must be gone
494 495 * by now.
495 496 */
496 497 void
497 498 ipcl_destroy(ip_stack_t *ipst)
498 499 {
499 500 int i;
500 501
501 502 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
502 503 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
503 504 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
504 505 }
505 506 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
506 507 sizeof (connf_t));
507 508 ipst->ips_ipcl_conn_fanout = NULL;
508 509
509 510 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
510 511 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
511 512 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
512 513 }
513 514 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
514 515 sizeof (connf_t));
515 516 ipst->ips_ipcl_bind_fanout = NULL;
516 517
517 518 for (i = 0; i < IPPROTO_MAX; i++) {
518 519 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
519 520 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
520 521 }
521 522 kmem_free(ipst->ips_ipcl_proto_fanout_v4,
522 523 IPPROTO_MAX * sizeof (connf_t));
523 524 ipst->ips_ipcl_proto_fanout_v4 = NULL;
524 525
525 526 for (i = 0; i < IPPROTO_MAX; i++) {
526 527 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
527 528 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
528 529 }
529 530 kmem_free(ipst->ips_ipcl_proto_fanout_v6,
530 531 IPPROTO_MAX * sizeof (connf_t));
531 532 ipst->ips_ipcl_proto_fanout_v6 = NULL;
532 533
533 534 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
534 535 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
535 536 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
536 537 }
537 538 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
538 539 sizeof (connf_t));
539 540 ipst->ips_ipcl_udp_fanout = NULL;
540 541
541 542 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
542 543 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
543 544 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
544 545 }
545 546 kmem_free(ipst->ips_ipcl_iptun_fanout,
546 547 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
547 548 ipst->ips_ipcl_iptun_fanout = NULL;
548 549
549 550 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
550 551 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
551 552 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
552 553 }
553 554 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
554 555 sizeof (connf_t));
555 556 ipst->ips_ipcl_raw_fanout = NULL;
556 557
557 558 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
558 559 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
559 560 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
560 561 }
561 562 kmem_free(ipst->ips_ipcl_globalhash_fanout,
562 563 sizeof (connf_t) * CONN_G_HASH_SIZE);
563 564 ipst->ips_ipcl_globalhash_fanout = NULL;
564 565
565 566 ASSERT(ipst->ips_rts_clients->connf_head == NULL);
566 567 mutex_destroy(&ipst->ips_rts_clients->connf_lock);
567 568 kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
568 569 ipst->ips_rts_clients = NULL;
569 570 }
570 571
571 572 /*
572 573 * conn creation routine. initialize the conn, sets the reference
573 574 * and inserts it in the global hash table.
574 575 */
575 576 conn_t *
576 577 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
577 578 {
578 579 conn_t *connp;
579 580 struct kmem_cache *conn_cache;
580 581
581 582 switch (type) {
582 583 case IPCL_SCTPCONN:
583 584 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
584 585 return (NULL);
585 586 sctp_conn_init(connp);
586 587 netstack_hold(ns);
587 588 connp->conn_netstack = ns;
588 589 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
589 590 connp->conn_ixa->ixa_conn_id = (long)connp;
590 591 ipcl_globalhash_insert(connp);
591 592 return (connp);
592 593
593 594 case IPCL_TCPCONN:
594 595 conn_cache = tcp_conn_cache;
595 596 break;
596 597
597 598 case IPCL_UDPCONN:
598 599 conn_cache = udp_conn_cache;
599 600 break;
600 601
601 602 case IPCL_RAWIPCONN:
602 603 conn_cache = rawip_conn_cache;
603 604 break;
604 605
605 606 case IPCL_RTSCONN:
606 607 conn_cache = rts_conn_cache;
607 608 break;
608 609
609 610 case IPCL_IPCCONN:
610 611 conn_cache = ip_conn_cache;
611 612 break;
612 613
613 614 default:
614 615 connp = NULL;
615 616 ASSERT(0);
616 617 }
617 618
618 619 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
619 620 return (NULL);
620 621
621 622 connp->conn_ref = 1;
622 623 netstack_hold(ns);
623 624 connp->conn_netstack = ns;
624 625 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
625 626 connp->conn_ixa->ixa_conn_id = (long)connp;
626 627 ipcl_globalhash_insert(connp);
627 628 return (connp);
628 629 }
629 630
630 631 void
631 632 ipcl_conn_destroy(conn_t *connp)
632 633 {
633 634 mblk_t *mp;
634 635 netstack_t *ns = connp->conn_netstack;
635 636
636 637 ASSERT(!MUTEX_HELD(&connp->conn_lock));
637 638 ASSERT(connp->conn_ref == 0);
638 639 ASSERT(connp->conn_ioctlref == 0);
639 640
640 641 DTRACE_PROBE1(conn__destroy, conn_t *, connp);
641 642
642 643 if (connp->conn_cred != NULL) {
643 644 crfree(connp->conn_cred);
644 645 connp->conn_cred = NULL;
645 646 /* ixa_cred done in ipcl_conn_cleanup below */
646 647 }
647 648
648 649 if (connp->conn_ht_iphc != NULL) {
649 650 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
650 651 connp->conn_ht_iphc = NULL;
651 652 connp->conn_ht_iphc_allocated = 0;
652 653 connp->conn_ht_iphc_len = 0;
653 654 connp->conn_ht_ulp = NULL;
654 655 connp->conn_ht_ulp_len = 0;
655 656 }
656 657 ip_pkt_free(&connp->conn_xmit_ipp);
657 658
658 659 ipcl_globalhash_remove(connp);
659 660
660 661 if (connp->conn_latch != NULL) {
661 662 IPLATCH_REFRELE(connp->conn_latch);
662 663 connp->conn_latch = NULL;
663 664 }
664 665 if (connp->conn_latch_in_policy != NULL) {
665 666 IPPOL_REFRELE(connp->conn_latch_in_policy);
666 667 connp->conn_latch_in_policy = NULL;
667 668 }
668 669 if (connp->conn_latch_in_action != NULL) {
669 670 IPACT_REFRELE(connp->conn_latch_in_action);
670 671 connp->conn_latch_in_action = NULL;
671 672 }
672 673 if (connp->conn_policy != NULL) {
673 674 IPPH_REFRELE(connp->conn_policy, ns);
674 675 connp->conn_policy = NULL;
675 676 }
676 677
677 678 if (connp->conn_ipsec_opt_mp != NULL) {
678 679 freemsg(connp->conn_ipsec_opt_mp);
679 680 connp->conn_ipsec_opt_mp = NULL;
680 681 }
681 682
682 683 if (connp->conn_flags & IPCL_TCPCONN) {
683 684 tcp_t *tcp = connp->conn_tcp;
684 685
685 686 tcp_free(tcp);
686 687 mp = tcp->tcp_timercache;
687 688
688 689 tcp->tcp_tcps = NULL;
689 690
690 691 /*
691 692 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
692 693 * the mblk.
693 694 */
694 695 if (tcp->tcp_rsrv_mp != NULL) {
695 696 freeb(tcp->tcp_rsrv_mp);
696 697 tcp->tcp_rsrv_mp = NULL;
697 698 mutex_destroy(&tcp->tcp_rsrv_mp_lock);
698 699 }
699 700
700 701 ipcl_conn_cleanup(connp);
701 702 connp->conn_flags = IPCL_TCPCONN;
702 703 if (ns != NULL) {
703 704 ASSERT(tcp->tcp_tcps == NULL);
704 705 connp->conn_netstack = NULL;
705 706 connp->conn_ixa->ixa_ipst = NULL;
706 707 netstack_rele(ns);
707 708 }
708 709
709 710 bzero(tcp, sizeof (tcp_t));
710 711
711 712 tcp->tcp_timercache = mp;
712 713 tcp->tcp_connp = connp;
713 714 kmem_cache_free(tcp_conn_cache, connp);
714 715 return;
715 716 }
716 717
717 718 if (connp->conn_flags & IPCL_SCTPCONN) {
718 719 ASSERT(ns != NULL);
719 720 sctp_free(connp);
720 721 return;
721 722 }
722 723
723 724 ipcl_conn_cleanup(connp);
724 725 if (ns != NULL) {
725 726 connp->conn_netstack = NULL;
726 727 connp->conn_ixa->ixa_ipst = NULL;
727 728 netstack_rele(ns);
728 729 }
729 730
730 731 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
731 732 if (connp->conn_flags & IPCL_UDPCONN) {
732 733 connp->conn_flags = IPCL_UDPCONN;
733 734 kmem_cache_free(udp_conn_cache, connp);
734 735 } else if (connp->conn_flags & IPCL_RAWIPCONN) {
735 736 connp->conn_flags = IPCL_RAWIPCONN;
736 737 connp->conn_proto = IPPROTO_ICMP;
737 738 connp->conn_ixa->ixa_protocol = connp->conn_proto;
738 739 kmem_cache_free(rawip_conn_cache, connp);
739 740 } else if (connp->conn_flags & IPCL_RTSCONN) {
740 741 connp->conn_flags = IPCL_RTSCONN;
741 742 kmem_cache_free(rts_conn_cache, connp);
742 743 } else {
743 744 connp->conn_flags = IPCL_IPCCONN;
744 745 ASSERT(connp->conn_flags & IPCL_IPCCONN);
745 746 ASSERT(connp->conn_priv == NULL);
746 747 kmem_cache_free(ip_conn_cache, connp);
747 748 }
748 749 }
749 750
750 751 /*
751 752 * Running in cluster mode - deregister listener information
752 753 */
753 754 static void
754 755 ipcl_conn_unlisten(conn_t *connp)
755 756 {
756 757 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
757 758 ASSERT(connp->conn_lport != 0);
758 759
759 760 if (cl_inet_unlisten != NULL) {
760 761 sa_family_t addr_family;
761 762 uint8_t *laddrp;
762 763
763 764 if (connp->conn_ipversion == IPV6_VERSION) {
764 765 addr_family = AF_INET6;
765 766 laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
766 767 } else {
767 768 addr_family = AF_INET;
768 769 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
769 770 }
770 771 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
771 772 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
772 773 }
773 774 connp->conn_flags &= ~IPCL_CL_LISTENER;
774 775 }
775 776
776 777 /*
777 778 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
778 779 * which table the conn belonged to). So for debugging we can see which hash
779 780 * table this connection was in.
780 781 */
781 782 #define IPCL_HASH_REMOVE(connp) { \
782 783 connf_t *connfp = (connp)->conn_fanout; \
783 784 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \
784 785 if (connfp != NULL) { \
785 786 mutex_enter(&connfp->connf_lock); \
786 787 if ((connp)->conn_next != NULL) \
787 788 (connp)->conn_next->conn_prev = \
788 789 (connp)->conn_prev; \
789 790 if ((connp)->conn_prev != NULL) \
790 791 (connp)->conn_prev->conn_next = \
791 792 (connp)->conn_next; \
792 793 else \
793 794 connfp->connf_head = (connp)->conn_next; \
794 795 (connp)->conn_fanout = NULL; \
795 796 (connp)->conn_next = NULL; \
796 797 (connp)->conn_prev = NULL; \
797 798 (connp)->conn_flags |= IPCL_REMOVED; \
798 799 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \
799 800 ipcl_conn_unlisten((connp)); \
800 801 CONN_DEC_REF((connp)); \
801 802 mutex_exit(&connfp->connf_lock); \
802 803 } \
803 804 }
804 805
805 806 void
806 807 ipcl_hash_remove(conn_t *connp)
807 808 {
808 809 uint8_t protocol = connp->conn_proto;
809 810
810 811 IPCL_HASH_REMOVE(connp);
811 812 if (protocol == IPPROTO_RSVP)
812 813 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
813 814 }
814 815
815 816 /*
816 817 * The whole purpose of this function is allow removal of
817 818 * a conn_t from the connected hash for timewait reclaim.
818 819 * This is essentially a TW reclaim fastpath where timewait
819 820 * collector checks under fanout lock (so no one else can
820 821 * get access to the conn_t) that refcnt is 2 i.e. one for
821 822 * TCP and one for the classifier hash list. If ref count
822 823 * is indeed 2, we can just remove the conn under lock and
823 824 * avoid cleaning up the conn under squeue. This gives us
824 825 * improved performance.
825 826 */
826 827 void
827 828 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
828 829 {
829 830 ASSERT(MUTEX_HELD(&connfp->connf_lock));
830 831 ASSERT(MUTEX_HELD(&connp->conn_lock));
831 832 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
832 833
833 834 if ((connp)->conn_next != NULL) {
834 835 (connp)->conn_next->conn_prev = (connp)->conn_prev;
835 836 }
836 837 if ((connp)->conn_prev != NULL) {
837 838 (connp)->conn_prev->conn_next = (connp)->conn_next;
838 839 } else {
839 840 connfp->connf_head = (connp)->conn_next;
840 841 }
841 842 (connp)->conn_fanout = NULL;
842 843 (connp)->conn_next = NULL;
843 844 (connp)->conn_prev = NULL;
844 845 (connp)->conn_flags |= IPCL_REMOVED;
845 846 ASSERT((connp)->conn_ref == 2);
846 847 (connp)->conn_ref--;
847 848 }
848 849
849 850 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \
850 851 ASSERT((connp)->conn_fanout == NULL); \
851 852 ASSERT((connp)->conn_next == NULL); \
852 853 ASSERT((connp)->conn_prev == NULL); \
853 854 if ((connfp)->connf_head != NULL) { \
854 855 (connfp)->connf_head->conn_prev = (connp); \
855 856 (connp)->conn_next = (connfp)->connf_head; \
856 857 } \
857 858 (connp)->conn_fanout = (connfp); \
858 859 (connfp)->connf_head = (connp); \
859 860 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
860 861 IPCL_CONNECTED; \
861 862 CONN_INC_REF(connp); \
862 863 }
863 864
864 865 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \
865 866 IPCL_HASH_REMOVE((connp)); \
866 867 mutex_enter(&(connfp)->connf_lock); \
867 868 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \
868 869 mutex_exit(&(connfp)->connf_lock); \
869 870 }
870 871
871 872 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \
872 873 conn_t *pconnp = NULL, *nconnp; \
873 874 IPCL_HASH_REMOVE((connp)); \
874 875 mutex_enter(&(connfp)->connf_lock); \
875 876 nconnp = (connfp)->connf_head; \
876 877 while (nconnp != NULL && \
877 878 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \
878 879 pconnp = nconnp; \
879 880 nconnp = nconnp->conn_next; \
880 881 } \
881 882 if (pconnp != NULL) { \
882 883 pconnp->conn_next = (connp); \
883 884 (connp)->conn_prev = pconnp; \
884 885 } else { \
885 886 (connfp)->connf_head = (connp); \
886 887 } \
887 888 if (nconnp != NULL) { \
888 889 (connp)->conn_next = nconnp; \
889 890 nconnp->conn_prev = (connp); \
890 891 } \
891 892 (connp)->conn_fanout = (connfp); \
892 893 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
893 894 IPCL_BOUND; \
894 895 CONN_INC_REF(connp); \
895 896 mutex_exit(&(connfp)->connf_lock); \
896 897 }
897 898
898 899 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \
899 900 conn_t **list, *prev, *next; \
900 901 boolean_t isv4mapped = \
901 902 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \
902 903 IPCL_HASH_REMOVE((connp)); \
903 904 mutex_enter(&(connfp)->connf_lock); \
904 905 list = &(connfp)->connf_head; \
905 906 prev = NULL; \
906 907 while ((next = *list) != NULL) { \
907 908 if (isv4mapped && \
908 909 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \
909 910 connp->conn_zoneid == next->conn_zoneid) { \
910 911 (connp)->conn_next = next; \
911 912 if (prev != NULL) \
912 913 prev = next->conn_prev; \
913 914 next->conn_prev = (connp); \
914 915 break; \
915 916 } \
916 917 list = &next->conn_next; \
917 918 prev = next; \
918 919 } \
919 920 (connp)->conn_prev = prev; \
920 921 *list = (connp); \
921 922 (connp)->conn_fanout = (connfp); \
922 923 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
923 924 IPCL_BOUND; \
924 925 CONN_INC_REF((connp)); \
925 926 mutex_exit(&(connfp)->connf_lock); \
926 927 }
927 928
928 929 void
929 930 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
930 931 {
931 932 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
932 933 }
933 934
934 935 /*
935 936 * Because the classifier is used to classify inbound packets, the destination
936 937 * address is meant to be our local tunnel address (tunnel source), and the
937 938 * source the remote tunnel address (tunnel destination).
938 939 *
939 940 * Note that conn_proto can't be used for fanout since the upper protocol
940 941 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
941 942 */
942 943 conn_t *
943 944 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
944 945 {
945 946 connf_t *connfp;
946 947 conn_t *connp;
947 948
948 949 /* first look for IPv4 tunnel links */
949 950 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
950 951 mutex_enter(&connfp->connf_lock);
951 952 for (connp = connfp->connf_head; connp != NULL;
952 953 connp = connp->conn_next) {
953 954 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
954 955 break;
955 956 }
956 957 if (connp != NULL)
957 958 goto done;
958 959
959 960 mutex_exit(&connfp->connf_lock);
960 961
961 962 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
962 963 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
963 964 INADDR_ANY)];
964 965 mutex_enter(&connfp->connf_lock);
965 966 for (connp = connfp->connf_head; connp != NULL;
966 967 connp = connp->conn_next) {
967 968 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
968 969 break;
969 970 }
970 971 done:
971 972 if (connp != NULL)
972 973 CONN_INC_REF(connp);
973 974 mutex_exit(&connfp->connf_lock);
974 975 return (connp);
975 976 }
976 977
977 978 conn_t *
978 979 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
979 980 {
980 981 connf_t *connfp;
981 982 conn_t *connp;
982 983
983 984 /* Look for an IPv6 tunnel link */
984 985 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
985 986 mutex_enter(&connfp->connf_lock);
986 987 for (connp = connfp->connf_head; connp != NULL;
987 988 connp = connp->conn_next) {
988 989 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
989 990 CONN_INC_REF(connp);
990 991 break;
991 992 }
992 993 }
993 994 mutex_exit(&connfp->connf_lock);
994 995 return (connp);
995 996 }
996 997
997 998 /*
998 999 * This function is used only for inserting SCTP raw socket now.
999 1000 * This may change later.
1000 1001 *
1001 1002 * Note that only one raw socket can be bound to a port. The param
1002 1003 * lport is in network byte order.
1003 1004 */
1004 1005 static int
1005 1006 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1006 1007 {
1007 1008 connf_t *connfp;
1008 1009 conn_t *oconnp;
1009 1010 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1010 1011
1011 1012 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1012 1013
1013 1014 /* Check for existing raw socket already bound to the port. */
1014 1015 mutex_enter(&connfp->connf_lock);
1015 1016 for (oconnp = connfp->connf_head; oconnp != NULL;
1016 1017 oconnp = oconnp->conn_next) {
1017 1018 if (oconnp->conn_lport == lport &&
1018 1019 oconnp->conn_zoneid == connp->conn_zoneid &&
1019 1020 oconnp->conn_family == connp->conn_family &&
1020 1021 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1021 1022 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1022 1023 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1023 1024 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1024 1025 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1025 1026 &connp->conn_laddr_v6))) {
1026 1027 break;
1027 1028 }
1028 1029 }
1029 1030 mutex_exit(&connfp->connf_lock);
1030 1031 if (oconnp != NULL)
1031 1032 return (EADDRNOTAVAIL);
1032 1033
1033 1034 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1034 1035 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1035 1036 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1036 1037 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1037 1038 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1038 1039 } else {
1039 1040 IPCL_HASH_INSERT_BOUND(connfp, connp);
1040 1041 }
1041 1042 } else {
1042 1043 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1043 1044 }
1044 1045 return (0);
1045 1046 }
1046 1047
1047 1048 static int
1048 1049 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1049 1050 {
1050 1051 connf_t *connfp;
1051 1052 conn_t *tconnp;
1052 1053 ipaddr_t laddr = connp->conn_laddr_v4;
1053 1054 ipaddr_t faddr = connp->conn_faddr_v4;
1054 1055
1055 1056 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1056 1057 mutex_enter(&connfp->connf_lock);
1057 1058 for (tconnp = connfp->connf_head; tconnp != NULL;
1058 1059 tconnp = tconnp->conn_next) {
1059 1060 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1060 1061 /* A tunnel is already bound to these addresses. */
1061 1062 mutex_exit(&connfp->connf_lock);
1062 1063 return (EADDRINUSE);
1063 1064 }
1064 1065 }
1065 1066 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1066 1067 mutex_exit(&connfp->connf_lock);
1067 1068 return (0);
1068 1069 }
1069 1070
1070 1071 static int
1071 1072 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1072 1073 {
1073 1074 connf_t *connfp;
1074 1075 conn_t *tconnp;
1075 1076 in6_addr_t *laddr = &connp->conn_laddr_v6;
1076 1077 in6_addr_t *faddr = &connp->conn_faddr_v6;
1077 1078
1078 1079 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1079 1080 mutex_enter(&connfp->connf_lock);
1080 1081 for (tconnp = connfp->connf_head; tconnp != NULL;
1081 1082 tconnp = tconnp->conn_next) {
1082 1083 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1083 1084 /* A tunnel is already bound to these addresses. */
1084 1085 mutex_exit(&connfp->connf_lock);
1085 1086 return (EADDRINUSE);
1086 1087 }
1087 1088 }
1088 1089 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1089 1090 mutex_exit(&connfp->connf_lock);
1090 1091 return (0);
1091 1092 }
1092 1093
1093 1094 /*
1094 1095 * Check for a MAC exemption conflict on a labeled system. Note that for
1095 1096 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1096 1097 * transport layer. This check is for binding all other protocols.
1097 1098 *
1098 1099 * Returns true if there's a conflict.
1099 1100 */
1100 1101 static boolean_t
1101 1102 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1102 1103 {
1103 1104 connf_t *connfp;
1104 1105 conn_t *tconn;
1105 1106
1106 1107 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1107 1108 mutex_enter(&connfp->connf_lock);
1108 1109 for (tconn = connfp->connf_head; tconn != NULL;
1109 1110 tconn = tconn->conn_next) {
1110 1111 /* We don't allow v4 fallback for v6 raw socket */
1111 1112 if (connp->conn_family != tconn->conn_family)
1112 1113 continue;
1113 1114 /* If neither is exempt, then there's no conflict */
1114 1115 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1115 1116 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1116 1117 continue;
1117 1118 /* We are only concerned about sockets for a different zone */
1118 1119 if (connp->conn_zoneid == tconn->conn_zoneid)
1119 1120 continue;
1120 1121 /* If both are bound to different specific addrs, ok */
1121 1122 if (connp->conn_laddr_v4 != INADDR_ANY &&
1122 1123 tconn->conn_laddr_v4 != INADDR_ANY &&
1123 1124 connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1124 1125 continue;
1125 1126 /* These two conflict; fail */
1126 1127 break;
1127 1128 }
1128 1129 mutex_exit(&connfp->connf_lock);
1129 1130 return (tconn != NULL);
1130 1131 }
1131 1132
1132 1133 static boolean_t
1133 1134 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1134 1135 {
1135 1136 connf_t *connfp;
1136 1137 conn_t *tconn;
1137 1138
1138 1139 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1139 1140 mutex_enter(&connfp->connf_lock);
1140 1141 for (tconn = connfp->connf_head; tconn != NULL;
1141 1142 tconn = tconn->conn_next) {
1142 1143 /* We don't allow v4 fallback for v6 raw socket */
1143 1144 if (connp->conn_family != tconn->conn_family)
1144 1145 continue;
1145 1146 /* If neither is exempt, then there's no conflict */
1146 1147 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1147 1148 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1148 1149 continue;
1149 1150 /* We are only concerned about sockets for a different zone */
1150 1151 if (connp->conn_zoneid == tconn->conn_zoneid)
1151 1152 continue;
1152 1153 /* If both are bound to different addrs, ok */
1153 1154 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1154 1155 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1155 1156 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1156 1157 &tconn->conn_laddr_v6))
1157 1158 continue;
1158 1159 /* These two conflict; fail */
1159 1160 break;
1160 1161 }
1161 1162 mutex_exit(&connfp->connf_lock);
1162 1163 return (tconn != NULL);
1163 1164 }
1164 1165
1165 1166 /*
1166 1167 * (v4, v6) bind hash insertion routines
1167 1168 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1168 1169 */
1169 1170
1170 1171 int
1171 1172 ipcl_bind_insert(conn_t *connp)
1172 1173 {
1173 1174 if (connp->conn_ipversion == IPV6_VERSION)
1174 1175 return (ipcl_bind_insert_v6(connp));
1175 1176 else
1176 1177 return (ipcl_bind_insert_v4(connp));
1177 1178 }
1178 1179
1179 1180 int
1180 1181 ipcl_bind_insert_v4(conn_t *connp)
1181 1182 {
1182 1183 connf_t *connfp;
1183 1184 int ret = 0;
1184 1185 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1185 1186 uint16_t lport = connp->conn_lport;
1186 1187 uint8_t protocol = connp->conn_proto;
1187 1188
1188 1189 if (IPCL_IS_IPTUN(connp))
1189 1190 return (ipcl_iptun_hash_insert(connp, ipst));
1190 1191
1191 1192 switch (protocol) {
1192 1193 default:
1193 1194 if (is_system_labeled() &&
1194 1195 check_exempt_conflict_v4(connp, ipst))
1195 1196 return (EADDRINUSE);
1196 1197 /* FALLTHROUGH */
1197 1198 case IPPROTO_UDP:
1198 1199 if (protocol == IPPROTO_UDP) {
1199 1200 connfp = &ipst->ips_ipcl_udp_fanout[
1200 1201 IPCL_UDP_HASH(lport, ipst)];
1201 1202 } else {
1202 1203 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1203 1204 }
1204 1205
1205 1206 if (connp->conn_faddr_v4 != INADDR_ANY) {
1206 1207 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1207 1208 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1208 1209 IPCL_HASH_INSERT_BOUND(connfp, connp);
1209 1210 } else {
1210 1211 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1211 1212 }
1212 1213 if (protocol == IPPROTO_RSVP)
1213 1214 ill_set_inputfn_all(ipst);
1214 1215 break;
1215 1216
1216 1217 case IPPROTO_TCP:
1217 1218 /* Insert it in the Bind Hash */
1218 1219 ASSERT(connp->conn_zoneid != ALL_ZONES);
1219 1220 connfp = &ipst->ips_ipcl_bind_fanout[
1220 1221 IPCL_BIND_HASH(lport, ipst)];
1221 1222 if (connp->conn_laddr_v4 != INADDR_ANY) {
1222 1223 IPCL_HASH_INSERT_BOUND(connfp, connp);
1223 1224 } else {
1224 1225 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1225 1226 }
1226 1227 if (cl_inet_listen != NULL) {
1227 1228 ASSERT(connp->conn_ipversion == IPV4_VERSION);
1228 1229 connp->conn_flags |= IPCL_CL_LISTENER;
1229 1230 (*cl_inet_listen)(
1230 1231 connp->conn_netstack->netstack_stackid,
1231 1232 IPPROTO_TCP, AF_INET,
1232 1233 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1233 1234 }
1234 1235 break;
1235 1236
1236 1237 case IPPROTO_SCTP:
1237 1238 ret = ipcl_sctp_hash_insert(connp, lport);
1238 1239 break;
1239 1240 }
1240 1241
1241 1242 return (ret);
1242 1243 }
1243 1244
1244 1245 int
1245 1246 ipcl_bind_insert_v6(conn_t *connp)
1246 1247 {
1247 1248 connf_t *connfp;
1248 1249 int ret = 0;
1249 1250 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1250 1251 uint16_t lport = connp->conn_lport;
1251 1252 uint8_t protocol = connp->conn_proto;
1252 1253
1253 1254 if (IPCL_IS_IPTUN(connp)) {
1254 1255 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1255 1256 }
1256 1257
1257 1258 switch (protocol) {
1258 1259 default:
1259 1260 if (is_system_labeled() &&
1260 1261 check_exempt_conflict_v6(connp, ipst))
1261 1262 return (EADDRINUSE);
1262 1263 /* FALLTHROUGH */
1263 1264 case IPPROTO_UDP:
1264 1265 if (protocol == IPPROTO_UDP) {
1265 1266 connfp = &ipst->ips_ipcl_udp_fanout[
1266 1267 IPCL_UDP_HASH(lport, ipst)];
1267 1268 } else {
1268 1269 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1269 1270 }
1270 1271
1271 1272 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1272 1273 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1273 1274 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1274 1275 IPCL_HASH_INSERT_BOUND(connfp, connp);
1275 1276 } else {
1276 1277 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1277 1278 }
1278 1279 break;
1279 1280
1280 1281 case IPPROTO_TCP:
1281 1282 /* Insert it in the Bind Hash */
1282 1283 ASSERT(connp->conn_zoneid != ALL_ZONES);
1283 1284 connfp = &ipst->ips_ipcl_bind_fanout[
1284 1285 IPCL_BIND_HASH(lport, ipst)];
1285 1286 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1286 1287 IPCL_HASH_INSERT_BOUND(connfp, connp);
1287 1288 } else {
1288 1289 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1289 1290 }
1290 1291 if (cl_inet_listen != NULL) {
1291 1292 sa_family_t addr_family;
1292 1293 uint8_t *laddrp;
1293 1294
1294 1295 if (connp->conn_ipversion == IPV6_VERSION) {
1295 1296 addr_family = AF_INET6;
1296 1297 laddrp =
1297 1298 (uint8_t *)&connp->conn_bound_addr_v6;
1298 1299 } else {
1299 1300 addr_family = AF_INET;
1300 1301 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1301 1302 }
1302 1303 connp->conn_flags |= IPCL_CL_LISTENER;
1303 1304 (*cl_inet_listen)(
1304 1305 connp->conn_netstack->netstack_stackid,
1305 1306 IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1306 1307 }
1307 1308 break;
1308 1309
1309 1310 case IPPROTO_SCTP:
1310 1311 ret = ipcl_sctp_hash_insert(connp, lport);
1311 1312 break;
1312 1313 }
1313 1314
1314 1315 return (ret);
1315 1316 }
1316 1317
1317 1318 /*
1318 1319 * ipcl_conn_hash insertion routines.
1319 1320 * The caller has already set conn_proto and the addresses/ports in the conn_t.
1320 1321 */
1321 1322
1322 1323 int
1323 1324 ipcl_conn_insert(conn_t *connp)
1324 1325 {
1325 1326 if (connp->conn_ipversion == IPV6_VERSION)
1326 1327 return (ipcl_conn_insert_v6(connp));
1327 1328 else
1328 1329 return (ipcl_conn_insert_v4(connp));
1329 1330 }
1330 1331
1331 1332 int
1332 1333 ipcl_conn_insert_v4(conn_t *connp)
1333 1334 {
1334 1335 connf_t *connfp;
1335 1336 conn_t *tconnp;
1336 1337 int ret = 0;
1337 1338 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1338 1339 uint16_t lport = connp->conn_lport;
1339 1340 uint8_t protocol = connp->conn_proto;
1340 1341
1341 1342 if (IPCL_IS_IPTUN(connp))
1342 1343 return (ipcl_iptun_hash_insert(connp, ipst));
1343 1344
1344 1345 switch (protocol) {
1345 1346 case IPPROTO_TCP:
1346 1347 /*
1347 1348 * For TCP, we check whether the connection tuple already
1348 1349 * exists before allowing the connection to proceed. We
1349 1350 * also allow indexing on the zoneid. This is to allow
1350 1351 * multiple shared stack zones to have the same tcp
1351 1352 * connection tuple. In practice this only happens for
1352 1353 * INADDR_LOOPBACK as it's the only local address which
1353 1354 * doesn't have to be unique.
1354 1355 */
1355 1356 connfp = &ipst->ips_ipcl_conn_fanout[
1356 1357 IPCL_CONN_HASH(connp->conn_faddr_v4,
1357 1358 connp->conn_ports, ipst)];
1358 1359 mutex_enter(&connfp->connf_lock);
1359 1360 for (tconnp = connfp->connf_head; tconnp != NULL;
1360 1361 tconnp = tconnp->conn_next) {
1361 1362 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1362 1363 connp->conn_faddr_v4, connp->conn_laddr_v4,
1363 1364 connp->conn_ports) &&
1364 1365 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1365 1366 /* Already have a conn. bail out */
1366 1367 mutex_exit(&connfp->connf_lock);
1367 1368 return (EADDRINUSE);
1368 1369 }
1369 1370 }
1370 1371 if (connp->conn_fanout != NULL) {
1371 1372 /*
1372 1373 * Probably a XTI/TLI application trying to do a
1373 1374 * rebind. Let it happen.
1374 1375 */
1375 1376 mutex_exit(&connfp->connf_lock);
1376 1377 IPCL_HASH_REMOVE(connp);
1377 1378 mutex_enter(&connfp->connf_lock);
1378 1379 }
1379 1380
1380 1381 ASSERT(connp->conn_recv != NULL);
1381 1382 ASSERT(connp->conn_recvicmp != NULL);
1382 1383
1383 1384 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1384 1385 mutex_exit(&connfp->connf_lock);
1385 1386 break;
1386 1387
1387 1388 case IPPROTO_SCTP:
1388 1389 /*
1389 1390 * The raw socket may have already been bound, remove it
1390 1391 * from the hash first.
1391 1392 */
1392 1393 IPCL_HASH_REMOVE(connp);
1393 1394 ret = ipcl_sctp_hash_insert(connp, lport);
1394 1395 break;
1395 1396
1396 1397 default:
1397 1398 /*
1398 1399 * Check for conflicts among MAC exempt bindings. For
1399 1400 * transports with port numbers, this is done by the upper
1400 1401 * level per-transport binding logic. For all others, it's
1401 1402 * done here.
1402 1403 */
1403 1404 if (is_system_labeled() &&
1404 1405 check_exempt_conflict_v4(connp, ipst))
1405 1406 return (EADDRINUSE);
1406 1407 /* FALLTHROUGH */
1407 1408
1408 1409 case IPPROTO_UDP:
1409 1410 if (protocol == IPPROTO_UDP) {
1410 1411 connfp = &ipst->ips_ipcl_udp_fanout[
1411 1412 IPCL_UDP_HASH(lport, ipst)];
1412 1413 } else {
1413 1414 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1414 1415 }
1415 1416
1416 1417 if (connp->conn_faddr_v4 != INADDR_ANY) {
1417 1418 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1418 1419 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1419 1420 IPCL_HASH_INSERT_BOUND(connfp, connp);
1420 1421 } else {
1421 1422 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1422 1423 }
1423 1424 break;
1424 1425 }
1425 1426
1426 1427 return (ret);
1427 1428 }
1428 1429
1429 1430 int
1430 1431 ipcl_conn_insert_v6(conn_t *connp)
1431 1432 {
1432 1433 connf_t *connfp;
1433 1434 conn_t *tconnp;
1434 1435 int ret = 0;
1435 1436 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1436 1437 uint16_t lport = connp->conn_lport;
1437 1438 uint8_t protocol = connp->conn_proto;
1438 1439 uint_t ifindex = connp->conn_bound_if;
1439 1440
1440 1441 if (IPCL_IS_IPTUN(connp))
1441 1442 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1442 1443
1443 1444 switch (protocol) {
1444 1445 case IPPROTO_TCP:
1445 1446
1446 1447 /*
1447 1448 * For tcp, we check whether the connection tuple already
1448 1449 * exists before allowing the connection to proceed. We
1449 1450 * also allow indexing on the zoneid. This is to allow
1450 1451 * multiple shared stack zones to have the same tcp
1451 1452 * connection tuple. In practice this only happens for
1452 1453 * ipv6_loopback as it's the only local address which
1453 1454 * doesn't have to be unique.
1454 1455 */
1455 1456 connfp = &ipst->ips_ipcl_conn_fanout[
1456 1457 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1457 1458 ipst)];
1458 1459 mutex_enter(&connfp->connf_lock);
1459 1460 for (tconnp = connfp->connf_head; tconnp != NULL;
1460 1461 tconnp = tconnp->conn_next) {
1461 1462 /* NOTE: need to match zoneid. Bug in onnv-gate */
1462 1463 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1463 1464 connp->conn_faddr_v6, connp->conn_laddr_v6,
1464 1465 connp->conn_ports) &&
1465 1466 (tconnp->conn_bound_if == 0 ||
1466 1467 tconnp->conn_bound_if == ifindex) &&
1467 1468 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1468 1469 /* Already have a conn. bail out */
1469 1470 mutex_exit(&connfp->connf_lock);
1470 1471 return (EADDRINUSE);
1471 1472 }
1472 1473 }
1473 1474 if (connp->conn_fanout != NULL) {
1474 1475 /*
1475 1476 * Probably a XTI/TLI application trying to do a
1476 1477 * rebind. Let it happen.
1477 1478 */
1478 1479 mutex_exit(&connfp->connf_lock);
1479 1480 IPCL_HASH_REMOVE(connp);
1480 1481 mutex_enter(&connfp->connf_lock);
1481 1482 }
1482 1483 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1483 1484 mutex_exit(&connfp->connf_lock);
1484 1485 break;
1485 1486
1486 1487 case IPPROTO_SCTP:
1487 1488 IPCL_HASH_REMOVE(connp);
1488 1489 ret = ipcl_sctp_hash_insert(connp, lport);
1489 1490 break;
1490 1491
1491 1492 default:
1492 1493 if (is_system_labeled() &&
1493 1494 check_exempt_conflict_v6(connp, ipst))
1494 1495 return (EADDRINUSE);
1495 1496 /* FALLTHROUGH */
1496 1497 case IPPROTO_UDP:
1497 1498 if (protocol == IPPROTO_UDP) {
1498 1499 connfp = &ipst->ips_ipcl_udp_fanout[
1499 1500 IPCL_UDP_HASH(lport, ipst)];
1500 1501 } else {
1501 1502 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1502 1503 }
1503 1504
1504 1505 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1505 1506 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1506 1507 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1507 1508 IPCL_HASH_INSERT_BOUND(connfp, connp);
1508 1509 } else {
1509 1510 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1510 1511 }
1511 1512 break;
1512 1513 }
1513 1514
1514 1515 return (ret);
1515 1516 }
1516 1517
1517 1518 /*
1518 1519 * v4 packet classifying function. looks up the fanout table to
1519 1520 * find the conn, the packet belongs to. returns the conn with
1520 1521 * the reference held, null otherwise.
1521 1522 *
1522 1523 * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1523 1524 * Lookup" comment block are applied. Labels are also checked as described
1524 1525 * above. If the packet is from the inside (looped back), and is from the same
1525 1526 * zone, then label checks are omitted.
1526 1527 */
1527 1528 conn_t *
1528 1529 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1529 1530 ip_recv_attr_t *ira, ip_stack_t *ipst)
1530 1531 {
1531 1532 ipha_t *ipha;
1532 1533 connf_t *connfp, *bind_connfp;
1533 1534 uint16_t lport;
1534 1535 uint16_t fport;
1535 1536 uint32_t ports;
1536 1537 conn_t *connp;
1537 1538 uint16_t *up;
1538 1539 zoneid_t zoneid = ira->ira_zoneid;
1539 1540
1540 1541 ipha = (ipha_t *)mp->b_rptr;
1541 1542 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1542 1543
1543 1544 switch (protocol) {
1544 1545 case IPPROTO_TCP:
1545 1546 ports = *(uint32_t *)up;
1546 1547 connfp =
1547 1548 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1548 1549 ports, ipst)];
1549 1550 mutex_enter(&connfp->connf_lock);
1550 1551 for (connp = connfp->connf_head; connp != NULL;
1551 1552 connp = connp->conn_next) {
1552 1553 if (IPCL_CONN_MATCH(connp, protocol,
1553 1554 ipha->ipha_src, ipha->ipha_dst, ports) &&
1554 1555 (connp->conn_zoneid == zoneid ||
1555 1556 connp->conn_allzones ||
1556 1557 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1557 1558 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1558 1559 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1559 1560 break;
1560 1561 }
1561 1562
1562 1563 if (connp != NULL) {
1563 1564 /*
1564 1565 * We have a fully-bound TCP connection.
1565 1566 *
1566 1567 * For labeled systems, there's no need to check the
1567 1568 * label here. It's known to be good as we checked
1568 1569 * before allowing the connection to become bound.
1569 1570 */
1570 1571 CONN_INC_REF(connp);
1571 1572 mutex_exit(&connfp->connf_lock);
1572 1573 return (connp);
1573 1574 }
1574 1575
1575 1576 mutex_exit(&connfp->connf_lock);
1576 1577 lport = up[1];
1577 1578 bind_connfp =
1578 1579 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1579 1580 mutex_enter(&bind_connfp->connf_lock);
1580 1581 for (connp = bind_connfp->connf_head; connp != NULL;
1581 1582 connp = connp->conn_next) {
1582 1583 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1583 1584 lport) &&
1584 1585 (connp->conn_zoneid == zoneid ||
1585 1586 connp->conn_allzones ||
1586 1587 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1587 1588 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1588 1589 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1589 1590 break;
1590 1591 }
1591 1592
1592 1593 /*
1593 1594 * If the matching connection is SLP on a private address, then
1594 1595 * the label on the packet must match the local zone's label.
1595 1596 * Otherwise, it must be in the label range defined by tnrh.
1596 1597 * This is ensured by tsol_receive_local.
1597 1598 *
1598 1599 * Note that we don't check tsol_receive_local for
1599 1600 * the connected case.
1600 1601 */
1601 1602 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1602 1603 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1603 1604 ira, connp)) {
1604 1605 DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1605 1606 char *, "connp(1) could not receive mp(2)",
1606 1607 conn_t *, connp, mblk_t *, mp);
1607 1608 connp = NULL;
1608 1609 }
1609 1610
1610 1611 if (connp != NULL) {
1611 1612 /* Have a listener at least */
1612 1613 CONN_INC_REF(connp);
1613 1614 mutex_exit(&bind_connfp->connf_lock);
1614 1615 return (connp);
1615 1616 }
1616 1617
1617 1618 mutex_exit(&bind_connfp->connf_lock);
1618 1619 break;
1619 1620
1620 1621 case IPPROTO_UDP:
1621 1622 lport = up[1];
1622 1623 fport = up[0];
1623 1624 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1624 1625 mutex_enter(&connfp->connf_lock);
1625 1626 for (connp = connfp->connf_head; connp != NULL;
1626 1627 connp = connp->conn_next) {
1627 1628 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1628 1629 fport, ipha->ipha_src) &&
1629 1630 (connp->conn_zoneid == zoneid ||
1630 1631 connp->conn_allzones ||
1631 1632 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1632 1633 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1633 1634 break;
1634 1635 }
1635 1636
1636 1637 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1637 1638 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1638 1639 ira, connp)) {
1639 1640 DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1640 1641 char *, "connp(1) could not receive mp(2)",
1641 1642 conn_t *, connp, mblk_t *, mp);
1642 1643 connp = NULL;
1643 1644 }
1644 1645
1645 1646 if (connp != NULL) {
1646 1647 CONN_INC_REF(connp);
1647 1648 mutex_exit(&connfp->connf_lock);
1648 1649 return (connp);
1649 1650 }
1650 1651
1651 1652 /*
1652 1653 * We shouldn't come here for multicast/broadcast packets
1653 1654 */
1654 1655 mutex_exit(&connfp->connf_lock);
1655 1656
1656 1657 break;
1657 1658
1658 1659 case IPPROTO_ENCAP:
1659 1660 case IPPROTO_IPV6:
1660 1661 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1661 1662 &ipha->ipha_dst, ipst));
1662 1663 }
1663 1664
1664 1665 return (NULL);
1665 1666 }
1666 1667
1667 1668 conn_t *
1668 1669 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1669 1670 ip_recv_attr_t *ira, ip_stack_t *ipst)
1670 1671 {
1671 1672 ip6_t *ip6h;
1672 1673 connf_t *connfp, *bind_connfp;
1673 1674 uint16_t lport;
1674 1675 uint16_t fport;
1675 1676 tcpha_t *tcpha;
1676 1677 uint32_t ports;
1677 1678 conn_t *connp;
1678 1679 uint16_t *up;
1679 1680 zoneid_t zoneid = ira->ira_zoneid;
1680 1681
1681 1682 ip6h = (ip6_t *)mp->b_rptr;
1682 1683
1683 1684 switch (protocol) {
1684 1685 case IPPROTO_TCP:
1685 1686 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1686 1687 up = &tcpha->tha_lport;
1687 1688 ports = *(uint32_t *)up;
1688 1689
1689 1690 connfp =
1690 1691 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1691 1692 ports, ipst)];
1692 1693 mutex_enter(&connfp->connf_lock);
1693 1694 for (connp = connfp->connf_head; connp != NULL;
1694 1695 connp = connp->conn_next) {
1695 1696 if (IPCL_CONN_MATCH_V6(connp, protocol,
1696 1697 ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1697 1698 (connp->conn_zoneid == zoneid ||
1698 1699 connp->conn_allzones ||
1699 1700 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1700 1701 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1701 1702 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1702 1703 break;
1703 1704 }
1704 1705
1705 1706 if (connp != NULL) {
1706 1707 /*
1707 1708 * We have a fully-bound TCP connection.
1708 1709 *
1709 1710 * For labeled systems, there's no need to check the
1710 1711 * label here. It's known to be good as we checked
1711 1712 * before allowing the connection to become bound.
1712 1713 */
1713 1714 CONN_INC_REF(connp);
1714 1715 mutex_exit(&connfp->connf_lock);
1715 1716 return (connp);
1716 1717 }
1717 1718
1718 1719 mutex_exit(&connfp->connf_lock);
1719 1720
1720 1721 lport = up[1];
1721 1722 bind_connfp =
1722 1723 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1723 1724 mutex_enter(&bind_connfp->connf_lock);
1724 1725 for (connp = bind_connfp->connf_head; connp != NULL;
1725 1726 connp = connp->conn_next) {
1726 1727 if (IPCL_BIND_MATCH_V6(connp, protocol,
1727 1728 ip6h->ip6_dst, lport) &&
1728 1729 (connp->conn_zoneid == zoneid ||
1729 1730 connp->conn_allzones ||
1730 1731 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1731 1732 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1732 1733 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1733 1734 break;
1734 1735 }
1735 1736
1736 1737 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1737 1738 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1738 1739 ira, connp)) {
1739 1740 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1740 1741 char *, "connp(1) could not receive mp(2)",
1741 1742 conn_t *, connp, mblk_t *, mp);
1742 1743 connp = NULL;
1743 1744 }
1744 1745
1745 1746 if (connp != NULL) {
1746 1747 /* Have a listner at least */
1747 1748 CONN_INC_REF(connp);
1748 1749 mutex_exit(&bind_connfp->connf_lock);
1749 1750 return (connp);
1750 1751 }
1751 1752
1752 1753 mutex_exit(&bind_connfp->connf_lock);
1753 1754 break;
1754 1755
1755 1756 case IPPROTO_UDP:
1756 1757 up = (uint16_t *)&mp->b_rptr[hdr_len];
1757 1758 lport = up[1];
1758 1759 fport = up[0];
1759 1760 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1760 1761 mutex_enter(&connfp->connf_lock);
1761 1762 for (connp = connfp->connf_head; connp != NULL;
1762 1763 connp = connp->conn_next) {
1763 1764 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1764 1765 fport, ip6h->ip6_src) &&
1765 1766 (connp->conn_zoneid == zoneid ||
1766 1767 connp->conn_allzones ||
1767 1768 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1768 1769 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1769 1770 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1770 1771 break;
1771 1772 }
1772 1773
1773 1774 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1774 1775 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1775 1776 ira, connp)) {
1776 1777 DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1777 1778 char *, "connp(1) could not receive mp(2)",
1778 1779 conn_t *, connp, mblk_t *, mp);
1779 1780 connp = NULL;
1780 1781 }
1781 1782
1782 1783 if (connp != NULL) {
1783 1784 CONN_INC_REF(connp);
1784 1785 mutex_exit(&connfp->connf_lock);
1785 1786 return (connp);
1786 1787 }
1787 1788
1788 1789 /*
1789 1790 * We shouldn't come here for multicast/broadcast packets
1790 1791 */
1791 1792 mutex_exit(&connfp->connf_lock);
1792 1793 break;
1793 1794 case IPPROTO_ENCAP:
1794 1795 case IPPROTO_IPV6:
1795 1796 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1796 1797 &ip6h->ip6_dst, ipst));
1797 1798 }
1798 1799
1799 1800 return (NULL);
1800 1801 }
1801 1802
1802 1803 /*
1803 1804 * wrapper around ipcl_classify_(v4,v6) routines.
1804 1805 */
1805 1806 conn_t *
1806 1807 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1807 1808 {
1808 1809 if (ira->ira_flags & IRAF_IS_IPV4) {
1809 1810 return (ipcl_classify_v4(mp, ira->ira_protocol,
1810 1811 ira->ira_ip_hdr_length, ira, ipst));
1811 1812 } else {
1812 1813 return (ipcl_classify_v6(mp, ira->ira_protocol,
1813 1814 ira->ira_ip_hdr_length, ira, ipst));
1814 1815 }
1815 1816 }
1816 1817
1817 1818 /*
1818 1819 * Only used to classify SCTP RAW sockets
1819 1820 */
1820 1821 conn_t *
1821 1822 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1822 1823 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1823 1824 {
1824 1825 connf_t *connfp;
1825 1826 conn_t *connp;
1826 1827 in_port_t lport;
1827 1828 int ipversion;
1828 1829 const void *dst;
1829 1830 zoneid_t zoneid = ira->ira_zoneid;
1830 1831
1831 1832 lport = ((uint16_t *)&ports)[1];
1832 1833 if (ira->ira_flags & IRAF_IS_IPV4) {
1833 1834 dst = (const void *)&ipha->ipha_dst;
1834 1835 ipversion = IPV4_VERSION;
1835 1836 } else {
1836 1837 dst = (const void *)&ip6h->ip6_dst;
1837 1838 ipversion = IPV6_VERSION;
1838 1839 }
1839 1840
1840 1841 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1841 1842 mutex_enter(&connfp->connf_lock);
1842 1843 for (connp = connfp->connf_head; connp != NULL;
1843 1844 connp = connp->conn_next) {
1844 1845 /* We don't allow v4 fallback for v6 raw socket. */
1845 1846 if (ipversion != connp->conn_ipversion)
1846 1847 continue;
1847 1848 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1848 1849 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1849 1850 if (ipversion == IPV4_VERSION) {
1850 1851 if (!IPCL_CONN_MATCH(connp, protocol,
1851 1852 ipha->ipha_src, ipha->ipha_dst, ports))
1852 1853 continue;
1853 1854 } else {
1854 1855 if (!IPCL_CONN_MATCH_V6(connp, protocol,
1855 1856 ip6h->ip6_src, ip6h->ip6_dst, ports))
1856 1857 continue;
1857 1858 }
1858 1859 } else {
1859 1860 if (ipversion == IPV4_VERSION) {
1860 1861 if (!IPCL_BIND_MATCH(connp, protocol,
1861 1862 ipha->ipha_dst, lport))
1862 1863 continue;
1863 1864 } else {
1864 1865 if (!IPCL_BIND_MATCH_V6(connp, protocol,
1865 1866 ip6h->ip6_dst, lport))
1866 1867 continue;
1867 1868 }
1868 1869 }
1869 1870
1870 1871 if (connp->conn_zoneid == zoneid ||
1871 1872 connp->conn_allzones ||
1872 1873 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1873 1874 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1874 1875 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1875 1876 break;
1876 1877 }
1877 1878
1878 1879 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1879 1880 !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1880 1881 DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1881 1882 char *, "connp(1) could not receive mp(2)",
1882 1883 conn_t *, connp, mblk_t *, mp);
1883 1884 connp = NULL;
1884 1885 }
1885 1886
1886 1887 if (connp != NULL)
1887 1888 goto found;
1888 1889 mutex_exit(&connfp->connf_lock);
1889 1890
1890 1891 /* Try to look for a wildcard SCTP RAW socket match. */
1891 1892 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1892 1893 mutex_enter(&connfp->connf_lock);
1893 1894 for (connp = connfp->connf_head; connp != NULL;
1894 1895 connp = connp->conn_next) {
1895 1896 /* We don't allow v4 fallback for v6 raw socket. */
1896 1897 if (ipversion != connp->conn_ipversion)
1897 1898 continue;
1898 1899 if (!IPCL_ZONE_MATCH(connp, zoneid))
1899 1900 continue;
1900 1901
1901 1902 if (ipversion == IPV4_VERSION) {
1902 1903 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1903 1904 break;
1904 1905 } else {
1905 1906 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1906 1907 break;
1907 1908 }
1908 1909 }
1909 1910 }
1910 1911
1911 1912 if (connp != NULL)
1912 1913 goto found;
1913 1914
1914 1915 mutex_exit(&connfp->connf_lock);
1915 1916 return (NULL);
1916 1917
1917 1918 found:
1918 1919 ASSERT(connp != NULL);
1919 1920 CONN_INC_REF(connp);
1920 1921 mutex_exit(&connfp->connf_lock);
1921 1922 return (connp);
1922 1923 }
1923 1924
1924 1925 /* ARGSUSED */
1925 1926 static int
1926 1927 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1927 1928 {
1928 1929 itc_t *itc = (itc_t *)buf;
1929 1930 conn_t *connp = &itc->itc_conn;
1930 1931 tcp_t *tcp = (tcp_t *)&itc[1];
1931 1932
1932 1933 bzero(connp, sizeof (conn_t));
1933 1934 bzero(tcp, sizeof (tcp_t));
1934 1935
1935 1936 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1936 1937 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1937 1938 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1938 1939 tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1939 1940 if (tcp->tcp_timercache == NULL)
1940 1941 return (ENOMEM);
1941 1942 connp->conn_tcp = tcp;
1942 1943 connp->conn_flags = IPCL_TCPCONN;
1943 1944 connp->conn_proto = IPPROTO_TCP;
1944 1945 tcp->tcp_connp = connp;
1945 1946 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1946 1947
1947 1948 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1948 1949 if (connp->conn_ixa == NULL) {
1949 1950 tcp_timermp_free(tcp);
1950 1951 return (ENOMEM);
1951 1952 }
1952 1953 connp->conn_ixa->ixa_refcnt = 1;
1953 1954 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1954 1955 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1955 1956 return (0);
1956 1957 }
1957 1958
1958 1959 /* ARGSUSED */
1959 1960 static void
1960 1961 tcp_conn_destructor(void *buf, void *cdrarg)
1961 1962 {
1962 1963 itc_t *itc = (itc_t *)buf;
1963 1964 conn_t *connp = &itc->itc_conn;
1964 1965 tcp_t *tcp = (tcp_t *)&itc[1];
1965 1966
1966 1967 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1967 1968 ASSERT(tcp->tcp_connp == connp);
1968 1969 ASSERT(connp->conn_tcp == tcp);
1969 1970 tcp_timermp_free(tcp);
1970 1971 mutex_destroy(&connp->conn_lock);
1971 1972 cv_destroy(&connp->conn_cv);
1972 1973 cv_destroy(&connp->conn_sq_cv);
1973 1974 rw_destroy(&connp->conn_ilg_lock);
1974 1975
1975 1976 /* Can be NULL if constructor failed */
1976 1977 if (connp->conn_ixa != NULL) {
1977 1978 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1978 1979 ASSERT(connp->conn_ixa->ixa_ire == NULL);
1979 1980 ASSERT(connp->conn_ixa->ixa_nce == NULL);
1980 1981 ixa_refrele(connp->conn_ixa);
1981 1982 }
1982 1983 }
1983 1984
1984 1985 /* ARGSUSED */
1985 1986 static int
1986 1987 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1987 1988 {
1988 1989 itc_t *itc = (itc_t *)buf;
1989 1990 conn_t *connp = &itc->itc_conn;
1990 1991
1991 1992 bzero(connp, sizeof (conn_t));
1992 1993 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1993 1994 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1994 1995 connp->conn_flags = IPCL_IPCCONN;
1995 1996 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1996 1997
1997 1998 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1998 1999 if (connp->conn_ixa == NULL)
1999 2000 return (ENOMEM);
2000 2001 connp->conn_ixa->ixa_refcnt = 1;
2001 2002 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2002 2003 return (0);
2003 2004 }
2004 2005
2005 2006 /* ARGSUSED */
2006 2007 static void
2007 2008 ip_conn_destructor(void *buf, void *cdrarg)
2008 2009 {
2009 2010 itc_t *itc = (itc_t *)buf;
2010 2011 conn_t *connp = &itc->itc_conn;
2011 2012
2012 2013 ASSERT(connp->conn_flags & IPCL_IPCCONN);
2013 2014 ASSERT(connp->conn_priv == NULL);
2014 2015 mutex_destroy(&connp->conn_lock);
2015 2016 cv_destroy(&connp->conn_cv);
2016 2017 rw_destroy(&connp->conn_ilg_lock);
2017 2018
2018 2019 /* Can be NULL if constructor failed */
2019 2020 if (connp->conn_ixa != NULL) {
2020 2021 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2021 2022 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2022 2023 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2023 2024 ixa_refrele(connp->conn_ixa);
2024 2025 }
2025 2026 }
2026 2027
2027 2028 /* ARGSUSED */
2028 2029 static int
2029 2030 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2030 2031 {
2031 2032 itc_t *itc = (itc_t *)buf;
2032 2033 conn_t *connp = &itc->itc_conn;
2033 2034 udp_t *udp = (udp_t *)&itc[1];
2034 2035
2035 2036 bzero(connp, sizeof (conn_t));
2036 2037 bzero(udp, sizeof (udp_t));
2037 2038
2038 2039 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2039 2040 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2040 2041 connp->conn_udp = udp;
2041 2042 connp->conn_flags = IPCL_UDPCONN;
2042 2043 connp->conn_proto = IPPROTO_UDP;
2043 2044 udp->udp_connp = connp;
2044 2045 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2045 2046 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2046 2047 if (connp->conn_ixa == NULL)
2047 2048 return (ENOMEM);
2048 2049 connp->conn_ixa->ixa_refcnt = 1;
2049 2050 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2050 2051 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2051 2052 return (0);
2052 2053 }
2053 2054
2054 2055 /* ARGSUSED */
2055 2056 static void
2056 2057 udp_conn_destructor(void *buf, void *cdrarg)
2057 2058 {
2058 2059 itc_t *itc = (itc_t *)buf;
2059 2060 conn_t *connp = &itc->itc_conn;
2060 2061 udp_t *udp = (udp_t *)&itc[1];
2061 2062
2062 2063 ASSERT(connp->conn_flags & IPCL_UDPCONN);
2063 2064 ASSERT(udp->udp_connp == connp);
2064 2065 ASSERT(connp->conn_udp == udp);
2065 2066 mutex_destroy(&connp->conn_lock);
2066 2067 cv_destroy(&connp->conn_cv);
2067 2068 rw_destroy(&connp->conn_ilg_lock);
2068 2069
2069 2070 /* Can be NULL if constructor failed */
2070 2071 if (connp->conn_ixa != NULL) {
2071 2072 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2072 2073 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2073 2074 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2074 2075 ixa_refrele(connp->conn_ixa);
2075 2076 }
2076 2077 }
2077 2078
2078 2079 /* ARGSUSED */
2079 2080 static int
2080 2081 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2081 2082 {
2082 2083 itc_t *itc = (itc_t *)buf;
2083 2084 conn_t *connp = &itc->itc_conn;
2084 2085 icmp_t *icmp = (icmp_t *)&itc[1];
2085 2086
2086 2087 bzero(connp, sizeof (conn_t));
2087 2088 bzero(icmp, sizeof (icmp_t));
2088 2089
2089 2090 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2090 2091 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2091 2092 connp->conn_icmp = icmp;
2092 2093 connp->conn_flags = IPCL_RAWIPCONN;
2093 2094 connp->conn_proto = IPPROTO_ICMP;
2094 2095 icmp->icmp_connp = connp;
2095 2096 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2096 2097 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2097 2098 if (connp->conn_ixa == NULL)
2098 2099 return (ENOMEM);
2099 2100 connp->conn_ixa->ixa_refcnt = 1;
2100 2101 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2101 2102 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2102 2103 return (0);
2103 2104 }
2104 2105
2105 2106 /* ARGSUSED */
2106 2107 static void
2107 2108 rawip_conn_destructor(void *buf, void *cdrarg)
2108 2109 {
2109 2110 itc_t *itc = (itc_t *)buf;
2110 2111 conn_t *connp = &itc->itc_conn;
2111 2112 icmp_t *icmp = (icmp_t *)&itc[1];
2112 2113
2113 2114 ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2114 2115 ASSERT(icmp->icmp_connp == connp);
2115 2116 ASSERT(connp->conn_icmp == icmp);
2116 2117 mutex_destroy(&connp->conn_lock);
2117 2118 cv_destroy(&connp->conn_cv);
2118 2119 rw_destroy(&connp->conn_ilg_lock);
2119 2120
2120 2121 /* Can be NULL if constructor failed */
2121 2122 if (connp->conn_ixa != NULL) {
2122 2123 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2123 2124 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2124 2125 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2125 2126 ixa_refrele(connp->conn_ixa);
2126 2127 }
2127 2128 }
2128 2129
2129 2130 /* ARGSUSED */
2130 2131 static int
2131 2132 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2132 2133 {
2133 2134 itc_t *itc = (itc_t *)buf;
2134 2135 conn_t *connp = &itc->itc_conn;
2135 2136 rts_t *rts = (rts_t *)&itc[1];
2136 2137
2137 2138 bzero(connp, sizeof (conn_t));
2138 2139 bzero(rts, sizeof (rts_t));
2139 2140
2140 2141 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2141 2142 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2142 2143 connp->conn_rts = rts;
2143 2144 connp->conn_flags = IPCL_RTSCONN;
2144 2145 rts->rts_connp = connp;
2145 2146 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2146 2147 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2147 2148 if (connp->conn_ixa == NULL)
2148 2149 return (ENOMEM);
2149 2150 connp->conn_ixa->ixa_refcnt = 1;
2150 2151 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2151 2152 return (0);
2152 2153 }
2153 2154
2154 2155 /* ARGSUSED */
2155 2156 static void
2156 2157 rts_conn_destructor(void *buf, void *cdrarg)
2157 2158 {
2158 2159 itc_t *itc = (itc_t *)buf;
2159 2160 conn_t *connp = &itc->itc_conn;
2160 2161 rts_t *rts = (rts_t *)&itc[1];
2161 2162
2162 2163 ASSERT(connp->conn_flags & IPCL_RTSCONN);
2163 2164 ASSERT(rts->rts_connp == connp);
2164 2165 ASSERT(connp->conn_rts == rts);
2165 2166 mutex_destroy(&connp->conn_lock);
2166 2167 cv_destroy(&connp->conn_cv);
2167 2168 rw_destroy(&connp->conn_ilg_lock);
2168 2169
2169 2170 /* Can be NULL if constructor failed */
2170 2171 if (connp->conn_ixa != NULL) {
2171 2172 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2172 2173 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2173 2174 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2174 2175 ixa_refrele(connp->conn_ixa);
2175 2176 }
2176 2177 }
2177 2178
2178 2179 /*
2179 2180 * Called as part of ipcl_conn_destroy to assert and clear any pointers
2180 2181 * in the conn_t.
2181 2182 *
2182 2183 * Below we list all the pointers in the conn_t as a documentation aid.
2183 2184 * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2184 2185 * If you add any pointers to the conn_t please add an ASSERT here
2185 2186 * and #ifdef it out if it can't be actually asserted to be NULL.
2186 2187 * In any case, we bzero most of the conn_t at the end of the function.
2187 2188 */
2188 2189 void
2189 2190 ipcl_conn_cleanup(conn_t *connp)
2190 2191 {
2191 2192 ip_xmit_attr_t *ixa;
2192 2193
2193 2194 ASSERT(connp->conn_latch == NULL);
2194 2195 ASSERT(connp->conn_latch_in_policy == NULL);
2195 2196 ASSERT(connp->conn_latch_in_action == NULL);
2196 2197 #ifdef notdef
2197 2198 ASSERT(connp->conn_rq == NULL);
2198 2199 ASSERT(connp->conn_wq == NULL);
2199 2200 #endif
2200 2201 ASSERT(connp->conn_cred == NULL);
2201 2202 ASSERT(connp->conn_g_fanout == NULL);
2202 2203 ASSERT(connp->conn_g_next == NULL);
2203 2204 ASSERT(connp->conn_g_prev == NULL);
2204 2205 ASSERT(connp->conn_policy == NULL);
2205 2206 ASSERT(connp->conn_fanout == NULL);
2206 2207 ASSERT(connp->conn_next == NULL);
2207 2208 ASSERT(connp->conn_prev == NULL);
2208 2209 ASSERT(connp->conn_oper_pending_ill == NULL);
2209 2210 ASSERT(connp->conn_ilg == NULL);
2210 2211 ASSERT(connp->conn_drain_next == NULL);
2211 2212 ASSERT(connp->conn_drain_prev == NULL);
2212 2213 #ifdef notdef
2213 2214 /* conn_idl is not cleared when removed from idl list */
2214 2215 ASSERT(connp->conn_idl == NULL);
2215 2216 #endif
2216 2217 ASSERT(connp->conn_ipsec_opt_mp == NULL);
2217 2218 #ifdef notdef
2218 2219 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2219 2220 ASSERT(connp->conn_netstack == NULL);
2220 2221 #endif
2221 2222
2222 2223 ASSERT(connp->conn_helper_info == NULL);
2223 2224 ASSERT(connp->conn_ixa != NULL);
2224 2225 ixa = connp->conn_ixa;
2225 2226 ASSERT(ixa->ixa_refcnt == 1);
2226 2227 /* Need to preserve ixa_protocol */
2227 2228 ixa_cleanup(ixa);
2228 2229 ixa->ixa_flags = 0;
2229 2230
2230 2231 /* Clear out the conn_t fields that are not preserved */
2231 2232 bzero(&connp->conn_start_clr,
2232 2233 sizeof (conn_t) -
2233 2234 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2234 2235 }
2235 2236
2236 2237 /*
2237 2238 * All conns are inserted in a global multi-list for the benefit of
2238 2239 * walkers. The walk is guaranteed to walk all open conns at the time
2239 2240 * of the start of the walk exactly once. This property is needed to
2240 2241 * achieve some cleanups during unplumb of interfaces. This is achieved
2241 2242 * as follows.
2242 2243 *
2243 2244 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2244 2245 * call the insert and delete functions below at creation and deletion
2245 2246 * time respectively. The conn never moves or changes its position in this
2246 2247 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2247 2248 * won't increase due to walkers, once the conn deletion has started. Note
2248 2249 * that we can't remove the conn from the global list and then wait for
2249 2250 * the refcnt to drop to zero, since walkers would then see a truncated
2250 2251 * list. CONN_INCIPIENT ensures that walkers don't start looking at
2251 2252 * conns until ip_open is ready to make them globally visible.
2252 2253 * The global round robin multi-list locks are held only to get the
2253 2254 * next member/insertion/deletion and contention should be negligible
2254 2255 * if the multi-list is much greater than the number of cpus.
2255 2256 */
2256 2257 void
2257 2258 ipcl_globalhash_insert(conn_t *connp)
2258 2259 {
2259 2260 int index;
2260 2261 struct connf_s *connfp;
2261 2262 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
2262 2263
2263 2264 /*
2264 2265 * No need for atomic here. Approximate even distribution
2265 2266 * in the global lists is sufficient.
2266 2267 */
2267 2268 ipst->ips_conn_g_index++;
2268 2269 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2269 2270
2270 2271 connp->conn_g_prev = NULL;
2271 2272 /*
2272 2273 * Mark as INCIPIENT, so that walkers will ignore this
2273 2274 * for now, till ip_open is ready to make it visible globally.
2274 2275 */
2275 2276 connp->conn_state_flags |= CONN_INCIPIENT;
2276 2277
2277 2278 connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2278 2279 /* Insert at the head of the list */
2279 2280 mutex_enter(&connfp->connf_lock);
2280 2281 connp->conn_g_next = connfp->connf_head;
2281 2282 if (connp->conn_g_next != NULL)
2282 2283 connp->conn_g_next->conn_g_prev = connp;
2283 2284 connfp->connf_head = connp;
2284 2285
2285 2286 /* The fanout bucket this conn points to */
2286 2287 connp->conn_g_fanout = connfp;
2287 2288
2288 2289 mutex_exit(&connfp->connf_lock);
2289 2290 }
2290 2291
2291 2292 void
2292 2293 ipcl_globalhash_remove(conn_t *connp)
2293 2294 {
2294 2295 struct connf_s *connfp;
2295 2296
2296 2297 /*
2297 2298 * We were never inserted in the global multi list.
2298 2299 * IPCL_NONE variety is never inserted in the global multilist
2299 2300 * since it is presumed to not need any cleanup and is transient.
2300 2301 */
2301 2302 if (connp->conn_g_fanout == NULL)
2302 2303 return;
2303 2304
2304 2305 connfp = connp->conn_g_fanout;
2305 2306 mutex_enter(&connfp->connf_lock);
2306 2307 if (connp->conn_g_prev != NULL)
2307 2308 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2308 2309 else
2309 2310 connfp->connf_head = connp->conn_g_next;
2310 2311 if (connp->conn_g_next != NULL)
2311 2312 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2312 2313 mutex_exit(&connfp->connf_lock);
2313 2314
2314 2315 /* Better to stumble on a null pointer than to corrupt memory */
2315 2316 connp->conn_g_next = NULL;
2316 2317 connp->conn_g_prev = NULL;
2317 2318 connp->conn_g_fanout = NULL;
2318 2319 }
2319 2320
2320 2321 /*
2321 2322 * Walk the list of all conn_t's in the system, calling the function provided
2322 2323 * With the specified argument for each.
2323 2324 * Applies to both IPv4 and IPv6.
2324 2325 *
2325 2326 * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2326 2327 * conn_oper_pending_ill). To guard against stale pointers
2327 2328 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2328 2329 * unplumbed or removed. New conn_t's that are created while we are walking
2329 2330 * may be missed by this walk, because they are not necessarily inserted
2330 2331 * at the tail of the list. They are new conn_t's and thus don't have any
2331 2332 * stale pointers. The CONN_CLOSING flag ensures that no new reference
2332 2333 * is created to the struct that is going away.
2333 2334 */
2334 2335 void
2335 2336 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2336 2337 {
2337 2338 int i;
2338 2339 conn_t *connp;
2339 2340 conn_t *prev_connp;
2340 2341
2341 2342 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2342 2343 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2343 2344 prev_connp = NULL;
2344 2345 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2345 2346 while (connp != NULL) {
2346 2347 mutex_enter(&connp->conn_lock);
2347 2348 if (connp->conn_state_flags &
2348 2349 (CONN_CONDEMNED | CONN_INCIPIENT)) {
2349 2350 mutex_exit(&connp->conn_lock);
2350 2351 connp = connp->conn_g_next;
2351 2352 continue;
2352 2353 }
2353 2354 CONN_INC_REF_LOCKED(connp);
2354 2355 mutex_exit(&connp->conn_lock);
2355 2356 mutex_exit(
2356 2357 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2357 2358 (*func)(connp, arg);
2358 2359 if (prev_connp != NULL)
2359 2360 CONN_DEC_REF(prev_connp);
2360 2361 mutex_enter(
2361 2362 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2362 2363 prev_connp = connp;
2363 2364 connp = connp->conn_g_next;
2364 2365 }
2365 2366 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2366 2367 if (prev_connp != NULL)
2367 2368 CONN_DEC_REF(prev_connp);
2368 2369 }
2369 2370 }
2370 2371
2371 2372 /*
2372 2373 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2373 2374 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2374 2375 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2375 2376 * (peer tcp in ESTABLISHED state).
2376 2377 */
2377 2378 conn_t *
2378 2379 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2379 2380 ip_stack_t *ipst)
2380 2381 {
2381 2382 uint32_t ports;
2382 2383 uint16_t *pports = (uint16_t *)&ports;
2383 2384 connf_t *connfp;
2384 2385 conn_t *tconnp;
2385 2386 boolean_t zone_chk;
2386 2387
2387 2388 /*
2388 2389 * If either the source of destination address is loopback, then
2389 2390 * both endpoints must be in the same Zone. Otherwise, both of
2390 2391 * the addresses are system-wide unique (tcp is in ESTABLISHED
2391 2392 * state) and the endpoints may reside in different Zones.
2392 2393 */
2393 2394 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2394 2395 ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2395 2396
2396 2397 pports[0] = tcpha->tha_fport;
2397 2398 pports[1] = tcpha->tha_lport;
2398 2399
2399 2400 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2400 2401 ports, ipst)];
2401 2402
2402 2403 mutex_enter(&connfp->connf_lock);
2403 2404 for (tconnp = connfp->connf_head; tconnp != NULL;
2404 2405 tconnp = tconnp->conn_next) {
2405 2406
2406 2407 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2407 2408 ipha->ipha_dst, ipha->ipha_src, ports) &&
2408 2409 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2409 2410 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2410 2411
2411 2412 ASSERT(tconnp != connp);
2412 2413 CONN_INC_REF(tconnp);
2413 2414 mutex_exit(&connfp->connf_lock);
2414 2415 return (tconnp);
2415 2416 }
2416 2417 }
2417 2418 mutex_exit(&connfp->connf_lock);
2418 2419 return (NULL);
2419 2420 }
2420 2421
2421 2422 /*
2422 2423 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2423 2424 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2424 2425 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2425 2426 * (peer tcp in ESTABLISHED state).
2426 2427 */
2427 2428 conn_t *
2428 2429 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2429 2430 ip_stack_t *ipst)
2430 2431 {
2431 2432 uint32_t ports;
2432 2433 uint16_t *pports = (uint16_t *)&ports;
2433 2434 connf_t *connfp;
2434 2435 conn_t *tconnp;
2435 2436 boolean_t zone_chk;
2436 2437
2437 2438 /*
2438 2439 * If either the source of destination address is loopback, then
2439 2440 * both endpoints must be in the same Zone. Otherwise, both of
2440 2441 * the addresses are system-wide unique (tcp is in ESTABLISHED
2441 2442 * state) and the endpoints may reside in different Zones. We
2442 2443 * don't do Zone check for link local address(es) because the
2443 2444 * current Zone implementation treats each link local address as
2444 2445 * being unique per system node, i.e. they belong to global Zone.
2445 2446 */
2446 2447 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2447 2448 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2448 2449
2449 2450 pports[0] = tcpha->tha_fport;
2450 2451 pports[1] = tcpha->tha_lport;
2451 2452
2452 2453 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2453 2454 ports, ipst)];
2454 2455
2455 2456 mutex_enter(&connfp->connf_lock);
2456 2457 for (tconnp = connfp->connf_head; tconnp != NULL;
2457 2458 tconnp = tconnp->conn_next) {
2458 2459
2459 2460 /* We skip conn_bound_if check here as this is loopback tcp */
2460 2461 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2461 2462 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2462 2463 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2463 2464 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2464 2465
2465 2466 ASSERT(tconnp != connp);
2466 2467 CONN_INC_REF(tconnp);
2467 2468 mutex_exit(&connfp->connf_lock);
2468 2469 return (tconnp);
2469 2470 }
2470 2471 }
2471 2472 mutex_exit(&connfp->connf_lock);
2472 2473 return (NULL);
2473 2474 }
2474 2475
2475 2476 /*
2476 2477 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2477 2478 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2478 2479 * Only checks for connected entries i.e. no INADDR_ANY checks.
2479 2480 */
2480 2481 conn_t *
2481 2482 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2482 2483 ip_stack_t *ipst)
2483 2484 {
2484 2485 uint32_t ports;
2485 2486 uint16_t *pports;
2486 2487 connf_t *connfp;
2487 2488 conn_t *tconnp;
2488 2489
2489 2490 pports = (uint16_t *)&ports;
2490 2491 pports[0] = tcpha->tha_fport;
2491 2492 pports[1] = tcpha->tha_lport;
2492 2493
2493 2494 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2494 2495 ports, ipst)];
2495 2496
2496 2497 mutex_enter(&connfp->connf_lock);
2497 2498 for (tconnp = connfp->connf_head; tconnp != NULL;
2498 2499 tconnp = tconnp->conn_next) {
2499 2500
2500 2501 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2501 2502 ipha->ipha_dst, ipha->ipha_src, ports) &&
2502 2503 tconnp->conn_tcp->tcp_state >= min_state) {
2503 2504
2504 2505 CONN_INC_REF(tconnp);
2505 2506 mutex_exit(&connfp->connf_lock);
2506 2507 return (tconnp);
2507 2508 }
2508 2509 }
2509 2510 mutex_exit(&connfp->connf_lock);
2510 2511 return (NULL);
2511 2512 }
2512 2513
2513 2514 /*
2514 2515 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2515 2516 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2516 2517 * Only checks for connected entries i.e. no INADDR_ANY checks.
2517 2518 * Match on ifindex in addition to addresses.
2518 2519 */
2519 2520 conn_t *
2520 2521 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2521 2522 uint_t ifindex, ip_stack_t *ipst)
2522 2523 {
2523 2524 tcp_t *tcp;
2524 2525 uint32_t ports;
2525 2526 uint16_t *pports;
2526 2527 connf_t *connfp;
2527 2528 conn_t *tconnp;
2528 2529
2529 2530 pports = (uint16_t *)&ports;
2530 2531 pports[0] = tcpha->tha_fport;
2531 2532 pports[1] = tcpha->tha_lport;
2532 2533
2533 2534 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2534 2535 ports, ipst)];
2535 2536
2536 2537 mutex_enter(&connfp->connf_lock);
2537 2538 for (tconnp = connfp->connf_head; tconnp != NULL;
2538 2539 tconnp = tconnp->conn_next) {
2539 2540
2540 2541 tcp = tconnp->conn_tcp;
2541 2542 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2542 2543 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2543 2544 tcp->tcp_state >= min_state &&
2544 2545 (tconnp->conn_bound_if == 0 ||
2545 2546 tconnp->conn_bound_if == ifindex)) {
2546 2547
2547 2548 CONN_INC_REF(tconnp);
2548 2549 mutex_exit(&connfp->connf_lock);
2549 2550 return (tconnp);
2550 2551 }
2551 2552 }
2552 2553 mutex_exit(&connfp->connf_lock);
2553 2554 return (NULL);
2554 2555 }
2555 2556
2556 2557 /*
2557 2558 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2558 2559 * a listener when changing state.
2559 2560 */
2560 2561 conn_t *
2561 2562 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2562 2563 ip_stack_t *ipst)
2563 2564 {
2564 2565 connf_t *bind_connfp;
2565 2566 conn_t *connp;
2566 2567 tcp_t *tcp;
2567 2568
2568 2569 /*
2569 2570 * Avoid false matches for packets sent to an IP destination of
2570 2571 * all zeros.
2571 2572 */
2572 2573 if (laddr == 0)
2573 2574 return (NULL);
2574 2575
2575 2576 ASSERT(zoneid != ALL_ZONES);
2576 2577
2577 2578 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2578 2579 mutex_enter(&bind_connfp->connf_lock);
2579 2580 for (connp = bind_connfp->connf_head; connp != NULL;
2580 2581 connp = connp->conn_next) {
2581 2582 tcp = connp->conn_tcp;
2582 2583 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2583 2584 IPCL_ZONE_MATCH(connp, zoneid) &&
2584 2585 (tcp->tcp_listener == NULL)) {
2585 2586 CONN_INC_REF(connp);
2586 2587 mutex_exit(&bind_connfp->connf_lock);
2587 2588 return (connp);
2588 2589 }
2589 2590 }
2590 2591 mutex_exit(&bind_connfp->connf_lock);
2591 2592 return (NULL);
2592 2593 }
2593 2594
2594 2595 /*
2595 2596 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2596 2597 * a listener when changing state.
2597 2598 */
2598 2599 conn_t *
2599 2600 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2600 2601 zoneid_t zoneid, ip_stack_t *ipst)
2601 2602 {
2602 2603 connf_t *bind_connfp;
2603 2604 conn_t *connp = NULL;
2604 2605 tcp_t *tcp;
2605 2606
2606 2607 /*
2607 2608 * Avoid false matches for packets sent to an IP destination of
2608 2609 * all zeros.
2609 2610 */
2610 2611 if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2611 2612 return (NULL);
2612 2613
2613 2614 ASSERT(zoneid != ALL_ZONES);
2614 2615
2615 2616 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2616 2617 mutex_enter(&bind_connfp->connf_lock);
2617 2618 for (connp = bind_connfp->connf_head; connp != NULL;
2618 2619 connp = connp->conn_next) {
2619 2620 tcp = connp->conn_tcp;
2620 2621 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2621 2622 IPCL_ZONE_MATCH(connp, zoneid) &&
2622 2623 (connp->conn_bound_if == 0 ||
2623 2624 connp->conn_bound_if == ifindex) &&
2624 2625 tcp->tcp_listener == NULL) {
2625 2626 CONN_INC_REF(connp);
2626 2627 mutex_exit(&bind_connfp->connf_lock);
2627 2628 return (connp);
2628 2629 }
2629 2630 }
2630 2631 mutex_exit(&bind_connfp->connf_lock);
2631 2632 return (NULL);
2632 2633 }
2633 2634
2634 2635 /*
2635 2636 * ipcl_get_next_conn
2636 2637 * get the next entry in the conn global list
2637 2638 * and put a reference on the next_conn.
2638 2639 * decrement the reference on the current conn.
2639 2640 *
2640 2641 * This is an iterator based walker function that also provides for
2641 2642 * some selection by the caller. It walks through the conn_hash bucket
2642 2643 * searching for the next valid connp in the list, and selects connections
2643 2644 * that are neither closed nor condemned. It also REFHOLDS the conn
2644 2645 * thus ensuring that the conn exists when the caller uses the conn.
2645 2646 */
2646 2647 conn_t *
2647 2648 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2648 2649 {
2649 2650 conn_t *next_connp;
2650 2651
2651 2652 if (connfp == NULL)
2652 2653 return (NULL);
2653 2654
2654 2655 mutex_enter(&connfp->connf_lock);
2655 2656
2656 2657 next_connp = (connp == NULL) ?
2657 2658 connfp->connf_head : connp->conn_g_next;
2658 2659
2659 2660 while (next_connp != NULL) {
2660 2661 mutex_enter(&next_connp->conn_lock);
2661 2662 if (!(next_connp->conn_flags & conn_flags) ||
2662 2663 (next_connp->conn_state_flags &
2663 2664 (CONN_CONDEMNED | CONN_INCIPIENT))) {
2664 2665 /*
2665 2666 * This conn has been condemned or
2666 2667 * is closing, or the flags don't match
2667 2668 */
2668 2669 mutex_exit(&next_connp->conn_lock);
2669 2670 next_connp = next_connp->conn_g_next;
2670 2671 continue;
2671 2672 }
2672 2673 CONN_INC_REF_LOCKED(next_connp);
2673 2674 mutex_exit(&next_connp->conn_lock);
2674 2675 break;
2675 2676 }
2676 2677
2677 2678 mutex_exit(&connfp->connf_lock);
2678 2679
2679 2680 if (connp != NULL)
2680 2681 CONN_DEC_REF(connp);
2681 2682
2682 2683 return (next_connp);
2683 2684 }
2684 2685
2685 2686 #ifdef CONN_DEBUG
2686 2687 /*
2687 2688 * Trace of the last NBUF refhold/refrele
2688 2689 */
2689 2690 int
2690 2691 conn_trace_ref(conn_t *connp)
2691 2692 {
2692 2693 int last;
2693 2694 conn_trace_t *ctb;
2694 2695
2695 2696 ASSERT(MUTEX_HELD(&connp->conn_lock));
2696 2697 last = connp->conn_trace_last;
2697 2698 last++;
2698 2699 if (last == CONN_TRACE_MAX)
2699 2700 last = 0;
2700 2701
2701 2702 ctb = &connp->conn_trace_buf[last];
2702 2703 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2703 2704 connp->conn_trace_last = last;
2704 2705 return (1);
2705 2706 }
2706 2707
2707 2708 int
2708 2709 conn_untrace_ref(conn_t *connp)
2709 2710 {
2710 2711 int last;
2711 2712 conn_trace_t *ctb;
2712 2713
2713 2714 ASSERT(MUTEX_HELD(&connp->conn_lock));
2714 2715 last = connp->conn_trace_last;
2715 2716 last++;
2716 2717 if (last == CONN_TRACE_MAX)
2717 2718 last = 0;
2718 2719
2719 2720 ctb = &connp->conn_trace_buf[last];
2720 2721 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2721 2722 connp->conn_trace_last = last;
2722 2723 return (1);
2723 2724 }
2724 2725 #endif
|
↓ open down ↓ |
2435 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX