Print this page
Reduce lint
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/inet/ip/ipclassifier.c
+++ new/usr/src/uts/common/inet/ip/ipclassifier.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2016 Joyent, Inc.
24 24 */
25 25
26 26 /*
27 27 * IP PACKET CLASSIFIER
28 28 *
29 29 * The IP packet classifier provides mapping between IP packets and persistent
30 30 * connection state for connection-oriented protocols. It also provides
31 31 * interface for managing connection states.
32 32 *
33 33 * The connection state is kept in conn_t data structure and contains, among
34 34 * other things:
35 35 *
36 36 * o local/remote address and ports
37 37 * o Transport protocol
38 38 * o squeue for the connection (for TCP only)
39 39 * o reference counter
40 40 * o Connection state
41 41 * o hash table linkage
42 42 * o interface/ire information
43 43 * o credentials
44 44 * o ipsec policy
45 45 * o send and receive functions.
46 46 * o mutex lock.
47 47 *
48 48 * Connections use a reference counting scheme. They are freed when the
49 49 * reference counter drops to zero. A reference is incremented when connection
50 50 * is placed in a list or table, when incoming packet for the connection arrives
51 51 * and when connection is processed via squeue (squeue processing may be
52 52 * asynchronous and the reference protects the connection from being destroyed
53 53 * before its processing is finished).
54 54 *
55 55 * conn_recv is used to pass up packets to the ULP.
56 56 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
57 57 * a listener, and changes to tcp_input_listener as the listener has picked a
58 58 * good squeue. For other cases it is set to tcp_input_data.
59 59 *
60 60 * conn_recvicmp is used to pass up ICMP errors to the ULP.
61 61 *
62 62 * Classifier uses several hash tables:
63 63 *
64 64 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state
65 65 * ipcl_bind_fanout: contains all connections in BOUND state
66 66 * ipcl_proto_fanout: IPv4 protocol fanout
67 67 * ipcl_proto_fanout_v6: IPv6 protocol fanout
68 68 * ipcl_udp_fanout: contains all UDP connections
69 69 * ipcl_iptun_fanout: contains all IP tunnel connections
70 70 * ipcl_globalhash_fanout: contains all connections
71 71 *
72 72 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
73 73 * which need to view all existing connections.
74 74 *
75 75 * All tables are protected by per-bucket locks. When both per-bucket lock and
76 76 * connection lock need to be held, the per-bucket lock should be acquired
77 77 * first, followed by the connection lock.
78 78 *
79 79 * All functions doing search in one of these tables increment a reference
80 80 * counter on the connection found (if any). This reference should be dropped
81 81 * when the caller has finished processing the connection.
82 82 *
83 83 *
84 84 * INTERFACES:
85 85 * ===========
86 86 *
87 87 * Connection Lookup:
88 88 * ------------------
89 89 *
90 90 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
91 91 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
92 92 *
93 93 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
94 94 * it can't find any associated connection. If the connection is found, its
95 95 * reference counter is incremented.
96 96 *
97 97 * mp: mblock, containing packet header. The full header should fit
98 98 * into a single mblock. It should also contain at least full IP
99 99 * and TCP or UDP header.
100 100 *
101 101 * protocol: Either IPPROTO_TCP or IPPROTO_UDP.
102 102 *
103 103 * hdr_len: The size of IP header. It is used to find TCP or UDP header in
104 104 * the packet.
105 105 *
106 106 * ira->ira_zoneid: The zone in which the returned connection must be; the
107 107 * zoneid corresponding to the ire_zoneid on the IRE located for
108 108 * the packet's destination address.
109 109 *
110 110 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
111 111 * IRAF_TX_SHARED_ADDR flags
112 112 *
113 113 * For TCP connections, the lookup order is as follows:
114 114 * 5-tuple {src, dst, protocol, local port, remote port}
115 115 * lookup in ipcl_conn_fanout table.
116 116 * 3-tuple {dst, remote port, protocol} lookup in
117 117 * ipcl_bind_fanout table.
118 118 *
119 119 * For UDP connections, a 5-tuple {src, dst, protocol, local port,
120 120 * remote port} lookup is done on ipcl_udp_fanout. Note that,
121 121 * these interfaces do not handle cases where a packets belongs
122 122 * to multiple UDP clients, which is handled in IP itself.
123 123 *
124 124 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
125 125 * determine which actual zone gets the segment. This is used only in a
126 126 * labeled environment. The matching rules are:
127 127 *
128 128 * - If it's not a multilevel port, then the label on the packet selects
129 129 * the zone. Unlabeled packets are delivered to the global zone.
130 130 *
131 131 * - If it's a multilevel port, then only the zone registered to receive
132 132 * packets on that port matches.
133 133 *
134 134 * Also, in a labeled environment, packet labels need to be checked. For fully
135 135 * bound TCP connections, we can assume that the packet label was checked
136 136 * during connection establishment, and doesn't need to be checked on each
137 137 * packet. For others, though, we need to check for strict equality or, for
138 138 * multilevel ports, membership in the range or set. This part currently does
139 139 * a tnrh lookup on each packet, but could be optimized to use cached results
140 140 * if that were necessary. (SCTP doesn't come through here, but if it did,
141 141 * we would apply the same rules as TCP.)
142 142 *
143 143 * An implication of the above is that fully-bound TCP sockets must always use
144 144 * distinct 4-tuples; they can't be discriminated by label alone.
145 145 *
146 146 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
147 147 * as there's no connection set-up handshake and no shared state.
148 148 *
149 149 * Labels on looped-back packets within a single zone do not need to be
150 150 * checked, as all processes in the same zone have the same label.
151 151 *
152 152 * Finally, for unlabeled packets received by a labeled system, special rules
153 153 * apply. We consider only the MLP if there is one. Otherwise, we prefer a
154 154 * socket in the zone whose label matches the default label of the sender, if
155 155 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the
156 156 * receiver's label must dominate the sender's default label.
157 157 *
158 158 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
159 159 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
160 160 * ip_stack);
161 161 *
162 162 * Lookup routine to find a exact match for {src, dst, local port,
163 163 * remote port) for TCP connections in ipcl_conn_fanout. The address and
164 164 * ports are read from the IP and TCP header respectively.
165 165 *
166 166 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol,
167 167 * zoneid, ip_stack);
168 168 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
169 169 * zoneid, ip_stack);
170 170 *
171 171 * Lookup routine to find a listener with the tuple {lport, laddr,
172 172 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional
173 173 * parameter interface index is also compared.
174 174 *
175 175 * void ipcl_walk(func, arg, ip_stack)
176 176 *
177 177 * Apply 'func' to every connection available. The 'func' is called as
178 178 * (*func)(connp, arg). The walk is non-atomic so connections may be
179 179 * created and destroyed during the walk. The CONN_CONDEMNED and
180 180 * CONN_INCIPIENT flags ensure that connections which are newly created
181 181 * or being destroyed are not selected by the walker.
182 182 *
183 183 * Table Updates
184 184 * -------------
185 185 *
186 186 * int ipcl_conn_insert(connp);
187 187 * int ipcl_conn_insert_v4(connp);
188 188 * int ipcl_conn_insert_v6(connp);
189 189 *
190 190 * Insert 'connp' in the ipcl_conn_fanout.
191 191 * Arguements :
192 192 * connp conn_t to be inserted
193 193 *
194 194 * Return value :
195 195 * 0 if connp was inserted
196 196 * EADDRINUSE if the connection with the same tuple
197 197 * already exists.
198 198 *
199 199 * int ipcl_bind_insert(connp);
200 200 * int ipcl_bind_insert_v4(connp);
201 201 * int ipcl_bind_insert_v6(connp);
202 202 *
203 203 * Insert 'connp' in ipcl_bind_fanout.
204 204 * Arguements :
205 205 * connp conn_t to be inserted
206 206 *
207 207 *
208 208 * void ipcl_hash_remove(connp);
209 209 *
210 210 * Removes the 'connp' from the connection fanout table.
211 211 *
212 212 * Connection Creation/Destruction
213 213 * -------------------------------
214 214 *
215 215 * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
216 216 *
217 217 * Creates a new conn based on the type flag, inserts it into
218 218 * globalhash table.
219 219 *
220 220 * type: This flag determines the type of conn_t which needs to be
221 221 * created i.e., which kmem_cache it comes from.
222 222 * IPCL_TCPCONN indicates a TCP connection
223 223 * IPCL_SCTPCONN indicates a SCTP connection
224 224 * IPCL_UDPCONN indicates a UDP conn_t.
225 225 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t.
226 226 * IPCL_RTSCONN indicates a RTS conn_t.
227 227 * IPCL_IPCCONN indicates all other connections.
228 228 *
229 229 * void ipcl_conn_destroy(connp)
230 230 *
231 231 * Destroys the connection state, removes it from the global
232 232 * connection hash table and frees its memory.
233 233 */
234 234
235 235 #include <sys/types.h>
236 236 #include <sys/stream.h>
237 237 #include <sys/stropts.h>
238 238 #include <sys/sysmacros.h>
239 239 #include <sys/strsubr.h>
240 240 #include <sys/strsun.h>
241 241 #define _SUN_TPI_VERSION 2
242 242 #include <sys/ddi.h>
243 243 #include <sys/cmn_err.h>
244 244 #include <sys/debug.h>
245 245
246 246 #include <sys/systm.h>
247 247 #include <sys/param.h>
248 248 #include <sys/kmem.h>
249 249 #include <sys/isa_defs.h>
250 250 #include <inet/common.h>
251 251 #include <netinet/ip6.h>
252 252 #include <netinet/icmp6.h>
253 253
254 254 #include <inet/ip.h>
255 255 #include <inet/ip_if.h>
256 256 #include <inet/ip_ire.h>
257 257 #include <inet/ip6.h>
258 258 #include <inet/ip_ndp.h>
259 259 #include <inet/ip_impl.h>
260 260 #include <inet/udp_impl.h>
261 261 #include <inet/sctp_ip.h>
262 262 #include <inet/sctp/sctp_impl.h>
263 263 #include <inet/rawip_impl.h>
264 264 #include <inet/rts_impl.h>
265 265 #include <inet/iptun/iptun_impl.h>
266 266
267 267 #include <sys/cpuvar.h>
268 268
269 269 #include <inet/ipclassifier.h>
270 270 #include <inet/tcp.h>
271 271 #include <inet/ipsec_impl.h>
272 272
273 273 #include <sys/tsol/tnet.h>
274 274 #include <sys/sockio.h>
275 275
276 276 /* Old value for compatibility. Setable in /etc/system */
277 277 uint_t tcp_conn_hash_size = 0;
278 278
279 279 /* New value. Zero means choose automatically. Setable in /etc/system */
280 280 uint_t ipcl_conn_hash_size = 0;
281 281 uint_t ipcl_conn_hash_memfactor = 8192;
282 282 uint_t ipcl_conn_hash_maxsize = 82500;
283 283
284 284 /* bind/udp fanout table size */
285 285 uint_t ipcl_bind_fanout_size = 512;
286 286 uint_t ipcl_udp_fanout_size = 16384;
287 287
288 288 /* Raw socket fanout size. Must be a power of 2. */
289 289 uint_t ipcl_raw_fanout_size = 256;
290 290
291 291 /*
292 292 * The IPCL_IPTUN_HASH() function works best with a prime table size. We
293 293 * expect that most large deployments would have hundreds of tunnels, and
294 294 * thousands in the extreme case.
295 295 */
296 296 uint_t ipcl_iptun_fanout_size = 6143;
297 297
298 298 /*
299 299 * Power of 2^N Primes useful for hashing for N of 0-28,
300 300 * these primes are the nearest prime <= 2^N - 2^(N-2).
301 301 */
302 302
303 303 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \
304 304 6143, 12281, 24571, 49139, 98299, 196597, 393209, \
305 305 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \
306 306 50331599, 100663291, 201326557, 0}
307 307
308 308 /*
309 309 * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
310 310 * are aligned on cache lines.
311 311 */
312 312 typedef union itc_s {
313 313 conn_t itc_conn;
314 314 char itcu_filler[CACHE_ALIGN(conn_s)];
315 315 } itc_t;
316 316
317 317 struct kmem_cache *tcp_conn_cache;
318 318 struct kmem_cache *ip_conn_cache;
319 319 extern struct kmem_cache *sctp_conn_cache;
320 320 struct kmem_cache *udp_conn_cache;
321 321 struct kmem_cache *rawip_conn_cache;
322 322 struct kmem_cache *rts_conn_cache;
323 323
324 324 extern void tcp_timermp_free(tcp_t *);
325 325 extern mblk_t *tcp_timermp_alloc(int);
326 326
327 327 static int ip_conn_constructor(void *, void *, int);
328 328 static void ip_conn_destructor(void *, void *);
329 329
330 330 static int tcp_conn_constructor(void *, void *, int);
331 331 static void tcp_conn_destructor(void *, void *);
332 332
333 333 static int udp_conn_constructor(void *, void *, int);
334 334 static void udp_conn_destructor(void *, void *);
335 335
336 336 static int rawip_conn_constructor(void *, void *, int);
337 337 static void rawip_conn_destructor(void *, void *);
338 338
339 339 static int rts_conn_constructor(void *, void *, int);
340 340 static void rts_conn_destructor(void *, void *);
341 341
342 342 /*
343 343 * Global (for all stack instances) init routine
344 344 */
345 345 void
346 346 ipcl_g_init(void)
347 347 {
348 348 ip_conn_cache = kmem_cache_create("ip_conn_cache",
349 349 sizeof (conn_t), CACHE_ALIGN_SIZE,
350 350 ip_conn_constructor, ip_conn_destructor,
351 351 NULL, NULL, NULL, 0);
352 352
353 353 tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
354 354 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
355 355 tcp_conn_constructor, tcp_conn_destructor,
356 356 tcp_conn_reclaim, NULL, NULL, 0);
357 357
358 358 udp_conn_cache = kmem_cache_create("udp_conn_cache",
359 359 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
360 360 udp_conn_constructor, udp_conn_destructor,
361 361 NULL, NULL, NULL, 0);
362 362
363 363 rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
364 364 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
365 365 rawip_conn_constructor, rawip_conn_destructor,
366 366 NULL, NULL, NULL, 0);
367 367
368 368 rts_conn_cache = kmem_cache_create("rts_conn_cache",
369 369 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
370 370 rts_conn_constructor, rts_conn_destructor,
371 371 NULL, NULL, NULL, 0);
372 372 }
373 373
374 374 /*
375 375 * ipclassifier intialization routine, sets up hash tables.
376 376 */
377 377 void
378 378 ipcl_init(ip_stack_t *ipst)
379 379 {
380 380 int i;
381 381 int sizes[] = P2Ps();
382 382
383 383 /*
384 384 * Calculate size of conn fanout table from /etc/system settings
385 385 */
386 386 if (ipcl_conn_hash_size != 0) {
387 387 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
388 388 } else if (tcp_conn_hash_size != 0) {
389 389 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
390 390 } else {
391 391 extern pgcnt_t freemem;
392 392
393 393 ipst->ips_ipcl_conn_fanout_size =
394 394 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
395 395
396 396 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
397 397 ipst->ips_ipcl_conn_fanout_size =
398 398 ipcl_conn_hash_maxsize;
399 399 }
400 400 }
401 401
402 402 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
403 403 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
404 404 break;
405 405 }
406 406 }
407 407 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
408 408 /* Out of range, use the 2^16 value */
409 409 ipst->ips_ipcl_conn_fanout_size = sizes[16];
410 410 }
411 411
412 412 /* Take values from /etc/system */
413 413 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
414 414 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
415 415 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
416 416 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
417 417
418 418 ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
419 419
420 420 ipst->ips_ipcl_conn_fanout = kmem_zalloc(
421 421 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
422 422
423 423 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
424 424 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
425 425 MUTEX_DEFAULT, NULL);
426 426 }
427 427
428 428 ipst->ips_ipcl_bind_fanout = kmem_zalloc(
429 429 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
430 430
431 431 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
432 432 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
433 433 MUTEX_DEFAULT, NULL);
434 434 }
435 435
436 436 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
437 437 sizeof (connf_t), KM_SLEEP);
438 438 for (i = 0; i < IPPROTO_MAX; i++) {
439 439 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
440 440 MUTEX_DEFAULT, NULL);
441 441 }
442 442
443 443 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
444 444 sizeof (connf_t), KM_SLEEP);
445 445 for (i = 0; i < IPPROTO_MAX; i++) {
446 446 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
447 447 MUTEX_DEFAULT, NULL);
448 448 }
449 449
450 450 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
451 451 mutex_init(&ipst->ips_rts_clients->connf_lock,
452 452 NULL, MUTEX_DEFAULT, NULL);
453 453
454 454 ipst->ips_ipcl_udp_fanout = kmem_zalloc(
455 455 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
456 456 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
457 457 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
458 458 MUTEX_DEFAULT, NULL);
459 459 }
460 460
461 461 ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
462 462 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
463 463 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
464 464 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
465 465 MUTEX_DEFAULT, NULL);
466 466 }
467 467
468 468 ipst->ips_ipcl_raw_fanout = kmem_zalloc(
469 469 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
470 470 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
471 471 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
472 472 MUTEX_DEFAULT, NULL);
473 473 }
474 474
475 475 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
476 476 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
477 477 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
478 478 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
479 479 NULL, MUTEX_DEFAULT, NULL);
480 480 }
481 481 }
482 482
483 483 void
484 484 ipcl_g_destroy(void)
485 485 {
486 486 kmem_cache_destroy(ip_conn_cache);
487 487 kmem_cache_destroy(tcp_conn_cache);
488 488 kmem_cache_destroy(udp_conn_cache);
489 489 kmem_cache_destroy(rawip_conn_cache);
490 490 kmem_cache_destroy(rts_conn_cache);
491 491 }
492 492
493 493 /*
494 494 * All user-level and kernel use of the stack must be gone
495 495 * by now.
496 496 */
497 497 void
498 498 ipcl_destroy(ip_stack_t *ipst)
499 499 {
500 500 int i;
501 501
502 502 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
503 503 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
504 504 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
505 505 }
506 506 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
507 507 sizeof (connf_t));
508 508 ipst->ips_ipcl_conn_fanout = NULL;
509 509
510 510 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
511 511 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
512 512 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
513 513 }
514 514 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
515 515 sizeof (connf_t));
516 516 ipst->ips_ipcl_bind_fanout = NULL;
517 517
518 518 for (i = 0; i < IPPROTO_MAX; i++) {
519 519 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
520 520 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
521 521 }
522 522 kmem_free(ipst->ips_ipcl_proto_fanout_v4,
523 523 IPPROTO_MAX * sizeof (connf_t));
524 524 ipst->ips_ipcl_proto_fanout_v4 = NULL;
525 525
526 526 for (i = 0; i < IPPROTO_MAX; i++) {
527 527 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
528 528 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
529 529 }
530 530 kmem_free(ipst->ips_ipcl_proto_fanout_v6,
531 531 IPPROTO_MAX * sizeof (connf_t));
532 532 ipst->ips_ipcl_proto_fanout_v6 = NULL;
533 533
534 534 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
535 535 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
536 536 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
537 537 }
538 538 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
539 539 sizeof (connf_t));
540 540 ipst->ips_ipcl_udp_fanout = NULL;
541 541
542 542 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
543 543 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
544 544 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
545 545 }
546 546 kmem_free(ipst->ips_ipcl_iptun_fanout,
547 547 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
548 548 ipst->ips_ipcl_iptun_fanout = NULL;
549 549
550 550 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
551 551 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
552 552 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
553 553 }
554 554 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
555 555 sizeof (connf_t));
556 556 ipst->ips_ipcl_raw_fanout = NULL;
557 557
558 558 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
559 559 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
560 560 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
561 561 }
562 562 kmem_free(ipst->ips_ipcl_globalhash_fanout,
563 563 sizeof (connf_t) * CONN_G_HASH_SIZE);
564 564 ipst->ips_ipcl_globalhash_fanout = NULL;
565 565
566 566 ASSERT(ipst->ips_rts_clients->connf_head == NULL);
567 567 mutex_destroy(&ipst->ips_rts_clients->connf_lock);
568 568 kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
569 569 ipst->ips_rts_clients = NULL;
570 570 }
571 571
572 572 /*
573 573 * conn creation routine. initialize the conn, sets the reference
574 574 * and inserts it in the global hash table.
575 575 */
576 576 conn_t *
577 577 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
578 578 {
579 579 conn_t *connp;
580 580 struct kmem_cache *conn_cache;
581 581
582 582 switch (type) {
583 583 case IPCL_SCTPCONN:
584 584 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
585 585 return (NULL);
586 586 sctp_conn_init(connp);
587 587 netstack_hold(ns);
588 588 connp->conn_netstack = ns;
589 589 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
590 590 connp->conn_ixa->ixa_conn_id = (long)connp;
591 591 ipcl_globalhash_insert(connp);
592 592 return (connp);
593 593
594 594 case IPCL_TCPCONN:
595 595 conn_cache = tcp_conn_cache;
596 596 break;
597 597
598 598 case IPCL_UDPCONN:
599 599 conn_cache = udp_conn_cache;
600 600 break;
601 601
602 602 case IPCL_RAWIPCONN:
603 603 conn_cache = rawip_conn_cache;
604 604 break;
605 605
606 606 case IPCL_RTSCONN:
607 607 conn_cache = rts_conn_cache;
608 608 break;
609 609
610 610 case IPCL_IPCCONN:
611 611 conn_cache = ip_conn_cache;
612 612 break;
613 613
614 614 default:
615 615 connp = NULL;
616 616 ASSERT(0);
617 617 }
618 618
619 619 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
620 620 return (NULL);
621 621
622 622 connp->conn_ref = 1;
623 623 netstack_hold(ns);
624 624 connp->conn_netstack = ns;
625 625 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
626 626 connp->conn_ixa->ixa_conn_id = (long)connp;
627 627 ipcl_globalhash_insert(connp);
628 628 return (connp);
629 629 }
630 630
631 631 void
632 632 ipcl_conn_destroy(conn_t *connp)
633 633 {
634 634 mblk_t *mp;
635 635 netstack_t *ns = connp->conn_netstack;
636 636
637 637 ASSERT(!MUTEX_HELD(&connp->conn_lock));
638 638 ASSERT(connp->conn_ref == 0);
639 639 ASSERT(connp->conn_ioctlref == 0);
640 640
641 641 DTRACE_PROBE1(conn__destroy, conn_t *, connp);
642 642
643 643 if (connp->conn_cred != NULL) {
644 644 crfree(connp->conn_cred);
645 645 connp->conn_cred = NULL;
646 646 /* ixa_cred done in ipcl_conn_cleanup below */
647 647 }
648 648
649 649 if (connp->conn_ht_iphc != NULL) {
650 650 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
651 651 connp->conn_ht_iphc = NULL;
652 652 connp->conn_ht_iphc_allocated = 0;
653 653 connp->conn_ht_iphc_len = 0;
654 654 connp->conn_ht_ulp = NULL;
655 655 connp->conn_ht_ulp_len = 0;
656 656 }
657 657 ip_pkt_free(&connp->conn_xmit_ipp);
658 658
659 659 ipcl_globalhash_remove(connp);
660 660
661 661 if (connp->conn_latch != NULL) {
662 662 IPLATCH_REFRELE(connp->conn_latch);
663 663 connp->conn_latch = NULL;
664 664 }
665 665 if (connp->conn_latch_in_policy != NULL) {
666 666 IPPOL_REFRELE(connp->conn_latch_in_policy);
667 667 connp->conn_latch_in_policy = NULL;
668 668 }
669 669 if (connp->conn_latch_in_action != NULL) {
670 670 IPACT_REFRELE(connp->conn_latch_in_action);
671 671 connp->conn_latch_in_action = NULL;
672 672 }
673 673 if (connp->conn_policy != NULL) {
674 674 IPPH_REFRELE(connp->conn_policy, ns);
675 675 connp->conn_policy = NULL;
676 676 }
677 677
678 678 if (connp->conn_ipsec_opt_mp != NULL) {
679 679 freemsg(connp->conn_ipsec_opt_mp);
680 680 connp->conn_ipsec_opt_mp = NULL;
681 681 }
682 682
683 683 if (connp->conn_flags & IPCL_TCPCONN) {
684 684 tcp_t *tcp = connp->conn_tcp;
685 685
686 686 tcp_free(tcp);
687 687 mp = tcp->tcp_timercache;
688 688
689 689 tcp->tcp_tcps = NULL;
690 690
691 691 /*
692 692 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
693 693 * the mblk.
694 694 */
695 695 if (tcp->tcp_rsrv_mp != NULL) {
696 696 freeb(tcp->tcp_rsrv_mp);
697 697 tcp->tcp_rsrv_mp = NULL;
698 698 mutex_destroy(&tcp->tcp_rsrv_mp_lock);
699 699 }
700 700
701 701 ipcl_conn_cleanup(connp);
702 702 connp->conn_flags = IPCL_TCPCONN;
703 703 if (ns != NULL) {
704 704 ASSERT(tcp->tcp_tcps == NULL);
705 705 connp->conn_netstack = NULL;
706 706 connp->conn_ixa->ixa_ipst = NULL;
707 707 netstack_rele(ns);
708 708 }
709 709
710 710 bzero(tcp, sizeof (tcp_t));
711 711
712 712 tcp->tcp_timercache = mp;
713 713 tcp->tcp_connp = connp;
714 714 kmem_cache_free(tcp_conn_cache, connp);
715 715 return;
716 716 }
717 717
718 718 if (connp->conn_flags & IPCL_SCTPCONN) {
719 719 ASSERT(ns != NULL);
720 720 sctp_free(connp);
721 721 return;
722 722 }
723 723
724 724 ipcl_conn_cleanup(connp);
725 725 if (ns != NULL) {
726 726 connp->conn_netstack = NULL;
727 727 connp->conn_ixa->ixa_ipst = NULL;
728 728 netstack_rele(ns);
729 729 }
730 730
731 731 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
732 732 if (connp->conn_flags & IPCL_UDPCONN) {
733 733 connp->conn_flags = IPCL_UDPCONN;
734 734 kmem_cache_free(udp_conn_cache, connp);
735 735 } else if (connp->conn_flags & IPCL_RAWIPCONN) {
736 736 connp->conn_flags = IPCL_RAWIPCONN;
737 737 connp->conn_proto = IPPROTO_ICMP;
738 738 connp->conn_ixa->ixa_protocol = connp->conn_proto;
739 739 kmem_cache_free(rawip_conn_cache, connp);
740 740 } else if (connp->conn_flags & IPCL_RTSCONN) {
741 741 connp->conn_flags = IPCL_RTSCONN;
742 742 kmem_cache_free(rts_conn_cache, connp);
743 743 } else {
744 744 connp->conn_flags = IPCL_IPCCONN;
745 745 ASSERT(connp->conn_flags & IPCL_IPCCONN);
746 746 ASSERT(connp->conn_priv == NULL);
747 747 kmem_cache_free(ip_conn_cache, connp);
748 748 }
749 749 }
750 750
751 751 /*
752 752 * Running in cluster mode - deregister listener information
753 753 */
754 754 static void
755 755 ipcl_conn_unlisten(conn_t *connp)
756 756 {
757 757 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
758 758 ASSERT(connp->conn_lport != 0);
759 759
760 760 if (cl_inet_unlisten != NULL) {
761 761 sa_family_t addr_family;
762 762 uint8_t *laddrp;
763 763
764 764 if (connp->conn_ipversion == IPV6_VERSION) {
765 765 addr_family = AF_INET6;
766 766 laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
767 767 } else {
768 768 addr_family = AF_INET;
769 769 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
770 770 }
771 771 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
772 772 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
773 773 }
774 774 connp->conn_flags &= ~IPCL_CL_LISTENER;
775 775 }
776 776
777 777 /*
778 778 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
779 779 * which table the conn belonged to). So for debugging we can see which hash
780 780 * table this connection was in.
781 781 */
782 782 #define IPCL_HASH_REMOVE(connp) { \
783 783 connf_t *connfp = (connp)->conn_fanout; \
784 784 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \
785 785 if (connfp != NULL) { \
786 786 mutex_enter(&connfp->connf_lock); \
787 787 if ((connp)->conn_next != NULL) \
788 788 (connp)->conn_next->conn_prev = \
789 789 (connp)->conn_prev; \
790 790 if ((connp)->conn_prev != NULL) \
791 791 (connp)->conn_prev->conn_next = \
792 792 (connp)->conn_next; \
793 793 else \
794 794 connfp->connf_head = (connp)->conn_next; \
795 795 (connp)->conn_fanout = NULL; \
796 796 (connp)->conn_next = NULL; \
797 797 (connp)->conn_prev = NULL; \
798 798 (connp)->conn_flags |= IPCL_REMOVED; \
799 799 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \
800 800 ipcl_conn_unlisten((connp)); \
801 801 CONN_DEC_REF((connp)); \
802 802 mutex_exit(&connfp->connf_lock); \
803 803 } \
804 804 }
805 805
806 806 void
807 807 ipcl_hash_remove(conn_t *connp)
808 808 {
809 809 uint8_t protocol = connp->conn_proto;
810 810
811 811 IPCL_HASH_REMOVE(connp);
812 812 if (protocol == IPPROTO_RSVP)
813 813 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
814 814 }
815 815
816 816 /*
817 817 * The whole purpose of this function is allow removal of
818 818 * a conn_t from the connected hash for timewait reclaim.
819 819 * This is essentially a TW reclaim fastpath where timewait
820 820 * collector checks under fanout lock (so no one else can
821 821 * get access to the conn_t) that refcnt is 2 i.e. one for
822 822 * TCP and one for the classifier hash list. If ref count
823 823 * is indeed 2, we can just remove the conn under lock and
824 824 * avoid cleaning up the conn under squeue. This gives us
825 825 * improved performance.
826 826 */
827 827 void
828 828 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
829 829 {
830 830 ASSERT(MUTEX_HELD(&connfp->connf_lock));
831 831 ASSERT(MUTEX_HELD(&connp->conn_lock));
832 832 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
833 833
834 834 if ((connp)->conn_next != NULL) {
835 835 (connp)->conn_next->conn_prev = (connp)->conn_prev;
836 836 }
837 837 if ((connp)->conn_prev != NULL) {
838 838 (connp)->conn_prev->conn_next = (connp)->conn_next;
839 839 } else {
840 840 connfp->connf_head = (connp)->conn_next;
841 841 }
842 842 (connp)->conn_fanout = NULL;
843 843 (connp)->conn_next = NULL;
844 844 (connp)->conn_prev = NULL;
845 845 (connp)->conn_flags |= IPCL_REMOVED;
846 846 ASSERT((connp)->conn_ref == 2);
847 847 (connp)->conn_ref--;
848 848 }
849 849
850 850 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \
851 851 ASSERT((connp)->conn_fanout == NULL); \
852 852 ASSERT((connp)->conn_next == NULL); \
853 853 ASSERT((connp)->conn_prev == NULL); \
854 854 if ((connfp)->connf_head != NULL) { \
855 855 (connfp)->connf_head->conn_prev = (connp); \
856 856 (connp)->conn_next = (connfp)->connf_head; \
857 857 } \
858 858 (connp)->conn_fanout = (connfp); \
859 859 (connfp)->connf_head = (connp); \
860 860 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
861 861 IPCL_CONNECTED; \
862 862 CONN_INC_REF(connp); \
863 863 }
864 864
865 865 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \
866 866 IPCL_HASH_REMOVE((connp)); \
867 867 mutex_enter(&(connfp)->connf_lock); \
868 868 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \
869 869 mutex_exit(&(connfp)->connf_lock); \
870 870 }
871 871
872 872 /*
873 873 * When inserting bound or wildcard entries into the hash, ordering rules are
874 874 * used to facilitate timely and correct lookups. The order is as follows:
875 875 * 1. Entries bound to a specific address
876 876 * 2. Entries bound to INADDR_ANY
877 877 * 3. Entries bound to ADDR_UNSPECIFIED
878 878 * Entries in a category which share conn_lport (such as those using
879 879 * SO_REUSEPORT) will be ordered such that the newest inserted is first.
880 880 */
881 881
882 882 void
883 883 ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp)
884 884 {
885 885 conn_t *pconnp, *nconnp;
886 886
887 887 IPCL_HASH_REMOVE(connp);
888 888 mutex_enter(&connfp->connf_lock);
889 889 nconnp = connfp->connf_head;
890 890 pconnp = NULL;
891 891 while (nconnp != NULL) {
892 892 /*
893 893 * Walk though entries associated with the fanout until one is
894 894 * found which fulfills any of these conditions:
895 895 * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED
896 896 * 2. Listen port the same as connp
897 897 */
898 898 if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) ||
899 899 connp->conn_lport == nconnp->conn_lport)
900 900 break;
901 901 pconnp = nconnp;
902 902 nconnp = nconnp->conn_next;
903 903 }
904 904 if (pconnp != NULL) {
905 905 pconnp->conn_next = connp;
906 906 connp->conn_prev = pconnp;
907 907 } else {
908 908 connfp->connf_head = connp;
909 909 }
910 910 if (nconnp != NULL) {
911 911 connp->conn_next = nconnp;
912 912 nconnp->conn_prev = connp;
|
↓ open down ↓ |
912 lines elided |
↑ open up ↑ |
913 913 }
914 914 connp->conn_fanout = connfp;
915 915 connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
916 916 CONN_INC_REF(connp);
917 917 mutex_exit(&connfp->connf_lock);
918 918 }
919 919
920 920 void
921 921 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
922 922 {
923 - conn_t **list, *prev, *next;
924 923 conn_t *pconnp = NULL, *nconnp;
925 924 boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6);
926 925
927 926 IPCL_HASH_REMOVE(connp);
928 927 mutex_enter(&connfp->connf_lock);
929 928 nconnp = connfp->connf_head;
930 929 pconnp = NULL;
931 930 while (nconnp != NULL) {
932 931 if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) &&
933 932 isv4mapped && connp->conn_lport == nconnp->conn_lport)
934 933 break;
935 934 if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) &&
936 935 (isv4mapped ||
937 936 connp->conn_lport == nconnp->conn_lport))
938 937 break;
939 938
940 939 pconnp = nconnp;
941 940 nconnp = nconnp->conn_next;
942 941 }
943 942 if (pconnp != NULL) {
944 943 pconnp->conn_next = connp;
945 944 connp->conn_prev = pconnp;
946 945 } else {
947 946 connfp->connf_head = connp;
948 947 }
949 948 if (nconnp != NULL) {
950 949 connp->conn_next = nconnp;
951 950 nconnp->conn_prev = connp;
952 951 }
953 952 connp->conn_fanout = connfp;
954 953 connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
955 954 CONN_INC_REF(connp);
956 955 mutex_exit(&connfp->connf_lock);
957 956 }
958 957
959 958 /*
960 959 * Because the classifier is used to classify inbound packets, the destination
961 960 * address is meant to be our local tunnel address (tunnel source), and the
962 961 * source the remote tunnel address (tunnel destination).
963 962 *
964 963 * Note that conn_proto can't be used for fanout since the upper protocol
965 964 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
966 965 */
967 966 conn_t *
968 967 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
969 968 {
970 969 connf_t *connfp;
971 970 conn_t *connp;
972 971
973 972 /* first look for IPv4 tunnel links */
974 973 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
975 974 mutex_enter(&connfp->connf_lock);
976 975 for (connp = connfp->connf_head; connp != NULL;
977 976 connp = connp->conn_next) {
978 977 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
979 978 break;
980 979 }
981 980 if (connp != NULL)
982 981 goto done;
983 982
984 983 mutex_exit(&connfp->connf_lock);
985 984
986 985 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
987 986 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
988 987 INADDR_ANY)];
989 988 mutex_enter(&connfp->connf_lock);
990 989 for (connp = connfp->connf_head; connp != NULL;
991 990 connp = connp->conn_next) {
992 991 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
993 992 break;
994 993 }
995 994 done:
996 995 if (connp != NULL)
997 996 CONN_INC_REF(connp);
998 997 mutex_exit(&connfp->connf_lock);
999 998 return (connp);
1000 999 }
1001 1000
1002 1001 conn_t *
1003 1002 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
1004 1003 {
1005 1004 connf_t *connfp;
1006 1005 conn_t *connp;
1007 1006
1008 1007 /* Look for an IPv6 tunnel link */
1009 1008 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
1010 1009 mutex_enter(&connfp->connf_lock);
1011 1010 for (connp = connfp->connf_head; connp != NULL;
1012 1011 connp = connp->conn_next) {
1013 1012 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
1014 1013 CONN_INC_REF(connp);
1015 1014 break;
1016 1015 }
1017 1016 }
1018 1017 mutex_exit(&connfp->connf_lock);
1019 1018 return (connp);
1020 1019 }
1021 1020
1022 1021 /*
1023 1022 * This function is used only for inserting SCTP raw socket now.
1024 1023 * This may change later.
1025 1024 *
1026 1025 * Note that only one raw socket can be bound to a port. The param
1027 1026 * lport is in network byte order.
1028 1027 */
1029 1028 static int
1030 1029 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1031 1030 {
1032 1031 connf_t *connfp;
1033 1032 conn_t *oconnp;
1034 1033 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1035 1034
1036 1035 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1037 1036
1038 1037 /* Check for existing raw socket already bound to the port. */
1039 1038 mutex_enter(&connfp->connf_lock);
1040 1039 for (oconnp = connfp->connf_head; oconnp != NULL;
1041 1040 oconnp = oconnp->conn_next) {
1042 1041 if (oconnp->conn_lport == lport &&
1043 1042 oconnp->conn_zoneid == connp->conn_zoneid &&
1044 1043 oconnp->conn_family == connp->conn_family &&
1045 1044 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1046 1045 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1047 1046 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1048 1047 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1049 1048 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1050 1049 &connp->conn_laddr_v6))) {
1051 1050 break;
1052 1051 }
1053 1052 }
1054 1053 mutex_exit(&connfp->connf_lock);
1055 1054 if (oconnp != NULL)
1056 1055 return (EADDRNOTAVAIL);
1057 1056
1058 1057 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1059 1058 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1060 1059 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1061 1060 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1062 1061 ipcl_hash_insert_wildcard(connfp, connp);
1063 1062 } else {
1064 1063 ipcl_hash_insert_bound(connfp, connp);
1065 1064 }
1066 1065 } else {
1067 1066 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1068 1067 }
1069 1068 return (0);
1070 1069 }
1071 1070
1072 1071 static int
1073 1072 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1074 1073 {
1075 1074 connf_t *connfp;
1076 1075 conn_t *tconnp;
1077 1076 ipaddr_t laddr = connp->conn_laddr_v4;
1078 1077 ipaddr_t faddr = connp->conn_faddr_v4;
1079 1078
1080 1079 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1081 1080 mutex_enter(&connfp->connf_lock);
1082 1081 for (tconnp = connfp->connf_head; tconnp != NULL;
1083 1082 tconnp = tconnp->conn_next) {
1084 1083 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1085 1084 /* A tunnel is already bound to these addresses. */
1086 1085 mutex_exit(&connfp->connf_lock);
1087 1086 return (EADDRINUSE);
1088 1087 }
1089 1088 }
1090 1089 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1091 1090 mutex_exit(&connfp->connf_lock);
1092 1091 return (0);
1093 1092 }
1094 1093
1095 1094 static int
1096 1095 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1097 1096 {
1098 1097 connf_t *connfp;
1099 1098 conn_t *tconnp;
1100 1099 in6_addr_t *laddr = &connp->conn_laddr_v6;
1101 1100 in6_addr_t *faddr = &connp->conn_faddr_v6;
1102 1101
1103 1102 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1104 1103 mutex_enter(&connfp->connf_lock);
1105 1104 for (tconnp = connfp->connf_head; tconnp != NULL;
1106 1105 tconnp = tconnp->conn_next) {
1107 1106 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1108 1107 /* A tunnel is already bound to these addresses. */
1109 1108 mutex_exit(&connfp->connf_lock);
1110 1109 return (EADDRINUSE);
1111 1110 }
1112 1111 }
1113 1112 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1114 1113 mutex_exit(&connfp->connf_lock);
1115 1114 return (0);
1116 1115 }
1117 1116
1118 1117 /*
1119 1118 * Check for a MAC exemption conflict on a labeled system. Note that for
1120 1119 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1121 1120 * transport layer. This check is for binding all other protocols.
1122 1121 *
1123 1122 * Returns true if there's a conflict.
1124 1123 */
1125 1124 static boolean_t
1126 1125 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1127 1126 {
1128 1127 connf_t *connfp;
1129 1128 conn_t *tconn;
1130 1129
1131 1130 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1132 1131 mutex_enter(&connfp->connf_lock);
1133 1132 for (tconn = connfp->connf_head; tconn != NULL;
1134 1133 tconn = tconn->conn_next) {
1135 1134 /* We don't allow v4 fallback for v6 raw socket */
1136 1135 if (connp->conn_family != tconn->conn_family)
1137 1136 continue;
1138 1137 /* If neither is exempt, then there's no conflict */
1139 1138 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1140 1139 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1141 1140 continue;
1142 1141 /* We are only concerned about sockets for a different zone */
1143 1142 if (connp->conn_zoneid == tconn->conn_zoneid)
1144 1143 continue;
1145 1144 /* If both are bound to different specific addrs, ok */
1146 1145 if (connp->conn_laddr_v4 != INADDR_ANY &&
1147 1146 tconn->conn_laddr_v4 != INADDR_ANY &&
1148 1147 connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1149 1148 continue;
1150 1149 /* These two conflict; fail */
1151 1150 break;
1152 1151 }
1153 1152 mutex_exit(&connfp->connf_lock);
1154 1153 return (tconn != NULL);
1155 1154 }
1156 1155
1157 1156 static boolean_t
1158 1157 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1159 1158 {
1160 1159 connf_t *connfp;
1161 1160 conn_t *tconn;
1162 1161
1163 1162 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1164 1163 mutex_enter(&connfp->connf_lock);
1165 1164 for (tconn = connfp->connf_head; tconn != NULL;
1166 1165 tconn = tconn->conn_next) {
1167 1166 /* We don't allow v4 fallback for v6 raw socket */
1168 1167 if (connp->conn_family != tconn->conn_family)
1169 1168 continue;
1170 1169 /* If neither is exempt, then there's no conflict */
1171 1170 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1172 1171 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1173 1172 continue;
1174 1173 /* We are only concerned about sockets for a different zone */
1175 1174 if (connp->conn_zoneid == tconn->conn_zoneid)
1176 1175 continue;
1177 1176 /* If both are bound to different addrs, ok */
1178 1177 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1179 1178 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1180 1179 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1181 1180 &tconn->conn_laddr_v6))
1182 1181 continue;
1183 1182 /* These two conflict; fail */
1184 1183 break;
1185 1184 }
1186 1185 mutex_exit(&connfp->connf_lock);
1187 1186 return (tconn != NULL);
1188 1187 }
1189 1188
1190 1189 /*
1191 1190 * (v4, v6) bind hash insertion routines
1192 1191 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1193 1192 */
1194 1193
1195 1194 int
1196 1195 ipcl_bind_insert(conn_t *connp)
1197 1196 {
1198 1197 if (connp->conn_ipversion == IPV6_VERSION)
1199 1198 return (ipcl_bind_insert_v6(connp));
1200 1199 else
1201 1200 return (ipcl_bind_insert_v4(connp));
1202 1201 }
1203 1202
1204 1203 int
1205 1204 ipcl_bind_insert_v4(conn_t *connp)
1206 1205 {
1207 1206 connf_t *connfp;
1208 1207 int ret = 0;
1209 1208 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1210 1209 uint16_t lport = connp->conn_lport;
1211 1210 uint8_t protocol = connp->conn_proto;
1212 1211
1213 1212 if (IPCL_IS_IPTUN(connp))
1214 1213 return (ipcl_iptun_hash_insert(connp, ipst));
1215 1214
1216 1215 switch (protocol) {
1217 1216 default:
1218 1217 if (is_system_labeled() &&
1219 1218 check_exempt_conflict_v4(connp, ipst))
1220 1219 return (EADDRINUSE);
1221 1220 /* FALLTHROUGH */
1222 1221 case IPPROTO_UDP:
1223 1222 if (protocol == IPPROTO_UDP) {
1224 1223 connfp = &ipst->ips_ipcl_udp_fanout[
1225 1224 IPCL_UDP_HASH(lport, ipst)];
1226 1225 } else {
1227 1226 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1228 1227 }
1229 1228
1230 1229 if (connp->conn_faddr_v4 != INADDR_ANY) {
1231 1230 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1232 1231 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1233 1232 ipcl_hash_insert_bound(connfp, connp);
1234 1233 } else {
1235 1234 ipcl_hash_insert_wildcard(connfp, connp);
1236 1235 }
1237 1236 if (protocol == IPPROTO_RSVP)
1238 1237 ill_set_inputfn_all(ipst);
1239 1238 break;
1240 1239
1241 1240 case IPPROTO_TCP:
1242 1241 /* Insert it in the Bind Hash */
1243 1242 ASSERT(connp->conn_zoneid != ALL_ZONES);
1244 1243 connfp = &ipst->ips_ipcl_bind_fanout[
1245 1244 IPCL_BIND_HASH(lport, ipst)];
1246 1245 if (connp->conn_laddr_v4 != INADDR_ANY) {
1247 1246 ipcl_hash_insert_bound(connfp, connp);
1248 1247 } else {
1249 1248 ipcl_hash_insert_wildcard(connfp, connp);
1250 1249 }
1251 1250 if (cl_inet_listen != NULL) {
1252 1251 ASSERT(connp->conn_ipversion == IPV4_VERSION);
1253 1252 connp->conn_flags |= IPCL_CL_LISTENER;
1254 1253 (*cl_inet_listen)(
1255 1254 connp->conn_netstack->netstack_stackid,
1256 1255 IPPROTO_TCP, AF_INET,
1257 1256 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1258 1257 }
1259 1258 break;
1260 1259
1261 1260 case IPPROTO_SCTP:
1262 1261 ret = ipcl_sctp_hash_insert(connp, lport);
1263 1262 break;
1264 1263 }
1265 1264
1266 1265 return (ret);
1267 1266 }
1268 1267
1269 1268 int
1270 1269 ipcl_bind_insert_v6(conn_t *connp)
1271 1270 {
1272 1271 connf_t *connfp;
1273 1272 int ret = 0;
1274 1273 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1275 1274 uint16_t lport = connp->conn_lport;
1276 1275 uint8_t protocol = connp->conn_proto;
1277 1276
1278 1277 if (IPCL_IS_IPTUN(connp)) {
1279 1278 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1280 1279 }
1281 1280
1282 1281 switch (protocol) {
1283 1282 default:
1284 1283 if (is_system_labeled() &&
1285 1284 check_exempt_conflict_v6(connp, ipst))
1286 1285 return (EADDRINUSE);
1287 1286 /* FALLTHROUGH */
1288 1287 case IPPROTO_UDP:
1289 1288 if (protocol == IPPROTO_UDP) {
1290 1289 connfp = &ipst->ips_ipcl_udp_fanout[
1291 1290 IPCL_UDP_HASH(lport, ipst)];
1292 1291 } else {
1293 1292 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1294 1293 }
1295 1294
1296 1295 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1297 1296 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1298 1297 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1299 1298 ipcl_hash_insert_bound(connfp, connp);
1300 1299 } else {
1301 1300 ipcl_hash_insert_wildcard(connfp, connp);
1302 1301 }
1303 1302 break;
1304 1303
1305 1304 case IPPROTO_TCP:
1306 1305 /* Insert it in the Bind Hash */
1307 1306 ASSERT(connp->conn_zoneid != ALL_ZONES);
1308 1307 connfp = &ipst->ips_ipcl_bind_fanout[
1309 1308 IPCL_BIND_HASH(lport, ipst)];
1310 1309 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1311 1310 ipcl_hash_insert_bound(connfp, connp);
1312 1311 } else {
1313 1312 ipcl_hash_insert_wildcard(connfp, connp);
1314 1313 }
1315 1314 if (cl_inet_listen != NULL) {
1316 1315 sa_family_t addr_family;
1317 1316 uint8_t *laddrp;
1318 1317
1319 1318 if (connp->conn_ipversion == IPV6_VERSION) {
1320 1319 addr_family = AF_INET6;
1321 1320 laddrp =
1322 1321 (uint8_t *)&connp->conn_bound_addr_v6;
1323 1322 } else {
1324 1323 addr_family = AF_INET;
1325 1324 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1326 1325 }
1327 1326 connp->conn_flags |= IPCL_CL_LISTENER;
1328 1327 (*cl_inet_listen)(
1329 1328 connp->conn_netstack->netstack_stackid,
1330 1329 IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1331 1330 }
1332 1331 break;
1333 1332
1334 1333 case IPPROTO_SCTP:
1335 1334 ret = ipcl_sctp_hash_insert(connp, lport);
1336 1335 break;
1337 1336 }
1338 1337
1339 1338 return (ret);
1340 1339 }
1341 1340
1342 1341 /*
1343 1342 * ipcl_conn_hash insertion routines.
1344 1343 * The caller has already set conn_proto and the addresses/ports in the conn_t.
1345 1344 */
1346 1345
1347 1346 int
1348 1347 ipcl_conn_insert(conn_t *connp)
1349 1348 {
1350 1349 if (connp->conn_ipversion == IPV6_VERSION)
1351 1350 return (ipcl_conn_insert_v6(connp));
1352 1351 else
1353 1352 return (ipcl_conn_insert_v4(connp));
1354 1353 }
1355 1354
1356 1355 int
1357 1356 ipcl_conn_insert_v4(conn_t *connp)
1358 1357 {
1359 1358 connf_t *connfp;
1360 1359 conn_t *tconnp;
1361 1360 int ret = 0;
1362 1361 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1363 1362 uint16_t lport = connp->conn_lport;
1364 1363 uint8_t protocol = connp->conn_proto;
1365 1364
1366 1365 if (IPCL_IS_IPTUN(connp))
1367 1366 return (ipcl_iptun_hash_insert(connp, ipst));
1368 1367
1369 1368 switch (protocol) {
1370 1369 case IPPROTO_TCP:
1371 1370 /*
1372 1371 * For TCP, we check whether the connection tuple already
1373 1372 * exists before allowing the connection to proceed. We
1374 1373 * also allow indexing on the zoneid. This is to allow
1375 1374 * multiple shared stack zones to have the same tcp
1376 1375 * connection tuple. In practice this only happens for
1377 1376 * INADDR_LOOPBACK as it's the only local address which
1378 1377 * doesn't have to be unique.
1379 1378 */
1380 1379 connfp = &ipst->ips_ipcl_conn_fanout[
1381 1380 IPCL_CONN_HASH(connp->conn_faddr_v4,
1382 1381 connp->conn_ports, ipst)];
1383 1382 mutex_enter(&connfp->connf_lock);
1384 1383 for (tconnp = connfp->connf_head; tconnp != NULL;
1385 1384 tconnp = tconnp->conn_next) {
1386 1385 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1387 1386 connp->conn_faddr_v4, connp->conn_laddr_v4,
1388 1387 connp->conn_ports) &&
1389 1388 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1390 1389 /* Already have a conn. bail out */
1391 1390 mutex_exit(&connfp->connf_lock);
1392 1391 return (EADDRINUSE);
1393 1392 }
1394 1393 }
1395 1394 if (connp->conn_fanout != NULL) {
1396 1395 /*
1397 1396 * Probably a XTI/TLI application trying to do a
1398 1397 * rebind. Let it happen.
1399 1398 */
1400 1399 mutex_exit(&connfp->connf_lock);
1401 1400 IPCL_HASH_REMOVE(connp);
1402 1401 mutex_enter(&connfp->connf_lock);
1403 1402 }
1404 1403
1405 1404 ASSERT(connp->conn_recv != NULL);
1406 1405 ASSERT(connp->conn_recvicmp != NULL);
1407 1406
1408 1407 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1409 1408 mutex_exit(&connfp->connf_lock);
1410 1409 break;
1411 1410
1412 1411 case IPPROTO_SCTP:
1413 1412 /*
1414 1413 * The raw socket may have already been bound, remove it
1415 1414 * from the hash first.
1416 1415 */
1417 1416 IPCL_HASH_REMOVE(connp);
1418 1417 ret = ipcl_sctp_hash_insert(connp, lport);
1419 1418 break;
1420 1419
1421 1420 default:
1422 1421 /*
1423 1422 * Check for conflicts among MAC exempt bindings. For
1424 1423 * transports with port numbers, this is done by the upper
1425 1424 * level per-transport binding logic. For all others, it's
1426 1425 * done here.
1427 1426 */
1428 1427 if (is_system_labeled() &&
1429 1428 check_exempt_conflict_v4(connp, ipst))
1430 1429 return (EADDRINUSE);
1431 1430 /* FALLTHROUGH */
1432 1431
1433 1432 case IPPROTO_UDP:
1434 1433 if (protocol == IPPROTO_UDP) {
1435 1434 connfp = &ipst->ips_ipcl_udp_fanout[
1436 1435 IPCL_UDP_HASH(lport, ipst)];
1437 1436 } else {
1438 1437 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1439 1438 }
1440 1439
1441 1440 if (connp->conn_faddr_v4 != INADDR_ANY) {
1442 1441 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1443 1442 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1444 1443 ipcl_hash_insert_bound(connfp, connp);
1445 1444 } else {
1446 1445 ipcl_hash_insert_wildcard(connfp, connp);
1447 1446 }
1448 1447 break;
1449 1448 }
1450 1449
1451 1450 return (ret);
1452 1451 }
1453 1452
1454 1453 int
1455 1454 ipcl_conn_insert_v6(conn_t *connp)
1456 1455 {
1457 1456 connf_t *connfp;
1458 1457 conn_t *tconnp;
1459 1458 int ret = 0;
1460 1459 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1461 1460 uint16_t lport = connp->conn_lport;
1462 1461 uint8_t protocol = connp->conn_proto;
1463 1462 uint_t ifindex = connp->conn_bound_if;
1464 1463
1465 1464 if (IPCL_IS_IPTUN(connp))
1466 1465 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1467 1466
1468 1467 switch (protocol) {
1469 1468 case IPPROTO_TCP:
1470 1469
1471 1470 /*
1472 1471 * For tcp, we check whether the connection tuple already
1473 1472 * exists before allowing the connection to proceed. We
1474 1473 * also allow indexing on the zoneid. This is to allow
1475 1474 * multiple shared stack zones to have the same tcp
1476 1475 * connection tuple. In practice this only happens for
1477 1476 * ipv6_loopback as it's the only local address which
1478 1477 * doesn't have to be unique.
1479 1478 */
1480 1479 connfp = &ipst->ips_ipcl_conn_fanout[
1481 1480 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1482 1481 ipst)];
1483 1482 mutex_enter(&connfp->connf_lock);
1484 1483 for (tconnp = connfp->connf_head; tconnp != NULL;
1485 1484 tconnp = tconnp->conn_next) {
1486 1485 /* NOTE: need to match zoneid. Bug in onnv-gate */
1487 1486 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1488 1487 connp->conn_faddr_v6, connp->conn_laddr_v6,
1489 1488 connp->conn_ports) &&
1490 1489 (tconnp->conn_bound_if == 0 ||
1491 1490 tconnp->conn_bound_if == ifindex) &&
1492 1491 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1493 1492 /* Already have a conn. bail out */
1494 1493 mutex_exit(&connfp->connf_lock);
1495 1494 return (EADDRINUSE);
1496 1495 }
1497 1496 }
1498 1497 if (connp->conn_fanout != NULL) {
1499 1498 /*
1500 1499 * Probably a XTI/TLI application trying to do a
1501 1500 * rebind. Let it happen.
1502 1501 */
1503 1502 mutex_exit(&connfp->connf_lock);
1504 1503 IPCL_HASH_REMOVE(connp);
1505 1504 mutex_enter(&connfp->connf_lock);
1506 1505 }
1507 1506 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1508 1507 mutex_exit(&connfp->connf_lock);
1509 1508 break;
1510 1509
1511 1510 case IPPROTO_SCTP:
1512 1511 IPCL_HASH_REMOVE(connp);
1513 1512 ret = ipcl_sctp_hash_insert(connp, lport);
1514 1513 break;
1515 1514
1516 1515 default:
1517 1516 if (is_system_labeled() &&
1518 1517 check_exempt_conflict_v6(connp, ipst))
1519 1518 return (EADDRINUSE);
1520 1519 /* FALLTHROUGH */
1521 1520 case IPPROTO_UDP:
1522 1521 if (protocol == IPPROTO_UDP) {
1523 1522 connfp = &ipst->ips_ipcl_udp_fanout[
1524 1523 IPCL_UDP_HASH(lport, ipst)];
1525 1524 } else {
1526 1525 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1527 1526 }
1528 1527
1529 1528 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1530 1529 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1531 1530 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1532 1531 ipcl_hash_insert_bound(connfp, connp);
1533 1532 } else {
1534 1533 ipcl_hash_insert_wildcard(connfp, connp);
1535 1534 }
1536 1535 break;
1537 1536 }
1538 1537
1539 1538 return (ret);
1540 1539 }
1541 1540
1542 1541 /*
1543 1542 * v4 packet classifying function. looks up the fanout table to
1544 1543 * find the conn, the packet belongs to. returns the conn with
1545 1544 * the reference held, null otherwise.
1546 1545 *
1547 1546 * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1548 1547 * Lookup" comment block are applied. Labels are also checked as described
1549 1548 * above. If the packet is from the inside (looped back), and is from the same
1550 1549 * zone, then label checks are omitted.
1551 1550 */
1552 1551 conn_t *
1553 1552 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1554 1553 ip_recv_attr_t *ira, ip_stack_t *ipst)
1555 1554 {
1556 1555 ipha_t *ipha;
1557 1556 connf_t *connfp, *bind_connfp;
1558 1557 uint16_t lport;
1559 1558 uint16_t fport;
1560 1559 uint32_t ports;
1561 1560 conn_t *connp;
1562 1561 uint16_t *up;
1563 1562 zoneid_t zoneid = ira->ira_zoneid;
1564 1563
1565 1564 ipha = (ipha_t *)mp->b_rptr;
1566 1565 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1567 1566
1568 1567 switch (protocol) {
1569 1568 case IPPROTO_TCP:
1570 1569 ports = *(uint32_t *)up;
1571 1570 connfp =
1572 1571 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1573 1572 ports, ipst)];
1574 1573 mutex_enter(&connfp->connf_lock);
1575 1574 for (connp = connfp->connf_head; connp != NULL;
1576 1575 connp = connp->conn_next) {
1577 1576 if (IPCL_CONN_MATCH(connp, protocol,
1578 1577 ipha->ipha_src, ipha->ipha_dst, ports) &&
1579 1578 (connp->conn_zoneid == zoneid ||
1580 1579 connp->conn_allzones ||
1581 1580 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1582 1581 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1583 1582 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1584 1583 break;
1585 1584 }
1586 1585
1587 1586 if (connp != NULL) {
1588 1587 /*
1589 1588 * We have a fully-bound TCP connection.
1590 1589 *
1591 1590 * For labeled systems, there's no need to check the
1592 1591 * label here. It's known to be good as we checked
1593 1592 * before allowing the connection to become bound.
1594 1593 */
1595 1594 CONN_INC_REF(connp);
1596 1595 mutex_exit(&connfp->connf_lock);
1597 1596 return (connp);
1598 1597 }
1599 1598
1600 1599 mutex_exit(&connfp->connf_lock);
1601 1600 lport = up[1];
1602 1601 bind_connfp =
1603 1602 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1604 1603 mutex_enter(&bind_connfp->connf_lock);
1605 1604 for (connp = bind_connfp->connf_head; connp != NULL;
1606 1605 connp = connp->conn_next) {
1607 1606 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1608 1607 lport) &&
1609 1608 (connp->conn_zoneid == zoneid ||
1610 1609 connp->conn_allzones ||
1611 1610 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1612 1611 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1613 1612 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1614 1613 break;
1615 1614 }
1616 1615
1617 1616 /*
1618 1617 * If the matching connection is SLP on a private address, then
1619 1618 * the label on the packet must match the local zone's label.
1620 1619 * Otherwise, it must be in the label range defined by tnrh.
1621 1620 * This is ensured by tsol_receive_local.
1622 1621 *
1623 1622 * Note that we don't check tsol_receive_local for
1624 1623 * the connected case.
1625 1624 */
1626 1625 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1627 1626 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1628 1627 ira, connp)) {
1629 1628 DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1630 1629 char *, "connp(1) could not receive mp(2)",
1631 1630 conn_t *, connp, mblk_t *, mp);
1632 1631 connp = NULL;
1633 1632 }
1634 1633
1635 1634 if (connp != NULL) {
1636 1635 /* Have a listener at least */
1637 1636 CONN_INC_REF(connp);
1638 1637 mutex_exit(&bind_connfp->connf_lock);
1639 1638 return (connp);
1640 1639 }
1641 1640
1642 1641 mutex_exit(&bind_connfp->connf_lock);
1643 1642 break;
1644 1643
1645 1644 case IPPROTO_UDP:
1646 1645 lport = up[1];
1647 1646 fport = up[0];
1648 1647 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1649 1648 mutex_enter(&connfp->connf_lock);
1650 1649 for (connp = connfp->connf_head; connp != NULL;
1651 1650 connp = connp->conn_next) {
1652 1651 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1653 1652 fport, ipha->ipha_src) &&
1654 1653 (connp->conn_zoneid == zoneid ||
1655 1654 connp->conn_allzones ||
1656 1655 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1657 1656 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1658 1657 break;
1659 1658 }
1660 1659
1661 1660 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1662 1661 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1663 1662 ira, connp)) {
1664 1663 DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1665 1664 char *, "connp(1) could not receive mp(2)",
1666 1665 conn_t *, connp, mblk_t *, mp);
1667 1666 connp = NULL;
1668 1667 }
1669 1668
1670 1669 if (connp != NULL) {
1671 1670 CONN_INC_REF(connp);
1672 1671 mutex_exit(&connfp->connf_lock);
1673 1672 return (connp);
1674 1673 }
1675 1674
1676 1675 /*
1677 1676 * We shouldn't come here for multicast/broadcast packets
1678 1677 */
1679 1678 mutex_exit(&connfp->connf_lock);
1680 1679
1681 1680 break;
1682 1681
1683 1682 case IPPROTO_ENCAP:
1684 1683 case IPPROTO_IPV6:
1685 1684 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1686 1685 &ipha->ipha_dst, ipst));
1687 1686 }
1688 1687
1689 1688 return (NULL);
1690 1689 }
1691 1690
1692 1691 conn_t *
1693 1692 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1694 1693 ip_recv_attr_t *ira, ip_stack_t *ipst)
1695 1694 {
1696 1695 ip6_t *ip6h;
1697 1696 connf_t *connfp, *bind_connfp;
1698 1697 uint16_t lport;
1699 1698 uint16_t fport;
1700 1699 tcpha_t *tcpha;
1701 1700 uint32_t ports;
1702 1701 conn_t *connp;
1703 1702 uint16_t *up;
1704 1703 zoneid_t zoneid = ira->ira_zoneid;
1705 1704
1706 1705 ip6h = (ip6_t *)mp->b_rptr;
1707 1706
1708 1707 switch (protocol) {
1709 1708 case IPPROTO_TCP:
1710 1709 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1711 1710 up = &tcpha->tha_lport;
1712 1711 ports = *(uint32_t *)up;
1713 1712
1714 1713 connfp =
1715 1714 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1716 1715 ports, ipst)];
1717 1716 mutex_enter(&connfp->connf_lock);
1718 1717 for (connp = connfp->connf_head; connp != NULL;
1719 1718 connp = connp->conn_next) {
1720 1719 if (IPCL_CONN_MATCH_V6(connp, protocol,
1721 1720 ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1722 1721 (connp->conn_zoneid == zoneid ||
1723 1722 connp->conn_allzones ||
1724 1723 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1725 1724 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1726 1725 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1727 1726 break;
1728 1727 }
1729 1728
1730 1729 if (connp != NULL) {
1731 1730 /*
1732 1731 * We have a fully-bound TCP connection.
1733 1732 *
1734 1733 * For labeled systems, there's no need to check the
1735 1734 * label here. It's known to be good as we checked
1736 1735 * before allowing the connection to become bound.
1737 1736 */
1738 1737 CONN_INC_REF(connp);
1739 1738 mutex_exit(&connfp->connf_lock);
1740 1739 return (connp);
1741 1740 }
1742 1741
1743 1742 mutex_exit(&connfp->connf_lock);
1744 1743
1745 1744 lport = up[1];
1746 1745 bind_connfp =
1747 1746 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1748 1747 mutex_enter(&bind_connfp->connf_lock);
1749 1748 for (connp = bind_connfp->connf_head; connp != NULL;
1750 1749 connp = connp->conn_next) {
1751 1750 if (IPCL_BIND_MATCH_V6(connp, protocol,
1752 1751 ip6h->ip6_dst, lport) &&
1753 1752 (connp->conn_zoneid == zoneid ||
1754 1753 connp->conn_allzones ||
1755 1754 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1756 1755 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1757 1756 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1758 1757 break;
1759 1758 }
1760 1759
1761 1760 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1762 1761 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1763 1762 ira, connp)) {
1764 1763 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1765 1764 char *, "connp(1) could not receive mp(2)",
1766 1765 conn_t *, connp, mblk_t *, mp);
1767 1766 connp = NULL;
1768 1767 }
1769 1768
1770 1769 if (connp != NULL) {
1771 1770 /* Have a listner at least */
1772 1771 CONN_INC_REF(connp);
1773 1772 mutex_exit(&bind_connfp->connf_lock);
1774 1773 return (connp);
1775 1774 }
1776 1775
1777 1776 mutex_exit(&bind_connfp->connf_lock);
1778 1777 break;
1779 1778
1780 1779 case IPPROTO_UDP:
1781 1780 up = (uint16_t *)&mp->b_rptr[hdr_len];
1782 1781 lport = up[1];
1783 1782 fport = up[0];
1784 1783 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1785 1784 mutex_enter(&connfp->connf_lock);
1786 1785 for (connp = connfp->connf_head; connp != NULL;
1787 1786 connp = connp->conn_next) {
1788 1787 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1789 1788 fport, ip6h->ip6_src) &&
1790 1789 (connp->conn_zoneid == zoneid ||
1791 1790 connp->conn_allzones ||
1792 1791 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1793 1792 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1794 1793 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1795 1794 break;
1796 1795 }
1797 1796
1798 1797 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1799 1798 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1800 1799 ira, connp)) {
1801 1800 DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1802 1801 char *, "connp(1) could not receive mp(2)",
1803 1802 conn_t *, connp, mblk_t *, mp);
1804 1803 connp = NULL;
1805 1804 }
1806 1805
1807 1806 if (connp != NULL) {
1808 1807 CONN_INC_REF(connp);
1809 1808 mutex_exit(&connfp->connf_lock);
1810 1809 return (connp);
1811 1810 }
1812 1811
1813 1812 /*
1814 1813 * We shouldn't come here for multicast/broadcast packets
1815 1814 */
1816 1815 mutex_exit(&connfp->connf_lock);
1817 1816 break;
1818 1817 case IPPROTO_ENCAP:
1819 1818 case IPPROTO_IPV6:
1820 1819 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1821 1820 &ip6h->ip6_dst, ipst));
1822 1821 }
1823 1822
1824 1823 return (NULL);
1825 1824 }
1826 1825
1827 1826 /*
1828 1827 * wrapper around ipcl_classify_(v4,v6) routines.
1829 1828 */
1830 1829 conn_t *
1831 1830 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1832 1831 {
1833 1832 if (ira->ira_flags & IRAF_IS_IPV4) {
1834 1833 return (ipcl_classify_v4(mp, ira->ira_protocol,
1835 1834 ira->ira_ip_hdr_length, ira, ipst));
1836 1835 } else {
1837 1836 return (ipcl_classify_v6(mp, ira->ira_protocol,
1838 1837 ira->ira_ip_hdr_length, ira, ipst));
1839 1838 }
1840 1839 }
1841 1840
1842 1841 /*
1843 1842 * Only used to classify SCTP RAW sockets
1844 1843 */
1845 1844 conn_t *
1846 1845 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1847 1846 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1848 1847 {
1849 1848 connf_t *connfp;
1850 1849 conn_t *connp;
1851 1850 in_port_t lport;
1852 1851 int ipversion;
1853 1852 const void *dst;
1854 1853 zoneid_t zoneid = ira->ira_zoneid;
1855 1854
1856 1855 lport = ((uint16_t *)&ports)[1];
1857 1856 if (ira->ira_flags & IRAF_IS_IPV4) {
1858 1857 dst = (const void *)&ipha->ipha_dst;
1859 1858 ipversion = IPV4_VERSION;
1860 1859 } else {
1861 1860 dst = (const void *)&ip6h->ip6_dst;
1862 1861 ipversion = IPV6_VERSION;
1863 1862 }
1864 1863
1865 1864 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1866 1865 mutex_enter(&connfp->connf_lock);
1867 1866 for (connp = connfp->connf_head; connp != NULL;
1868 1867 connp = connp->conn_next) {
1869 1868 /* We don't allow v4 fallback for v6 raw socket. */
1870 1869 if (ipversion != connp->conn_ipversion)
1871 1870 continue;
1872 1871 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1873 1872 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1874 1873 if (ipversion == IPV4_VERSION) {
1875 1874 if (!IPCL_CONN_MATCH(connp, protocol,
1876 1875 ipha->ipha_src, ipha->ipha_dst, ports))
1877 1876 continue;
1878 1877 } else {
1879 1878 if (!IPCL_CONN_MATCH_V6(connp, protocol,
1880 1879 ip6h->ip6_src, ip6h->ip6_dst, ports))
1881 1880 continue;
1882 1881 }
1883 1882 } else {
1884 1883 if (ipversion == IPV4_VERSION) {
1885 1884 if (!IPCL_BIND_MATCH(connp, protocol,
1886 1885 ipha->ipha_dst, lport))
1887 1886 continue;
1888 1887 } else {
1889 1888 if (!IPCL_BIND_MATCH_V6(connp, protocol,
1890 1889 ip6h->ip6_dst, lport))
1891 1890 continue;
1892 1891 }
1893 1892 }
1894 1893
1895 1894 if (connp->conn_zoneid == zoneid ||
1896 1895 connp->conn_allzones ||
1897 1896 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1898 1897 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1899 1898 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1900 1899 break;
1901 1900 }
1902 1901
1903 1902 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1904 1903 !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1905 1904 DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1906 1905 char *, "connp(1) could not receive mp(2)",
1907 1906 conn_t *, connp, mblk_t *, mp);
1908 1907 connp = NULL;
1909 1908 }
1910 1909
1911 1910 if (connp != NULL)
1912 1911 goto found;
1913 1912 mutex_exit(&connfp->connf_lock);
1914 1913
1915 1914 /* Try to look for a wildcard SCTP RAW socket match. */
1916 1915 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1917 1916 mutex_enter(&connfp->connf_lock);
1918 1917 for (connp = connfp->connf_head; connp != NULL;
1919 1918 connp = connp->conn_next) {
1920 1919 /* We don't allow v4 fallback for v6 raw socket. */
1921 1920 if (ipversion != connp->conn_ipversion)
1922 1921 continue;
1923 1922 if (!IPCL_ZONE_MATCH(connp, zoneid))
1924 1923 continue;
1925 1924
1926 1925 if (ipversion == IPV4_VERSION) {
1927 1926 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1928 1927 break;
1929 1928 } else {
1930 1929 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1931 1930 break;
1932 1931 }
1933 1932 }
1934 1933 }
1935 1934
1936 1935 if (connp != NULL)
1937 1936 goto found;
1938 1937
1939 1938 mutex_exit(&connfp->connf_lock);
1940 1939 return (NULL);
1941 1940
1942 1941 found:
1943 1942 ASSERT(connp != NULL);
1944 1943 CONN_INC_REF(connp);
1945 1944 mutex_exit(&connfp->connf_lock);
1946 1945 return (connp);
1947 1946 }
1948 1947
1949 1948 /* ARGSUSED */
1950 1949 static int
1951 1950 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1952 1951 {
1953 1952 itc_t *itc = (itc_t *)buf;
1954 1953 conn_t *connp = &itc->itc_conn;
1955 1954 tcp_t *tcp = (tcp_t *)&itc[1];
1956 1955
1957 1956 bzero(connp, sizeof (conn_t));
1958 1957 bzero(tcp, sizeof (tcp_t));
1959 1958
1960 1959 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1961 1960 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1962 1961 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1963 1962 tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1964 1963 if (tcp->tcp_timercache == NULL)
1965 1964 return (ENOMEM);
1966 1965 connp->conn_tcp = tcp;
1967 1966 connp->conn_flags = IPCL_TCPCONN;
1968 1967 connp->conn_proto = IPPROTO_TCP;
1969 1968 tcp->tcp_connp = connp;
1970 1969 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1971 1970
1972 1971 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1973 1972 if (connp->conn_ixa == NULL) {
1974 1973 tcp_timermp_free(tcp);
1975 1974 return (ENOMEM);
1976 1975 }
1977 1976 connp->conn_ixa->ixa_refcnt = 1;
1978 1977 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1979 1978 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1980 1979 return (0);
1981 1980 }
1982 1981
1983 1982 /* ARGSUSED */
1984 1983 static void
1985 1984 tcp_conn_destructor(void *buf, void *cdrarg)
1986 1985 {
1987 1986 itc_t *itc = (itc_t *)buf;
1988 1987 conn_t *connp = &itc->itc_conn;
1989 1988 tcp_t *tcp = (tcp_t *)&itc[1];
1990 1989
1991 1990 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1992 1991 ASSERT(tcp->tcp_connp == connp);
1993 1992 ASSERT(connp->conn_tcp == tcp);
1994 1993 tcp_timermp_free(tcp);
1995 1994 mutex_destroy(&connp->conn_lock);
1996 1995 cv_destroy(&connp->conn_cv);
1997 1996 cv_destroy(&connp->conn_sq_cv);
1998 1997 rw_destroy(&connp->conn_ilg_lock);
1999 1998
2000 1999 /* Can be NULL if constructor failed */
2001 2000 if (connp->conn_ixa != NULL) {
2002 2001 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2003 2002 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2004 2003 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2005 2004 ixa_refrele(connp->conn_ixa);
2006 2005 }
2007 2006 }
2008 2007
2009 2008 /* ARGSUSED */
2010 2009 static int
2011 2010 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2012 2011 {
2013 2012 itc_t *itc = (itc_t *)buf;
2014 2013 conn_t *connp = &itc->itc_conn;
2015 2014
2016 2015 bzero(connp, sizeof (conn_t));
2017 2016 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2018 2017 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2019 2018 connp->conn_flags = IPCL_IPCCONN;
2020 2019 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2021 2020
2022 2021 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2023 2022 if (connp->conn_ixa == NULL)
2024 2023 return (ENOMEM);
2025 2024 connp->conn_ixa->ixa_refcnt = 1;
2026 2025 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2027 2026 return (0);
2028 2027 }
2029 2028
2030 2029 /* ARGSUSED */
2031 2030 static void
2032 2031 ip_conn_destructor(void *buf, void *cdrarg)
2033 2032 {
2034 2033 itc_t *itc = (itc_t *)buf;
2035 2034 conn_t *connp = &itc->itc_conn;
2036 2035
2037 2036 ASSERT(connp->conn_flags & IPCL_IPCCONN);
2038 2037 ASSERT(connp->conn_priv == NULL);
2039 2038 mutex_destroy(&connp->conn_lock);
2040 2039 cv_destroy(&connp->conn_cv);
2041 2040 rw_destroy(&connp->conn_ilg_lock);
2042 2041
2043 2042 /* Can be NULL if constructor failed */
2044 2043 if (connp->conn_ixa != NULL) {
2045 2044 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2046 2045 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2047 2046 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2048 2047 ixa_refrele(connp->conn_ixa);
2049 2048 }
2050 2049 }
2051 2050
2052 2051 /* ARGSUSED */
2053 2052 static int
2054 2053 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2055 2054 {
2056 2055 itc_t *itc = (itc_t *)buf;
2057 2056 conn_t *connp = &itc->itc_conn;
2058 2057 udp_t *udp = (udp_t *)&itc[1];
2059 2058
2060 2059 bzero(connp, sizeof (conn_t));
2061 2060 bzero(udp, sizeof (udp_t));
2062 2061
2063 2062 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2064 2063 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2065 2064 connp->conn_udp = udp;
2066 2065 connp->conn_flags = IPCL_UDPCONN;
2067 2066 connp->conn_proto = IPPROTO_UDP;
2068 2067 udp->udp_connp = connp;
2069 2068 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2070 2069 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2071 2070 if (connp->conn_ixa == NULL)
2072 2071 return (ENOMEM);
2073 2072 connp->conn_ixa->ixa_refcnt = 1;
2074 2073 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2075 2074 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2076 2075 return (0);
2077 2076 }
2078 2077
2079 2078 /* ARGSUSED */
2080 2079 static void
2081 2080 udp_conn_destructor(void *buf, void *cdrarg)
2082 2081 {
2083 2082 itc_t *itc = (itc_t *)buf;
2084 2083 conn_t *connp = &itc->itc_conn;
2085 2084 udp_t *udp = (udp_t *)&itc[1];
2086 2085
2087 2086 ASSERT(connp->conn_flags & IPCL_UDPCONN);
2088 2087 ASSERT(udp->udp_connp == connp);
2089 2088 ASSERT(connp->conn_udp == udp);
2090 2089 mutex_destroy(&connp->conn_lock);
2091 2090 cv_destroy(&connp->conn_cv);
2092 2091 rw_destroy(&connp->conn_ilg_lock);
2093 2092
2094 2093 /* Can be NULL if constructor failed */
2095 2094 if (connp->conn_ixa != NULL) {
2096 2095 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2097 2096 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2098 2097 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2099 2098 ixa_refrele(connp->conn_ixa);
2100 2099 }
2101 2100 }
2102 2101
2103 2102 /* ARGSUSED */
2104 2103 static int
2105 2104 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2106 2105 {
2107 2106 itc_t *itc = (itc_t *)buf;
2108 2107 conn_t *connp = &itc->itc_conn;
2109 2108 icmp_t *icmp = (icmp_t *)&itc[1];
2110 2109
2111 2110 bzero(connp, sizeof (conn_t));
2112 2111 bzero(icmp, sizeof (icmp_t));
2113 2112
2114 2113 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2115 2114 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2116 2115 connp->conn_icmp = icmp;
2117 2116 connp->conn_flags = IPCL_RAWIPCONN;
2118 2117 connp->conn_proto = IPPROTO_ICMP;
2119 2118 icmp->icmp_connp = connp;
2120 2119 rw_init(&icmp->icmp_bpf_lock, NULL, RW_DEFAULT, NULL);
2121 2120 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2122 2121 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2123 2122 if (connp->conn_ixa == NULL)
2124 2123 return (ENOMEM);
2125 2124 connp->conn_ixa->ixa_refcnt = 1;
2126 2125 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2127 2126 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2128 2127 return (0);
2129 2128 }
2130 2129
2131 2130 /* ARGSUSED */
2132 2131 static void
2133 2132 rawip_conn_destructor(void *buf, void *cdrarg)
2134 2133 {
2135 2134 itc_t *itc = (itc_t *)buf;
2136 2135 conn_t *connp = &itc->itc_conn;
2137 2136 icmp_t *icmp = (icmp_t *)&itc[1];
2138 2137
2139 2138 ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2140 2139 ASSERT(icmp->icmp_connp == connp);
2141 2140 ASSERT(connp->conn_icmp == icmp);
2142 2141 mutex_destroy(&connp->conn_lock);
2143 2142 cv_destroy(&connp->conn_cv);
2144 2143 rw_destroy(&connp->conn_ilg_lock);
2145 2144 rw_destroy(&icmp->icmp_bpf_lock);
2146 2145
2147 2146 /* Can be NULL if constructor failed */
2148 2147 if (connp->conn_ixa != NULL) {
2149 2148 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2150 2149 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2151 2150 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2152 2151 ixa_refrele(connp->conn_ixa);
2153 2152 }
2154 2153 }
2155 2154
2156 2155 /* ARGSUSED */
2157 2156 static int
2158 2157 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2159 2158 {
2160 2159 itc_t *itc = (itc_t *)buf;
2161 2160 conn_t *connp = &itc->itc_conn;
2162 2161 rts_t *rts = (rts_t *)&itc[1];
2163 2162
2164 2163 bzero(connp, sizeof (conn_t));
2165 2164 bzero(rts, sizeof (rts_t));
2166 2165
2167 2166 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2168 2167 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2169 2168 connp->conn_rts = rts;
2170 2169 connp->conn_flags = IPCL_RTSCONN;
2171 2170 rts->rts_connp = connp;
2172 2171 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2173 2172 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2174 2173 if (connp->conn_ixa == NULL)
2175 2174 return (ENOMEM);
2176 2175 connp->conn_ixa->ixa_refcnt = 1;
2177 2176 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2178 2177 return (0);
2179 2178 }
2180 2179
2181 2180 /* ARGSUSED */
2182 2181 static void
2183 2182 rts_conn_destructor(void *buf, void *cdrarg)
2184 2183 {
2185 2184 itc_t *itc = (itc_t *)buf;
2186 2185 conn_t *connp = &itc->itc_conn;
2187 2186 rts_t *rts = (rts_t *)&itc[1];
2188 2187
2189 2188 ASSERT(connp->conn_flags & IPCL_RTSCONN);
2190 2189 ASSERT(rts->rts_connp == connp);
2191 2190 ASSERT(connp->conn_rts == rts);
2192 2191 mutex_destroy(&connp->conn_lock);
2193 2192 cv_destroy(&connp->conn_cv);
2194 2193 rw_destroy(&connp->conn_ilg_lock);
2195 2194
2196 2195 /* Can be NULL if constructor failed */
2197 2196 if (connp->conn_ixa != NULL) {
2198 2197 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2199 2198 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2200 2199 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2201 2200 ixa_refrele(connp->conn_ixa);
2202 2201 }
2203 2202 }
2204 2203
2205 2204 /*
2206 2205 * Called as part of ipcl_conn_destroy to assert and clear any pointers
2207 2206 * in the conn_t.
2208 2207 *
2209 2208 * Below we list all the pointers in the conn_t as a documentation aid.
2210 2209 * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2211 2210 * If you add any pointers to the conn_t please add an ASSERT here
2212 2211 * and #ifdef it out if it can't be actually asserted to be NULL.
2213 2212 * In any case, we bzero most of the conn_t at the end of the function.
2214 2213 */
2215 2214 void
2216 2215 ipcl_conn_cleanup(conn_t *connp)
2217 2216 {
2218 2217 ip_xmit_attr_t *ixa;
2219 2218
2220 2219 ASSERT(connp->conn_latch == NULL);
2221 2220 ASSERT(connp->conn_latch_in_policy == NULL);
2222 2221 ASSERT(connp->conn_latch_in_action == NULL);
2223 2222 #ifdef notdef
2224 2223 ASSERT(connp->conn_rq == NULL);
2225 2224 ASSERT(connp->conn_wq == NULL);
2226 2225 #endif
2227 2226 ASSERT(connp->conn_cred == NULL);
2228 2227 ASSERT(connp->conn_g_fanout == NULL);
2229 2228 ASSERT(connp->conn_g_next == NULL);
2230 2229 ASSERT(connp->conn_g_prev == NULL);
2231 2230 ASSERT(connp->conn_policy == NULL);
2232 2231 ASSERT(connp->conn_fanout == NULL);
2233 2232 ASSERT(connp->conn_next == NULL);
2234 2233 ASSERT(connp->conn_prev == NULL);
2235 2234 ASSERT(connp->conn_oper_pending_ill == NULL);
2236 2235 ASSERT(connp->conn_ilg == NULL);
2237 2236 ASSERT(connp->conn_drain_next == NULL);
2238 2237 ASSERT(connp->conn_drain_prev == NULL);
2239 2238 #ifdef notdef
2240 2239 /* conn_idl is not cleared when removed from idl list */
2241 2240 ASSERT(connp->conn_idl == NULL);
2242 2241 #endif
2243 2242 ASSERT(connp->conn_ipsec_opt_mp == NULL);
2244 2243 #ifdef notdef
2245 2244 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2246 2245 ASSERT(connp->conn_netstack == NULL);
2247 2246 #endif
2248 2247
2249 2248 ASSERT(connp->conn_helper_info == NULL);
2250 2249 ASSERT(connp->conn_ixa != NULL);
2251 2250 ixa = connp->conn_ixa;
2252 2251 ASSERT(ixa->ixa_refcnt == 1);
2253 2252 /* Need to preserve ixa_protocol */
2254 2253 ixa_cleanup(ixa);
2255 2254 ixa->ixa_flags = 0;
2256 2255
2257 2256 /* Clear out the conn_t fields that are not preserved */
2258 2257 bzero(&connp->conn_start_clr,
2259 2258 sizeof (conn_t) -
2260 2259 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2261 2260 }
2262 2261
2263 2262 /*
2264 2263 * All conns are inserted in a global multi-list for the benefit of
2265 2264 * walkers. The walk is guaranteed to walk all open conns at the time
2266 2265 * of the start of the walk exactly once. This property is needed to
2267 2266 * achieve some cleanups during unplumb of interfaces. This is achieved
2268 2267 * as follows.
2269 2268 *
2270 2269 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2271 2270 * call the insert and delete functions below at creation and deletion
2272 2271 * time respectively. The conn never moves or changes its position in this
2273 2272 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2274 2273 * won't increase due to walkers, once the conn deletion has started. Note
2275 2274 * that we can't remove the conn from the global list and then wait for
2276 2275 * the refcnt to drop to zero, since walkers would then see a truncated
2277 2276 * list. CONN_INCIPIENT ensures that walkers don't start looking at
2278 2277 * conns until ip_open is ready to make them globally visible.
2279 2278 * The global round robin multi-list locks are held only to get the
2280 2279 * next member/insertion/deletion and contention should be negligible
2281 2280 * if the multi-list is much greater than the number of cpus.
2282 2281 */
2283 2282 void
2284 2283 ipcl_globalhash_insert(conn_t *connp)
2285 2284 {
2286 2285 int index;
2287 2286 struct connf_s *connfp;
2288 2287 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
2289 2288
2290 2289 /*
2291 2290 * No need for atomic here. Approximate even distribution
2292 2291 * in the global lists is sufficient.
2293 2292 */
2294 2293 ipst->ips_conn_g_index++;
2295 2294 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2296 2295
2297 2296 connp->conn_g_prev = NULL;
2298 2297 /*
2299 2298 * Mark as INCIPIENT, so that walkers will ignore this
2300 2299 * for now, till ip_open is ready to make it visible globally.
2301 2300 */
2302 2301 connp->conn_state_flags |= CONN_INCIPIENT;
2303 2302
2304 2303 connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2305 2304 /* Insert at the head of the list */
2306 2305 mutex_enter(&connfp->connf_lock);
2307 2306 connp->conn_g_next = connfp->connf_head;
2308 2307 if (connp->conn_g_next != NULL)
2309 2308 connp->conn_g_next->conn_g_prev = connp;
2310 2309 connfp->connf_head = connp;
2311 2310
2312 2311 /* The fanout bucket this conn points to */
2313 2312 connp->conn_g_fanout = connfp;
2314 2313
2315 2314 mutex_exit(&connfp->connf_lock);
2316 2315 }
2317 2316
2318 2317 void
2319 2318 ipcl_globalhash_remove(conn_t *connp)
2320 2319 {
2321 2320 struct connf_s *connfp;
2322 2321
2323 2322 /*
2324 2323 * We were never inserted in the global multi list.
2325 2324 * IPCL_NONE variety is never inserted in the global multilist
2326 2325 * since it is presumed to not need any cleanup and is transient.
2327 2326 */
2328 2327 if (connp->conn_g_fanout == NULL)
2329 2328 return;
2330 2329
2331 2330 connfp = connp->conn_g_fanout;
2332 2331 mutex_enter(&connfp->connf_lock);
2333 2332 if (connp->conn_g_prev != NULL)
2334 2333 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2335 2334 else
2336 2335 connfp->connf_head = connp->conn_g_next;
2337 2336 if (connp->conn_g_next != NULL)
2338 2337 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2339 2338 mutex_exit(&connfp->connf_lock);
2340 2339
2341 2340 /* Better to stumble on a null pointer than to corrupt memory */
2342 2341 connp->conn_g_next = NULL;
2343 2342 connp->conn_g_prev = NULL;
2344 2343 connp->conn_g_fanout = NULL;
2345 2344 }
2346 2345
2347 2346 /*
2348 2347 * Walk the list of all conn_t's in the system, calling the function provided
2349 2348 * With the specified argument for each.
2350 2349 * Applies to both IPv4 and IPv6.
2351 2350 *
2352 2351 * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2353 2352 * conn_oper_pending_ill). To guard against stale pointers
2354 2353 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2355 2354 * unplumbed or removed. New conn_t's that are created while we are walking
2356 2355 * may be missed by this walk, because they are not necessarily inserted
2357 2356 * at the tail of the list. They are new conn_t's and thus don't have any
2358 2357 * stale pointers. The CONN_CLOSING flag ensures that no new reference
2359 2358 * is created to the struct that is going away.
2360 2359 */
2361 2360 void
2362 2361 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2363 2362 {
2364 2363 int i;
2365 2364 conn_t *connp;
2366 2365 conn_t *prev_connp;
2367 2366
2368 2367 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2369 2368 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2370 2369 prev_connp = NULL;
2371 2370 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2372 2371 while (connp != NULL) {
2373 2372 mutex_enter(&connp->conn_lock);
2374 2373 if (connp->conn_state_flags &
2375 2374 (CONN_CONDEMNED | CONN_INCIPIENT)) {
2376 2375 mutex_exit(&connp->conn_lock);
2377 2376 connp = connp->conn_g_next;
2378 2377 continue;
2379 2378 }
2380 2379 CONN_INC_REF_LOCKED(connp);
2381 2380 mutex_exit(&connp->conn_lock);
2382 2381 mutex_exit(
2383 2382 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2384 2383 (*func)(connp, arg);
2385 2384 if (prev_connp != NULL)
2386 2385 CONN_DEC_REF(prev_connp);
2387 2386 mutex_enter(
2388 2387 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2389 2388 prev_connp = connp;
2390 2389 connp = connp->conn_g_next;
2391 2390 }
2392 2391 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2393 2392 if (prev_connp != NULL)
2394 2393 CONN_DEC_REF(prev_connp);
2395 2394 }
2396 2395 }
2397 2396
2398 2397 /*
2399 2398 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2400 2399 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2401 2400 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2402 2401 * (peer tcp in ESTABLISHED state).
2403 2402 */
2404 2403 conn_t *
2405 2404 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2406 2405 ip_stack_t *ipst)
2407 2406 {
2408 2407 uint32_t ports;
2409 2408 uint16_t *pports = (uint16_t *)&ports;
2410 2409 connf_t *connfp;
2411 2410 conn_t *tconnp;
2412 2411 boolean_t zone_chk;
2413 2412
2414 2413 /*
2415 2414 * If either the source of destination address is loopback, then
2416 2415 * both endpoints must be in the same Zone. Otherwise, both of
2417 2416 * the addresses are system-wide unique (tcp is in ESTABLISHED
2418 2417 * state) and the endpoints may reside in different Zones.
2419 2418 */
2420 2419 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2421 2420 ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2422 2421
2423 2422 pports[0] = tcpha->tha_fport;
2424 2423 pports[1] = tcpha->tha_lport;
2425 2424
2426 2425 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2427 2426 ports, ipst)];
2428 2427
2429 2428 mutex_enter(&connfp->connf_lock);
2430 2429 for (tconnp = connfp->connf_head; tconnp != NULL;
2431 2430 tconnp = tconnp->conn_next) {
2432 2431
2433 2432 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2434 2433 ipha->ipha_dst, ipha->ipha_src, ports) &&
2435 2434 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2436 2435 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2437 2436
2438 2437 ASSERT(tconnp != connp);
2439 2438 CONN_INC_REF(tconnp);
2440 2439 mutex_exit(&connfp->connf_lock);
2441 2440 return (tconnp);
2442 2441 }
2443 2442 }
2444 2443 mutex_exit(&connfp->connf_lock);
2445 2444 return (NULL);
2446 2445 }
2447 2446
2448 2447 /*
2449 2448 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2450 2449 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2451 2450 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2452 2451 * (peer tcp in ESTABLISHED state).
2453 2452 */
2454 2453 conn_t *
2455 2454 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2456 2455 ip_stack_t *ipst)
2457 2456 {
2458 2457 uint32_t ports;
2459 2458 uint16_t *pports = (uint16_t *)&ports;
2460 2459 connf_t *connfp;
2461 2460 conn_t *tconnp;
2462 2461 boolean_t zone_chk;
2463 2462
2464 2463 /*
2465 2464 * If either the source of destination address is loopback, then
2466 2465 * both endpoints must be in the same Zone. Otherwise, both of
2467 2466 * the addresses are system-wide unique (tcp is in ESTABLISHED
2468 2467 * state) and the endpoints may reside in different Zones. We
2469 2468 * don't do Zone check for link local address(es) because the
2470 2469 * current Zone implementation treats each link local address as
2471 2470 * being unique per system node, i.e. they belong to global Zone.
2472 2471 */
2473 2472 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2474 2473 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2475 2474
2476 2475 pports[0] = tcpha->tha_fport;
2477 2476 pports[1] = tcpha->tha_lport;
2478 2477
2479 2478 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2480 2479 ports, ipst)];
2481 2480
2482 2481 mutex_enter(&connfp->connf_lock);
2483 2482 for (tconnp = connfp->connf_head; tconnp != NULL;
2484 2483 tconnp = tconnp->conn_next) {
2485 2484
2486 2485 /* We skip conn_bound_if check here as this is loopback tcp */
2487 2486 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2488 2487 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2489 2488 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2490 2489 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2491 2490
2492 2491 ASSERT(tconnp != connp);
2493 2492 CONN_INC_REF(tconnp);
2494 2493 mutex_exit(&connfp->connf_lock);
2495 2494 return (tconnp);
2496 2495 }
2497 2496 }
2498 2497 mutex_exit(&connfp->connf_lock);
2499 2498 return (NULL);
2500 2499 }
2501 2500
2502 2501 /*
2503 2502 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2504 2503 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2505 2504 * Only checks for connected entries i.e. no INADDR_ANY checks.
2506 2505 */
2507 2506 conn_t *
2508 2507 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2509 2508 ip_stack_t *ipst)
2510 2509 {
2511 2510 uint32_t ports;
2512 2511 uint16_t *pports;
2513 2512 connf_t *connfp;
2514 2513 conn_t *tconnp;
2515 2514
2516 2515 pports = (uint16_t *)&ports;
2517 2516 pports[0] = tcpha->tha_fport;
2518 2517 pports[1] = tcpha->tha_lport;
2519 2518
2520 2519 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2521 2520 ports, ipst)];
2522 2521
2523 2522 mutex_enter(&connfp->connf_lock);
2524 2523 for (tconnp = connfp->connf_head; tconnp != NULL;
2525 2524 tconnp = tconnp->conn_next) {
2526 2525
2527 2526 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2528 2527 ipha->ipha_dst, ipha->ipha_src, ports) &&
2529 2528 tconnp->conn_tcp->tcp_state >= min_state) {
2530 2529
2531 2530 CONN_INC_REF(tconnp);
2532 2531 mutex_exit(&connfp->connf_lock);
2533 2532 return (tconnp);
2534 2533 }
2535 2534 }
2536 2535 mutex_exit(&connfp->connf_lock);
2537 2536 return (NULL);
2538 2537 }
2539 2538
2540 2539 /*
2541 2540 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2542 2541 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2543 2542 * Only checks for connected entries i.e. no INADDR_ANY checks.
2544 2543 * Match on ifindex in addition to addresses.
2545 2544 */
2546 2545 conn_t *
2547 2546 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2548 2547 uint_t ifindex, ip_stack_t *ipst)
2549 2548 {
2550 2549 tcp_t *tcp;
2551 2550 uint32_t ports;
2552 2551 uint16_t *pports;
2553 2552 connf_t *connfp;
2554 2553 conn_t *tconnp;
2555 2554
2556 2555 pports = (uint16_t *)&ports;
2557 2556 pports[0] = tcpha->tha_fport;
2558 2557 pports[1] = tcpha->tha_lport;
2559 2558
2560 2559 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2561 2560 ports, ipst)];
2562 2561
2563 2562 mutex_enter(&connfp->connf_lock);
2564 2563 for (tconnp = connfp->connf_head; tconnp != NULL;
2565 2564 tconnp = tconnp->conn_next) {
2566 2565
2567 2566 tcp = tconnp->conn_tcp;
2568 2567 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2569 2568 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2570 2569 tcp->tcp_state >= min_state &&
2571 2570 (tconnp->conn_bound_if == 0 ||
2572 2571 tconnp->conn_bound_if == ifindex)) {
2573 2572
2574 2573 CONN_INC_REF(tconnp);
2575 2574 mutex_exit(&connfp->connf_lock);
2576 2575 return (tconnp);
2577 2576 }
2578 2577 }
2579 2578 mutex_exit(&connfp->connf_lock);
2580 2579 return (NULL);
2581 2580 }
2582 2581
2583 2582 /*
2584 2583 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2585 2584 * a listener when changing state.
2586 2585 */
2587 2586 conn_t *
2588 2587 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2589 2588 ip_stack_t *ipst)
2590 2589 {
2591 2590 connf_t *bind_connfp;
2592 2591 conn_t *connp;
2593 2592 tcp_t *tcp;
2594 2593
2595 2594 /*
2596 2595 * Avoid false matches for packets sent to an IP destination of
2597 2596 * all zeros.
2598 2597 */
2599 2598 if (laddr == 0)
2600 2599 return (NULL);
2601 2600
2602 2601 ASSERT(zoneid != ALL_ZONES);
2603 2602
2604 2603 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2605 2604 mutex_enter(&bind_connfp->connf_lock);
2606 2605 for (connp = bind_connfp->connf_head; connp != NULL;
2607 2606 connp = connp->conn_next) {
2608 2607 tcp = connp->conn_tcp;
2609 2608 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2610 2609 IPCL_ZONE_MATCH(connp, zoneid) &&
2611 2610 (tcp->tcp_listener == NULL)) {
2612 2611 CONN_INC_REF(connp);
2613 2612 mutex_exit(&bind_connfp->connf_lock);
2614 2613 return (connp);
2615 2614 }
2616 2615 }
2617 2616 mutex_exit(&bind_connfp->connf_lock);
2618 2617 return (NULL);
2619 2618 }
2620 2619
2621 2620 /*
2622 2621 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2623 2622 * a listener when changing state.
2624 2623 */
2625 2624 conn_t *
2626 2625 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2627 2626 zoneid_t zoneid, ip_stack_t *ipst)
2628 2627 {
2629 2628 connf_t *bind_connfp;
2630 2629 conn_t *connp = NULL;
2631 2630 tcp_t *tcp;
2632 2631
2633 2632 /*
2634 2633 * Avoid false matches for packets sent to an IP destination of
2635 2634 * all zeros.
2636 2635 */
2637 2636 if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2638 2637 return (NULL);
2639 2638
2640 2639 ASSERT(zoneid != ALL_ZONES);
2641 2640
2642 2641 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2643 2642 mutex_enter(&bind_connfp->connf_lock);
2644 2643 for (connp = bind_connfp->connf_head; connp != NULL;
2645 2644 connp = connp->conn_next) {
2646 2645 tcp = connp->conn_tcp;
2647 2646 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2648 2647 IPCL_ZONE_MATCH(connp, zoneid) &&
2649 2648 (connp->conn_bound_if == 0 ||
2650 2649 connp->conn_bound_if == ifindex) &&
2651 2650 tcp->tcp_listener == NULL) {
2652 2651 CONN_INC_REF(connp);
2653 2652 mutex_exit(&bind_connfp->connf_lock);
2654 2653 return (connp);
2655 2654 }
2656 2655 }
2657 2656 mutex_exit(&bind_connfp->connf_lock);
2658 2657 return (NULL);
2659 2658 }
2660 2659
2661 2660 /*
2662 2661 * ipcl_get_next_conn
2663 2662 * get the next entry in the conn global list
2664 2663 * and put a reference on the next_conn.
2665 2664 * decrement the reference on the current conn.
2666 2665 *
2667 2666 * This is an iterator based walker function that also provides for
2668 2667 * some selection by the caller. It walks through the conn_hash bucket
2669 2668 * searching for the next valid connp in the list, and selects connections
2670 2669 * that are neither closed nor condemned. It also REFHOLDS the conn
2671 2670 * thus ensuring that the conn exists when the caller uses the conn.
2672 2671 */
2673 2672 conn_t *
2674 2673 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2675 2674 {
2676 2675 conn_t *next_connp;
2677 2676
2678 2677 if (connfp == NULL)
2679 2678 return (NULL);
2680 2679
2681 2680 mutex_enter(&connfp->connf_lock);
2682 2681
2683 2682 next_connp = (connp == NULL) ?
2684 2683 connfp->connf_head : connp->conn_g_next;
2685 2684
2686 2685 while (next_connp != NULL) {
2687 2686 mutex_enter(&next_connp->conn_lock);
2688 2687 if (!(next_connp->conn_flags & conn_flags) ||
2689 2688 (next_connp->conn_state_flags &
2690 2689 (CONN_CONDEMNED | CONN_INCIPIENT))) {
2691 2690 /*
2692 2691 * This conn has been condemned or
2693 2692 * is closing, or the flags don't match
2694 2693 */
2695 2694 mutex_exit(&next_connp->conn_lock);
2696 2695 next_connp = next_connp->conn_g_next;
2697 2696 continue;
2698 2697 }
2699 2698 CONN_INC_REF_LOCKED(next_connp);
2700 2699 mutex_exit(&next_connp->conn_lock);
2701 2700 break;
2702 2701 }
2703 2702
2704 2703 mutex_exit(&connfp->connf_lock);
2705 2704
2706 2705 if (connp != NULL)
2707 2706 CONN_DEC_REF(connp);
2708 2707
2709 2708 return (next_connp);
2710 2709 }
2711 2710
2712 2711 #ifdef CONN_DEBUG
2713 2712 /*
2714 2713 * Trace of the last NBUF refhold/refrele
2715 2714 */
2716 2715 int
2717 2716 conn_trace_ref(conn_t *connp)
2718 2717 {
2719 2718 int last;
2720 2719 conn_trace_t *ctb;
2721 2720
2722 2721 ASSERT(MUTEX_HELD(&connp->conn_lock));
2723 2722 last = connp->conn_trace_last;
2724 2723 last++;
2725 2724 if (last == CONN_TRACE_MAX)
2726 2725 last = 0;
2727 2726
2728 2727 ctb = &connp->conn_trace_buf[last];
2729 2728 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2730 2729 connp->conn_trace_last = last;
2731 2730 return (1);
2732 2731 }
2733 2732
2734 2733 int
2735 2734 conn_untrace_ref(conn_t *connp)
2736 2735 {
2737 2736 int last;
2738 2737 conn_trace_t *ctb;
2739 2738
2740 2739 ASSERT(MUTEX_HELD(&connp->conn_lock));
2741 2740 last = connp->conn_trace_last;
2742 2741 last++;
2743 2742 if (last == CONN_TRACE_MAX)
2744 2743 last = 0;
2745 2744
2746 2745 ctb = &connp->conn_trace_buf[last];
2747 2746 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2748 2747 connp->conn_trace_last = last;
2749 2748 return (1);
2750 2749 }
2751 2750 #endif
|
↓ open down ↓ |
1818 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX