1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2016 Joyent, Inc.
24 */
25
26 /*
27 * IP PACKET CLASSIFIER
28 *
29 * The IP packet classifier provides mapping between IP packets and persistent
30 * connection state for connection-oriented protocols. It also provides
31 * interface for managing connection states.
32 *
33 * The connection state is kept in conn_t data structure and contains, among
34 * other things:
35 *
36 * o local/remote address and ports
37 * o Transport protocol
38 * o squeue for the connection (for TCP only)
39 * o reference counter
40 * o Connection state
41 * o hash table linkage
42 * o interface/ire information
43 * o credentials
44 * o ipsec policy
45 * o send and receive functions.
46 * o mutex lock.
47 *
48 * Connections use a reference counting scheme. They are freed when the
49 * reference counter drops to zero. A reference is incremented when connection
50 * is placed in a list or table, when incoming packet for the connection arrives
51 * and when connection is processed via squeue (squeue processing may be
52 * asynchronous and the reference protects the connection from being destroyed
53 * before its processing is finished).
54 *
55 * conn_recv is used to pass up packets to the ULP.
56 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
57 * a listener, and changes to tcp_input_listener as the listener has picked a
58 * good squeue. For other cases it is set to tcp_input_data.
59 *
60 * conn_recvicmp is used to pass up ICMP errors to the ULP.
61 *
62 * Classifier uses several hash tables:
63 *
64 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state
65 * ipcl_bind_fanout: contains all connections in BOUND state
66 * ipcl_proto_fanout: IPv4 protocol fanout
67 * ipcl_proto_fanout_v6: IPv6 protocol fanout
68 * ipcl_udp_fanout: contains all UDP connections
69 * ipcl_iptun_fanout: contains all IP tunnel connections
70 * ipcl_globalhash_fanout: contains all connections
71 *
72 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
73 * which need to view all existing connections.
74 *
75 * All tables are protected by per-bucket locks. When both per-bucket lock and
76 * connection lock need to be held, the per-bucket lock should be acquired
77 * first, followed by the connection lock.
78 *
79 * All functions doing search in one of these tables increment a reference
80 * counter on the connection found (if any). This reference should be dropped
81 * when the caller has finished processing the connection.
82 *
83 *
84 * INTERFACES:
85 * ===========
86 *
87 * Connection Lookup:
88 * ------------------
89 *
90 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
91 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
92 *
93 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
94 * it can't find any associated connection. If the connection is found, its
95 * reference counter is incremented.
96 *
97 * mp: mblock, containing packet header. The full header should fit
98 * into a single mblock. It should also contain at least full IP
99 * and TCP or UDP header.
100 *
101 * protocol: Either IPPROTO_TCP or IPPROTO_UDP.
102 *
103 * hdr_len: The size of IP header. It is used to find TCP or UDP header in
104 * the packet.
105 *
106 * ira->ira_zoneid: The zone in which the returned connection must be; the
107 * zoneid corresponding to the ire_zoneid on the IRE located for
108 * the packet's destination address.
109 *
110 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
111 * IRAF_TX_SHARED_ADDR flags
112 *
113 * For TCP connections, the lookup order is as follows:
114 * 5-tuple {src, dst, protocol, local port, remote port}
115 * lookup in ipcl_conn_fanout table.
116 * 3-tuple {dst, remote port, protocol} lookup in
117 * ipcl_bind_fanout table.
118 *
119 * For UDP connections, a 5-tuple {src, dst, protocol, local port,
120 * remote port} lookup is done on ipcl_udp_fanout. Note that,
121 * these interfaces do not handle cases where a packets belongs
122 * to multiple UDP clients, which is handled in IP itself.
123 *
124 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
125 * determine which actual zone gets the segment. This is used only in a
126 * labeled environment. The matching rules are:
127 *
128 * - If it's not a multilevel port, then the label on the packet selects
129 * the zone. Unlabeled packets are delivered to the global zone.
130 *
131 * - If it's a multilevel port, then only the zone registered to receive
132 * packets on that port matches.
133 *
134 * Also, in a labeled environment, packet labels need to be checked. For fully
135 * bound TCP connections, we can assume that the packet label was checked
136 * during connection establishment, and doesn't need to be checked on each
137 * packet. For others, though, we need to check for strict equality or, for
138 * multilevel ports, membership in the range or set. This part currently does
139 * a tnrh lookup on each packet, but could be optimized to use cached results
140 * if that were necessary. (SCTP doesn't come through here, but if it did,
141 * we would apply the same rules as TCP.)
142 *
143 * An implication of the above is that fully-bound TCP sockets must always use
144 * distinct 4-tuples; they can't be discriminated by label alone.
145 *
146 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
147 * as there's no connection set-up handshake and no shared state.
148 *
149 * Labels on looped-back packets within a single zone do not need to be
150 * checked, as all processes in the same zone have the same label.
151 *
152 * Finally, for unlabeled packets received by a labeled system, special rules
153 * apply. We consider only the MLP if there is one. Otherwise, we prefer a
154 * socket in the zone whose label matches the default label of the sender, if
155 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the
156 * receiver's label must dominate the sender's default label.
157 *
158 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
159 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
160 * ip_stack);
161 *
162 * Lookup routine to find a exact match for {src, dst, local port,
163 * remote port) for TCP connections in ipcl_conn_fanout. The address and
164 * ports are read from the IP and TCP header respectively.
165 *
166 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol,
167 * zoneid, ip_stack);
168 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
169 * zoneid, ip_stack);
170 *
171 * Lookup routine to find a listener with the tuple {lport, laddr,
172 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional
173 * parameter interface index is also compared.
174 *
175 * void ipcl_walk(func, arg, ip_stack)
176 *
177 * Apply 'func' to every connection available. The 'func' is called as
178 * (*func)(connp, arg). The walk is non-atomic so connections may be
179 * created and destroyed during the walk. The CONN_CONDEMNED and
180 * CONN_INCIPIENT flags ensure that connections which are newly created
181 * or being destroyed are not selected by the walker.
182 *
183 * Table Updates
184 * -------------
185 *
186 * int ipcl_conn_insert(connp);
187 * int ipcl_conn_insert_v4(connp);
188 * int ipcl_conn_insert_v6(connp);
189 *
190 * Insert 'connp' in the ipcl_conn_fanout.
191 * Arguements :
192 * connp conn_t to be inserted
193 *
194 * Return value :
195 * 0 if connp was inserted
196 * EADDRINUSE if the connection with the same tuple
197 * already exists.
198 *
199 * int ipcl_bind_insert(connp);
200 * int ipcl_bind_insert_v4(connp);
201 * int ipcl_bind_insert_v6(connp);
202 *
203 * Insert 'connp' in ipcl_bind_fanout.
204 * Arguements :
205 * connp conn_t to be inserted
206 *
207 *
208 * void ipcl_hash_remove(connp);
209 *
210 * Removes the 'connp' from the connection fanout table.
211 *
212 * Connection Creation/Destruction
213 * -------------------------------
214 *
215 * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
216 *
217 * Creates a new conn based on the type flag, inserts it into
218 * globalhash table.
219 *
220 * type: This flag determines the type of conn_t which needs to be
221 * created i.e., which kmem_cache it comes from.
222 * IPCL_TCPCONN indicates a TCP connection
223 * IPCL_SCTPCONN indicates a SCTP connection
224 * IPCL_UDPCONN indicates a UDP conn_t.
225 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t.
226 * IPCL_RTSCONN indicates a RTS conn_t.
227 * IPCL_IPCCONN indicates all other connections.
228 *
229 * void ipcl_conn_destroy(connp)
230 *
231 * Destroys the connection state, removes it from the global
232 * connection hash table and frees its memory.
233 */
234
235 #include <sys/types.h>
236 #include <sys/stream.h>
237 #include <sys/stropts.h>
238 #include <sys/sysmacros.h>
239 #include <sys/strsubr.h>
240 #include <sys/strsun.h>
241 #define _SUN_TPI_VERSION 2
242 #include <sys/ddi.h>
243 #include <sys/cmn_err.h>
244 #include <sys/debug.h>
245
246 #include <sys/systm.h>
247 #include <sys/param.h>
248 #include <sys/kmem.h>
249 #include <sys/isa_defs.h>
250 #include <inet/common.h>
251 #include <netinet/ip6.h>
252 #include <netinet/icmp6.h>
253
254 #include <inet/ip.h>
255 #include <inet/ip_if.h>
256 #include <inet/ip_ire.h>
257 #include <inet/ip6.h>
258 #include <inet/ip_ndp.h>
259 #include <inet/ip_impl.h>
260 #include <inet/udp_impl.h>
261 #include <inet/sctp_ip.h>
262 #include <inet/sctp/sctp_impl.h>
263 #include <inet/rawip_impl.h>
264 #include <inet/rts_impl.h>
265 #include <inet/iptun/iptun_impl.h>
266
267 #include <sys/cpuvar.h>
268
269 #include <inet/ipclassifier.h>
270 #include <inet/tcp.h>
271 #include <inet/ipsec_impl.h>
272
273 #include <sys/tsol/tnet.h>
274 #include <sys/sockio.h>
275
276 /* Old value for compatibility. Setable in /etc/system */
277 uint_t tcp_conn_hash_size = 0;
278
279 /* New value. Zero means choose automatically. Setable in /etc/system */
280 uint_t ipcl_conn_hash_size = 0;
281 uint_t ipcl_conn_hash_memfactor = 8192;
282 uint_t ipcl_conn_hash_maxsize = 82500;
283
284 /* bind/udp fanout table size */
285 uint_t ipcl_bind_fanout_size = 512;
286 uint_t ipcl_udp_fanout_size = 16384;
287
288 /* Raw socket fanout size. Must be a power of 2. */
289 uint_t ipcl_raw_fanout_size = 256;
290
291 /*
292 * The IPCL_IPTUN_HASH() function works best with a prime table size. We
293 * expect that most large deployments would have hundreds of tunnels, and
294 * thousands in the extreme case.
295 */
296 uint_t ipcl_iptun_fanout_size = 6143;
297
298 /*
299 * Power of 2^N Primes useful for hashing for N of 0-28,
300 * these primes are the nearest prime <= 2^N - 2^(N-2).
301 */
302
303 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \
304 6143, 12281, 24571, 49139, 98299, 196597, 393209, \
305 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \
306 50331599, 100663291, 201326557, 0}
307
308 /*
309 * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
310 * are aligned on cache lines.
311 */
312 typedef union itc_s {
313 conn_t itc_conn;
314 char itcu_filler[CACHE_ALIGN(conn_s)];
315 } itc_t;
316
317 struct kmem_cache *tcp_conn_cache;
318 struct kmem_cache *ip_conn_cache;
319 extern struct kmem_cache *sctp_conn_cache;
320 struct kmem_cache *udp_conn_cache;
321 struct kmem_cache *rawip_conn_cache;
322 struct kmem_cache *rts_conn_cache;
323
324 extern void tcp_timermp_free(tcp_t *);
325 extern mblk_t *tcp_timermp_alloc(int);
326
327 static int ip_conn_constructor(void *, void *, int);
328 static void ip_conn_destructor(void *, void *);
329
330 static int tcp_conn_constructor(void *, void *, int);
331 static void tcp_conn_destructor(void *, void *);
332
333 static int udp_conn_constructor(void *, void *, int);
334 static void udp_conn_destructor(void *, void *);
335
336 static int rawip_conn_constructor(void *, void *, int);
337 static void rawip_conn_destructor(void *, void *);
338
339 static int rts_conn_constructor(void *, void *, int);
340 static void rts_conn_destructor(void *, void *);
341
342 /*
343 * Global (for all stack instances) init routine
344 */
345 void
346 ipcl_g_init(void)
347 {
348 ip_conn_cache = kmem_cache_create("ip_conn_cache",
349 sizeof (conn_t), CACHE_ALIGN_SIZE,
350 ip_conn_constructor, ip_conn_destructor,
351 NULL, NULL, NULL, 0);
352
353 tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
354 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
355 tcp_conn_constructor, tcp_conn_destructor,
356 tcp_conn_reclaim, NULL, NULL, 0);
357
358 udp_conn_cache = kmem_cache_create("udp_conn_cache",
359 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
360 udp_conn_constructor, udp_conn_destructor,
361 NULL, NULL, NULL, 0);
362
363 rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
364 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
365 rawip_conn_constructor, rawip_conn_destructor,
366 NULL, NULL, NULL, 0);
367
368 rts_conn_cache = kmem_cache_create("rts_conn_cache",
369 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
370 rts_conn_constructor, rts_conn_destructor,
371 NULL, NULL, NULL, 0);
372 }
373
374 /*
375 * ipclassifier intialization routine, sets up hash tables.
376 */
377 void
378 ipcl_init(ip_stack_t *ipst)
379 {
380 int i;
381 int sizes[] = P2Ps();
382
383 /*
384 * Calculate size of conn fanout table from /etc/system settings
385 */
386 if (ipcl_conn_hash_size != 0) {
387 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
388 } else if (tcp_conn_hash_size != 0) {
389 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
390 } else {
391 extern pgcnt_t freemem;
392
393 ipst->ips_ipcl_conn_fanout_size =
394 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
395
396 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
397 ipst->ips_ipcl_conn_fanout_size =
398 ipcl_conn_hash_maxsize;
399 }
400 }
401
402 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
403 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
404 break;
405 }
406 }
407 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
408 /* Out of range, use the 2^16 value */
409 ipst->ips_ipcl_conn_fanout_size = sizes[16];
410 }
411
412 /* Take values from /etc/system */
413 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
414 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
415 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
416 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
417
418 ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
419
420 ipst->ips_ipcl_conn_fanout = kmem_zalloc(
421 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
422
423 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
424 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
425 MUTEX_DEFAULT, NULL);
426 }
427
428 ipst->ips_ipcl_bind_fanout = kmem_zalloc(
429 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
430
431 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
432 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
433 MUTEX_DEFAULT, NULL);
434 }
435
436 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
437 sizeof (connf_t), KM_SLEEP);
438 for (i = 0; i < IPPROTO_MAX; i++) {
439 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
440 MUTEX_DEFAULT, NULL);
441 }
442
443 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
444 sizeof (connf_t), KM_SLEEP);
445 for (i = 0; i < IPPROTO_MAX; i++) {
446 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
447 MUTEX_DEFAULT, NULL);
448 }
449
450 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
451 mutex_init(&ipst->ips_rts_clients->connf_lock,
452 NULL, MUTEX_DEFAULT, NULL);
453
454 ipst->ips_ipcl_udp_fanout = kmem_zalloc(
455 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
456 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
457 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
458 MUTEX_DEFAULT, NULL);
459 }
460
461 ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
462 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
463 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
464 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
465 MUTEX_DEFAULT, NULL);
466 }
467
468 ipst->ips_ipcl_raw_fanout = kmem_zalloc(
469 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
470 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
471 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
472 MUTEX_DEFAULT, NULL);
473 }
474
475 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
476 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
477 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
478 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
479 NULL, MUTEX_DEFAULT, NULL);
480 }
481 }
482
483 void
484 ipcl_g_destroy(void)
485 {
486 kmem_cache_destroy(ip_conn_cache);
487 kmem_cache_destroy(tcp_conn_cache);
488 kmem_cache_destroy(udp_conn_cache);
489 kmem_cache_destroy(rawip_conn_cache);
490 kmem_cache_destroy(rts_conn_cache);
491 }
492
493 /*
494 * All user-level and kernel use of the stack must be gone
495 * by now.
496 */
497 void
498 ipcl_destroy(ip_stack_t *ipst)
499 {
500 int i;
501
502 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
503 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
504 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
505 }
506 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
507 sizeof (connf_t));
508 ipst->ips_ipcl_conn_fanout = NULL;
509
510 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
511 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
512 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
513 }
514 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
515 sizeof (connf_t));
516 ipst->ips_ipcl_bind_fanout = NULL;
517
518 for (i = 0; i < IPPROTO_MAX; i++) {
519 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
520 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
521 }
522 kmem_free(ipst->ips_ipcl_proto_fanout_v4,
523 IPPROTO_MAX * sizeof (connf_t));
524 ipst->ips_ipcl_proto_fanout_v4 = NULL;
525
526 for (i = 0; i < IPPROTO_MAX; i++) {
527 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
528 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
529 }
530 kmem_free(ipst->ips_ipcl_proto_fanout_v6,
531 IPPROTO_MAX * sizeof (connf_t));
532 ipst->ips_ipcl_proto_fanout_v6 = NULL;
533
534 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
535 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
536 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
537 }
538 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
539 sizeof (connf_t));
540 ipst->ips_ipcl_udp_fanout = NULL;
541
542 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
543 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
544 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
545 }
546 kmem_free(ipst->ips_ipcl_iptun_fanout,
547 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
548 ipst->ips_ipcl_iptun_fanout = NULL;
549
550 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
551 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
552 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
553 }
554 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
555 sizeof (connf_t));
556 ipst->ips_ipcl_raw_fanout = NULL;
557
558 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
559 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
560 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
561 }
562 kmem_free(ipst->ips_ipcl_globalhash_fanout,
563 sizeof (connf_t) * CONN_G_HASH_SIZE);
564 ipst->ips_ipcl_globalhash_fanout = NULL;
565
566 ASSERT(ipst->ips_rts_clients->connf_head == NULL);
567 mutex_destroy(&ipst->ips_rts_clients->connf_lock);
568 kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
569 ipst->ips_rts_clients = NULL;
570 }
571
572 /*
573 * conn creation routine. initialize the conn, sets the reference
574 * and inserts it in the global hash table.
575 */
576 conn_t *
577 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
578 {
579 conn_t *connp;
580 struct kmem_cache *conn_cache;
581
582 switch (type) {
583 case IPCL_SCTPCONN:
584 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
585 return (NULL);
586 sctp_conn_init(connp);
587 netstack_hold(ns);
588 connp->conn_netstack = ns;
589 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
590 connp->conn_ixa->ixa_conn_id = (long)connp;
591 ipcl_globalhash_insert(connp);
592 return (connp);
593
594 case IPCL_TCPCONN:
595 conn_cache = tcp_conn_cache;
596 break;
597
598 case IPCL_UDPCONN:
599 conn_cache = udp_conn_cache;
600 break;
601
602 case IPCL_RAWIPCONN:
603 conn_cache = rawip_conn_cache;
604 break;
605
606 case IPCL_RTSCONN:
607 conn_cache = rts_conn_cache;
608 break;
609
610 case IPCL_IPCCONN:
611 conn_cache = ip_conn_cache;
612 break;
613
614 default:
615 connp = NULL;
616 ASSERT(0);
617 }
618
619 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
620 return (NULL);
621
622 connp->conn_ref = 1;
623 netstack_hold(ns);
624 connp->conn_netstack = ns;
625 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
626 connp->conn_ixa->ixa_conn_id = (long)connp;
627 ipcl_globalhash_insert(connp);
628 return (connp);
629 }
630
631 void
632 ipcl_conn_destroy(conn_t *connp)
633 {
634 mblk_t *mp;
635 netstack_t *ns = connp->conn_netstack;
636
637 ASSERT(!MUTEX_HELD(&connp->conn_lock));
638 ASSERT(connp->conn_ref == 0);
639 ASSERT(connp->conn_ioctlref == 0);
640
641 DTRACE_PROBE1(conn__destroy, conn_t *, connp);
642
643 if (connp->conn_cred != NULL) {
644 crfree(connp->conn_cred);
645 connp->conn_cred = NULL;
646 /* ixa_cred done in ipcl_conn_cleanup below */
647 }
648
649 if (connp->conn_ht_iphc != NULL) {
650 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
651 connp->conn_ht_iphc = NULL;
652 connp->conn_ht_iphc_allocated = 0;
653 connp->conn_ht_iphc_len = 0;
654 connp->conn_ht_ulp = NULL;
655 connp->conn_ht_ulp_len = 0;
656 }
657 ip_pkt_free(&connp->conn_xmit_ipp);
658
659 ipcl_globalhash_remove(connp);
660
661 if (connp->conn_latch != NULL) {
662 IPLATCH_REFRELE(connp->conn_latch);
663 connp->conn_latch = NULL;
664 }
665 if (connp->conn_latch_in_policy != NULL) {
666 IPPOL_REFRELE(connp->conn_latch_in_policy);
667 connp->conn_latch_in_policy = NULL;
668 }
669 if (connp->conn_latch_in_action != NULL) {
670 IPACT_REFRELE(connp->conn_latch_in_action);
671 connp->conn_latch_in_action = NULL;
672 }
673 if (connp->conn_policy != NULL) {
674 IPPH_REFRELE(connp->conn_policy, ns);
675 connp->conn_policy = NULL;
676 }
677
678 if (connp->conn_ipsec_opt_mp != NULL) {
679 freemsg(connp->conn_ipsec_opt_mp);
680 connp->conn_ipsec_opt_mp = NULL;
681 }
682
683 if (connp->conn_flags & IPCL_TCPCONN) {
684 tcp_t *tcp = connp->conn_tcp;
685
686 tcp_free(tcp);
687 mp = tcp->tcp_timercache;
688
689 tcp->tcp_tcps = NULL;
690
691 /*
692 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
693 * the mblk.
694 */
695 if (tcp->tcp_rsrv_mp != NULL) {
696 freeb(tcp->tcp_rsrv_mp);
697 tcp->tcp_rsrv_mp = NULL;
698 mutex_destroy(&tcp->tcp_rsrv_mp_lock);
699 }
700
701 ipcl_conn_cleanup(connp);
702 connp->conn_flags = IPCL_TCPCONN;
703 if (ns != NULL) {
704 ASSERT(tcp->tcp_tcps == NULL);
705 connp->conn_netstack = NULL;
706 connp->conn_ixa->ixa_ipst = NULL;
707 netstack_rele(ns);
708 }
709
710 bzero(tcp, sizeof (tcp_t));
711
712 tcp->tcp_timercache = mp;
713 tcp->tcp_connp = connp;
714 kmem_cache_free(tcp_conn_cache, connp);
715 return;
716 }
717
718 if (connp->conn_flags & IPCL_SCTPCONN) {
719 ASSERT(ns != NULL);
720 sctp_free(connp);
721 return;
722 }
723
724 ipcl_conn_cleanup(connp);
725 if (ns != NULL) {
726 connp->conn_netstack = NULL;
727 connp->conn_ixa->ixa_ipst = NULL;
728 netstack_rele(ns);
729 }
730
731 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
732 if (connp->conn_flags & IPCL_UDPCONN) {
733 connp->conn_flags = IPCL_UDPCONN;
734 kmem_cache_free(udp_conn_cache, connp);
735 } else if (connp->conn_flags & IPCL_RAWIPCONN) {
736 connp->conn_flags = IPCL_RAWIPCONN;
737 connp->conn_proto = IPPROTO_ICMP;
738 connp->conn_ixa->ixa_protocol = connp->conn_proto;
739 kmem_cache_free(rawip_conn_cache, connp);
740 } else if (connp->conn_flags & IPCL_RTSCONN) {
741 connp->conn_flags = IPCL_RTSCONN;
742 kmem_cache_free(rts_conn_cache, connp);
743 } else {
744 connp->conn_flags = IPCL_IPCCONN;
745 ASSERT(connp->conn_flags & IPCL_IPCCONN);
746 ASSERT(connp->conn_priv == NULL);
747 kmem_cache_free(ip_conn_cache, connp);
748 }
749 }
750
751 /*
752 * Running in cluster mode - deregister listener information
753 */
754 static void
755 ipcl_conn_unlisten(conn_t *connp)
756 {
757 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
758 ASSERT(connp->conn_lport != 0);
759
760 if (cl_inet_unlisten != NULL) {
761 sa_family_t addr_family;
762 uint8_t *laddrp;
763
764 if (connp->conn_ipversion == IPV6_VERSION) {
765 addr_family = AF_INET6;
766 laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
767 } else {
768 addr_family = AF_INET;
769 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
770 }
771 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
772 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
773 }
774 connp->conn_flags &= ~IPCL_CL_LISTENER;
775 }
776
777 /*
778 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
779 * which table the conn belonged to). So for debugging we can see which hash
780 * table this connection was in.
781 */
782 #define IPCL_HASH_REMOVE(connp) { \
783 connf_t *connfp = (connp)->conn_fanout; \
784 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \
785 if (connfp != NULL) { \
786 mutex_enter(&connfp->connf_lock); \
787 if ((connp)->conn_next != NULL) \
788 (connp)->conn_next->conn_prev = \
789 (connp)->conn_prev; \
790 if ((connp)->conn_prev != NULL) \
791 (connp)->conn_prev->conn_next = \
792 (connp)->conn_next; \
793 else \
794 connfp->connf_head = (connp)->conn_next; \
795 (connp)->conn_fanout = NULL; \
796 (connp)->conn_next = NULL; \
797 (connp)->conn_prev = NULL; \
798 (connp)->conn_flags |= IPCL_REMOVED; \
799 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \
800 ipcl_conn_unlisten((connp)); \
801 CONN_DEC_REF((connp)); \
802 mutex_exit(&connfp->connf_lock); \
803 } \
804 }
805
806 void
807 ipcl_hash_remove(conn_t *connp)
808 {
809 uint8_t protocol = connp->conn_proto;
810
811 IPCL_HASH_REMOVE(connp);
812 if (protocol == IPPROTO_RSVP)
813 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
814 }
815
816 /*
817 * The whole purpose of this function is allow removal of
818 * a conn_t from the connected hash for timewait reclaim.
819 * This is essentially a TW reclaim fastpath where timewait
820 * collector checks under fanout lock (so no one else can
821 * get access to the conn_t) that refcnt is 2 i.e. one for
822 * TCP and one for the classifier hash list. If ref count
823 * is indeed 2, we can just remove the conn under lock and
824 * avoid cleaning up the conn under squeue. This gives us
825 * improved performance.
826 */
827 void
828 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
829 {
830 ASSERT(MUTEX_HELD(&connfp->connf_lock));
831 ASSERT(MUTEX_HELD(&connp->conn_lock));
832 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
833
834 if ((connp)->conn_next != NULL) {
835 (connp)->conn_next->conn_prev = (connp)->conn_prev;
836 }
837 if ((connp)->conn_prev != NULL) {
838 (connp)->conn_prev->conn_next = (connp)->conn_next;
839 } else {
840 connfp->connf_head = (connp)->conn_next;
841 }
842 (connp)->conn_fanout = NULL;
843 (connp)->conn_next = NULL;
844 (connp)->conn_prev = NULL;
845 (connp)->conn_flags |= IPCL_REMOVED;
846 ASSERT((connp)->conn_ref == 2);
847 (connp)->conn_ref--;
848 }
849
850 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \
851 ASSERT((connp)->conn_fanout == NULL); \
852 ASSERT((connp)->conn_next == NULL); \
853 ASSERT((connp)->conn_prev == NULL); \
854 if ((connfp)->connf_head != NULL) { \
855 (connfp)->connf_head->conn_prev = (connp); \
856 (connp)->conn_next = (connfp)->connf_head; \
857 } \
858 (connp)->conn_fanout = (connfp); \
859 (connfp)->connf_head = (connp); \
860 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
861 IPCL_CONNECTED; \
862 CONN_INC_REF(connp); \
863 }
864
865 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \
866 IPCL_HASH_REMOVE((connp)); \
867 mutex_enter(&(connfp)->connf_lock); \
868 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \
869 mutex_exit(&(connfp)->connf_lock); \
870 }
871
872 /*
873 * When inserting bound or wildcard entries into the hash, ordering rules are
874 * used to facilitate timely and correct lookups. The order is as follows:
875 * 1. Entries bound to a specific address
876 * 2. Entries bound to INADDR_ANY
877 * 3. Entries bound to ADDR_UNSPECIFIED
878 * Entries in a category which share conn_lport (such as those using
879 * SO_REUSEPORT) will be ordered such that the newest inserted is first.
880 */
881
882 void
883 ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp)
884 {
885 conn_t *pconnp, *nconnp;
886
887 IPCL_HASH_REMOVE(connp);
888 mutex_enter(&connfp->connf_lock);
889 nconnp = connfp->connf_head;
890 pconnp = NULL;
891 while (nconnp != NULL) {
892 /*
893 * Walk though entries associated with the fanout until one is
894 * found which fulfills any of these conditions:
895 * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED
896 * 2. Listen port the same as connp
897 */
898 if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) ||
899 connp->conn_lport == nconnp->conn_lport)
900 break;
901 pconnp = nconnp;
902 nconnp = nconnp->conn_next;
903 }
904 if (pconnp != NULL) {
905 pconnp->conn_next = connp;
906 connp->conn_prev = pconnp;
907 } else {
908 connfp->connf_head = connp;
909 }
910 if (nconnp != NULL) {
911 connp->conn_next = nconnp;
912 nconnp->conn_prev = connp;
913 }
914 connp->conn_fanout = connfp;
915 connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
916 CONN_INC_REF(connp);
917 mutex_exit(&connfp->connf_lock);
918 }
919
920 void
921 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
922 {
923 conn_t **list, *prev, *next;
924 conn_t *pconnp = NULL, *nconnp;
925 boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6);
926
927 IPCL_HASH_REMOVE(connp);
928 mutex_enter(&connfp->connf_lock);
929 nconnp = connfp->connf_head;
930 pconnp = NULL;
931 while (nconnp != NULL) {
932 if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) &&
933 isv4mapped && connp->conn_lport == nconnp->conn_lport)
934 break;
935 if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) &&
936 (isv4mapped ||
937 connp->conn_lport == nconnp->conn_lport))
938 break;
939
940 pconnp = nconnp;
941 nconnp = nconnp->conn_next;
942 }
943 if (pconnp != NULL) {
944 pconnp->conn_next = connp;
945 connp->conn_prev = pconnp;
946 } else {
947 connfp->connf_head = connp;
948 }
949 if (nconnp != NULL) {
950 connp->conn_next = nconnp;
951 nconnp->conn_prev = connp;
952 }
953 connp->conn_fanout = connfp;
954 connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
955 CONN_INC_REF(connp);
956 mutex_exit(&connfp->connf_lock);
957 }
958
959 /*
960 * Because the classifier is used to classify inbound packets, the destination
961 * address is meant to be our local tunnel address (tunnel source), and the
962 * source the remote tunnel address (tunnel destination).
963 *
964 * Note that conn_proto can't be used for fanout since the upper protocol
965 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
966 */
967 conn_t *
968 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
969 {
970 connf_t *connfp;
971 conn_t *connp;
972
973 /* first look for IPv4 tunnel links */
974 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
975 mutex_enter(&connfp->connf_lock);
976 for (connp = connfp->connf_head; connp != NULL;
977 connp = connp->conn_next) {
978 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
979 break;
980 }
981 if (connp != NULL)
982 goto done;
983
984 mutex_exit(&connfp->connf_lock);
985
986 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
987 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
988 INADDR_ANY)];
989 mutex_enter(&connfp->connf_lock);
990 for (connp = connfp->connf_head; connp != NULL;
991 connp = connp->conn_next) {
992 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
993 break;
994 }
995 done:
996 if (connp != NULL)
997 CONN_INC_REF(connp);
998 mutex_exit(&connfp->connf_lock);
999 return (connp);
1000 }
1001
1002 conn_t *
1003 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
1004 {
1005 connf_t *connfp;
1006 conn_t *connp;
1007
1008 /* Look for an IPv6 tunnel link */
1009 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
1010 mutex_enter(&connfp->connf_lock);
1011 for (connp = connfp->connf_head; connp != NULL;
1012 connp = connp->conn_next) {
1013 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
1014 CONN_INC_REF(connp);
1015 break;
1016 }
1017 }
1018 mutex_exit(&connfp->connf_lock);
1019 return (connp);
1020 }
1021
1022 /*
1023 * This function is used only for inserting SCTP raw socket now.
1024 * This may change later.
1025 *
1026 * Note that only one raw socket can be bound to a port. The param
1027 * lport is in network byte order.
1028 */
1029 static int
1030 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1031 {
1032 connf_t *connfp;
1033 conn_t *oconnp;
1034 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1035
1036 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1037
1038 /* Check for existing raw socket already bound to the port. */
1039 mutex_enter(&connfp->connf_lock);
1040 for (oconnp = connfp->connf_head; oconnp != NULL;
1041 oconnp = oconnp->conn_next) {
1042 if (oconnp->conn_lport == lport &&
1043 oconnp->conn_zoneid == connp->conn_zoneid &&
1044 oconnp->conn_family == connp->conn_family &&
1045 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1046 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1047 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1048 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1049 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1050 &connp->conn_laddr_v6))) {
1051 break;
1052 }
1053 }
1054 mutex_exit(&connfp->connf_lock);
1055 if (oconnp != NULL)
1056 return (EADDRNOTAVAIL);
1057
1058 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1059 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1060 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1061 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1062 ipcl_hash_insert_wildcard(connfp, connp);
1063 } else {
1064 ipcl_hash_insert_bound(connfp, connp);
1065 }
1066 } else {
1067 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1068 }
1069 return (0);
1070 }
1071
1072 static int
1073 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1074 {
1075 connf_t *connfp;
1076 conn_t *tconnp;
1077 ipaddr_t laddr = connp->conn_laddr_v4;
1078 ipaddr_t faddr = connp->conn_faddr_v4;
1079
1080 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1081 mutex_enter(&connfp->connf_lock);
1082 for (tconnp = connfp->connf_head; tconnp != NULL;
1083 tconnp = tconnp->conn_next) {
1084 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1085 /* A tunnel is already bound to these addresses. */
1086 mutex_exit(&connfp->connf_lock);
1087 return (EADDRINUSE);
1088 }
1089 }
1090 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1091 mutex_exit(&connfp->connf_lock);
1092 return (0);
1093 }
1094
1095 static int
1096 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1097 {
1098 connf_t *connfp;
1099 conn_t *tconnp;
1100 in6_addr_t *laddr = &connp->conn_laddr_v6;
1101 in6_addr_t *faddr = &connp->conn_faddr_v6;
1102
1103 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1104 mutex_enter(&connfp->connf_lock);
1105 for (tconnp = connfp->connf_head; tconnp != NULL;
1106 tconnp = tconnp->conn_next) {
1107 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1108 /* A tunnel is already bound to these addresses. */
1109 mutex_exit(&connfp->connf_lock);
1110 return (EADDRINUSE);
1111 }
1112 }
1113 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1114 mutex_exit(&connfp->connf_lock);
1115 return (0);
1116 }
1117
1118 /*
1119 * Check for a MAC exemption conflict on a labeled system. Note that for
1120 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1121 * transport layer. This check is for binding all other protocols.
1122 *
1123 * Returns true if there's a conflict.
1124 */
1125 static boolean_t
1126 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1127 {
1128 connf_t *connfp;
1129 conn_t *tconn;
1130
1131 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1132 mutex_enter(&connfp->connf_lock);
1133 for (tconn = connfp->connf_head; tconn != NULL;
1134 tconn = tconn->conn_next) {
1135 /* We don't allow v4 fallback for v6 raw socket */
1136 if (connp->conn_family != tconn->conn_family)
1137 continue;
1138 /* If neither is exempt, then there's no conflict */
1139 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1140 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1141 continue;
1142 /* We are only concerned about sockets for a different zone */
1143 if (connp->conn_zoneid == tconn->conn_zoneid)
1144 continue;
1145 /* If both are bound to different specific addrs, ok */
1146 if (connp->conn_laddr_v4 != INADDR_ANY &&
1147 tconn->conn_laddr_v4 != INADDR_ANY &&
1148 connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1149 continue;
1150 /* These two conflict; fail */
1151 break;
1152 }
1153 mutex_exit(&connfp->connf_lock);
1154 return (tconn != NULL);
1155 }
1156
1157 static boolean_t
1158 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1159 {
1160 connf_t *connfp;
1161 conn_t *tconn;
1162
1163 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1164 mutex_enter(&connfp->connf_lock);
1165 for (tconn = connfp->connf_head; tconn != NULL;
1166 tconn = tconn->conn_next) {
1167 /* We don't allow v4 fallback for v6 raw socket */
1168 if (connp->conn_family != tconn->conn_family)
1169 continue;
1170 /* If neither is exempt, then there's no conflict */
1171 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1172 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1173 continue;
1174 /* We are only concerned about sockets for a different zone */
1175 if (connp->conn_zoneid == tconn->conn_zoneid)
1176 continue;
1177 /* If both are bound to different addrs, ok */
1178 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1179 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1180 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1181 &tconn->conn_laddr_v6))
1182 continue;
1183 /* These two conflict; fail */
1184 break;
1185 }
1186 mutex_exit(&connfp->connf_lock);
1187 return (tconn != NULL);
1188 }
1189
1190 /*
1191 * (v4, v6) bind hash insertion routines
1192 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1193 */
1194
1195 int
1196 ipcl_bind_insert(conn_t *connp)
1197 {
1198 if (connp->conn_ipversion == IPV6_VERSION)
1199 return (ipcl_bind_insert_v6(connp));
1200 else
1201 return (ipcl_bind_insert_v4(connp));
1202 }
1203
1204 int
1205 ipcl_bind_insert_v4(conn_t *connp)
1206 {
1207 connf_t *connfp;
1208 int ret = 0;
1209 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1210 uint16_t lport = connp->conn_lport;
1211 uint8_t protocol = connp->conn_proto;
1212
1213 if (IPCL_IS_IPTUN(connp))
1214 return (ipcl_iptun_hash_insert(connp, ipst));
1215
1216 switch (protocol) {
1217 default:
1218 if (is_system_labeled() &&
1219 check_exempt_conflict_v4(connp, ipst))
1220 return (EADDRINUSE);
1221 /* FALLTHROUGH */
1222 case IPPROTO_UDP:
1223 if (protocol == IPPROTO_UDP) {
1224 connfp = &ipst->ips_ipcl_udp_fanout[
1225 IPCL_UDP_HASH(lport, ipst)];
1226 } else {
1227 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1228 }
1229
1230 if (connp->conn_faddr_v4 != INADDR_ANY) {
1231 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1232 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1233 ipcl_hash_insert_bound(connfp, connp);
1234 } else {
1235 ipcl_hash_insert_wildcard(connfp, connp);
1236 }
1237 if (protocol == IPPROTO_RSVP)
1238 ill_set_inputfn_all(ipst);
1239 break;
1240
1241 case IPPROTO_TCP:
1242 /* Insert it in the Bind Hash */
1243 ASSERT(connp->conn_zoneid != ALL_ZONES);
1244 connfp = &ipst->ips_ipcl_bind_fanout[
1245 IPCL_BIND_HASH(lport, ipst)];
1246 if (connp->conn_laddr_v4 != INADDR_ANY) {
1247 ipcl_hash_insert_bound(connfp, connp);
1248 } else {
1249 ipcl_hash_insert_wildcard(connfp, connp);
1250 }
1251 if (cl_inet_listen != NULL) {
1252 ASSERT(connp->conn_ipversion == IPV4_VERSION);
1253 connp->conn_flags |= IPCL_CL_LISTENER;
1254 (*cl_inet_listen)(
1255 connp->conn_netstack->netstack_stackid,
1256 IPPROTO_TCP, AF_INET,
1257 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1258 }
1259 break;
1260
1261 case IPPROTO_SCTP:
1262 ret = ipcl_sctp_hash_insert(connp, lport);
1263 break;
1264 }
1265
1266 return (ret);
1267 }
1268
1269 int
1270 ipcl_bind_insert_v6(conn_t *connp)
1271 {
1272 connf_t *connfp;
1273 int ret = 0;
1274 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1275 uint16_t lport = connp->conn_lport;
1276 uint8_t protocol = connp->conn_proto;
1277
1278 if (IPCL_IS_IPTUN(connp)) {
1279 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1280 }
1281
1282 switch (protocol) {
1283 default:
1284 if (is_system_labeled() &&
1285 check_exempt_conflict_v6(connp, ipst))
1286 return (EADDRINUSE);
1287 /* FALLTHROUGH */
1288 case IPPROTO_UDP:
1289 if (protocol == IPPROTO_UDP) {
1290 connfp = &ipst->ips_ipcl_udp_fanout[
1291 IPCL_UDP_HASH(lport, ipst)];
1292 } else {
1293 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1294 }
1295
1296 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1297 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1298 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1299 ipcl_hash_insert_bound(connfp, connp);
1300 } else {
1301 ipcl_hash_insert_wildcard(connfp, connp);
1302 }
1303 break;
1304
1305 case IPPROTO_TCP:
1306 /* Insert it in the Bind Hash */
1307 ASSERT(connp->conn_zoneid != ALL_ZONES);
1308 connfp = &ipst->ips_ipcl_bind_fanout[
1309 IPCL_BIND_HASH(lport, ipst)];
1310 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1311 ipcl_hash_insert_bound(connfp, connp);
1312 } else {
1313 ipcl_hash_insert_wildcard(connfp, connp);
1314 }
1315 if (cl_inet_listen != NULL) {
1316 sa_family_t addr_family;
1317 uint8_t *laddrp;
1318
1319 if (connp->conn_ipversion == IPV6_VERSION) {
1320 addr_family = AF_INET6;
1321 laddrp =
1322 (uint8_t *)&connp->conn_bound_addr_v6;
1323 } else {
1324 addr_family = AF_INET;
1325 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1326 }
1327 connp->conn_flags |= IPCL_CL_LISTENER;
1328 (*cl_inet_listen)(
1329 connp->conn_netstack->netstack_stackid,
1330 IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1331 }
1332 break;
1333
1334 case IPPROTO_SCTP:
1335 ret = ipcl_sctp_hash_insert(connp, lport);
1336 break;
1337 }
1338
1339 return (ret);
1340 }
1341
1342 /*
1343 * ipcl_conn_hash insertion routines.
1344 * The caller has already set conn_proto and the addresses/ports in the conn_t.
1345 */
1346
1347 int
1348 ipcl_conn_insert(conn_t *connp)
1349 {
1350 if (connp->conn_ipversion == IPV6_VERSION)
1351 return (ipcl_conn_insert_v6(connp));
1352 else
1353 return (ipcl_conn_insert_v4(connp));
1354 }
1355
1356 int
1357 ipcl_conn_insert_v4(conn_t *connp)
1358 {
1359 connf_t *connfp;
1360 conn_t *tconnp;
1361 int ret = 0;
1362 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1363 uint16_t lport = connp->conn_lport;
1364 uint8_t protocol = connp->conn_proto;
1365
1366 if (IPCL_IS_IPTUN(connp))
1367 return (ipcl_iptun_hash_insert(connp, ipst));
1368
1369 switch (protocol) {
1370 case IPPROTO_TCP:
1371 /*
1372 * For TCP, we check whether the connection tuple already
1373 * exists before allowing the connection to proceed. We
1374 * also allow indexing on the zoneid. This is to allow
1375 * multiple shared stack zones to have the same tcp
1376 * connection tuple. In practice this only happens for
1377 * INADDR_LOOPBACK as it's the only local address which
1378 * doesn't have to be unique.
1379 */
1380 connfp = &ipst->ips_ipcl_conn_fanout[
1381 IPCL_CONN_HASH(connp->conn_faddr_v4,
1382 connp->conn_ports, ipst)];
1383 mutex_enter(&connfp->connf_lock);
1384 for (tconnp = connfp->connf_head; tconnp != NULL;
1385 tconnp = tconnp->conn_next) {
1386 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1387 connp->conn_faddr_v4, connp->conn_laddr_v4,
1388 connp->conn_ports) &&
1389 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1390 /* Already have a conn. bail out */
1391 mutex_exit(&connfp->connf_lock);
1392 return (EADDRINUSE);
1393 }
1394 }
1395 if (connp->conn_fanout != NULL) {
1396 /*
1397 * Probably a XTI/TLI application trying to do a
1398 * rebind. Let it happen.
1399 */
1400 mutex_exit(&connfp->connf_lock);
1401 IPCL_HASH_REMOVE(connp);
1402 mutex_enter(&connfp->connf_lock);
1403 }
1404
1405 ASSERT(connp->conn_recv != NULL);
1406 ASSERT(connp->conn_recvicmp != NULL);
1407
1408 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1409 mutex_exit(&connfp->connf_lock);
1410 break;
1411
1412 case IPPROTO_SCTP:
1413 /*
1414 * The raw socket may have already been bound, remove it
1415 * from the hash first.
1416 */
1417 IPCL_HASH_REMOVE(connp);
1418 ret = ipcl_sctp_hash_insert(connp, lport);
1419 break;
1420
1421 default:
1422 /*
1423 * Check for conflicts among MAC exempt bindings. For
1424 * transports with port numbers, this is done by the upper
1425 * level per-transport binding logic. For all others, it's
1426 * done here.
1427 */
1428 if (is_system_labeled() &&
1429 check_exempt_conflict_v4(connp, ipst))
1430 return (EADDRINUSE);
1431 /* FALLTHROUGH */
1432
1433 case IPPROTO_UDP:
1434 if (protocol == IPPROTO_UDP) {
1435 connfp = &ipst->ips_ipcl_udp_fanout[
1436 IPCL_UDP_HASH(lport, ipst)];
1437 } else {
1438 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1439 }
1440
1441 if (connp->conn_faddr_v4 != INADDR_ANY) {
1442 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1443 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1444 ipcl_hash_insert_bound(connfp, connp);
1445 } else {
1446 ipcl_hash_insert_wildcard(connfp, connp);
1447 }
1448 break;
1449 }
1450
1451 return (ret);
1452 }
1453
1454 int
1455 ipcl_conn_insert_v6(conn_t *connp)
1456 {
1457 connf_t *connfp;
1458 conn_t *tconnp;
1459 int ret = 0;
1460 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1461 uint16_t lport = connp->conn_lport;
1462 uint8_t protocol = connp->conn_proto;
1463 uint_t ifindex = connp->conn_bound_if;
1464
1465 if (IPCL_IS_IPTUN(connp))
1466 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1467
1468 switch (protocol) {
1469 case IPPROTO_TCP:
1470
1471 /*
1472 * For tcp, we check whether the connection tuple already
1473 * exists before allowing the connection to proceed. We
1474 * also allow indexing on the zoneid. This is to allow
1475 * multiple shared stack zones to have the same tcp
1476 * connection tuple. In practice this only happens for
1477 * ipv6_loopback as it's the only local address which
1478 * doesn't have to be unique.
1479 */
1480 connfp = &ipst->ips_ipcl_conn_fanout[
1481 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1482 ipst)];
1483 mutex_enter(&connfp->connf_lock);
1484 for (tconnp = connfp->connf_head; tconnp != NULL;
1485 tconnp = tconnp->conn_next) {
1486 /* NOTE: need to match zoneid. Bug in onnv-gate */
1487 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1488 connp->conn_faddr_v6, connp->conn_laddr_v6,
1489 connp->conn_ports) &&
1490 (tconnp->conn_bound_if == 0 ||
1491 tconnp->conn_bound_if == ifindex) &&
1492 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1493 /* Already have a conn. bail out */
1494 mutex_exit(&connfp->connf_lock);
1495 return (EADDRINUSE);
1496 }
1497 }
1498 if (connp->conn_fanout != NULL) {
1499 /*
1500 * Probably a XTI/TLI application trying to do a
1501 * rebind. Let it happen.
1502 */
1503 mutex_exit(&connfp->connf_lock);
1504 IPCL_HASH_REMOVE(connp);
1505 mutex_enter(&connfp->connf_lock);
1506 }
1507 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1508 mutex_exit(&connfp->connf_lock);
1509 break;
1510
1511 case IPPROTO_SCTP:
1512 IPCL_HASH_REMOVE(connp);
1513 ret = ipcl_sctp_hash_insert(connp, lport);
1514 break;
1515
1516 default:
1517 if (is_system_labeled() &&
1518 check_exempt_conflict_v6(connp, ipst))
1519 return (EADDRINUSE);
1520 /* FALLTHROUGH */
1521 case IPPROTO_UDP:
1522 if (protocol == IPPROTO_UDP) {
1523 connfp = &ipst->ips_ipcl_udp_fanout[
1524 IPCL_UDP_HASH(lport, ipst)];
1525 } else {
1526 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1527 }
1528
1529 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1530 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1531 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1532 ipcl_hash_insert_bound(connfp, connp);
1533 } else {
1534 ipcl_hash_insert_wildcard(connfp, connp);
1535 }
1536 break;
1537 }
1538
1539 return (ret);
1540 }
1541
1542 /*
1543 * v4 packet classifying function. looks up the fanout table to
1544 * find the conn, the packet belongs to. returns the conn with
1545 * the reference held, null otherwise.
1546 *
1547 * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1548 * Lookup" comment block are applied. Labels are also checked as described
1549 * above. If the packet is from the inside (looped back), and is from the same
1550 * zone, then label checks are omitted.
1551 */
1552 conn_t *
1553 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1554 ip_recv_attr_t *ira, ip_stack_t *ipst)
1555 {
1556 ipha_t *ipha;
1557 connf_t *connfp, *bind_connfp;
1558 uint16_t lport;
1559 uint16_t fport;
1560 uint32_t ports;
1561 conn_t *connp;
1562 uint16_t *up;
1563 zoneid_t zoneid = ira->ira_zoneid;
1564
1565 ipha = (ipha_t *)mp->b_rptr;
1566 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1567
1568 switch (protocol) {
1569 case IPPROTO_TCP:
1570 ports = *(uint32_t *)up;
1571 connfp =
1572 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1573 ports, ipst)];
1574 mutex_enter(&connfp->connf_lock);
1575 for (connp = connfp->connf_head; connp != NULL;
1576 connp = connp->conn_next) {
1577 if (IPCL_CONN_MATCH(connp, protocol,
1578 ipha->ipha_src, ipha->ipha_dst, ports) &&
1579 (connp->conn_zoneid == zoneid ||
1580 connp->conn_allzones ||
1581 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1582 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1583 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1584 break;
1585 }
1586
1587 if (connp != NULL) {
1588 /*
1589 * We have a fully-bound TCP connection.
1590 *
1591 * For labeled systems, there's no need to check the
1592 * label here. It's known to be good as we checked
1593 * before allowing the connection to become bound.
1594 */
1595 CONN_INC_REF(connp);
1596 mutex_exit(&connfp->connf_lock);
1597 return (connp);
1598 }
1599
1600 mutex_exit(&connfp->connf_lock);
1601 lport = up[1];
1602 bind_connfp =
1603 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1604 mutex_enter(&bind_connfp->connf_lock);
1605 for (connp = bind_connfp->connf_head; connp != NULL;
1606 connp = connp->conn_next) {
1607 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1608 lport) &&
1609 (connp->conn_zoneid == zoneid ||
1610 connp->conn_allzones ||
1611 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1612 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1613 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1614 break;
1615 }
1616
1617 /*
1618 * If the matching connection is SLP on a private address, then
1619 * the label on the packet must match the local zone's label.
1620 * Otherwise, it must be in the label range defined by tnrh.
1621 * This is ensured by tsol_receive_local.
1622 *
1623 * Note that we don't check tsol_receive_local for
1624 * the connected case.
1625 */
1626 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1627 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1628 ira, connp)) {
1629 DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1630 char *, "connp(1) could not receive mp(2)",
1631 conn_t *, connp, mblk_t *, mp);
1632 connp = NULL;
1633 }
1634
1635 if (connp != NULL) {
1636 /* Have a listener at least */
1637 CONN_INC_REF(connp);
1638 mutex_exit(&bind_connfp->connf_lock);
1639 return (connp);
1640 }
1641
1642 mutex_exit(&bind_connfp->connf_lock);
1643 break;
1644
1645 case IPPROTO_UDP:
1646 lport = up[1];
1647 fport = up[0];
1648 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1649 mutex_enter(&connfp->connf_lock);
1650 for (connp = connfp->connf_head; connp != NULL;
1651 connp = connp->conn_next) {
1652 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1653 fport, ipha->ipha_src) &&
1654 (connp->conn_zoneid == zoneid ||
1655 connp->conn_allzones ||
1656 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1657 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1658 break;
1659 }
1660
1661 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1662 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1663 ira, connp)) {
1664 DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1665 char *, "connp(1) could not receive mp(2)",
1666 conn_t *, connp, mblk_t *, mp);
1667 connp = NULL;
1668 }
1669
1670 if (connp != NULL) {
1671 CONN_INC_REF(connp);
1672 mutex_exit(&connfp->connf_lock);
1673 return (connp);
1674 }
1675
1676 /*
1677 * We shouldn't come here for multicast/broadcast packets
1678 */
1679 mutex_exit(&connfp->connf_lock);
1680
1681 break;
1682
1683 case IPPROTO_ENCAP:
1684 case IPPROTO_IPV6:
1685 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1686 &ipha->ipha_dst, ipst));
1687 }
1688
1689 return (NULL);
1690 }
1691
1692 conn_t *
1693 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1694 ip_recv_attr_t *ira, ip_stack_t *ipst)
1695 {
1696 ip6_t *ip6h;
1697 connf_t *connfp, *bind_connfp;
1698 uint16_t lport;
1699 uint16_t fport;
1700 tcpha_t *tcpha;
1701 uint32_t ports;
1702 conn_t *connp;
1703 uint16_t *up;
1704 zoneid_t zoneid = ira->ira_zoneid;
1705
1706 ip6h = (ip6_t *)mp->b_rptr;
1707
1708 switch (protocol) {
1709 case IPPROTO_TCP:
1710 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1711 up = &tcpha->tha_lport;
1712 ports = *(uint32_t *)up;
1713
1714 connfp =
1715 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1716 ports, ipst)];
1717 mutex_enter(&connfp->connf_lock);
1718 for (connp = connfp->connf_head; connp != NULL;
1719 connp = connp->conn_next) {
1720 if (IPCL_CONN_MATCH_V6(connp, protocol,
1721 ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1722 (connp->conn_zoneid == zoneid ||
1723 connp->conn_allzones ||
1724 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1725 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1726 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1727 break;
1728 }
1729
1730 if (connp != NULL) {
1731 /*
1732 * We have a fully-bound TCP connection.
1733 *
1734 * For labeled systems, there's no need to check the
1735 * label here. It's known to be good as we checked
1736 * before allowing the connection to become bound.
1737 */
1738 CONN_INC_REF(connp);
1739 mutex_exit(&connfp->connf_lock);
1740 return (connp);
1741 }
1742
1743 mutex_exit(&connfp->connf_lock);
1744
1745 lport = up[1];
1746 bind_connfp =
1747 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1748 mutex_enter(&bind_connfp->connf_lock);
1749 for (connp = bind_connfp->connf_head; connp != NULL;
1750 connp = connp->conn_next) {
1751 if (IPCL_BIND_MATCH_V6(connp, protocol,
1752 ip6h->ip6_dst, lport) &&
1753 (connp->conn_zoneid == zoneid ||
1754 connp->conn_allzones ||
1755 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1756 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1757 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1758 break;
1759 }
1760
1761 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1762 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1763 ira, connp)) {
1764 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1765 char *, "connp(1) could not receive mp(2)",
1766 conn_t *, connp, mblk_t *, mp);
1767 connp = NULL;
1768 }
1769
1770 if (connp != NULL) {
1771 /* Have a listner at least */
1772 CONN_INC_REF(connp);
1773 mutex_exit(&bind_connfp->connf_lock);
1774 return (connp);
1775 }
1776
1777 mutex_exit(&bind_connfp->connf_lock);
1778 break;
1779
1780 case IPPROTO_UDP:
1781 up = (uint16_t *)&mp->b_rptr[hdr_len];
1782 lport = up[1];
1783 fport = up[0];
1784 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1785 mutex_enter(&connfp->connf_lock);
1786 for (connp = connfp->connf_head; connp != NULL;
1787 connp = connp->conn_next) {
1788 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1789 fport, ip6h->ip6_src) &&
1790 (connp->conn_zoneid == zoneid ||
1791 connp->conn_allzones ||
1792 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1793 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1794 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1795 break;
1796 }
1797
1798 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1799 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1800 ira, connp)) {
1801 DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1802 char *, "connp(1) could not receive mp(2)",
1803 conn_t *, connp, mblk_t *, mp);
1804 connp = NULL;
1805 }
1806
1807 if (connp != NULL) {
1808 CONN_INC_REF(connp);
1809 mutex_exit(&connfp->connf_lock);
1810 return (connp);
1811 }
1812
1813 /*
1814 * We shouldn't come here for multicast/broadcast packets
1815 */
1816 mutex_exit(&connfp->connf_lock);
1817 break;
1818 case IPPROTO_ENCAP:
1819 case IPPROTO_IPV6:
1820 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1821 &ip6h->ip6_dst, ipst));
1822 }
1823
1824 return (NULL);
1825 }
1826
1827 /*
1828 * wrapper around ipcl_classify_(v4,v6) routines.
1829 */
1830 conn_t *
1831 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1832 {
1833 if (ira->ira_flags & IRAF_IS_IPV4) {
1834 return (ipcl_classify_v4(mp, ira->ira_protocol,
1835 ira->ira_ip_hdr_length, ira, ipst));
1836 } else {
1837 return (ipcl_classify_v6(mp, ira->ira_protocol,
1838 ira->ira_ip_hdr_length, ira, ipst));
1839 }
1840 }
1841
1842 /*
1843 * Only used to classify SCTP RAW sockets
1844 */
1845 conn_t *
1846 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1847 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1848 {
1849 connf_t *connfp;
1850 conn_t *connp;
1851 in_port_t lport;
1852 int ipversion;
1853 const void *dst;
1854 zoneid_t zoneid = ira->ira_zoneid;
1855
1856 lport = ((uint16_t *)&ports)[1];
1857 if (ira->ira_flags & IRAF_IS_IPV4) {
1858 dst = (const void *)&ipha->ipha_dst;
1859 ipversion = IPV4_VERSION;
1860 } else {
1861 dst = (const void *)&ip6h->ip6_dst;
1862 ipversion = IPV6_VERSION;
1863 }
1864
1865 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1866 mutex_enter(&connfp->connf_lock);
1867 for (connp = connfp->connf_head; connp != NULL;
1868 connp = connp->conn_next) {
1869 /* We don't allow v4 fallback for v6 raw socket. */
1870 if (ipversion != connp->conn_ipversion)
1871 continue;
1872 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1873 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1874 if (ipversion == IPV4_VERSION) {
1875 if (!IPCL_CONN_MATCH(connp, protocol,
1876 ipha->ipha_src, ipha->ipha_dst, ports))
1877 continue;
1878 } else {
1879 if (!IPCL_CONN_MATCH_V6(connp, protocol,
1880 ip6h->ip6_src, ip6h->ip6_dst, ports))
1881 continue;
1882 }
1883 } else {
1884 if (ipversion == IPV4_VERSION) {
1885 if (!IPCL_BIND_MATCH(connp, protocol,
1886 ipha->ipha_dst, lport))
1887 continue;
1888 } else {
1889 if (!IPCL_BIND_MATCH_V6(connp, protocol,
1890 ip6h->ip6_dst, lport))
1891 continue;
1892 }
1893 }
1894
1895 if (connp->conn_zoneid == zoneid ||
1896 connp->conn_allzones ||
1897 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1898 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1899 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1900 break;
1901 }
1902
1903 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1904 !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1905 DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1906 char *, "connp(1) could not receive mp(2)",
1907 conn_t *, connp, mblk_t *, mp);
1908 connp = NULL;
1909 }
1910
1911 if (connp != NULL)
1912 goto found;
1913 mutex_exit(&connfp->connf_lock);
1914
1915 /* Try to look for a wildcard SCTP RAW socket match. */
1916 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1917 mutex_enter(&connfp->connf_lock);
1918 for (connp = connfp->connf_head; connp != NULL;
1919 connp = connp->conn_next) {
1920 /* We don't allow v4 fallback for v6 raw socket. */
1921 if (ipversion != connp->conn_ipversion)
1922 continue;
1923 if (!IPCL_ZONE_MATCH(connp, zoneid))
1924 continue;
1925
1926 if (ipversion == IPV4_VERSION) {
1927 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1928 break;
1929 } else {
1930 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1931 break;
1932 }
1933 }
1934 }
1935
1936 if (connp != NULL)
1937 goto found;
1938
1939 mutex_exit(&connfp->connf_lock);
1940 return (NULL);
1941
1942 found:
1943 ASSERT(connp != NULL);
1944 CONN_INC_REF(connp);
1945 mutex_exit(&connfp->connf_lock);
1946 return (connp);
1947 }
1948
1949 /* ARGSUSED */
1950 static int
1951 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1952 {
1953 itc_t *itc = (itc_t *)buf;
1954 conn_t *connp = &itc->itc_conn;
1955 tcp_t *tcp = (tcp_t *)&itc[1];
1956
1957 bzero(connp, sizeof (conn_t));
1958 bzero(tcp, sizeof (tcp_t));
1959
1960 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1961 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1962 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1963 tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1964 if (tcp->tcp_timercache == NULL)
1965 return (ENOMEM);
1966 connp->conn_tcp = tcp;
1967 connp->conn_flags = IPCL_TCPCONN;
1968 connp->conn_proto = IPPROTO_TCP;
1969 tcp->tcp_connp = connp;
1970 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1971
1972 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1973 if (connp->conn_ixa == NULL) {
1974 tcp_timermp_free(tcp);
1975 return (ENOMEM);
1976 }
1977 connp->conn_ixa->ixa_refcnt = 1;
1978 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1979 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1980 return (0);
1981 }
1982
1983 /* ARGSUSED */
1984 static void
1985 tcp_conn_destructor(void *buf, void *cdrarg)
1986 {
1987 itc_t *itc = (itc_t *)buf;
1988 conn_t *connp = &itc->itc_conn;
1989 tcp_t *tcp = (tcp_t *)&itc[1];
1990
1991 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1992 ASSERT(tcp->tcp_connp == connp);
1993 ASSERT(connp->conn_tcp == tcp);
1994 tcp_timermp_free(tcp);
1995 mutex_destroy(&connp->conn_lock);
1996 cv_destroy(&connp->conn_cv);
1997 cv_destroy(&connp->conn_sq_cv);
1998 rw_destroy(&connp->conn_ilg_lock);
1999
2000 /* Can be NULL if constructor failed */
2001 if (connp->conn_ixa != NULL) {
2002 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2003 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2004 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2005 ixa_refrele(connp->conn_ixa);
2006 }
2007 }
2008
2009 /* ARGSUSED */
2010 static int
2011 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2012 {
2013 itc_t *itc = (itc_t *)buf;
2014 conn_t *connp = &itc->itc_conn;
2015
2016 bzero(connp, sizeof (conn_t));
2017 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2018 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2019 connp->conn_flags = IPCL_IPCCONN;
2020 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2021
2022 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2023 if (connp->conn_ixa == NULL)
2024 return (ENOMEM);
2025 connp->conn_ixa->ixa_refcnt = 1;
2026 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2027 return (0);
2028 }
2029
2030 /* ARGSUSED */
2031 static void
2032 ip_conn_destructor(void *buf, void *cdrarg)
2033 {
2034 itc_t *itc = (itc_t *)buf;
2035 conn_t *connp = &itc->itc_conn;
2036
2037 ASSERT(connp->conn_flags & IPCL_IPCCONN);
2038 ASSERT(connp->conn_priv == NULL);
2039 mutex_destroy(&connp->conn_lock);
2040 cv_destroy(&connp->conn_cv);
2041 rw_destroy(&connp->conn_ilg_lock);
2042
2043 /* Can be NULL if constructor failed */
2044 if (connp->conn_ixa != NULL) {
2045 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2046 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2047 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2048 ixa_refrele(connp->conn_ixa);
2049 }
2050 }
2051
2052 /* ARGSUSED */
2053 static int
2054 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2055 {
2056 itc_t *itc = (itc_t *)buf;
2057 conn_t *connp = &itc->itc_conn;
2058 udp_t *udp = (udp_t *)&itc[1];
2059
2060 bzero(connp, sizeof (conn_t));
2061 bzero(udp, sizeof (udp_t));
2062
2063 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2064 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2065 connp->conn_udp = udp;
2066 connp->conn_flags = IPCL_UDPCONN;
2067 connp->conn_proto = IPPROTO_UDP;
2068 udp->udp_connp = connp;
2069 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2070 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2071 if (connp->conn_ixa == NULL)
2072 return (ENOMEM);
2073 connp->conn_ixa->ixa_refcnt = 1;
2074 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2075 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2076 return (0);
2077 }
2078
2079 /* ARGSUSED */
2080 static void
2081 udp_conn_destructor(void *buf, void *cdrarg)
2082 {
2083 itc_t *itc = (itc_t *)buf;
2084 conn_t *connp = &itc->itc_conn;
2085 udp_t *udp = (udp_t *)&itc[1];
2086
2087 ASSERT(connp->conn_flags & IPCL_UDPCONN);
2088 ASSERT(udp->udp_connp == connp);
2089 ASSERT(connp->conn_udp == udp);
2090 mutex_destroy(&connp->conn_lock);
2091 cv_destroy(&connp->conn_cv);
2092 rw_destroy(&connp->conn_ilg_lock);
2093
2094 /* Can be NULL if constructor failed */
2095 if (connp->conn_ixa != NULL) {
2096 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2097 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2098 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2099 ixa_refrele(connp->conn_ixa);
2100 }
2101 }
2102
2103 /* ARGSUSED */
2104 static int
2105 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2106 {
2107 itc_t *itc = (itc_t *)buf;
2108 conn_t *connp = &itc->itc_conn;
2109 icmp_t *icmp = (icmp_t *)&itc[1];
2110
2111 bzero(connp, sizeof (conn_t));
2112 bzero(icmp, sizeof (icmp_t));
2113
2114 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2115 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2116 connp->conn_icmp = icmp;
2117 connp->conn_flags = IPCL_RAWIPCONN;
2118 connp->conn_proto = IPPROTO_ICMP;
2119 icmp->icmp_connp = connp;
2120 rw_init(&icmp->icmp_bpf_lock, NULL, RW_DEFAULT, NULL);
2121 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2122 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2123 if (connp->conn_ixa == NULL)
2124 return (ENOMEM);
2125 connp->conn_ixa->ixa_refcnt = 1;
2126 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2127 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2128 return (0);
2129 }
2130
2131 /* ARGSUSED */
2132 static void
2133 rawip_conn_destructor(void *buf, void *cdrarg)
2134 {
2135 itc_t *itc = (itc_t *)buf;
2136 conn_t *connp = &itc->itc_conn;
2137 icmp_t *icmp = (icmp_t *)&itc[1];
2138
2139 ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2140 ASSERT(icmp->icmp_connp == connp);
2141 ASSERT(connp->conn_icmp == icmp);
2142 mutex_destroy(&connp->conn_lock);
2143 cv_destroy(&connp->conn_cv);
2144 rw_destroy(&connp->conn_ilg_lock);
2145 rw_destroy(&icmp->icmp_bpf_lock);
2146
2147 /* Can be NULL if constructor failed */
2148 if (connp->conn_ixa != NULL) {
2149 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2150 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2151 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2152 ixa_refrele(connp->conn_ixa);
2153 }
2154 }
2155
2156 /* ARGSUSED */
2157 static int
2158 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2159 {
2160 itc_t *itc = (itc_t *)buf;
2161 conn_t *connp = &itc->itc_conn;
2162 rts_t *rts = (rts_t *)&itc[1];
2163
2164 bzero(connp, sizeof (conn_t));
2165 bzero(rts, sizeof (rts_t));
2166
2167 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2168 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2169 connp->conn_rts = rts;
2170 connp->conn_flags = IPCL_RTSCONN;
2171 rts->rts_connp = connp;
2172 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2173 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2174 if (connp->conn_ixa == NULL)
2175 return (ENOMEM);
2176 connp->conn_ixa->ixa_refcnt = 1;
2177 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2178 return (0);
2179 }
2180
2181 /* ARGSUSED */
2182 static void
2183 rts_conn_destructor(void *buf, void *cdrarg)
2184 {
2185 itc_t *itc = (itc_t *)buf;
2186 conn_t *connp = &itc->itc_conn;
2187 rts_t *rts = (rts_t *)&itc[1];
2188
2189 ASSERT(connp->conn_flags & IPCL_RTSCONN);
2190 ASSERT(rts->rts_connp == connp);
2191 ASSERT(connp->conn_rts == rts);
2192 mutex_destroy(&connp->conn_lock);
2193 cv_destroy(&connp->conn_cv);
2194 rw_destroy(&connp->conn_ilg_lock);
2195
2196 /* Can be NULL if constructor failed */
2197 if (connp->conn_ixa != NULL) {
2198 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2199 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2200 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2201 ixa_refrele(connp->conn_ixa);
2202 }
2203 }
2204
2205 /*
2206 * Called as part of ipcl_conn_destroy to assert and clear any pointers
2207 * in the conn_t.
2208 *
2209 * Below we list all the pointers in the conn_t as a documentation aid.
2210 * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2211 * If you add any pointers to the conn_t please add an ASSERT here
2212 * and #ifdef it out if it can't be actually asserted to be NULL.
2213 * In any case, we bzero most of the conn_t at the end of the function.
2214 */
2215 void
2216 ipcl_conn_cleanup(conn_t *connp)
2217 {
2218 ip_xmit_attr_t *ixa;
2219
2220 ASSERT(connp->conn_latch == NULL);
2221 ASSERT(connp->conn_latch_in_policy == NULL);
2222 ASSERT(connp->conn_latch_in_action == NULL);
2223 #ifdef notdef
2224 ASSERT(connp->conn_rq == NULL);
2225 ASSERT(connp->conn_wq == NULL);
2226 #endif
2227 ASSERT(connp->conn_cred == NULL);
2228 ASSERT(connp->conn_g_fanout == NULL);
2229 ASSERT(connp->conn_g_next == NULL);
2230 ASSERT(connp->conn_g_prev == NULL);
2231 ASSERT(connp->conn_policy == NULL);
2232 ASSERT(connp->conn_fanout == NULL);
2233 ASSERT(connp->conn_next == NULL);
2234 ASSERT(connp->conn_prev == NULL);
2235 ASSERT(connp->conn_oper_pending_ill == NULL);
2236 ASSERT(connp->conn_ilg == NULL);
2237 ASSERT(connp->conn_drain_next == NULL);
2238 ASSERT(connp->conn_drain_prev == NULL);
2239 #ifdef notdef
2240 /* conn_idl is not cleared when removed from idl list */
2241 ASSERT(connp->conn_idl == NULL);
2242 #endif
2243 ASSERT(connp->conn_ipsec_opt_mp == NULL);
2244 #ifdef notdef
2245 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2246 ASSERT(connp->conn_netstack == NULL);
2247 #endif
2248
2249 ASSERT(connp->conn_helper_info == NULL);
2250 ASSERT(connp->conn_ixa != NULL);
2251 ixa = connp->conn_ixa;
2252 ASSERT(ixa->ixa_refcnt == 1);
2253 /* Need to preserve ixa_protocol */
2254 ixa_cleanup(ixa);
2255 ixa->ixa_flags = 0;
2256
2257 /* Clear out the conn_t fields that are not preserved */
2258 bzero(&connp->conn_start_clr,
2259 sizeof (conn_t) -
2260 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2261 }
2262
2263 /*
2264 * All conns are inserted in a global multi-list for the benefit of
2265 * walkers. The walk is guaranteed to walk all open conns at the time
2266 * of the start of the walk exactly once. This property is needed to
2267 * achieve some cleanups during unplumb of interfaces. This is achieved
2268 * as follows.
2269 *
2270 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2271 * call the insert and delete functions below at creation and deletion
2272 * time respectively. The conn never moves or changes its position in this
2273 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2274 * won't increase due to walkers, once the conn deletion has started. Note
2275 * that we can't remove the conn from the global list and then wait for
2276 * the refcnt to drop to zero, since walkers would then see a truncated
2277 * list. CONN_INCIPIENT ensures that walkers don't start looking at
2278 * conns until ip_open is ready to make them globally visible.
2279 * The global round robin multi-list locks are held only to get the
2280 * next member/insertion/deletion and contention should be negligible
2281 * if the multi-list is much greater than the number of cpus.
2282 */
2283 void
2284 ipcl_globalhash_insert(conn_t *connp)
2285 {
2286 int index;
2287 struct connf_s *connfp;
2288 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
2289
2290 /*
2291 * No need for atomic here. Approximate even distribution
2292 * in the global lists is sufficient.
2293 */
2294 ipst->ips_conn_g_index++;
2295 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2296
2297 connp->conn_g_prev = NULL;
2298 /*
2299 * Mark as INCIPIENT, so that walkers will ignore this
2300 * for now, till ip_open is ready to make it visible globally.
2301 */
2302 connp->conn_state_flags |= CONN_INCIPIENT;
2303
2304 connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2305 /* Insert at the head of the list */
2306 mutex_enter(&connfp->connf_lock);
2307 connp->conn_g_next = connfp->connf_head;
2308 if (connp->conn_g_next != NULL)
2309 connp->conn_g_next->conn_g_prev = connp;
2310 connfp->connf_head = connp;
2311
2312 /* The fanout bucket this conn points to */
2313 connp->conn_g_fanout = connfp;
2314
2315 mutex_exit(&connfp->connf_lock);
2316 }
2317
2318 void
2319 ipcl_globalhash_remove(conn_t *connp)
2320 {
2321 struct connf_s *connfp;
2322
2323 /*
2324 * We were never inserted in the global multi list.
2325 * IPCL_NONE variety is never inserted in the global multilist
2326 * since it is presumed to not need any cleanup and is transient.
2327 */
2328 if (connp->conn_g_fanout == NULL)
2329 return;
2330
2331 connfp = connp->conn_g_fanout;
2332 mutex_enter(&connfp->connf_lock);
2333 if (connp->conn_g_prev != NULL)
2334 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2335 else
2336 connfp->connf_head = connp->conn_g_next;
2337 if (connp->conn_g_next != NULL)
2338 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2339 mutex_exit(&connfp->connf_lock);
2340
2341 /* Better to stumble on a null pointer than to corrupt memory */
2342 connp->conn_g_next = NULL;
2343 connp->conn_g_prev = NULL;
2344 connp->conn_g_fanout = NULL;
2345 }
2346
2347 /*
2348 * Walk the list of all conn_t's in the system, calling the function provided
2349 * With the specified argument for each.
2350 * Applies to both IPv4 and IPv6.
2351 *
2352 * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2353 * conn_oper_pending_ill). To guard against stale pointers
2354 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2355 * unplumbed or removed. New conn_t's that are created while we are walking
2356 * may be missed by this walk, because they are not necessarily inserted
2357 * at the tail of the list. They are new conn_t's and thus don't have any
2358 * stale pointers. The CONN_CLOSING flag ensures that no new reference
2359 * is created to the struct that is going away.
2360 */
2361 void
2362 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2363 {
2364 int i;
2365 conn_t *connp;
2366 conn_t *prev_connp;
2367
2368 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2369 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2370 prev_connp = NULL;
2371 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2372 while (connp != NULL) {
2373 mutex_enter(&connp->conn_lock);
2374 if (connp->conn_state_flags &
2375 (CONN_CONDEMNED | CONN_INCIPIENT)) {
2376 mutex_exit(&connp->conn_lock);
2377 connp = connp->conn_g_next;
2378 continue;
2379 }
2380 CONN_INC_REF_LOCKED(connp);
2381 mutex_exit(&connp->conn_lock);
2382 mutex_exit(
2383 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2384 (*func)(connp, arg);
2385 if (prev_connp != NULL)
2386 CONN_DEC_REF(prev_connp);
2387 mutex_enter(
2388 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2389 prev_connp = connp;
2390 connp = connp->conn_g_next;
2391 }
2392 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2393 if (prev_connp != NULL)
2394 CONN_DEC_REF(prev_connp);
2395 }
2396 }
2397
2398 /*
2399 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2400 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2401 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2402 * (peer tcp in ESTABLISHED state).
2403 */
2404 conn_t *
2405 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2406 ip_stack_t *ipst)
2407 {
2408 uint32_t ports;
2409 uint16_t *pports = (uint16_t *)&ports;
2410 connf_t *connfp;
2411 conn_t *tconnp;
2412 boolean_t zone_chk;
2413
2414 /*
2415 * If either the source of destination address is loopback, then
2416 * both endpoints must be in the same Zone. Otherwise, both of
2417 * the addresses are system-wide unique (tcp is in ESTABLISHED
2418 * state) and the endpoints may reside in different Zones.
2419 */
2420 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2421 ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2422
2423 pports[0] = tcpha->tha_fport;
2424 pports[1] = tcpha->tha_lport;
2425
2426 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2427 ports, ipst)];
2428
2429 mutex_enter(&connfp->connf_lock);
2430 for (tconnp = connfp->connf_head; tconnp != NULL;
2431 tconnp = tconnp->conn_next) {
2432
2433 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2434 ipha->ipha_dst, ipha->ipha_src, ports) &&
2435 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2436 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2437
2438 ASSERT(tconnp != connp);
2439 CONN_INC_REF(tconnp);
2440 mutex_exit(&connfp->connf_lock);
2441 return (tconnp);
2442 }
2443 }
2444 mutex_exit(&connfp->connf_lock);
2445 return (NULL);
2446 }
2447
2448 /*
2449 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2450 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2451 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2452 * (peer tcp in ESTABLISHED state).
2453 */
2454 conn_t *
2455 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2456 ip_stack_t *ipst)
2457 {
2458 uint32_t ports;
2459 uint16_t *pports = (uint16_t *)&ports;
2460 connf_t *connfp;
2461 conn_t *tconnp;
2462 boolean_t zone_chk;
2463
2464 /*
2465 * If either the source of destination address is loopback, then
2466 * both endpoints must be in the same Zone. Otherwise, both of
2467 * the addresses are system-wide unique (tcp is in ESTABLISHED
2468 * state) and the endpoints may reside in different Zones. We
2469 * don't do Zone check for link local address(es) because the
2470 * current Zone implementation treats each link local address as
2471 * being unique per system node, i.e. they belong to global Zone.
2472 */
2473 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2474 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2475
2476 pports[0] = tcpha->tha_fport;
2477 pports[1] = tcpha->tha_lport;
2478
2479 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2480 ports, ipst)];
2481
2482 mutex_enter(&connfp->connf_lock);
2483 for (tconnp = connfp->connf_head; tconnp != NULL;
2484 tconnp = tconnp->conn_next) {
2485
2486 /* We skip conn_bound_if check here as this is loopback tcp */
2487 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2488 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2489 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2490 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2491
2492 ASSERT(tconnp != connp);
2493 CONN_INC_REF(tconnp);
2494 mutex_exit(&connfp->connf_lock);
2495 return (tconnp);
2496 }
2497 }
2498 mutex_exit(&connfp->connf_lock);
2499 return (NULL);
2500 }
2501
2502 /*
2503 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2504 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2505 * Only checks for connected entries i.e. no INADDR_ANY checks.
2506 */
2507 conn_t *
2508 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2509 ip_stack_t *ipst)
2510 {
2511 uint32_t ports;
2512 uint16_t *pports;
2513 connf_t *connfp;
2514 conn_t *tconnp;
2515
2516 pports = (uint16_t *)&ports;
2517 pports[0] = tcpha->tha_fport;
2518 pports[1] = tcpha->tha_lport;
2519
2520 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2521 ports, ipst)];
2522
2523 mutex_enter(&connfp->connf_lock);
2524 for (tconnp = connfp->connf_head; tconnp != NULL;
2525 tconnp = tconnp->conn_next) {
2526
2527 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2528 ipha->ipha_dst, ipha->ipha_src, ports) &&
2529 tconnp->conn_tcp->tcp_state >= min_state) {
2530
2531 CONN_INC_REF(tconnp);
2532 mutex_exit(&connfp->connf_lock);
2533 return (tconnp);
2534 }
2535 }
2536 mutex_exit(&connfp->connf_lock);
2537 return (NULL);
2538 }
2539
2540 /*
2541 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2542 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2543 * Only checks for connected entries i.e. no INADDR_ANY checks.
2544 * Match on ifindex in addition to addresses.
2545 */
2546 conn_t *
2547 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2548 uint_t ifindex, ip_stack_t *ipst)
2549 {
2550 tcp_t *tcp;
2551 uint32_t ports;
2552 uint16_t *pports;
2553 connf_t *connfp;
2554 conn_t *tconnp;
2555
2556 pports = (uint16_t *)&ports;
2557 pports[0] = tcpha->tha_fport;
2558 pports[1] = tcpha->tha_lport;
2559
2560 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2561 ports, ipst)];
2562
2563 mutex_enter(&connfp->connf_lock);
2564 for (tconnp = connfp->connf_head; tconnp != NULL;
2565 tconnp = tconnp->conn_next) {
2566
2567 tcp = tconnp->conn_tcp;
2568 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2569 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2570 tcp->tcp_state >= min_state &&
2571 (tconnp->conn_bound_if == 0 ||
2572 tconnp->conn_bound_if == ifindex)) {
2573
2574 CONN_INC_REF(tconnp);
2575 mutex_exit(&connfp->connf_lock);
2576 return (tconnp);
2577 }
2578 }
2579 mutex_exit(&connfp->connf_lock);
2580 return (NULL);
2581 }
2582
2583 /*
2584 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2585 * a listener when changing state.
2586 */
2587 conn_t *
2588 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2589 ip_stack_t *ipst)
2590 {
2591 connf_t *bind_connfp;
2592 conn_t *connp;
2593 tcp_t *tcp;
2594
2595 /*
2596 * Avoid false matches for packets sent to an IP destination of
2597 * all zeros.
2598 */
2599 if (laddr == 0)
2600 return (NULL);
2601
2602 ASSERT(zoneid != ALL_ZONES);
2603
2604 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2605 mutex_enter(&bind_connfp->connf_lock);
2606 for (connp = bind_connfp->connf_head; connp != NULL;
2607 connp = connp->conn_next) {
2608 tcp = connp->conn_tcp;
2609 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2610 IPCL_ZONE_MATCH(connp, zoneid) &&
2611 (tcp->tcp_listener == NULL)) {
2612 CONN_INC_REF(connp);
2613 mutex_exit(&bind_connfp->connf_lock);
2614 return (connp);
2615 }
2616 }
2617 mutex_exit(&bind_connfp->connf_lock);
2618 return (NULL);
2619 }
2620
2621 /*
2622 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2623 * a listener when changing state.
2624 */
2625 conn_t *
2626 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2627 zoneid_t zoneid, ip_stack_t *ipst)
2628 {
2629 connf_t *bind_connfp;
2630 conn_t *connp = NULL;
2631 tcp_t *tcp;
2632
2633 /*
2634 * Avoid false matches for packets sent to an IP destination of
2635 * all zeros.
2636 */
2637 if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2638 return (NULL);
2639
2640 ASSERT(zoneid != ALL_ZONES);
2641
2642 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2643 mutex_enter(&bind_connfp->connf_lock);
2644 for (connp = bind_connfp->connf_head; connp != NULL;
2645 connp = connp->conn_next) {
2646 tcp = connp->conn_tcp;
2647 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2648 IPCL_ZONE_MATCH(connp, zoneid) &&
2649 (connp->conn_bound_if == 0 ||
2650 connp->conn_bound_if == ifindex) &&
2651 tcp->tcp_listener == NULL) {
2652 CONN_INC_REF(connp);
2653 mutex_exit(&bind_connfp->connf_lock);
2654 return (connp);
2655 }
2656 }
2657 mutex_exit(&bind_connfp->connf_lock);
2658 return (NULL);
2659 }
2660
2661 /*
2662 * ipcl_get_next_conn
2663 * get the next entry in the conn global list
2664 * and put a reference on the next_conn.
2665 * decrement the reference on the current conn.
2666 *
2667 * This is an iterator based walker function that also provides for
2668 * some selection by the caller. It walks through the conn_hash bucket
2669 * searching for the next valid connp in the list, and selects connections
2670 * that are neither closed nor condemned. It also REFHOLDS the conn
2671 * thus ensuring that the conn exists when the caller uses the conn.
2672 */
2673 conn_t *
2674 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2675 {
2676 conn_t *next_connp;
2677
2678 if (connfp == NULL)
2679 return (NULL);
2680
2681 mutex_enter(&connfp->connf_lock);
2682
2683 next_connp = (connp == NULL) ?
2684 connfp->connf_head : connp->conn_g_next;
2685
2686 while (next_connp != NULL) {
2687 mutex_enter(&next_connp->conn_lock);
2688 if (!(next_connp->conn_flags & conn_flags) ||
2689 (next_connp->conn_state_flags &
2690 (CONN_CONDEMNED | CONN_INCIPIENT))) {
2691 /*
2692 * This conn has been condemned or
2693 * is closing, or the flags don't match
2694 */
2695 mutex_exit(&next_connp->conn_lock);
2696 next_connp = next_connp->conn_g_next;
2697 continue;
2698 }
2699 CONN_INC_REF_LOCKED(next_connp);
2700 mutex_exit(&next_connp->conn_lock);
2701 break;
2702 }
2703
2704 mutex_exit(&connfp->connf_lock);
2705
2706 if (connp != NULL)
2707 CONN_DEC_REF(connp);
2708
2709 return (next_connp);
2710 }
2711
2712 #ifdef CONN_DEBUG
2713 /*
2714 * Trace of the last NBUF refhold/refrele
2715 */
2716 int
2717 conn_trace_ref(conn_t *connp)
2718 {
2719 int last;
2720 conn_trace_t *ctb;
2721
2722 ASSERT(MUTEX_HELD(&connp->conn_lock));
2723 last = connp->conn_trace_last;
2724 last++;
2725 if (last == CONN_TRACE_MAX)
2726 last = 0;
2727
2728 ctb = &connp->conn_trace_buf[last];
2729 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2730 connp->conn_trace_last = last;
2731 return (1);
2732 }
2733
2734 int
2735 conn_untrace_ref(conn_t *connp)
2736 {
2737 int last;
2738 conn_trace_t *ctb;
2739
2740 ASSERT(MUTEX_HELD(&connp->conn_lock));
2741 last = connp->conn_trace_last;
2742 last++;
2743 if (last == CONN_TRACE_MAX)
2744 last = 0;
2745
2746 ctb = &connp->conn_trace_buf[last];
2747 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2748 connp->conn_trace_last = last;
2749 return (1);
2750 }
2751 #endif