1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
24 */
25
26 /*
27 * IP PACKET CLASSIFIER
28 *
29 * The IP packet classifier provides mapping between IP packets and persistent
30 * connection state for connection-oriented protocols. It also provides
31 * interface for managing connection states.
32 *
33 * The connection state is kept in conn_t data structure and contains, among
34 * other things:
35 *
36 * o local/remote address and ports
37 * o Transport protocol
38 * o squeue for the connection (for TCP only)
39 * o reference counter
40 * o Connection state
41 * o hash table linkage
42 * o interface/ire information
43 * o credentials
44 * o ipsec policy
45 * o send and receive functions.
46 * o mutex lock.
47 *
48 * Connections use a reference counting scheme. They are freed when the
49 * reference counter drops to zero. A reference is incremented when connection
50 * is placed in a list or table, when incoming packet for the connection arrives
51 * and when connection is processed via squeue (squeue processing may be
52 * asynchronous and the reference protects the connection from being destroyed
53 * before its processing is finished).
54 *
55 * conn_recv is used to pass up packets to the ULP.
56 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
57 * a listener, and changes to tcp_input_listener as the listener has picked a
58 * good squeue. For other cases it is set to tcp_input_data.
59 *
60 * conn_recvicmp is used to pass up ICMP errors to the ULP.
61 *
62 * Classifier uses several hash tables:
63 *
64 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state
65 * ipcl_bind_fanout: contains all connections in BOUND state
66 * ipcl_proto_fanout: IPv4 protocol fanout
67 * ipcl_proto_fanout_v6: IPv6 protocol fanout
68 * ipcl_udp_fanout: contains all UDP connections
69 * ipcl_iptun_fanout: contains all IP tunnel connections
70 * ipcl_globalhash_fanout: contains all connections
71 *
72 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
73 * which need to view all existing connections.
74 *
75 * All tables are protected by per-bucket locks. When both per-bucket lock and
76 * connection lock need to be held, the per-bucket lock should be acquired
77 * first, followed by the connection lock.
78 *
79 * All functions doing search in one of these tables increment a reference
80 * counter on the connection found (if any). This reference should be dropped
81 * when the caller has finished processing the connection.
82 *
83 *
84 * INTERFACES:
85 * ===========
86 *
87 * Connection Lookup:
88 * ------------------
89 *
90 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
91 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
92 *
93 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
94 * it can't find any associated connection. If the connection is found, its
95 * reference counter is incremented.
96 *
97 * mp: mblock, containing packet header. The full header should fit
98 * into a single mblock. It should also contain at least full IP
99 * and TCP or UDP header.
100 *
101 * protocol: Either IPPROTO_TCP or IPPROTO_UDP.
102 *
103 * hdr_len: The size of IP header. It is used to find TCP or UDP header in
104 * the packet.
105 *
106 * ira->ira_zoneid: The zone in which the returned connection must be; the
107 * zoneid corresponding to the ire_zoneid on the IRE located for
108 * the packet's destination address.
109 *
110 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
111 * IRAF_TX_SHARED_ADDR flags
112 *
113 * For TCP connections, the lookup order is as follows:
114 * 5-tuple {src, dst, protocol, local port, remote port}
115 * lookup in ipcl_conn_fanout table.
116 * 3-tuple {dst, remote port, protocol} lookup in
117 * ipcl_bind_fanout table.
118 *
119 * For UDP connections, a 5-tuple {src, dst, protocol, local port,
120 * remote port} lookup is done on ipcl_udp_fanout. Note that,
121 * these interfaces do not handle cases where a packets belongs
122 * to multiple UDP clients, which is handled in IP itself.
123 *
124 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
125 * determine which actual zone gets the segment. This is used only in a
126 * labeled environment. The matching rules are:
127 *
128 * - If it's not a multilevel port, then the label on the packet selects
129 * the zone. Unlabeled packets are delivered to the global zone.
130 *
131 * - If it's a multilevel port, then only the zone registered to receive
132 * packets on that port matches.
133 *
134 * Also, in a labeled environment, packet labels need to be checked. For fully
135 * bound TCP connections, we can assume that the packet label was checked
136 * during connection establishment, and doesn't need to be checked on each
137 * packet. For others, though, we need to check for strict equality or, for
138 * multilevel ports, membership in the range or set. This part currently does
139 * a tnrh lookup on each packet, but could be optimized to use cached results
140 * if that were necessary. (SCTP doesn't come through here, but if it did,
141 * we would apply the same rules as TCP.)
142 *
143 * An implication of the above is that fully-bound TCP sockets must always use
144 * distinct 4-tuples; they can't be discriminated by label alone.
145 *
146 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
147 * as there's no connection set-up handshake and no shared state.
148 *
149 * Labels on looped-back packets within a single zone do not need to be
150 * checked, as all processes in the same zone have the same label.
151 *
152 * Finally, for unlabeled packets received by a labeled system, special rules
153 * apply. We consider only the MLP if there is one. Otherwise, we prefer a
154 * socket in the zone whose label matches the default label of the sender, if
155 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the
156 * receiver's label must dominate the sender's default label.
157 *
158 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
159 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
160 * ip_stack);
161 *
162 * Lookup routine to find a exact match for {src, dst, local port,
163 * remote port) for TCP connections in ipcl_conn_fanout. The address and
164 * ports are read from the IP and TCP header respectively.
165 *
166 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol,
167 * zoneid, ip_stack);
168 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
169 * zoneid, ip_stack);
170 *
171 * Lookup routine to find a listener with the tuple {lport, laddr,
172 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional
173 * parameter interface index is also compared.
174 *
175 * void ipcl_walk(func, arg, ip_stack)
176 *
177 * Apply 'func' to every connection available. The 'func' is called as
178 * (*func)(connp, arg). The walk is non-atomic so connections may be
179 * created and destroyed during the walk. The CONN_CONDEMNED and
180 * CONN_INCIPIENT flags ensure that connections which are newly created
181 * or being destroyed are not selected by the walker.
182 *
183 * Table Updates
184 * -------------
185 *
186 * int ipcl_conn_insert(connp);
187 * int ipcl_conn_insert_v4(connp);
188 * int ipcl_conn_insert_v6(connp);
189 *
190 * Insert 'connp' in the ipcl_conn_fanout.
191 * Arguments :
192 * connp conn_t to be inserted
193 *
194 * Return value :
195 * 0 if connp was inserted
196 * EADDRINUSE if the connection with the same tuple
197 * already exists.
198 *
199 * int ipcl_bind_insert(connp);
200 * int ipcl_bind_insert_v4(connp);
201 * int ipcl_bind_insert_v6(connp);
202 *
203 * Insert 'connp' in ipcl_bind_fanout.
204 * Arguments :
205 * connp conn_t to be inserted
206 *
207 *
208 * void ipcl_hash_remove(connp);
209 *
210 * Removes the 'connp' from the connection fanout table.
211 *
212 * Connection Creation/Destruction
213 * -------------------------------
214 *
215 * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
216 *
217 * Creates a new conn based on the type flag, inserts it into
218 * globalhash table.
219 *
220 * type: This flag determines the type of conn_t which needs to be
221 * created i.e., which kmem_cache it comes from.
222 * IPCL_TCPCONN indicates a TCP connection
223 * IPCL_SCTPCONN indicates a SCTP connection
224 * IPCL_UDPCONN indicates a UDP conn_t.
225 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t.
226 * IPCL_RTSCONN indicates a RTS conn_t.
227 * IPCL_IPCCONN indicates all other connections.
228 *
229 * void ipcl_conn_destroy(connp)
230 *
231 * Destroys the connection state, removes it from the global
232 * connection hash table and frees its memory.
233 */
234
235 #include <sys/types.h>
236 #include <sys/stream.h>
237 #include <sys/stropts.h>
238 #include <sys/sysmacros.h>
239 #include <sys/strsubr.h>
240 #include <sys/strsun.h>
241 #define _SUN_TPI_VERSION 2
242 #include <sys/ddi.h>
243 #include <sys/cmn_err.h>
244 #include <sys/debug.h>
245
246 #include <sys/systm.h>
247 #include <sys/param.h>
248 #include <sys/kmem.h>
249 #include <sys/isa_defs.h>
250 #include <inet/common.h>
251 #include <netinet/ip6.h>
252 #include <netinet/icmp6.h>
253
254 #include <inet/ip.h>
255 #include <inet/ip_if.h>
256 #include <inet/ip_ire.h>
257 #include <inet/ip6.h>
258 #include <inet/ip_ndp.h>
259 #include <inet/ip_impl.h>
260 #include <inet/udp_impl.h>
261 #include <inet/sctp_ip.h>
262 #include <inet/sctp/sctp_impl.h>
263 #include <inet/rawip_impl.h>
264 #include <inet/rts_impl.h>
265 #include <inet/iptun/iptun_impl.h>
266
267 #include <sys/cpuvar.h>
268
269 #include <inet/ipclassifier.h>
270 #include <inet/tcp.h>
271 #include <inet/ipsec_impl.h>
272
273 #include <sys/tsol/tnet.h>
274 #include <sys/sockio.h>
275
276 /* Old value for compatibility. Setable in /etc/system */
277 uint_t tcp_conn_hash_size = 0;
278
279 /* New value. Zero means choose automatically. Setable in /etc/system */
280 uint_t ipcl_conn_hash_size = 0;
281 uint_t ipcl_conn_hash_memfactor = 8192;
282 uint_t ipcl_conn_hash_maxsize = 82500;
283
284 /* bind/udp fanout table size */
285 uint_t ipcl_bind_fanout_size = 512;
286 uint_t ipcl_udp_fanout_size = 16384;
287
288 /* Raw socket fanout size. Must be a power of 2. */
289 uint_t ipcl_raw_fanout_size = 256;
290
291 /*
292 * The IPCL_IPTUN_HASH() function works best with a prime table size. We
293 * expect that most large deployments would have hundreds of tunnels, and
294 * thousands in the extreme case.
295 */
296 uint_t ipcl_iptun_fanout_size = 6143;
297
298 /*
299 * Power of 2^N Primes useful for hashing for N of 0-28,
300 * these primes are the nearest prime <= 2^N - 2^(N-2).
301 */
302
303 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \
304 6143, 12281, 24571, 49139, 98299, 196597, 393209, \
305 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \
306 50331599, 100663291, 201326557, 0}
307
308 /*
309 * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
310 * are aligned on cache lines.
311 */
312 typedef union itc_s {
313 conn_t itc_conn;
314 char itcu_filler[CACHE_ALIGN(conn_s)];
315 } itc_t;
316
317 struct kmem_cache *tcp_conn_cache;
318 struct kmem_cache *ip_conn_cache;
319 extern struct kmem_cache *sctp_conn_cache;
320 struct kmem_cache *udp_conn_cache;
321 struct kmem_cache *rawip_conn_cache;
322 struct kmem_cache *rts_conn_cache;
323
324 extern void tcp_timermp_free(tcp_t *);
325 extern mblk_t *tcp_timermp_alloc(int);
326
327 static int ip_conn_constructor(void *, void *, int);
328 static void ip_conn_destructor(void *, void *);
329
330 static int tcp_conn_constructor(void *, void *, int);
331 static void tcp_conn_destructor(void *, void *);
332
333 static int udp_conn_constructor(void *, void *, int);
334 static void udp_conn_destructor(void *, void *);
335
336 static int rawip_conn_constructor(void *, void *, int);
337 static void rawip_conn_destructor(void *, void *);
338
339 static int rts_conn_constructor(void *, void *, int);
340 static void rts_conn_destructor(void *, void *);
341
342 /*
343 * Global (for all stack instances) init routine
344 */
345 void
346 ipcl_g_init(void)
347 {
348 ip_conn_cache = kmem_cache_create("ip_conn_cache",
349 sizeof (conn_t), CACHE_ALIGN_SIZE,
350 ip_conn_constructor, ip_conn_destructor,
351 NULL, NULL, NULL, 0);
352
353 tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
354 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
355 tcp_conn_constructor, tcp_conn_destructor,
356 tcp_conn_reclaim, NULL, NULL, 0);
357
358 udp_conn_cache = kmem_cache_create("udp_conn_cache",
359 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
360 udp_conn_constructor, udp_conn_destructor,
361 NULL, NULL, NULL, 0);
362
363 rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
364 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
365 rawip_conn_constructor, rawip_conn_destructor,
366 NULL, NULL, NULL, 0);
367
368 rts_conn_cache = kmem_cache_create("rts_conn_cache",
369 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
370 rts_conn_constructor, rts_conn_destructor,
371 NULL, NULL, NULL, 0);
372 }
373
374 /*
375 * ipclassifier intialization routine, sets up hash tables.
376 */
377 void
378 ipcl_init(ip_stack_t *ipst)
379 {
380 int i;
381 int sizes[] = P2Ps();
382
383 /*
384 * Calculate size of conn fanout table from /etc/system settings
385 */
386 if (ipcl_conn_hash_size != 0) {
387 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
388 } else if (tcp_conn_hash_size != 0) {
389 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
390 } else {
391 extern pgcnt_t freemem;
392
393 ipst->ips_ipcl_conn_fanout_size =
394 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
395
396 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
397 ipst->ips_ipcl_conn_fanout_size =
398 ipcl_conn_hash_maxsize;
399 }
400 }
401
402 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
403 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
404 break;
405 }
406 }
407 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
408 /* Out of range, use the 2^16 value */
409 ipst->ips_ipcl_conn_fanout_size = sizes[16];
410 }
411
412 /* Take values from /etc/system */
413 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
414 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
415 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
416 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
417
418 ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
419
420 ipst->ips_ipcl_conn_fanout = kmem_zalloc(
421 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
422
423 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
424 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
425 MUTEX_DEFAULT, NULL);
426 }
427
428 ipst->ips_ipcl_bind_fanout = kmem_zalloc(
429 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
430
431 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
432 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
433 MUTEX_DEFAULT, NULL);
434 }
435
436 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
437 sizeof (connf_t), KM_SLEEP);
438 for (i = 0; i < IPPROTO_MAX; i++) {
439 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
440 MUTEX_DEFAULT, NULL);
441 }
442
443 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
444 sizeof (connf_t), KM_SLEEP);
445 for (i = 0; i < IPPROTO_MAX; i++) {
446 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
447 MUTEX_DEFAULT, NULL);
448 }
449
450 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
451 mutex_init(&ipst->ips_rts_clients->connf_lock,
452 NULL, MUTEX_DEFAULT, NULL);
453
454 ipst->ips_ipcl_udp_fanout = kmem_zalloc(
455 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
456 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
457 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
458 MUTEX_DEFAULT, NULL);
459 }
460
461 ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
462 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
463 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
464 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
465 MUTEX_DEFAULT, NULL);
466 }
467
468 ipst->ips_ipcl_raw_fanout = kmem_zalloc(
469 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
470 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
471 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
472 MUTEX_DEFAULT, NULL);
473 }
474
475 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
476 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
477 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
478 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
479 NULL, MUTEX_DEFAULT, NULL);
480 }
481 }
482
483 void
484 ipcl_g_destroy(void)
485 {
486 kmem_cache_destroy(ip_conn_cache);
487 kmem_cache_destroy(tcp_conn_cache);
488 kmem_cache_destroy(udp_conn_cache);
489 kmem_cache_destroy(rawip_conn_cache);
490 kmem_cache_destroy(rts_conn_cache);
491 }
492
493 /*
494 * All user-level and kernel use of the stack must be gone
495 * by now.
496 */
497 void
498 ipcl_destroy(ip_stack_t *ipst)
499 {
500 int i;
501
502 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
503 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
504 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
505 }
506 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
507 sizeof (connf_t));
508 ipst->ips_ipcl_conn_fanout = NULL;
509
510 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
511 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
512 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
513 }
514 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
515 sizeof (connf_t));
516 ipst->ips_ipcl_bind_fanout = NULL;
517
518 for (i = 0; i < IPPROTO_MAX; i++) {
519 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
520 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
521 }
522 kmem_free(ipst->ips_ipcl_proto_fanout_v4,
523 IPPROTO_MAX * sizeof (connf_t));
524 ipst->ips_ipcl_proto_fanout_v4 = NULL;
525
526 for (i = 0; i < IPPROTO_MAX; i++) {
527 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
528 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
529 }
530 kmem_free(ipst->ips_ipcl_proto_fanout_v6,
531 IPPROTO_MAX * sizeof (connf_t));
532 ipst->ips_ipcl_proto_fanout_v6 = NULL;
533
534 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
535 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
536 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
537 }
538 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
539 sizeof (connf_t));
540 ipst->ips_ipcl_udp_fanout = NULL;
541
542 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
543 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
544 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
545 }
546 kmem_free(ipst->ips_ipcl_iptun_fanout,
547 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
548 ipst->ips_ipcl_iptun_fanout = NULL;
549
550 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
551 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
552 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
553 }
554 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
555 sizeof (connf_t));
556 ipst->ips_ipcl_raw_fanout = NULL;
557
558 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
559 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
560 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
561 }
562 kmem_free(ipst->ips_ipcl_globalhash_fanout,
563 sizeof (connf_t) * CONN_G_HASH_SIZE);
564 ipst->ips_ipcl_globalhash_fanout = NULL;
565
566 ASSERT(ipst->ips_rts_clients->connf_head == NULL);
567 mutex_destroy(&ipst->ips_rts_clients->connf_lock);
568 kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
569 ipst->ips_rts_clients = NULL;
570 }
571
572 /*
573 * conn creation routine. initialize the conn, sets the reference
574 * and inserts it in the global hash table.
575 */
576 conn_t *
577 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
578 {
579 conn_t *connp;
580 struct kmem_cache *conn_cache;
581
582 switch (type) {
583 case IPCL_SCTPCONN:
584 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
585 return (NULL);
586 sctp_conn_init(connp);
587 netstack_hold(ns);
588 connp->conn_netstack = ns;
589 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
590 connp->conn_ixa->ixa_conn_id = (long)connp;
591 ipcl_globalhash_insert(connp);
592 return (connp);
593
594 case IPCL_TCPCONN:
595 conn_cache = tcp_conn_cache;
596 break;
597
598 case IPCL_UDPCONN:
599 conn_cache = udp_conn_cache;
600 break;
601
602 case IPCL_RAWIPCONN:
603 conn_cache = rawip_conn_cache;
604 break;
605
606 case IPCL_RTSCONN:
607 conn_cache = rts_conn_cache;
608 break;
609
610 case IPCL_IPCCONN:
611 conn_cache = ip_conn_cache;
612 break;
613
614 default:
615 conn_cache = NULL;
616 connp = NULL;
617 ASSERT(0);
618 }
619
620 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
621 return (NULL);
622
623 connp->conn_ref = 1;
624 netstack_hold(ns);
625 connp->conn_netstack = ns;
626 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
627 connp->conn_ixa->ixa_conn_id = (long)connp;
628 ipcl_globalhash_insert(connp);
629 return (connp);
630 }
631
632 void
633 ipcl_conn_destroy(conn_t *connp)
634 {
635 mblk_t *mp;
636 netstack_t *ns = connp->conn_netstack;
637
638 ASSERT(!MUTEX_HELD(&connp->conn_lock));
639 ASSERT(connp->conn_ref == 0);
640 ASSERT(connp->conn_ioctlref == 0);
641
642 DTRACE_PROBE1(conn__destroy, conn_t *, connp);
643
644 if (connp->conn_cred != NULL) {
645 crfree(connp->conn_cred);
646 connp->conn_cred = NULL;
647 /* ixa_cred done in ipcl_conn_cleanup below */
648 }
649
650 if (connp->conn_ht_iphc != NULL) {
651 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
652 connp->conn_ht_iphc = NULL;
653 connp->conn_ht_iphc_allocated = 0;
654 connp->conn_ht_iphc_len = 0;
655 connp->conn_ht_ulp = NULL;
656 connp->conn_ht_ulp_len = 0;
657 }
658 ip_pkt_free(&connp->conn_xmit_ipp);
659
660 ipcl_globalhash_remove(connp);
661
662 if (connp->conn_latch != NULL) {
663 IPLATCH_REFRELE(connp->conn_latch);
664 connp->conn_latch = NULL;
665 }
666 if (connp->conn_latch_in_policy != NULL) {
667 IPPOL_REFRELE(connp->conn_latch_in_policy);
668 connp->conn_latch_in_policy = NULL;
669 }
670 if (connp->conn_latch_in_action != NULL) {
671 IPACT_REFRELE(connp->conn_latch_in_action);
672 connp->conn_latch_in_action = NULL;
673 }
674 if (connp->conn_policy != NULL) {
675 IPPH_REFRELE(connp->conn_policy, ns);
676 connp->conn_policy = NULL;
677 }
678
679 if (connp->conn_ipsec_opt_mp != NULL) {
680 freemsg(connp->conn_ipsec_opt_mp);
681 connp->conn_ipsec_opt_mp = NULL;
682 }
683
684 if (connp->conn_flags & IPCL_TCPCONN) {
685 tcp_t *tcp = connp->conn_tcp;
686
687 tcp_free(tcp);
688 mp = tcp->tcp_timercache;
689
690 tcp->tcp_tcps = NULL;
691
692 /*
693 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
694 * the mblk.
695 */
696 if (tcp->tcp_rsrv_mp != NULL) {
697 freeb(tcp->tcp_rsrv_mp);
698 tcp->tcp_rsrv_mp = NULL;
699 mutex_destroy(&tcp->tcp_rsrv_mp_lock);
700 }
701
702 ipcl_conn_cleanup(connp);
703 connp->conn_flags = IPCL_TCPCONN;
704 if (ns != NULL) {
705 ASSERT(tcp->tcp_tcps == NULL);
706 connp->conn_netstack = NULL;
707 connp->conn_ixa->ixa_ipst = NULL;
708 netstack_rele(ns);
709 }
710
711 bzero(tcp, sizeof (tcp_t));
712
713 tcp->tcp_timercache = mp;
714 tcp->tcp_connp = connp;
715 kmem_cache_free(tcp_conn_cache, connp);
716 return;
717 }
718
719 if (connp->conn_flags & IPCL_SCTPCONN) {
720 ASSERT(ns != NULL);
721 sctp_free(connp);
722 return;
723 }
724
725 ipcl_conn_cleanup(connp);
726 if (ns != NULL) {
727 connp->conn_netstack = NULL;
728 connp->conn_ixa->ixa_ipst = NULL;
729 netstack_rele(ns);
730 }
731
732 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
733 if (connp->conn_flags & IPCL_UDPCONN) {
734 connp->conn_flags = IPCL_UDPCONN;
735 kmem_cache_free(udp_conn_cache, connp);
736 } else if (connp->conn_flags & IPCL_RAWIPCONN) {
737 connp->conn_flags = IPCL_RAWIPCONN;
738 connp->conn_proto = IPPROTO_ICMP;
739 connp->conn_ixa->ixa_protocol = connp->conn_proto;
740 kmem_cache_free(rawip_conn_cache, connp);
741 } else if (connp->conn_flags & IPCL_RTSCONN) {
742 connp->conn_flags = IPCL_RTSCONN;
743 kmem_cache_free(rts_conn_cache, connp);
744 } else {
745 connp->conn_flags = IPCL_IPCCONN;
746 ASSERT(connp->conn_flags & IPCL_IPCCONN);
747 ASSERT(connp->conn_priv == NULL);
748 kmem_cache_free(ip_conn_cache, connp);
749 }
750 }
751
752 /*
753 * Running in cluster mode - deregister listener information
754 */
755 static void
756 ipcl_conn_unlisten(conn_t *connp)
757 {
758 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
759 ASSERT(connp->conn_lport != 0);
760
761 if (cl_inet_unlisten != NULL) {
762 sa_family_t addr_family;
763 uint8_t *laddrp;
764
765 if (connp->conn_ipversion == IPV6_VERSION) {
766 addr_family = AF_INET6;
767 laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
768 } else {
769 addr_family = AF_INET;
770 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
771 }
772 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
773 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
774 }
775 connp->conn_flags &= ~IPCL_CL_LISTENER;
776 }
777
778 /*
779 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
780 * which table the conn belonged to). So for debugging we can see which hash
781 * table this connection was in.
782 */
783 #define IPCL_HASH_REMOVE(connp) { \
784 connf_t *connfp = (connp)->conn_fanout; \
785 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \
786 if (connfp != NULL) { \
787 mutex_enter(&connfp->connf_lock); \
788 if ((connp)->conn_next != NULL) \
789 (connp)->conn_next->conn_prev = \
790 (connp)->conn_prev; \
791 if ((connp)->conn_prev != NULL) \
792 (connp)->conn_prev->conn_next = \
793 (connp)->conn_next; \
794 else \
795 connfp->connf_head = (connp)->conn_next; \
796 (connp)->conn_fanout = NULL; \
797 (connp)->conn_next = NULL; \
798 (connp)->conn_prev = NULL; \
799 (connp)->conn_flags |= IPCL_REMOVED; \
800 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \
801 ipcl_conn_unlisten((connp)); \
802 CONN_DEC_REF((connp)); \
803 mutex_exit(&connfp->connf_lock); \
804 } \
805 }
806
807 void
808 ipcl_hash_remove(conn_t *connp)
809 {
810 uint8_t protocol = connp->conn_proto;
811
812 IPCL_HASH_REMOVE(connp);
813 if (protocol == IPPROTO_RSVP)
814 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
815 }
816
817 /*
818 * The whole purpose of this function is allow removal of
819 * a conn_t from the connected hash for timewait reclaim.
820 * This is essentially a TW reclaim fastpath where timewait
821 * collector checks under fanout lock (so no one else can
822 * get access to the conn_t) that refcnt is 2 i.e. one for
823 * TCP and one for the classifier hash list. If ref count
824 * is indeed 2, we can just remove the conn under lock and
825 * avoid cleaning up the conn under squeue. This gives us
826 * improved performance.
827 */
828 void
829 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
830 {
831 ASSERT(MUTEX_HELD(&connfp->connf_lock));
832 ASSERT(MUTEX_HELD(&connp->conn_lock));
833 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
834
835 if ((connp)->conn_next != NULL) {
836 (connp)->conn_next->conn_prev = (connp)->conn_prev;
837 }
838 if ((connp)->conn_prev != NULL) {
839 (connp)->conn_prev->conn_next = (connp)->conn_next;
840 } else {
841 connfp->connf_head = (connp)->conn_next;
842 }
843 (connp)->conn_fanout = NULL;
844 (connp)->conn_next = NULL;
845 (connp)->conn_prev = NULL;
846 (connp)->conn_flags |= IPCL_REMOVED;
847 ASSERT((connp)->conn_ref == 2);
848 (connp)->conn_ref--;
849 }
850
851 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \
852 ASSERT((connp)->conn_fanout == NULL); \
853 ASSERT((connp)->conn_next == NULL); \
854 ASSERT((connp)->conn_prev == NULL); \
855 if ((connfp)->connf_head != NULL) { \
856 (connfp)->connf_head->conn_prev = (connp); \
857 (connp)->conn_next = (connfp)->connf_head; \
858 } \
859 (connp)->conn_fanout = (connfp); \
860 (connfp)->connf_head = (connp); \
861 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
862 IPCL_CONNECTED; \
863 CONN_INC_REF(connp); \
864 }
865
866 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \
867 IPCL_HASH_REMOVE((connp)); \
868 mutex_enter(&(connfp)->connf_lock); \
869 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \
870 mutex_exit(&(connfp)->connf_lock); \
871 }
872
873 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \
874 conn_t *pconnp = NULL, *nconnp; \
875 IPCL_HASH_REMOVE((connp)); \
876 mutex_enter(&(connfp)->connf_lock); \
877 nconnp = (connfp)->connf_head; \
878 while (nconnp != NULL && \
879 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \
880 pconnp = nconnp; \
881 nconnp = nconnp->conn_next; \
882 } \
883 if (pconnp != NULL) { \
884 pconnp->conn_next = (connp); \
885 (connp)->conn_prev = pconnp; \
886 } else { \
887 (connfp)->connf_head = (connp); \
888 } \
889 if (nconnp != NULL) { \
890 (connp)->conn_next = nconnp; \
891 nconnp->conn_prev = (connp); \
892 } \
893 (connp)->conn_fanout = (connfp); \
894 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
895 IPCL_BOUND; \
896 CONN_INC_REF(connp); \
897 mutex_exit(&(connfp)->connf_lock); \
898 }
899
900 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \
901 conn_t **list, *prev, *next; \
902 boolean_t isv4mapped = \
903 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \
904 IPCL_HASH_REMOVE((connp)); \
905 mutex_enter(&(connfp)->connf_lock); \
906 list = &(connfp)->connf_head; \
907 prev = NULL; \
908 while ((next = *list) != NULL) { \
909 if (isv4mapped && \
910 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \
911 connp->conn_zoneid == next->conn_zoneid) { \
912 (connp)->conn_next = next; \
913 if (prev != NULL) \
914 prev = next->conn_prev; \
915 next->conn_prev = (connp); \
916 break; \
917 } \
918 list = &next->conn_next; \
919 prev = next; \
920 } \
921 (connp)->conn_prev = prev; \
922 *list = (connp); \
923 (connp)->conn_fanout = (connfp); \
924 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
925 IPCL_BOUND; \
926 CONN_INC_REF((connp)); \
927 mutex_exit(&(connfp)->connf_lock); \
928 }
929
930 void
931 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
932 {
933 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
934 }
935
936 /*
937 * Because the classifier is used to classify inbound packets, the destination
938 * address is meant to be our local tunnel address (tunnel source), and the
939 * source the remote tunnel address (tunnel destination).
940 *
941 * Note that conn_proto can't be used for fanout since the upper protocol
942 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
943 */
944 conn_t *
945 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
946 {
947 connf_t *connfp;
948 conn_t *connp;
949
950 /* first look for IPv4 tunnel links */
951 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
952 mutex_enter(&connfp->connf_lock);
953 for (connp = connfp->connf_head; connp != NULL;
954 connp = connp->conn_next) {
955 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
956 break;
957 }
958 if (connp != NULL)
959 goto done;
960
961 mutex_exit(&connfp->connf_lock);
962
963 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
964 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
965 INADDR_ANY)];
966 mutex_enter(&connfp->connf_lock);
967 for (connp = connfp->connf_head; connp != NULL;
968 connp = connp->conn_next) {
969 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
970 break;
971 }
972 done:
973 if (connp != NULL)
974 CONN_INC_REF(connp);
975 mutex_exit(&connfp->connf_lock);
976 return (connp);
977 }
978
979 conn_t *
980 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
981 {
982 connf_t *connfp;
983 conn_t *connp;
984
985 /* Look for an IPv6 tunnel link */
986 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
987 mutex_enter(&connfp->connf_lock);
988 for (connp = connfp->connf_head; connp != NULL;
989 connp = connp->conn_next) {
990 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
991 CONN_INC_REF(connp);
992 break;
993 }
994 }
995 mutex_exit(&connfp->connf_lock);
996 return (connp);
997 }
998
999 /*
1000 * This function is used only for inserting SCTP raw socket now.
1001 * This may change later.
1002 *
1003 * Note that only one raw socket can be bound to a port. The param
1004 * lport is in network byte order.
1005 */
1006 static int
1007 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1008 {
1009 connf_t *connfp;
1010 conn_t *oconnp;
1011 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1012
1013 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1014
1015 /* Check for existing raw socket already bound to the port. */
1016 mutex_enter(&connfp->connf_lock);
1017 for (oconnp = connfp->connf_head; oconnp != NULL;
1018 oconnp = oconnp->conn_next) {
1019 if (oconnp->conn_lport == lport &&
1020 oconnp->conn_zoneid == connp->conn_zoneid &&
1021 oconnp->conn_family == connp->conn_family &&
1022 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1023 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1024 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1025 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1026 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1027 &connp->conn_laddr_v6))) {
1028 break;
1029 }
1030 }
1031 mutex_exit(&connfp->connf_lock);
1032 if (oconnp != NULL)
1033 return (EADDRNOTAVAIL);
1034
1035 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1036 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1037 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1038 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1039 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1040 } else {
1041 IPCL_HASH_INSERT_BOUND(connfp, connp);
1042 }
1043 } else {
1044 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1045 }
1046 return (0);
1047 }
1048
1049 static int
1050 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1051 {
1052 connf_t *connfp;
1053 conn_t *tconnp;
1054 ipaddr_t laddr = connp->conn_laddr_v4;
1055 ipaddr_t faddr = connp->conn_faddr_v4;
1056
1057 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1058 mutex_enter(&connfp->connf_lock);
1059 for (tconnp = connfp->connf_head; tconnp != NULL;
1060 tconnp = tconnp->conn_next) {
1061 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1062 /* A tunnel is already bound to these addresses. */
1063 mutex_exit(&connfp->connf_lock);
1064 return (EADDRINUSE);
1065 }
1066 }
1067 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1068 mutex_exit(&connfp->connf_lock);
1069 return (0);
1070 }
1071
1072 static int
1073 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1074 {
1075 connf_t *connfp;
1076 conn_t *tconnp;
1077 in6_addr_t *laddr = &connp->conn_laddr_v6;
1078 in6_addr_t *faddr = &connp->conn_faddr_v6;
1079
1080 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1081 mutex_enter(&connfp->connf_lock);
1082 for (tconnp = connfp->connf_head; tconnp != NULL;
1083 tconnp = tconnp->conn_next) {
1084 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1085 /* A tunnel is already bound to these addresses. */
1086 mutex_exit(&connfp->connf_lock);
1087 return (EADDRINUSE);
1088 }
1089 }
1090 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1091 mutex_exit(&connfp->connf_lock);
1092 return (0);
1093 }
1094
1095 /*
1096 * Check for a MAC exemption conflict on a labeled system. Note that for
1097 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1098 * transport layer. This check is for binding all other protocols.
1099 *
1100 * Returns true if there's a conflict.
1101 */
1102 static boolean_t
1103 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1104 {
1105 connf_t *connfp;
1106 conn_t *tconn;
1107
1108 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1109 mutex_enter(&connfp->connf_lock);
1110 for (tconn = connfp->connf_head; tconn != NULL;
1111 tconn = tconn->conn_next) {
1112 /* We don't allow v4 fallback for v6 raw socket */
1113 if (connp->conn_family != tconn->conn_family)
1114 continue;
1115 /* If neither is exempt, then there's no conflict */
1116 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1117 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1118 continue;
1119 /* We are only concerned about sockets for a different zone */
1120 if (connp->conn_zoneid == tconn->conn_zoneid)
1121 continue;
1122 /* If both are bound to different specific addrs, ok */
1123 if (connp->conn_laddr_v4 != INADDR_ANY &&
1124 tconn->conn_laddr_v4 != INADDR_ANY &&
1125 connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1126 continue;
1127 /* These two conflict; fail */
1128 break;
1129 }
1130 mutex_exit(&connfp->connf_lock);
1131 return (tconn != NULL);
1132 }
1133
1134 static boolean_t
1135 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1136 {
1137 connf_t *connfp;
1138 conn_t *tconn;
1139
1140 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1141 mutex_enter(&connfp->connf_lock);
1142 for (tconn = connfp->connf_head; tconn != NULL;
1143 tconn = tconn->conn_next) {
1144 /* We don't allow v4 fallback for v6 raw socket */
1145 if (connp->conn_family != tconn->conn_family)
1146 continue;
1147 /* If neither is exempt, then there's no conflict */
1148 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1149 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1150 continue;
1151 /* We are only concerned about sockets for a different zone */
1152 if (connp->conn_zoneid == tconn->conn_zoneid)
1153 continue;
1154 /* If both are bound to different addrs, ok */
1155 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1156 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1157 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1158 &tconn->conn_laddr_v6))
1159 continue;
1160 /* These two conflict; fail */
1161 break;
1162 }
1163 mutex_exit(&connfp->connf_lock);
1164 return (tconn != NULL);
1165 }
1166
1167 /*
1168 * (v4, v6) bind hash insertion routines
1169 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1170 */
1171
1172 int
1173 ipcl_bind_insert(conn_t *connp)
1174 {
1175 if (connp->conn_ipversion == IPV6_VERSION)
1176 return (ipcl_bind_insert_v6(connp));
1177 else
1178 return (ipcl_bind_insert_v4(connp));
1179 }
1180
1181 int
1182 ipcl_bind_insert_v4(conn_t *connp)
1183 {
1184 connf_t *connfp;
1185 int ret = 0;
1186 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1187 uint16_t lport = connp->conn_lport;
1188 uint8_t protocol = connp->conn_proto;
1189
1190 if (IPCL_IS_IPTUN(connp))
1191 return (ipcl_iptun_hash_insert(connp, ipst));
1192
1193 switch (protocol) {
1194 default:
1195 if (is_system_labeled() &&
1196 check_exempt_conflict_v4(connp, ipst))
1197 return (EADDRINUSE);
1198 /* FALLTHROUGH */
1199 case IPPROTO_UDP:
1200 if (protocol == IPPROTO_UDP) {
1201 connfp = &ipst->ips_ipcl_udp_fanout[
1202 IPCL_UDP_HASH(lport, ipst)];
1203 } else {
1204 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1205 }
1206
1207 if (connp->conn_faddr_v4 != INADDR_ANY) {
1208 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1209 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1210 IPCL_HASH_INSERT_BOUND(connfp, connp);
1211 } else {
1212 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1213 }
1214 if (protocol == IPPROTO_RSVP)
1215 ill_set_inputfn_all(ipst);
1216 break;
1217
1218 case IPPROTO_TCP:
1219 /* Insert it in the Bind Hash */
1220 ASSERT(connp->conn_zoneid != ALL_ZONES);
1221 connfp = &ipst->ips_ipcl_bind_fanout[
1222 IPCL_BIND_HASH(lport, ipst)];
1223 if (connp->conn_laddr_v4 != INADDR_ANY) {
1224 IPCL_HASH_INSERT_BOUND(connfp, connp);
1225 } else {
1226 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1227 }
1228 if (cl_inet_listen != NULL) {
1229 ASSERT(connp->conn_ipversion == IPV4_VERSION);
1230 connp->conn_flags |= IPCL_CL_LISTENER;
1231 (*cl_inet_listen)(
1232 connp->conn_netstack->netstack_stackid,
1233 IPPROTO_TCP, AF_INET,
1234 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1235 }
1236 break;
1237
1238 case IPPROTO_SCTP:
1239 ret = ipcl_sctp_hash_insert(connp, lport);
1240 break;
1241 }
1242
1243 return (ret);
1244 }
1245
1246 int
1247 ipcl_bind_insert_v6(conn_t *connp)
1248 {
1249 connf_t *connfp;
1250 int ret = 0;
1251 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1252 uint16_t lport = connp->conn_lport;
1253 uint8_t protocol = connp->conn_proto;
1254
1255 if (IPCL_IS_IPTUN(connp)) {
1256 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1257 }
1258
1259 switch (protocol) {
1260 default:
1261 if (is_system_labeled() &&
1262 check_exempt_conflict_v6(connp, ipst))
1263 return (EADDRINUSE);
1264 /* FALLTHROUGH */
1265 case IPPROTO_UDP:
1266 if (protocol == IPPROTO_UDP) {
1267 connfp = &ipst->ips_ipcl_udp_fanout[
1268 IPCL_UDP_HASH(lport, ipst)];
1269 } else {
1270 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1271 }
1272
1273 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1274 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1275 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1276 IPCL_HASH_INSERT_BOUND(connfp, connp);
1277 } else {
1278 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1279 }
1280 break;
1281
1282 case IPPROTO_TCP:
1283 /* Insert it in the Bind Hash */
1284 ASSERT(connp->conn_zoneid != ALL_ZONES);
1285 connfp = &ipst->ips_ipcl_bind_fanout[
1286 IPCL_BIND_HASH(lport, ipst)];
1287 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1288 IPCL_HASH_INSERT_BOUND(connfp, connp);
1289 } else {
1290 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1291 }
1292 if (cl_inet_listen != NULL) {
1293 sa_family_t addr_family;
1294 uint8_t *laddrp;
1295
1296 if (connp->conn_ipversion == IPV6_VERSION) {
1297 addr_family = AF_INET6;
1298 laddrp =
1299 (uint8_t *)&connp->conn_bound_addr_v6;
1300 } else {
1301 addr_family = AF_INET;
1302 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1303 }
1304 connp->conn_flags |= IPCL_CL_LISTENER;
1305 (*cl_inet_listen)(
1306 connp->conn_netstack->netstack_stackid,
1307 IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1308 }
1309 break;
1310
1311 case IPPROTO_SCTP:
1312 ret = ipcl_sctp_hash_insert(connp, lport);
1313 break;
1314 }
1315
1316 return (ret);
1317 }
1318
1319 /*
1320 * ipcl_conn_hash insertion routines.
1321 * The caller has already set conn_proto and the addresses/ports in the conn_t.
1322 */
1323
1324 int
1325 ipcl_conn_insert(conn_t *connp)
1326 {
1327 if (connp->conn_ipversion == IPV6_VERSION)
1328 return (ipcl_conn_insert_v6(connp));
1329 else
1330 return (ipcl_conn_insert_v4(connp));
1331 }
1332
1333 int
1334 ipcl_conn_insert_v4(conn_t *connp)
1335 {
1336 connf_t *connfp;
1337 conn_t *tconnp;
1338 int ret = 0;
1339 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1340 uint16_t lport = connp->conn_lport;
1341 uint8_t protocol = connp->conn_proto;
1342
1343 if (IPCL_IS_IPTUN(connp))
1344 return (ipcl_iptun_hash_insert(connp, ipst));
1345
1346 switch (protocol) {
1347 case IPPROTO_TCP:
1348 /*
1349 * For TCP, we check whether the connection tuple already
1350 * exists before allowing the connection to proceed. We
1351 * also allow indexing on the zoneid. This is to allow
1352 * multiple shared stack zones to have the same tcp
1353 * connection tuple. In practice this only happens for
1354 * INADDR_LOOPBACK as it's the only local address which
1355 * doesn't have to be unique.
1356 */
1357 connfp = &ipst->ips_ipcl_conn_fanout[
1358 IPCL_CONN_HASH(connp->conn_faddr_v4,
1359 connp->conn_ports, ipst)];
1360 mutex_enter(&connfp->connf_lock);
1361 for (tconnp = connfp->connf_head; tconnp != NULL;
1362 tconnp = tconnp->conn_next) {
1363 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1364 connp->conn_faddr_v4, connp->conn_laddr_v4,
1365 connp->conn_ports) &&
1366 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1367 /* Already have a conn. bail out */
1368 mutex_exit(&connfp->connf_lock);
1369 return (EADDRINUSE);
1370 }
1371 }
1372 if (connp->conn_fanout != NULL) {
1373 /*
1374 * Probably a XTI/TLI application trying to do a
1375 * rebind. Let it happen.
1376 */
1377 mutex_exit(&connfp->connf_lock);
1378 IPCL_HASH_REMOVE(connp);
1379 mutex_enter(&connfp->connf_lock);
1380 }
1381
1382 ASSERT(connp->conn_recv != NULL);
1383 ASSERT(connp->conn_recvicmp != NULL);
1384
1385 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1386 mutex_exit(&connfp->connf_lock);
1387 break;
1388
1389 case IPPROTO_SCTP:
1390 /*
1391 * The raw socket may have already been bound, remove it
1392 * from the hash first.
1393 */
1394 IPCL_HASH_REMOVE(connp);
1395 ret = ipcl_sctp_hash_insert(connp, lport);
1396 break;
1397
1398 default:
1399 /*
1400 * Check for conflicts among MAC exempt bindings. For
1401 * transports with port numbers, this is done by the upper
1402 * level per-transport binding logic. For all others, it's
1403 * done here.
1404 */
1405 if (is_system_labeled() &&
1406 check_exempt_conflict_v4(connp, ipst))
1407 return (EADDRINUSE);
1408 /* FALLTHROUGH */
1409
1410 case IPPROTO_UDP:
1411 if (protocol == IPPROTO_UDP) {
1412 connfp = &ipst->ips_ipcl_udp_fanout[
1413 IPCL_UDP_HASH(lport, ipst)];
1414 } else {
1415 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1416 }
1417
1418 if (connp->conn_faddr_v4 != INADDR_ANY) {
1419 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1420 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1421 IPCL_HASH_INSERT_BOUND(connfp, connp);
1422 } else {
1423 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1424 }
1425 break;
1426 }
1427
1428 return (ret);
1429 }
1430
1431 int
1432 ipcl_conn_insert_v6(conn_t *connp)
1433 {
1434 connf_t *connfp;
1435 conn_t *tconnp;
1436 int ret = 0;
1437 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1438 uint16_t lport = connp->conn_lport;
1439 uint8_t protocol = connp->conn_proto;
1440 uint_t ifindex = connp->conn_bound_if;
1441
1442 if (IPCL_IS_IPTUN(connp))
1443 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1444
1445 switch (protocol) {
1446 case IPPROTO_TCP:
1447
1448 /*
1449 * For tcp, we check whether the connection tuple already
1450 * exists before allowing the connection to proceed. We
1451 * also allow indexing on the zoneid. This is to allow
1452 * multiple shared stack zones to have the same tcp
1453 * connection tuple. In practice this only happens for
1454 * ipv6_loopback as it's the only local address which
1455 * doesn't have to be unique.
1456 */
1457 connfp = &ipst->ips_ipcl_conn_fanout[
1458 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1459 ipst)];
1460 mutex_enter(&connfp->connf_lock);
1461 for (tconnp = connfp->connf_head; tconnp != NULL;
1462 tconnp = tconnp->conn_next) {
1463 /* NOTE: need to match zoneid. Bug in onnv-gate */
1464 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1465 connp->conn_faddr_v6, connp->conn_laddr_v6,
1466 connp->conn_ports) &&
1467 (tconnp->conn_bound_if == 0 ||
1468 tconnp->conn_bound_if == ifindex) &&
1469 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1470 /* Already have a conn. bail out */
1471 mutex_exit(&connfp->connf_lock);
1472 return (EADDRINUSE);
1473 }
1474 }
1475 if (connp->conn_fanout != NULL) {
1476 /*
1477 * Probably a XTI/TLI application trying to do a
1478 * rebind. Let it happen.
1479 */
1480 mutex_exit(&connfp->connf_lock);
1481 IPCL_HASH_REMOVE(connp);
1482 mutex_enter(&connfp->connf_lock);
1483 }
1484 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1485 mutex_exit(&connfp->connf_lock);
1486 break;
1487
1488 case IPPROTO_SCTP:
1489 IPCL_HASH_REMOVE(connp);
1490 ret = ipcl_sctp_hash_insert(connp, lport);
1491 break;
1492
1493 default:
1494 if (is_system_labeled() &&
1495 check_exempt_conflict_v6(connp, ipst))
1496 return (EADDRINUSE);
1497 /* FALLTHROUGH */
1498 case IPPROTO_UDP:
1499 if (protocol == IPPROTO_UDP) {
1500 connfp = &ipst->ips_ipcl_udp_fanout[
1501 IPCL_UDP_HASH(lport, ipst)];
1502 } else {
1503 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1504 }
1505
1506 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1507 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1508 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1509 IPCL_HASH_INSERT_BOUND(connfp, connp);
1510 } else {
1511 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1512 }
1513 break;
1514 }
1515
1516 return (ret);
1517 }
1518
1519 /*
1520 * v4 packet classifying function. looks up the fanout table to
1521 * find the conn, the packet belongs to. returns the conn with
1522 * the reference held, null otherwise.
1523 *
1524 * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1525 * Lookup" comment block are applied. Labels are also checked as described
1526 * above. If the packet is from the inside (looped back), and is from the same
1527 * zone, then label checks are omitted.
1528 */
1529 conn_t *
1530 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1531 ip_recv_attr_t *ira, ip_stack_t *ipst)
1532 {
1533 ipha_t *ipha;
1534 connf_t *connfp, *bind_connfp;
1535 uint16_t lport;
1536 uint16_t fport;
1537 uint32_t ports;
1538 conn_t *connp;
1539 uint16_t *up;
1540 zoneid_t zoneid = ira->ira_zoneid;
1541
1542 ipha = (ipha_t *)mp->b_rptr;
1543 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1544
1545 switch (protocol) {
1546 case IPPROTO_TCP:
1547 ports = *(uint32_t *)up;
1548 connfp =
1549 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1550 ports, ipst)];
1551 mutex_enter(&connfp->connf_lock);
1552 for (connp = connfp->connf_head; connp != NULL;
1553 connp = connp->conn_next) {
1554 if (IPCL_CONN_MATCH(connp, protocol,
1555 ipha->ipha_src, ipha->ipha_dst, ports) &&
1556 (connp->conn_zoneid == zoneid ||
1557 connp->conn_allzones ||
1558 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1559 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1560 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1561 break;
1562 }
1563
1564 if (connp != NULL) {
1565 /*
1566 * We have a fully-bound TCP connection.
1567 *
1568 * For labeled systems, there's no need to check the
1569 * label here. It's known to be good as we checked
1570 * before allowing the connection to become bound.
1571 */
1572 CONN_INC_REF(connp);
1573 mutex_exit(&connfp->connf_lock);
1574 return (connp);
1575 }
1576
1577 mutex_exit(&connfp->connf_lock);
1578 lport = up[1];
1579 bind_connfp =
1580 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1581 mutex_enter(&bind_connfp->connf_lock);
1582 for (connp = bind_connfp->connf_head; connp != NULL;
1583 connp = connp->conn_next) {
1584 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1585 lport) &&
1586 (connp->conn_zoneid == zoneid ||
1587 connp->conn_allzones ||
1588 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1589 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1590 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1591 break;
1592 }
1593
1594 /*
1595 * If the matching connection is SLP on a private address, then
1596 * the label on the packet must match the local zone's label.
1597 * Otherwise, it must be in the label range defined by tnrh.
1598 * This is ensured by tsol_receive_local.
1599 *
1600 * Note that we don't check tsol_receive_local for
1601 * the connected case.
1602 */
1603 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1604 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1605 ira, connp)) {
1606 DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1607 char *, "connp(1) could not receive mp(2)",
1608 conn_t *, connp, mblk_t *, mp);
1609 connp = NULL;
1610 }
1611
1612 if (connp != NULL) {
1613 /* Have a listener at least */
1614 CONN_INC_REF(connp);
1615 mutex_exit(&bind_connfp->connf_lock);
1616 return (connp);
1617 }
1618
1619 mutex_exit(&bind_connfp->connf_lock);
1620 break;
1621
1622 case IPPROTO_UDP:
1623 lport = up[1];
1624 fport = up[0];
1625 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1626 mutex_enter(&connfp->connf_lock);
1627 for (connp = connfp->connf_head; connp != NULL;
1628 connp = connp->conn_next) {
1629 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1630 fport, ipha->ipha_src) &&
1631 (connp->conn_zoneid == zoneid ||
1632 connp->conn_allzones ||
1633 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1634 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1635 break;
1636 }
1637
1638 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1639 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1640 ira, connp)) {
1641 DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1642 char *, "connp(1) could not receive mp(2)",
1643 conn_t *, connp, mblk_t *, mp);
1644 connp = NULL;
1645 }
1646
1647 if (connp != NULL) {
1648 CONN_INC_REF(connp);
1649 mutex_exit(&connfp->connf_lock);
1650 return (connp);
1651 }
1652
1653 /*
1654 * We shouldn't come here for multicast/broadcast packets
1655 */
1656 mutex_exit(&connfp->connf_lock);
1657
1658 break;
1659
1660 case IPPROTO_ENCAP:
1661 case IPPROTO_IPV6:
1662 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1663 &ipha->ipha_dst, ipst));
1664 }
1665
1666 return (NULL);
1667 }
1668
1669 conn_t *
1670 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1671 ip_recv_attr_t *ira, ip_stack_t *ipst)
1672 {
1673 ip6_t *ip6h;
1674 connf_t *connfp, *bind_connfp;
1675 uint16_t lport;
1676 uint16_t fport;
1677 tcpha_t *tcpha;
1678 uint32_t ports;
1679 conn_t *connp;
1680 uint16_t *up;
1681 zoneid_t zoneid = ira->ira_zoneid;
1682
1683 ip6h = (ip6_t *)mp->b_rptr;
1684
1685 switch (protocol) {
1686 case IPPROTO_TCP:
1687 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1688 up = &tcpha->tha_lport;
1689 ports = *(uint32_t *)up;
1690
1691 connfp =
1692 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1693 ports, ipst)];
1694 mutex_enter(&connfp->connf_lock);
1695 for (connp = connfp->connf_head; connp != NULL;
1696 connp = connp->conn_next) {
1697 if (IPCL_CONN_MATCH_V6(connp, protocol,
1698 ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1699 (connp->conn_zoneid == zoneid ||
1700 connp->conn_allzones ||
1701 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1702 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1703 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1704 break;
1705 }
1706
1707 if (connp != NULL) {
1708 /*
1709 * We have a fully-bound TCP connection.
1710 *
1711 * For labeled systems, there's no need to check the
1712 * label here. It's known to be good as we checked
1713 * before allowing the connection to become bound.
1714 */
1715 CONN_INC_REF(connp);
1716 mutex_exit(&connfp->connf_lock);
1717 return (connp);
1718 }
1719
1720 mutex_exit(&connfp->connf_lock);
1721
1722 lport = up[1];
1723 bind_connfp =
1724 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1725 mutex_enter(&bind_connfp->connf_lock);
1726 for (connp = bind_connfp->connf_head; connp != NULL;
1727 connp = connp->conn_next) {
1728 if (IPCL_BIND_MATCH_V6(connp, protocol,
1729 ip6h->ip6_dst, lport) &&
1730 (connp->conn_zoneid == zoneid ||
1731 connp->conn_allzones ||
1732 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1733 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1734 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1735 break;
1736 }
1737
1738 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1739 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1740 ira, connp)) {
1741 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1742 char *, "connp(1) could not receive mp(2)",
1743 conn_t *, connp, mblk_t *, mp);
1744 connp = NULL;
1745 }
1746
1747 if (connp != NULL) {
1748 /* Have a listner at least */
1749 CONN_INC_REF(connp);
1750 mutex_exit(&bind_connfp->connf_lock);
1751 return (connp);
1752 }
1753
1754 mutex_exit(&bind_connfp->connf_lock);
1755 break;
1756
1757 case IPPROTO_UDP:
1758 up = (uint16_t *)&mp->b_rptr[hdr_len];
1759 lport = up[1];
1760 fport = up[0];
1761 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1762 mutex_enter(&connfp->connf_lock);
1763 for (connp = connfp->connf_head; connp != NULL;
1764 connp = connp->conn_next) {
1765 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1766 fport, ip6h->ip6_src) &&
1767 (connp->conn_zoneid == zoneid ||
1768 connp->conn_allzones ||
1769 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1770 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1771 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1772 break;
1773 }
1774
1775 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1776 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1777 ira, connp)) {
1778 DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1779 char *, "connp(1) could not receive mp(2)",
1780 conn_t *, connp, mblk_t *, mp);
1781 connp = NULL;
1782 }
1783
1784 if (connp != NULL) {
1785 CONN_INC_REF(connp);
1786 mutex_exit(&connfp->connf_lock);
1787 return (connp);
1788 }
1789
1790 /*
1791 * We shouldn't come here for multicast/broadcast packets
1792 */
1793 mutex_exit(&connfp->connf_lock);
1794 break;
1795 case IPPROTO_ENCAP:
1796 case IPPROTO_IPV6:
1797 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1798 &ip6h->ip6_dst, ipst));
1799 }
1800
1801 return (NULL);
1802 }
1803
1804 /*
1805 * wrapper around ipcl_classify_(v4,v6) routines.
1806 */
1807 conn_t *
1808 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1809 {
1810 if (ira->ira_flags & IRAF_IS_IPV4) {
1811 return (ipcl_classify_v4(mp, ira->ira_protocol,
1812 ira->ira_ip_hdr_length, ira, ipst));
1813 } else {
1814 return (ipcl_classify_v6(mp, ira->ira_protocol,
1815 ira->ira_ip_hdr_length, ira, ipst));
1816 }
1817 }
1818
1819 /*
1820 * Only used to classify SCTP RAW sockets
1821 */
1822 conn_t *
1823 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1824 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1825 {
1826 connf_t *connfp;
1827 conn_t *connp;
1828 in_port_t lport;
1829 int ipversion;
1830 const void *dst;
1831 zoneid_t zoneid = ira->ira_zoneid;
1832
1833 lport = ((uint16_t *)&ports)[1];
1834 if (ira->ira_flags & IRAF_IS_IPV4) {
1835 dst = (const void *)&ipha->ipha_dst;
1836 ipversion = IPV4_VERSION;
1837 } else {
1838 dst = (const void *)&ip6h->ip6_dst;
1839 ipversion = IPV6_VERSION;
1840 }
1841
1842 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1843 mutex_enter(&connfp->connf_lock);
1844 for (connp = connfp->connf_head; connp != NULL;
1845 connp = connp->conn_next) {
1846 /* We don't allow v4 fallback for v6 raw socket. */
1847 if (ipversion != connp->conn_ipversion)
1848 continue;
1849 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1850 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1851 if (ipversion == IPV4_VERSION) {
1852 if (!IPCL_CONN_MATCH(connp, protocol,
1853 ipha->ipha_src, ipha->ipha_dst, ports))
1854 continue;
1855 } else {
1856 if (!IPCL_CONN_MATCH_V6(connp, protocol,
1857 ip6h->ip6_src, ip6h->ip6_dst, ports))
1858 continue;
1859 }
1860 } else {
1861 if (ipversion == IPV4_VERSION) {
1862 if (!IPCL_BIND_MATCH(connp, protocol,
1863 ipha->ipha_dst, lport))
1864 continue;
1865 } else {
1866 if (!IPCL_BIND_MATCH_V6(connp, protocol,
1867 ip6h->ip6_dst, lport))
1868 continue;
1869 }
1870 }
1871
1872 if (connp->conn_zoneid == zoneid ||
1873 connp->conn_allzones ||
1874 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1875 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1876 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1877 break;
1878 }
1879
1880 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1881 !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1882 DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1883 char *, "connp(1) could not receive mp(2)",
1884 conn_t *, connp, mblk_t *, mp);
1885 connp = NULL;
1886 }
1887
1888 if (connp != NULL)
1889 goto found;
1890 mutex_exit(&connfp->connf_lock);
1891
1892 /* Try to look for a wildcard SCTP RAW socket match. */
1893 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1894 mutex_enter(&connfp->connf_lock);
1895 for (connp = connfp->connf_head; connp != NULL;
1896 connp = connp->conn_next) {
1897 /* We don't allow v4 fallback for v6 raw socket. */
1898 if (ipversion != connp->conn_ipversion)
1899 continue;
1900 if (!IPCL_ZONE_MATCH(connp, zoneid))
1901 continue;
1902
1903 if (ipversion == IPV4_VERSION) {
1904 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1905 break;
1906 } else {
1907 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1908 break;
1909 }
1910 }
1911 }
1912
1913 if (connp != NULL)
1914 goto found;
1915
1916 mutex_exit(&connfp->connf_lock);
1917 return (NULL);
1918
1919 found:
1920 ASSERT(connp != NULL);
1921 CONN_INC_REF(connp);
1922 mutex_exit(&connfp->connf_lock);
1923 return (connp);
1924 }
1925
1926 /* ARGSUSED */
1927 static int
1928 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1929 {
1930 itc_t *itc = (itc_t *)buf;
1931 conn_t *connp = &itc->itc_conn;
1932 tcp_t *tcp = (tcp_t *)&itc[1];
1933
1934 bzero(connp, sizeof (conn_t));
1935 bzero(tcp, sizeof (tcp_t));
1936
1937 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1938 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1939 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1940 tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1941 if (tcp->tcp_timercache == NULL)
1942 return (ENOMEM);
1943 connp->conn_tcp = tcp;
1944 connp->conn_flags = IPCL_TCPCONN;
1945 connp->conn_proto = IPPROTO_TCP;
1946 tcp->tcp_connp = connp;
1947 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1948
1949 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1950 if (connp->conn_ixa == NULL) {
1951 tcp_timermp_free(tcp);
1952 return (ENOMEM);
1953 }
1954 connp->conn_ixa->ixa_refcnt = 1;
1955 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1956 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1957 return (0);
1958 }
1959
1960 /* ARGSUSED */
1961 static void
1962 tcp_conn_destructor(void *buf, void *cdrarg)
1963 {
1964 itc_t *itc = (itc_t *)buf;
1965 conn_t *connp = &itc->itc_conn;
1966 tcp_t *tcp = (tcp_t *)&itc[1];
1967
1968 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1969 ASSERT(tcp->tcp_connp == connp);
1970 ASSERT(connp->conn_tcp == tcp);
1971 tcp_timermp_free(tcp);
1972 mutex_destroy(&connp->conn_lock);
1973 cv_destroy(&connp->conn_cv);
1974 cv_destroy(&connp->conn_sq_cv);
1975 rw_destroy(&connp->conn_ilg_lock);
1976
1977 /* Can be NULL if constructor failed */
1978 if (connp->conn_ixa != NULL) {
1979 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1980 ASSERT(connp->conn_ixa->ixa_ire == NULL);
1981 ASSERT(connp->conn_ixa->ixa_nce == NULL);
1982 ixa_refrele(connp->conn_ixa);
1983 }
1984 }
1985
1986 /* ARGSUSED */
1987 static int
1988 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1989 {
1990 itc_t *itc = (itc_t *)buf;
1991 conn_t *connp = &itc->itc_conn;
1992
1993 bzero(connp, sizeof (conn_t));
1994 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1995 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1996 connp->conn_flags = IPCL_IPCCONN;
1997 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1998
1999 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2000 if (connp->conn_ixa == NULL)
2001 return (ENOMEM);
2002 connp->conn_ixa->ixa_refcnt = 1;
2003 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2004 return (0);
2005 }
2006
2007 /* ARGSUSED */
2008 static void
2009 ip_conn_destructor(void *buf, void *cdrarg)
2010 {
2011 itc_t *itc = (itc_t *)buf;
2012 conn_t *connp = &itc->itc_conn;
2013
2014 ASSERT(connp->conn_flags & IPCL_IPCCONN);
2015 ASSERT(connp->conn_priv == NULL);
2016 mutex_destroy(&connp->conn_lock);
2017 cv_destroy(&connp->conn_cv);
2018 rw_destroy(&connp->conn_ilg_lock);
2019
2020 /* Can be NULL if constructor failed */
2021 if (connp->conn_ixa != NULL) {
2022 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2023 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2024 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2025 ixa_refrele(connp->conn_ixa);
2026 }
2027 }
2028
2029 /* ARGSUSED */
2030 static int
2031 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2032 {
2033 itc_t *itc = (itc_t *)buf;
2034 conn_t *connp = &itc->itc_conn;
2035 udp_t *udp = (udp_t *)&itc[1];
2036
2037 bzero(connp, sizeof (conn_t));
2038 bzero(udp, sizeof (udp_t));
2039
2040 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2041 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2042 connp->conn_udp = udp;
2043 connp->conn_flags = IPCL_UDPCONN;
2044 connp->conn_proto = IPPROTO_UDP;
2045 udp->udp_connp = connp;
2046 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2047 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2048 if (connp->conn_ixa == NULL)
2049 return (ENOMEM);
2050 connp->conn_ixa->ixa_refcnt = 1;
2051 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2052 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2053 return (0);
2054 }
2055
2056 /* ARGSUSED */
2057 static void
2058 udp_conn_destructor(void *buf, void *cdrarg)
2059 {
2060 itc_t *itc = (itc_t *)buf;
2061 conn_t *connp = &itc->itc_conn;
2062 udp_t *udp = (udp_t *)&itc[1];
2063
2064 ASSERT(connp->conn_flags & IPCL_UDPCONN);
2065 ASSERT(udp->udp_connp == connp);
2066 ASSERT(connp->conn_udp == udp);
2067 mutex_destroy(&connp->conn_lock);
2068 cv_destroy(&connp->conn_cv);
2069 rw_destroy(&connp->conn_ilg_lock);
2070
2071 /* Can be NULL if constructor failed */
2072 if (connp->conn_ixa != NULL) {
2073 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2074 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2075 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2076 ixa_refrele(connp->conn_ixa);
2077 }
2078 }
2079
2080 /* ARGSUSED */
2081 static int
2082 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2083 {
2084 itc_t *itc = (itc_t *)buf;
2085 conn_t *connp = &itc->itc_conn;
2086 icmp_t *icmp = (icmp_t *)&itc[1];
2087
2088 bzero(connp, sizeof (conn_t));
2089 bzero(icmp, sizeof (icmp_t));
2090
2091 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2092 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2093 connp->conn_icmp = icmp;
2094 connp->conn_flags = IPCL_RAWIPCONN;
2095 connp->conn_proto = IPPROTO_ICMP;
2096 icmp->icmp_connp = connp;
2097 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2098 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2099 if (connp->conn_ixa == NULL)
2100 return (ENOMEM);
2101 connp->conn_ixa->ixa_refcnt = 1;
2102 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2103 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2104 return (0);
2105 }
2106
2107 /* ARGSUSED */
2108 static void
2109 rawip_conn_destructor(void *buf, void *cdrarg)
2110 {
2111 itc_t *itc = (itc_t *)buf;
2112 conn_t *connp = &itc->itc_conn;
2113 icmp_t *icmp = (icmp_t *)&itc[1];
2114
2115 ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2116 ASSERT(icmp->icmp_connp == connp);
2117 ASSERT(connp->conn_icmp == icmp);
2118 mutex_destroy(&connp->conn_lock);
2119 cv_destroy(&connp->conn_cv);
2120 rw_destroy(&connp->conn_ilg_lock);
2121
2122 /* Can be NULL if constructor failed */
2123 if (connp->conn_ixa != NULL) {
2124 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2125 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2126 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2127 ixa_refrele(connp->conn_ixa);
2128 }
2129 }
2130
2131 /* ARGSUSED */
2132 static int
2133 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2134 {
2135 itc_t *itc = (itc_t *)buf;
2136 conn_t *connp = &itc->itc_conn;
2137 rts_t *rts = (rts_t *)&itc[1];
2138
2139 bzero(connp, sizeof (conn_t));
2140 bzero(rts, sizeof (rts_t));
2141
2142 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2143 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2144 connp->conn_rts = rts;
2145 connp->conn_flags = IPCL_RTSCONN;
2146 rts->rts_connp = connp;
2147 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2148 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2149 if (connp->conn_ixa == NULL)
2150 return (ENOMEM);
2151 connp->conn_ixa->ixa_refcnt = 1;
2152 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2153 return (0);
2154 }
2155
2156 /* ARGSUSED */
2157 static void
2158 rts_conn_destructor(void *buf, void *cdrarg)
2159 {
2160 itc_t *itc = (itc_t *)buf;
2161 conn_t *connp = &itc->itc_conn;
2162 rts_t *rts = (rts_t *)&itc[1];
2163
2164 ASSERT(connp->conn_flags & IPCL_RTSCONN);
2165 ASSERT(rts->rts_connp == connp);
2166 ASSERT(connp->conn_rts == rts);
2167 mutex_destroy(&connp->conn_lock);
2168 cv_destroy(&connp->conn_cv);
2169 rw_destroy(&connp->conn_ilg_lock);
2170
2171 /* Can be NULL if constructor failed */
2172 if (connp->conn_ixa != NULL) {
2173 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2174 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2175 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2176 ixa_refrele(connp->conn_ixa);
2177 }
2178 }
2179
2180 /*
2181 * Called as part of ipcl_conn_destroy to assert and clear any pointers
2182 * in the conn_t.
2183 *
2184 * Below we list all the pointers in the conn_t as a documentation aid.
2185 * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2186 * If you add any pointers to the conn_t please add an ASSERT here
2187 * and #ifdef it out if it can't be actually asserted to be NULL.
2188 * In any case, we bzero most of the conn_t at the end of the function.
2189 */
2190 void
2191 ipcl_conn_cleanup(conn_t *connp)
2192 {
2193 ip_xmit_attr_t *ixa;
2194
2195 ASSERT(connp->conn_latch == NULL);
2196 ASSERT(connp->conn_latch_in_policy == NULL);
2197 ASSERT(connp->conn_latch_in_action == NULL);
2198 #ifdef notdef
2199 ASSERT(connp->conn_rq == NULL);
2200 ASSERT(connp->conn_wq == NULL);
2201 #endif
2202 ASSERT(connp->conn_cred == NULL);
2203 ASSERT(connp->conn_g_fanout == NULL);
2204 ASSERT(connp->conn_g_next == NULL);
2205 ASSERT(connp->conn_g_prev == NULL);
2206 ASSERT(connp->conn_policy == NULL);
2207 ASSERT(connp->conn_fanout == NULL);
2208 ASSERT(connp->conn_next == NULL);
2209 ASSERT(connp->conn_prev == NULL);
2210 ASSERT(connp->conn_oper_pending_ill == NULL);
2211 ASSERT(connp->conn_ilg == NULL);
2212 ASSERT(connp->conn_drain_next == NULL);
2213 ASSERT(connp->conn_drain_prev == NULL);
2214 #ifdef notdef
2215 /* conn_idl is not cleared when removed from idl list */
2216 ASSERT(connp->conn_idl == NULL);
2217 #endif
2218 ASSERT(connp->conn_ipsec_opt_mp == NULL);
2219 #ifdef notdef
2220 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2221 ASSERT(connp->conn_netstack == NULL);
2222 #endif
2223
2224 ASSERT(connp->conn_helper_info == NULL);
2225 ASSERT(connp->conn_ixa != NULL);
2226 ixa = connp->conn_ixa;
2227 ASSERT(ixa->ixa_refcnt == 1);
2228 /* Need to preserve ixa_protocol */
2229 ixa_cleanup(ixa);
2230 ixa->ixa_flags = 0;
2231
2232 /* Clear out the conn_t fields that are not preserved */
2233 bzero(&connp->conn_start_clr,
2234 sizeof (conn_t) -
2235 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2236 }
2237
2238 /*
2239 * All conns are inserted in a global multi-list for the benefit of
2240 * walkers. The walk is guaranteed to walk all open conns at the time
2241 * of the start of the walk exactly once. This property is needed to
2242 * achieve some cleanups during unplumb of interfaces. This is achieved
2243 * as follows.
2244 *
2245 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2246 * call the insert and delete functions below at creation and deletion
2247 * time respectively. The conn never moves or changes its position in this
2248 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2249 * won't increase due to walkers, once the conn deletion has started. Note
2250 * that we can't remove the conn from the global list and then wait for
2251 * the refcnt to drop to zero, since walkers would then see a truncated
2252 * list. CONN_INCIPIENT ensures that walkers don't start looking at
2253 * conns until ip_open is ready to make them globally visible.
2254 * The global round robin multi-list locks are held only to get the
2255 * next member/insertion/deletion and contention should be negligible
2256 * if the multi-list is much greater than the number of cpus.
2257 */
2258 void
2259 ipcl_globalhash_insert(conn_t *connp)
2260 {
2261 int index;
2262 struct connf_s *connfp;
2263 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
2264
2265 /*
2266 * No need for atomic here. Approximate even distribution
2267 * in the global lists is sufficient.
2268 */
2269 ipst->ips_conn_g_index++;
2270 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2271
2272 connp->conn_g_prev = NULL;
2273 /*
2274 * Mark as INCIPIENT, so that walkers will ignore this
2275 * for now, till ip_open is ready to make it visible globally.
2276 */
2277 connp->conn_state_flags |= CONN_INCIPIENT;
2278
2279 connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2280 /* Insert at the head of the list */
2281 mutex_enter(&connfp->connf_lock);
2282 connp->conn_g_next = connfp->connf_head;
2283 if (connp->conn_g_next != NULL)
2284 connp->conn_g_next->conn_g_prev = connp;
2285 connfp->connf_head = connp;
2286
2287 /* The fanout bucket this conn points to */
2288 connp->conn_g_fanout = connfp;
2289
2290 mutex_exit(&connfp->connf_lock);
2291 }
2292
2293 void
2294 ipcl_globalhash_remove(conn_t *connp)
2295 {
2296 struct connf_s *connfp;
2297
2298 /*
2299 * We were never inserted in the global multi list.
2300 * IPCL_NONE variety is never inserted in the global multilist
2301 * since it is presumed to not need any cleanup and is transient.
2302 */
2303 if (connp->conn_g_fanout == NULL)
2304 return;
2305
2306 connfp = connp->conn_g_fanout;
2307 mutex_enter(&connfp->connf_lock);
2308 if (connp->conn_g_prev != NULL)
2309 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2310 else
2311 connfp->connf_head = connp->conn_g_next;
2312 if (connp->conn_g_next != NULL)
2313 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2314 mutex_exit(&connfp->connf_lock);
2315
2316 /* Better to stumble on a null pointer than to corrupt memory */
2317 connp->conn_g_next = NULL;
2318 connp->conn_g_prev = NULL;
2319 connp->conn_g_fanout = NULL;
2320 }
2321
2322 /*
2323 * Walk the list of all conn_t's in the system, calling the function provided
2324 * With the specified argument for each.
2325 * Applies to both IPv4 and IPv6.
2326 *
2327 * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2328 * conn_oper_pending_ill). To guard against stale pointers
2329 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2330 * unplumbed or removed. New conn_t's that are created while we are walking
2331 * may be missed by this walk, because they are not necessarily inserted
2332 * at the tail of the list. They are new conn_t's and thus don't have any
2333 * stale pointers. The CONN_CLOSING flag ensures that no new reference
2334 * is created to the struct that is going away.
2335 */
2336 void
2337 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2338 {
2339 int i;
2340 conn_t *connp;
2341 conn_t *prev_connp;
2342
2343 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2344 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2345 prev_connp = NULL;
2346 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2347 while (connp != NULL) {
2348 mutex_enter(&connp->conn_lock);
2349 if (connp->conn_state_flags &
2350 (CONN_CONDEMNED | CONN_INCIPIENT)) {
2351 mutex_exit(&connp->conn_lock);
2352 connp = connp->conn_g_next;
2353 continue;
2354 }
2355 CONN_INC_REF_LOCKED(connp);
2356 mutex_exit(&connp->conn_lock);
2357 mutex_exit(
2358 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2359 (*func)(connp, arg);
2360 if (prev_connp != NULL)
2361 CONN_DEC_REF(prev_connp);
2362 mutex_enter(
2363 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2364 prev_connp = connp;
2365 connp = connp->conn_g_next;
2366 }
2367 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2368 if (prev_connp != NULL)
2369 CONN_DEC_REF(prev_connp);
2370 }
2371 }
2372
2373 /*
2374 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2375 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2376 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2377 * (peer tcp in ESTABLISHED state).
2378 */
2379 conn_t *
2380 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2381 ip_stack_t *ipst)
2382 {
2383 uint32_t ports;
2384 uint16_t *pports = (uint16_t *)&ports;
2385 connf_t *connfp;
2386 conn_t *tconnp;
2387 boolean_t zone_chk;
2388
2389 /*
2390 * If either the source of destination address is loopback, then
2391 * both endpoints must be in the same Zone. Otherwise, both of
2392 * the addresses are system-wide unique (tcp is in ESTABLISHED
2393 * state) and the endpoints may reside in different Zones.
2394 */
2395 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2396 ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2397
2398 pports[0] = tcpha->tha_fport;
2399 pports[1] = tcpha->tha_lport;
2400
2401 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2402 ports, ipst)];
2403
2404 mutex_enter(&connfp->connf_lock);
2405 for (tconnp = connfp->connf_head; tconnp != NULL;
2406 tconnp = tconnp->conn_next) {
2407
2408 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2409 ipha->ipha_dst, ipha->ipha_src, ports) &&
2410 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2411 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2412
2413 ASSERT(tconnp != connp);
2414 CONN_INC_REF(tconnp);
2415 mutex_exit(&connfp->connf_lock);
2416 return (tconnp);
2417 }
2418 }
2419 mutex_exit(&connfp->connf_lock);
2420 return (NULL);
2421 }
2422
2423 /*
2424 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2425 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2426 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2427 * (peer tcp in ESTABLISHED state).
2428 */
2429 conn_t *
2430 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2431 ip_stack_t *ipst)
2432 {
2433 uint32_t ports;
2434 uint16_t *pports = (uint16_t *)&ports;
2435 connf_t *connfp;
2436 conn_t *tconnp;
2437 boolean_t zone_chk;
2438
2439 /*
2440 * If either the source of destination address is loopback, then
2441 * both endpoints must be in the same Zone. Otherwise, both of
2442 * the addresses are system-wide unique (tcp is in ESTABLISHED
2443 * state) and the endpoints may reside in different Zones. We
2444 * don't do Zone check for link local address(es) because the
2445 * current Zone implementation treats each link local address as
2446 * being unique per system node, i.e. they belong to global Zone.
2447 */
2448 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2449 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2450
2451 pports[0] = tcpha->tha_fport;
2452 pports[1] = tcpha->tha_lport;
2453
2454 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2455 ports, ipst)];
2456
2457 mutex_enter(&connfp->connf_lock);
2458 for (tconnp = connfp->connf_head; tconnp != NULL;
2459 tconnp = tconnp->conn_next) {
2460
2461 /* We skip conn_bound_if check here as this is loopback tcp */
2462 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2463 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2464 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2465 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2466
2467 ASSERT(tconnp != connp);
2468 CONN_INC_REF(tconnp);
2469 mutex_exit(&connfp->connf_lock);
2470 return (tconnp);
2471 }
2472 }
2473 mutex_exit(&connfp->connf_lock);
2474 return (NULL);
2475 }
2476
2477 /*
2478 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2479 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2480 * Only checks for connected entries i.e. no INADDR_ANY checks.
2481 */
2482 conn_t *
2483 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2484 ip_stack_t *ipst)
2485 {
2486 uint32_t ports;
2487 uint16_t *pports;
2488 connf_t *connfp;
2489 conn_t *tconnp;
2490
2491 pports = (uint16_t *)&ports;
2492 pports[0] = tcpha->tha_fport;
2493 pports[1] = tcpha->tha_lport;
2494
2495 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2496 ports, ipst)];
2497
2498 mutex_enter(&connfp->connf_lock);
2499 for (tconnp = connfp->connf_head; tconnp != NULL;
2500 tconnp = tconnp->conn_next) {
2501
2502 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2503 ipha->ipha_dst, ipha->ipha_src, ports) &&
2504 tconnp->conn_tcp->tcp_state >= min_state) {
2505
2506 CONN_INC_REF(tconnp);
2507 mutex_exit(&connfp->connf_lock);
2508 return (tconnp);
2509 }
2510 }
2511 mutex_exit(&connfp->connf_lock);
2512 return (NULL);
2513 }
2514
2515 /*
2516 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2517 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2518 * Only checks for connected entries i.e. no INADDR_ANY checks.
2519 * Match on ifindex in addition to addresses.
2520 */
2521 conn_t *
2522 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2523 uint_t ifindex, ip_stack_t *ipst)
2524 {
2525 tcp_t *tcp;
2526 uint32_t ports;
2527 uint16_t *pports;
2528 connf_t *connfp;
2529 conn_t *tconnp;
2530
2531 pports = (uint16_t *)&ports;
2532 pports[0] = tcpha->tha_fport;
2533 pports[1] = tcpha->tha_lport;
2534
2535 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2536 ports, ipst)];
2537
2538 mutex_enter(&connfp->connf_lock);
2539 for (tconnp = connfp->connf_head; tconnp != NULL;
2540 tconnp = tconnp->conn_next) {
2541
2542 tcp = tconnp->conn_tcp;
2543 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2544 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2545 tcp->tcp_state >= min_state &&
2546 (tconnp->conn_bound_if == 0 ||
2547 tconnp->conn_bound_if == ifindex)) {
2548
2549 CONN_INC_REF(tconnp);
2550 mutex_exit(&connfp->connf_lock);
2551 return (tconnp);
2552 }
2553 }
2554 mutex_exit(&connfp->connf_lock);
2555 return (NULL);
2556 }
2557
2558 /*
2559 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2560 * a listener when changing state.
2561 */
2562 conn_t *
2563 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2564 ip_stack_t *ipst)
2565 {
2566 connf_t *bind_connfp;
2567 conn_t *connp;
2568 tcp_t *tcp;
2569
2570 /*
2571 * Avoid false matches for packets sent to an IP destination of
2572 * all zeros.
2573 */
2574 if (laddr == 0)
2575 return (NULL);
2576
2577 ASSERT(zoneid != ALL_ZONES);
2578
2579 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2580 mutex_enter(&bind_connfp->connf_lock);
2581 for (connp = bind_connfp->connf_head; connp != NULL;
2582 connp = connp->conn_next) {
2583 tcp = connp->conn_tcp;
2584 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2585 IPCL_ZONE_MATCH(connp, zoneid) &&
2586 (tcp->tcp_listener == NULL)) {
2587 CONN_INC_REF(connp);
2588 mutex_exit(&bind_connfp->connf_lock);
2589 return (connp);
2590 }
2591 }
2592 mutex_exit(&bind_connfp->connf_lock);
2593 return (NULL);
2594 }
2595
2596 /*
2597 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2598 * a listener when changing state.
2599 */
2600 conn_t *
2601 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2602 zoneid_t zoneid, ip_stack_t *ipst)
2603 {
2604 connf_t *bind_connfp;
2605 conn_t *connp = NULL;
2606 tcp_t *tcp;
2607
2608 /*
2609 * Avoid false matches for packets sent to an IP destination of
2610 * all zeros.
2611 */
2612 if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2613 return (NULL);
2614
2615 ASSERT(zoneid != ALL_ZONES);
2616
2617 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2618 mutex_enter(&bind_connfp->connf_lock);
2619 for (connp = bind_connfp->connf_head; connp != NULL;
2620 connp = connp->conn_next) {
2621 tcp = connp->conn_tcp;
2622 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2623 IPCL_ZONE_MATCH(connp, zoneid) &&
2624 (connp->conn_bound_if == 0 ||
2625 connp->conn_bound_if == ifindex) &&
2626 tcp->tcp_listener == NULL) {
2627 CONN_INC_REF(connp);
2628 mutex_exit(&bind_connfp->connf_lock);
2629 return (connp);
2630 }
2631 }
2632 mutex_exit(&bind_connfp->connf_lock);
2633 return (NULL);
2634 }
2635
2636 /*
2637 * ipcl_get_next_conn
2638 * get the next entry in the conn global list
2639 * and put a reference on the next_conn.
2640 * decrement the reference on the current conn.
2641 *
2642 * This is an iterator based walker function that also provides for
2643 * some selection by the caller. It walks through the conn_hash bucket
2644 * searching for the next valid connp in the list, and selects connections
2645 * that are neither closed nor condemned. It also REFHOLDS the conn
2646 * thus ensuring that the conn exists when the caller uses the conn.
2647 */
2648 conn_t *
2649 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2650 {
2651 conn_t *next_connp;
2652
2653 if (connfp == NULL)
2654 return (NULL);
2655
2656 mutex_enter(&connfp->connf_lock);
2657
2658 next_connp = (connp == NULL) ?
2659 connfp->connf_head : connp->conn_g_next;
2660
2661 while (next_connp != NULL) {
2662 mutex_enter(&next_connp->conn_lock);
2663 if (!(next_connp->conn_flags & conn_flags) ||
2664 (next_connp->conn_state_flags &
2665 (CONN_CONDEMNED | CONN_INCIPIENT))) {
2666 /*
2667 * This conn has been condemned or
2668 * is closing, or the flags don't match
2669 */
2670 mutex_exit(&next_connp->conn_lock);
2671 next_connp = next_connp->conn_g_next;
2672 continue;
2673 }
2674 CONN_INC_REF_LOCKED(next_connp);
2675 mutex_exit(&next_connp->conn_lock);
2676 break;
2677 }
2678
2679 mutex_exit(&connfp->connf_lock);
2680
2681 if (connp != NULL)
2682 CONN_DEC_REF(connp);
2683
2684 return (next_connp);
2685 }
2686
2687 #ifdef CONN_DEBUG
2688 /*
2689 * Trace of the last NBUF refhold/refrele
2690 */
2691 int
2692 conn_trace_ref(conn_t *connp)
2693 {
2694 int last;
2695 conn_trace_t *ctb;
2696
2697 ASSERT(MUTEX_HELD(&connp->conn_lock));
2698 last = connp->conn_trace_last;
2699 last++;
2700 if (last == CONN_TRACE_MAX)
2701 last = 0;
2702
2703 ctb = &connp->conn_trace_buf[last];
2704 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2705 connp->conn_trace_last = last;
2706 return (1);
2707 }
2708
2709 int
2710 conn_untrace_ref(conn_t *connp)
2711 {
2712 int last;
2713 conn_trace_t *ctb;
2714
2715 ASSERT(MUTEX_HELD(&connp->conn_lock));
2716 last = connp->conn_trace_last;
2717 last++;
2718 if (last == CONN_TRACE_MAX)
2719 last = 0;
2720
2721 ctb = &connp->conn_trace_buf[last];
2722 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2723 connp->conn_trace_last = last;
2724 return (1);
2725 }
2726 #endif
2727
2728 mib2_socketInfoEntry_t *
2729 conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie)
2730 {
2731 vnode_t *vn = NULL;
2732 vattr_t attr;
2733 uint64_t flags = 0;
2734
2735 /*
2736 * If the connection is closing, it is not safe to make an upcall or
2737 * access the stream associated with the connection.
2738 * The callers of this function have a reference on connp itself
2739 * so, as long as it is not closing, it's safe to continue.
2740 */
2741 mutex_enter(&connp->conn_lock);
2742
2743 if ((connp->conn_state_flags & CONN_CLOSING)) {
2744 mutex_exit(&connp->conn_lock);
2745 return (NULL);
2746 }
2747
2748 mutex_exit(&connp->conn_lock);
2749
2750 if (connp->conn_upper_handle != NULL) {
2751 vn = (*connp->conn_upcalls->su_get_vnode)
2752 (connp->conn_upper_handle);
2753 } else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL) {
2754 vn = STREAM(connp->conn_rq)->sd_pvnode;
2755 if (vn != NULL)
2756 VN_HOLD(vn);
2757 flags |= MIB2_SOCKINFO_STREAM;
2758 }
2759
2760 if (vn == NULL || VOP_GETATTR(vn, &attr, 0, CRED(), NULL) != 0) {
2761 if (vn != NULL)
2762 VN_RELE(vn);
2763 return (NULL);
2764 }
2765
2766 VN_RELE(vn);
2767
2768 bzero(sie, sizeof (*sie));
2769
2770 sie->sie_flags = flags;
2771 sie->sie_inode = attr.va_nodeid;
2772 sie->sie_dev = attr.va_rdev;
2773
2774 return (sie);
2775 }