5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
24 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
25 * Copyright 2015, Joyent, Inc.
26 */
27 /* Copyright (c) 1990 Mentat Inc. */
28
29 #include <sys/sysmacros.h>
30 #include <sys/types.h>
31 #include <sys/stream.h>
32 #include <sys/stropts.h>
33 #include <sys/strlog.h>
34 #include <sys/strsun.h>
35 #define _SUN_TPI_VERSION 2
36 #include <sys/tihdr.h>
37 #include <sys/timod.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/strsubr.h>
41 #include <sys/suntpi.h>
42 #include <sys/xti_inet.h>
43 #include <sys/kmem.h>
44 #include <sys/cred_impl.h>
45 #include <sys/policy.h>
60 #include <netinet/icmp6.h>
61 #include <netinet/udp.h>
62
63 #include <inet/common.h>
64 #include <inet/ip.h>
65 #include <inet/ip_impl.h>
66 #include <inet/ipsec_impl.h>
67 #include <inet/ip6.h>
68 #include <inet/ip_ire.h>
69 #include <inet/ip_if.h>
70 #include <inet/ip_multi.h>
71 #include <inet/ip_ndp.h>
72 #include <inet/proto_set.h>
73 #include <inet/mib2.h>
74 #include <inet/optcom.h>
75 #include <inet/snmpcom.h>
76 #include <inet/kstatcom.h>
77 #include <inet/ipclassifier.h>
78 #include <sys/squeue_impl.h>
79 #include <inet/ipnet.h>
80 #include <sys/vxlan.h>
81 #include <inet/inet_hash.h>
82
83 #include <sys/tsol/label.h>
84 #include <sys/tsol/tnet.h>
85 #include <rpc/pmap_prot.h>
86
87 #include <inet/udp_impl.h>
88
89 /*
90 * Synchronization notes:
91 *
92 * UDP is MT and uses the usual kernel synchronization primitives. There are 2
93 * locks, the fanout lock (uf_lock) and conn_lock. conn_lock
94 * protects the contents of the udp_t. uf_lock protects the address and the
95 * fanout information.
96 * The lock order is conn_lock -> uf_lock.
97 *
98 * The fanout lock uf_lock:
99 * When a UDP endpoint is bound to a local port, it is inserted into
100 * a bind hash list. The list consists of an array of udp_fanout_t buckets.
101 * The size of the array is controlled by the udp_bind_fanout_size variable.
331
332 /* Setable in /etc/system */
333 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
334 uint32_t udp_random_anon_port = 1;
335
336 /*
337 * Hook functions to enable cluster networking.
338 * On non-clustered systems these vectors must always be NULL
339 */
340
341 void (*cl_inet_bind)(netstackid_t stack_id, uchar_t protocol,
342 sa_family_t addr_family, uint8_t *laddrp, in_port_t lport,
343 void *args) = NULL;
344 void (*cl_inet_unbind)(netstackid_t stack_id, uint8_t protocol,
345 sa_family_t addr_family, uint8_t *laddrp, in_port_t lport,
346 void *args) = NULL;
347
348 typedef union T_primitives *t_primp_t;
349
350 /*
351 * Various protocols that encapsulate UDP have no real use for the source port.
352 * Instead, they want to vary the source port to provide better equal-cost
353 * multipathing and other systems that use fanout. Consider something like
354 * VXLAN. If you're actually sending multiple different streams to a single
355 * host, if you don't vary the source port, then the tuple of ( SRC IP, DST IP,
356 * SRC Port, DST Port) will always be the same.
357 *
358 * Here, we return a port to hash this to, if we know how to hash it. If for
359 * some reason we can't perform an L4 hash, then we just return the default
360 * value, usually the default port. After we determine the hash we transform it
361 * so that it's in the range of [ min, max ].
362 *
363 * We'd like to avoid a pull up for the sake of performing the hash. If the
364 * first mblk_t doesn't have the full protocol header, then we just send it to
365 * the default. If for some reason we have an encapsulated packet that has its
366 * protocol header in different parts of an mblk_t, then we'll go with the
367 * default port. This means that that if a driver isn't consistent about how it
368 * generates the frames for a given flow, it will not always be consistently
369 * hashed. That should be an uncommon event.
370 */
371 uint16_t
372 udp_srcport_hash(mblk_t *mp, int type, uint16_t min, uint16_t max,
373 uint16_t def)
374 {
375 size_t szused = 0;
376 struct ether_header *ether;
377 struct ether_vlan_header *vether;
378 ip6_t *ip6h;
379 ipha_t *ipha;
380 uint16_t sap;
381 uint64_t hash;
382 uint32_t mod;
383
384 ASSERT(min <= max);
385
386 if (type != UDP_HASH_VXLAN)
387 return (def);
388
389 if (!IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)))
390 return (def);
391
392 /*
393 * The following logic is VXLAN specific to get at the header, if we
394 * have formats, eg. GENEVE, then we should ignore this.
395 *
396 * The kernel overlay device often puts a first mblk_t for the data
397 * which is just the encap. If so, then we're going to use that and try
398 * to avoid a pull up.
399 */
400 if (MBLKL(mp) == VXLAN_HDR_LEN) {
401 if (mp->b_cont == NULL)
402 return (def);
403 mp = mp->b_cont;
404 ether = (struct ether_header *)mp->b_rptr;
405 } else if (MBLKL(mp) < VXLAN_HDR_LEN) {
406 return (def);
407 } else {
408 szused = VXLAN_HDR_LEN;
409 ether = (struct ether_header *)((uintptr_t)mp->b_rptr + szused);
410 }
411
412 /* Can we hold a MAC header? */
413 if (MBLKL(mp) + szused < sizeof (struct ether_header))
414 return (def);
415
416 /*
417 * We need to lie about the starting offset into the message block for
418 * convenience. Undo it at the end. We know that inet_pkt_hash() won't
419 * modify the mblk_t.
420 */
421 mp->b_rptr += szused;
422 hash = inet_pkt_hash(DL_ETHER, mp, INET_PKT_HASH_L2 |
423 INET_PKT_HASH_L3 | INET_PKT_HASH_L4);
424 mp->b_rptr -= szused;
425
426 if (hash == 0)
427 return (def);
428
429 mod = max - min + 1;
430 return ((hash % mod) + min);
431 }
432
433 /*
434 * Return the next anonymous port in the privileged port range for
435 * bind checking.
436 *
437 * Trusted Extension (TX) notes: TX allows administrator to mark or
438 * reserve ports as Multilevel ports (MLP). MLP has special function
439 * on TX systems. Once a port is made MLP, it's not available as
440 * ordinary port. This creates "holes" in the port name space. It
441 * may be necessary to skip the "holes" find a suitable anon port.
442 */
443 static in_port_t
444 udp_get_next_priv_port(udp_t *udp)
445 {
446 static in_port_t next_priv_port = IPPORT_RESERVED - 1;
447 in_port_t nextport;
448 boolean_t restart = B_FALSE;
449 udp_stack_t *us = udp->udp_us;
450
451 retry:
452 if (next_priv_port < us->us_min_anonpriv_port ||
453 next_priv_port >= IPPORT_RESERVED) {
1651
1652 len = udp->udp_recv_ipp.ipp_ipv4_options_len;
1653 ASSERT(len != 0);
1654 bcopy(udp->udp_recv_ipp.ipp_ipv4_options, ptr, len);
1655 mutex_exit(&connp->conn_lock);
1656 return (len);
1657 }
1658 break;
1659 case IPPROTO_UDP:
1660 switch (name) {
1661 case UDP_NAT_T_ENDPOINT:
1662 mutex_enter(&connp->conn_lock);
1663 *i1 = udp->udp_nat_t_endpoint;
1664 mutex_exit(&connp->conn_lock);
1665 return (sizeof (int));
1666 case UDP_RCVHDR:
1667 mutex_enter(&connp->conn_lock);
1668 *i1 = udp->udp_rcvhdr ? 1 : 0;
1669 mutex_exit(&connp->conn_lock);
1670 return (sizeof (int));
1671 case UDP_SRCPORT_HASH:
1672 mutex_enter(&connp->conn_lock);
1673 *i1 = udp->udp_vxlanhash;
1674 mutex_exit(&connp->conn_lock);
1675 return (sizeof (int));
1676 case UDP_SND_TO_CONNECTED:
1677 mutex_enter(&connp->conn_lock);
1678 *i1 = udp->udp_snd_to_conn ? 1 : 0;
1679 mutex_exit(&connp->conn_lock);
1680 return (sizeof (int));
1681 }
1682 }
1683 mutex_enter(&connp->conn_lock);
1684 retval = conn_opt_get(&coas, level, name, ptr);
1685 mutex_exit(&connp->conn_lock);
1686 return (retval);
1687 }
1688
1689 /*
1690 * This routine retrieves the current status of socket options.
1691 * It returns the size of the option retrieved, or -1.
1692 */
1693 int
1694 udp_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1695 {
1796 * to IPv6.
1797 */
1798 if (connp->conn_family != AF_INET) {
1799 return (EAFNOSUPPORT);
1800 }
1801
1802 if (!checkonly) {
1803 mutex_enter(&connp->conn_lock);
1804 udp->udp_nat_t_endpoint = onoff;
1805 mutex_exit(&connp->conn_lock);
1806 coa->coa_changed |= COA_HEADER_CHANGED;
1807 coa->coa_changed |= COA_WROFF_CHANGED;
1808 }
1809 /* Fully handled this option. */
1810 return (0);
1811 case UDP_RCVHDR:
1812 mutex_enter(&connp->conn_lock);
1813 udp->udp_rcvhdr = onoff;
1814 mutex_exit(&connp->conn_lock);
1815 return (0);
1816 case UDP_SRCPORT_HASH:
1817 /*
1818 * This should have already been verified, but double
1819 * check.
1820 */
1821 if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) {
1822 return (error);
1823 }
1824
1825 /* First see if the val is something we understand */
1826 if (*i1 != UDP_HASH_DISABLE && *i1 != UDP_HASH_VXLAN)
1827 return (EINVAL);
1828
1829 if (!checkonly) {
1830 mutex_enter(&connp->conn_lock);
1831 udp->udp_vxlanhash = *i1;
1832 mutex_exit(&connp->conn_lock);
1833 }
1834 /* Fully handled this option. */
1835 return (0);
1836 case UDP_SND_TO_CONNECTED:
1837 mutex_enter(&connp->conn_lock);
1838 udp->udp_snd_to_conn = onoff;
1839 mutex_exit(&connp->conn_lock);
1840 return (0);
1841 }
1842 break;
1843 }
1844 error = conn_opt_set(coa, level, name, inlen, invalp,
1845 checkonly, cr);
1846 return (error);
1847 }
1848
1849 /*
1850 * This routine sets socket options.
1851 */
1852 int
1853 udp_opt_set(conn_t *connp, uint_t optset_context, int level,
1854 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
1855 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr)
2104 outlenp, outvalp, thisdg_attrs, cr);
2105 return (error);
2106 }
2107
2108 /*
2109 * Setup IP and UDP headers.
2110 * Returns NULL on allocation failure, in which case data_mp is freed.
2111 */
2112 mblk_t *
2113 udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
2114 const in6_addr_t *v6src, const in6_addr_t *v6dst, in_port_t dstport,
2115 uint32_t flowinfo, mblk_t *data_mp, int *errorp)
2116 {
2117 mblk_t *mp;
2118 udpha_t *udpha;
2119 udp_stack_t *us = connp->conn_netstack->netstack_udp;
2120 uint_t data_len;
2121 uint32_t cksum;
2122 udp_t *udp = connp->conn_udp;
2123 boolean_t insert_spi = udp->udp_nat_t_endpoint;
2124 boolean_t hash_srcport = udp->udp_vxlanhash;
2125 uint_t ulp_hdr_len;
2126 uint16_t srcport;
2127
2128 data_len = msgdsize(data_mp);
2129 ulp_hdr_len = UDPH_SIZE;
2130 if (insert_spi)
2131 ulp_hdr_len += sizeof (uint32_t);
2132
2133 /*
2134 * If we have source port hashing going on, determine the hash before
2135 * we modify the mblk_t.
2136 */
2137 if (hash_srcport == B_TRUE) {
2138 srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN,
2139 IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX,
2140 ntohs(connp->conn_lport));
2141 }
2142
2143 mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo,
2144 ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp);
2145 if (mp == NULL) {
2146 ASSERT(*errorp != 0);
2147 return (NULL);
2148 }
2149
2150 data_len += ulp_hdr_len;
2151 ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2152
2153 udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length);
2154 if (hash_srcport == B_TRUE) {
2155 udpha->uha_src_port = htons(srcport);
2156 } else {
2157 udpha->uha_src_port = connp->conn_lport;
2158 }
2159 udpha->uha_dst_port = dstport;
2160 udpha->uha_checksum = 0;
2161 udpha->uha_length = htons(data_len);
2162
2163 /*
2164 * If there was a routing option/header then conn_prepend_hdr
2165 * has massaged it and placed the pseudo-header checksum difference
2166 * in the cksum argument.
2167 *
2168 * Setup header length and prepare for ULP checksum done in IP.
2169 *
2170 * We make it easy for IP to include our pseudo header
2171 * by putting our length in uha_checksum.
2172 * The IP source, destination, and length have already been set by
2173 * conn_prepend_hdr.
2174 */
2175 cksum += data_len;
2176 cksum = (cksum >> 16) + (cksum & 0xFFFF);
2177 ASSERT(cksum < 0x10000);
2178
3313 ixa_refrele(ixa);
3314 return (error);
3315 }
3316
3317
3318 /*
3319 * Prepend the header template and then fill in the source and
3320 * flowinfo. The caller needs to handle the destination address since
3321 * it's setting is different if rthdr or source route.
3322 *
3323 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
3324 * When it returns NULL it sets errorp.
3325 */
3326 static mblk_t *
3327 udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
3328 const in6_addr_t *v6src, in_port_t dstport, uint32_t flowinfo, int *errorp)
3329 {
3330 udp_t *udp = connp->conn_udp;
3331 udp_stack_t *us = udp->udp_us;
3332 boolean_t insert_spi = udp->udp_nat_t_endpoint;
3333 boolean_t hash_srcport = udp->udp_vxlanhash;
3334 uint_t pktlen;
3335 uint_t alloclen;
3336 uint_t copylen;
3337 uint8_t *iph;
3338 uint_t ip_hdr_length;
3339 udpha_t *udpha;
3340 uint32_t cksum;
3341 ip_pkt_t *ipp;
3342 uint16_t srcport;
3343
3344 ASSERT(MUTEX_HELD(&connp->conn_lock));
3345
3346 /*
3347 * If we have source port hashing going on, determine the hash before
3348 * we modify the mblk_t.
3349 */
3350 if (hash_srcport == B_TRUE) {
3351 srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN,
3352 IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX,
3353 ntohs(connp->conn_lport));
3354 }
3355
3356 /*
3357 * Copy the header template and leave space for an SPI
3358 */
3359 copylen = connp->conn_ht_iphc_len;
3360 alloclen = copylen + (insert_spi ? sizeof (uint32_t) : 0);
3361 pktlen = alloclen + msgdsize(mp);
3362 if (pktlen > IP_MAXPACKET) {
3363 freemsg(mp);
3364 *errorp = EMSGSIZE;
3365 return (NULL);
3366 }
3367 ixa->ixa_pktlen = pktlen;
3368
3369 /* check/fix buffer config, setup pointers into it */
3370 iph = mp->b_rptr - alloclen;
3371 if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
3372 mblk_t *mp1;
3373
3374 mp1 = allocb(alloclen + us->us_wroff_extra, BPRI_MED);
3375 if (mp1 == NULL) {
3376 freemsg(mp);
3434 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
3435 ip6h->ip6_src = ipp->ipp_addr;
3436 } else {
3437 ip6h->ip6_src = *v6src;
3438 }
3439 ip6h->ip6_vcf =
3440 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
3441 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
3442 if (ipp->ipp_fields & IPPF_TCLASS) {
3443 /* Overrides the class part of flowinfo */
3444 ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
3445 ipp->ipp_tclass);
3446 }
3447 }
3448
3449 /* Insert all-0s SPI now. */
3450 if (insert_spi)
3451 *((uint32_t *)(udpha + 1)) = 0;
3452
3453 udpha->uha_dst_port = dstport;
3454 if (hash_srcport == B_TRUE)
3455 udpha->uha_src_port = htons(srcport);
3456
3457 return (mp);
3458 }
3459
3460 /*
3461 * Send a T_UDERR_IND in response to an M_DATA
3462 */
3463 static void
3464 udp_ud_err_connected(conn_t *connp, t_scalar_t error)
3465 {
3466 struct sockaddr_storage ss;
3467 sin_t *sin;
3468 sin6_t *sin6;
3469 struct sockaddr *addr;
3470 socklen_t addrlen;
3471 mblk_t *mp1;
3472
3473 mutex_enter(&connp->conn_lock);
3474 /* Initialize addr and addrlen as if they're passed in */
3475 if (connp->conn_family == AF_INET) {
3476 sin = (sin_t *)&ss;
6044 */
6045 udp->udp_state = TS_IDLE;
6046 }
6047 return (error);
6048 }
6049
6050 int
6051 udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
6052 cred_t *cr)
6053 {
6054 sin6_t *sin6;
6055 sin_t *sin = NULL;
6056 uint_t srcid;
6057 conn_t *connp = (conn_t *)proto_handle;
6058 udp_t *udp = connp->conn_udp;
6059 int error = 0;
6060 udp_stack_t *us = udp->udp_us;
6061 ushort_t ipversion;
6062 pid_t pid = curproc->p_pid;
6063 ip_xmit_attr_t *ixa;
6064
6065 ASSERT(DB_TYPE(mp) == M_DATA);
6066
6067 /* All Solaris components should pass a cred for this operation. */
6068 ASSERT(cr != NULL);
6069
6070 /* do an implicit bind if necessary */
6071 if (udp->udp_state == TS_UNBND) {
6072 error = udp_implicit_bind(connp, cr);
6073 /*
6074 * We could be racing with an actual bind, in which case
6075 * we would see EPROTO. We cross our fingers and try
6076 * to connect.
6077 */
6078 if (!(error == 0 || error == EPROTO)) {
6079 freemsg(mp);
6080 return (error);
6081 }
6082 }
6083
6088 return (EDESTADDRREQ);
6089 }
6090 if (msg->msg_controllen != 0) {
6091 error = udp_output_ancillary(connp, NULL, NULL, mp,
6092 NULL, msg, cr, pid);
6093 } else {
6094 error = udp_output_connected(connp, mp, cr, pid);
6095 }
6096 if (us->us_sendto_ignerr)
6097 return (0);
6098 else
6099 return (error);
6100 }
6101
6102 /*
6103 * Check if we're allowed to send to a connection on which we've
6104 * already called 'connect'. The posix spec. allows both behaviors but
6105 * historically we've returned an error if already connected. The
6106 * client can allow this via a sockopt.
6107 */
6108 if (udp->udp_state == TS_DATA_XFER && !udp->udp_snd_to_conn) {
6109 UDPS_BUMP_MIB(us, udpOutErrors);
6110 return (EISCONN);
6111 }
6112
6113 error = proto_verify_ip_addr(connp->conn_family,
6114 (struct sockaddr *)msg->msg_name, msg->msg_namelen);
6115 if (error != 0) {
6116 UDPS_BUMP_MIB(us, udpOutErrors);
6117 return (error);
6118 }
6119 switch (connp->conn_family) {
6120 case AF_INET6:
6121 sin6 = (sin6_t *)msg->msg_name;
6122
6123 srcid = sin6->__sin6_src_id;
6124
6125 if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6126 /*
6127 * Destination is a non-IPv4-compatible IPv6 address.
6128 * Send out an IPv6 format packet.
|
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
24 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
25 */
26 /* Copyright (c) 1990 Mentat Inc. */
27
28 #include <sys/sysmacros.h>
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/strlog.h>
33 #include <sys/strsun.h>
34 #define _SUN_TPI_VERSION 2
35 #include <sys/tihdr.h>
36 #include <sys/timod.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/strsubr.h>
40 #include <sys/suntpi.h>
41 #include <sys/xti_inet.h>
42 #include <sys/kmem.h>
43 #include <sys/cred_impl.h>
44 #include <sys/policy.h>
59 #include <netinet/icmp6.h>
60 #include <netinet/udp.h>
61
62 #include <inet/common.h>
63 #include <inet/ip.h>
64 #include <inet/ip_impl.h>
65 #include <inet/ipsec_impl.h>
66 #include <inet/ip6.h>
67 #include <inet/ip_ire.h>
68 #include <inet/ip_if.h>
69 #include <inet/ip_multi.h>
70 #include <inet/ip_ndp.h>
71 #include <inet/proto_set.h>
72 #include <inet/mib2.h>
73 #include <inet/optcom.h>
74 #include <inet/snmpcom.h>
75 #include <inet/kstatcom.h>
76 #include <inet/ipclassifier.h>
77 #include <sys/squeue_impl.h>
78 #include <inet/ipnet.h>
79 #include <sys/ethernet.h>
80
81 #include <sys/tsol/label.h>
82 #include <sys/tsol/tnet.h>
83 #include <rpc/pmap_prot.h>
84
85 #include <inet/udp_impl.h>
86
87 /*
88 * Synchronization notes:
89 *
90 * UDP is MT and uses the usual kernel synchronization primitives. There are 2
91 * locks, the fanout lock (uf_lock) and conn_lock. conn_lock
92 * protects the contents of the udp_t. uf_lock protects the address and the
93 * fanout information.
94 * The lock order is conn_lock -> uf_lock.
95 *
96 * The fanout lock uf_lock:
97 * When a UDP endpoint is bound to a local port, it is inserted into
98 * a bind hash list. The list consists of an array of udp_fanout_t buckets.
99 * The size of the array is controlled by the udp_bind_fanout_size variable.
329
330 /* Setable in /etc/system */
331 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
332 uint32_t udp_random_anon_port = 1;
333
334 /*
335 * Hook functions to enable cluster networking.
336 * On non-clustered systems these vectors must always be NULL
337 */
338
339 void (*cl_inet_bind)(netstackid_t stack_id, uchar_t protocol,
340 sa_family_t addr_family, uint8_t *laddrp, in_port_t lport,
341 void *args) = NULL;
342 void (*cl_inet_unbind)(netstackid_t stack_id, uint8_t protocol,
343 sa_family_t addr_family, uint8_t *laddrp, in_port_t lport,
344 void *args) = NULL;
345
346 typedef union T_primitives *t_primp_t;
347
348 /*
349 * Return the next anonymous port in the privileged port range for
350 * bind checking.
351 *
352 * Trusted Extension (TX) notes: TX allows administrator to mark or
353 * reserve ports as Multilevel ports (MLP). MLP has special function
354 * on TX systems. Once a port is made MLP, it's not available as
355 * ordinary port. This creates "holes" in the port name space. It
356 * may be necessary to skip the "holes" find a suitable anon port.
357 */
358 static in_port_t
359 udp_get_next_priv_port(udp_t *udp)
360 {
361 static in_port_t next_priv_port = IPPORT_RESERVED - 1;
362 in_port_t nextport;
363 boolean_t restart = B_FALSE;
364 udp_stack_t *us = udp->udp_us;
365
366 retry:
367 if (next_priv_port < us->us_min_anonpriv_port ||
368 next_priv_port >= IPPORT_RESERVED) {
1566
1567 len = udp->udp_recv_ipp.ipp_ipv4_options_len;
1568 ASSERT(len != 0);
1569 bcopy(udp->udp_recv_ipp.ipp_ipv4_options, ptr, len);
1570 mutex_exit(&connp->conn_lock);
1571 return (len);
1572 }
1573 break;
1574 case IPPROTO_UDP:
1575 switch (name) {
1576 case UDP_NAT_T_ENDPOINT:
1577 mutex_enter(&connp->conn_lock);
1578 *i1 = udp->udp_nat_t_endpoint;
1579 mutex_exit(&connp->conn_lock);
1580 return (sizeof (int));
1581 case UDP_RCVHDR:
1582 mutex_enter(&connp->conn_lock);
1583 *i1 = udp->udp_rcvhdr ? 1 : 0;
1584 mutex_exit(&connp->conn_lock);
1585 return (sizeof (int));
1586 case UDP_SND_TO_CONNECTED:
1587 mutex_enter(&connp->conn_lock);
1588 *i1 = udp->udp_snd_to_conn ? 1 : 0;
1589 mutex_exit(&connp->conn_lock);
1590 return (sizeof (int));
1591 }
1592 }
1593 mutex_enter(&connp->conn_lock);
1594 retval = conn_opt_get(&coas, level, name, ptr);
1595 mutex_exit(&connp->conn_lock);
1596 return (retval);
1597 }
1598
1599 /*
1600 * This routine retrieves the current status of socket options.
1601 * It returns the size of the option retrieved, or -1.
1602 */
1603 int
1604 udp_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1605 {
1706 * to IPv6.
1707 */
1708 if (connp->conn_family != AF_INET) {
1709 return (EAFNOSUPPORT);
1710 }
1711
1712 if (!checkonly) {
1713 mutex_enter(&connp->conn_lock);
1714 udp->udp_nat_t_endpoint = onoff;
1715 mutex_exit(&connp->conn_lock);
1716 coa->coa_changed |= COA_HEADER_CHANGED;
1717 coa->coa_changed |= COA_WROFF_CHANGED;
1718 }
1719 /* Fully handled this option. */
1720 return (0);
1721 case UDP_RCVHDR:
1722 mutex_enter(&connp->conn_lock);
1723 udp->udp_rcvhdr = onoff;
1724 mutex_exit(&connp->conn_lock);
1725 return (0);
1726 case UDP_SND_TO_CONNECTED:
1727 mutex_enter(&connp->conn_lock);
1728 udp->udp_snd_to_conn = onoff;
1729 mutex_exit(&connp->conn_lock);
1730 return (0);
1731 }
1732 break;
1733 }
1734 error = conn_opt_set(coa, level, name, inlen, invalp,
1735 checkonly, cr);
1736 return (error);
1737 }
1738
1739 /*
1740 * This routine sets socket options.
1741 */
1742 int
1743 udp_opt_set(conn_t *connp, uint_t optset_context, int level,
1744 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
1745 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr)
1994 outlenp, outvalp, thisdg_attrs, cr);
1995 return (error);
1996 }
1997
1998 /*
1999 * Setup IP and UDP headers.
2000 * Returns NULL on allocation failure, in which case data_mp is freed.
2001 */
2002 mblk_t *
2003 udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
2004 const in6_addr_t *v6src, const in6_addr_t *v6dst, in_port_t dstport,
2005 uint32_t flowinfo, mblk_t *data_mp, int *errorp)
2006 {
2007 mblk_t *mp;
2008 udpha_t *udpha;
2009 udp_stack_t *us = connp->conn_netstack->netstack_udp;
2010 uint_t data_len;
2011 uint32_t cksum;
2012 udp_t *udp = connp->conn_udp;
2013 boolean_t insert_spi = udp->udp_nat_t_endpoint;
2014 uint_t ulp_hdr_len;
2015
2016 data_len = msgdsize(data_mp);
2017 ulp_hdr_len = UDPH_SIZE;
2018 if (insert_spi)
2019 ulp_hdr_len += sizeof (uint32_t);
2020
2021 mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo,
2022 ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp);
2023 if (mp == NULL) {
2024 ASSERT(*errorp != 0);
2025 return (NULL);
2026 }
2027
2028 data_len += ulp_hdr_len;
2029 ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2030
2031 udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length);
2032 udpha->uha_src_port = connp->conn_lport;
2033 udpha->uha_dst_port = dstport;
2034 udpha->uha_checksum = 0;
2035 udpha->uha_length = htons(data_len);
2036
2037 /*
2038 * If there was a routing option/header then conn_prepend_hdr
2039 * has massaged it and placed the pseudo-header checksum difference
2040 * in the cksum argument.
2041 *
2042 * Setup header length and prepare for ULP checksum done in IP.
2043 *
2044 * We make it easy for IP to include our pseudo header
2045 * by putting our length in uha_checksum.
2046 * The IP source, destination, and length have already been set by
2047 * conn_prepend_hdr.
2048 */
2049 cksum += data_len;
2050 cksum = (cksum >> 16) + (cksum & 0xFFFF);
2051 ASSERT(cksum < 0x10000);
2052
3187 ixa_refrele(ixa);
3188 return (error);
3189 }
3190
3191
3192 /*
3193 * Prepend the header template and then fill in the source and
3194 * flowinfo. The caller needs to handle the destination address since
3195 * it's setting is different if rthdr or source route.
3196 *
3197 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
3198 * When it returns NULL it sets errorp.
3199 */
3200 static mblk_t *
3201 udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
3202 const in6_addr_t *v6src, in_port_t dstport, uint32_t flowinfo, int *errorp)
3203 {
3204 udp_t *udp = connp->conn_udp;
3205 udp_stack_t *us = udp->udp_us;
3206 boolean_t insert_spi = udp->udp_nat_t_endpoint;
3207 uint_t pktlen;
3208 uint_t alloclen;
3209 uint_t copylen;
3210 uint8_t *iph;
3211 uint_t ip_hdr_length;
3212 udpha_t *udpha;
3213 uint32_t cksum;
3214 ip_pkt_t *ipp;
3215
3216 ASSERT(MUTEX_HELD(&connp->conn_lock));
3217
3218 /*
3219 * Copy the header template and leave space for an SPI
3220 */
3221 copylen = connp->conn_ht_iphc_len;
3222 alloclen = copylen + (insert_spi ? sizeof (uint32_t) : 0);
3223 pktlen = alloclen + msgdsize(mp);
3224 if (pktlen > IP_MAXPACKET) {
3225 freemsg(mp);
3226 *errorp = EMSGSIZE;
3227 return (NULL);
3228 }
3229 ixa->ixa_pktlen = pktlen;
3230
3231 /* check/fix buffer config, setup pointers into it */
3232 iph = mp->b_rptr - alloclen;
3233 if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
3234 mblk_t *mp1;
3235
3236 mp1 = allocb(alloclen + us->us_wroff_extra, BPRI_MED);
3237 if (mp1 == NULL) {
3238 freemsg(mp);
3296 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
3297 ip6h->ip6_src = ipp->ipp_addr;
3298 } else {
3299 ip6h->ip6_src = *v6src;
3300 }
3301 ip6h->ip6_vcf =
3302 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
3303 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
3304 if (ipp->ipp_fields & IPPF_TCLASS) {
3305 /* Overrides the class part of flowinfo */
3306 ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
3307 ipp->ipp_tclass);
3308 }
3309 }
3310
3311 /* Insert all-0s SPI now. */
3312 if (insert_spi)
3313 *((uint32_t *)(udpha + 1)) = 0;
3314
3315 udpha->uha_dst_port = dstport;
3316 return (mp);
3317 }
3318
3319 /*
3320 * Send a T_UDERR_IND in response to an M_DATA
3321 */
3322 static void
3323 udp_ud_err_connected(conn_t *connp, t_scalar_t error)
3324 {
3325 struct sockaddr_storage ss;
3326 sin_t *sin;
3327 sin6_t *sin6;
3328 struct sockaddr *addr;
3329 socklen_t addrlen;
3330 mblk_t *mp1;
3331
3332 mutex_enter(&connp->conn_lock);
3333 /* Initialize addr and addrlen as if they're passed in */
3334 if (connp->conn_family == AF_INET) {
3335 sin = (sin_t *)&ss;
5903 */
5904 udp->udp_state = TS_IDLE;
5905 }
5906 return (error);
5907 }
5908
5909 int
5910 udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
5911 cred_t *cr)
5912 {
5913 sin6_t *sin6;
5914 sin_t *sin = NULL;
5915 uint_t srcid;
5916 conn_t *connp = (conn_t *)proto_handle;
5917 udp_t *udp = connp->conn_udp;
5918 int error = 0;
5919 udp_stack_t *us = udp->udp_us;
5920 ushort_t ipversion;
5921 pid_t pid = curproc->p_pid;
5922 ip_xmit_attr_t *ixa;
5923 boolean_t snd_to_conn;
5924
5925 ASSERT(DB_TYPE(mp) == M_DATA);
5926
5927 /* All Solaris components should pass a cred for this operation. */
5928 ASSERT(cr != NULL);
5929
5930 /* do an implicit bind if necessary */
5931 if (udp->udp_state == TS_UNBND) {
5932 error = udp_implicit_bind(connp, cr);
5933 /*
5934 * We could be racing with an actual bind, in which case
5935 * we would see EPROTO. We cross our fingers and try
5936 * to connect.
5937 */
5938 if (!(error == 0 || error == EPROTO)) {
5939 freemsg(mp);
5940 return (error);
5941 }
5942 }
5943
5948 return (EDESTADDRREQ);
5949 }
5950 if (msg->msg_controllen != 0) {
5951 error = udp_output_ancillary(connp, NULL, NULL, mp,
5952 NULL, msg, cr, pid);
5953 } else {
5954 error = udp_output_connected(connp, mp, cr, pid);
5955 }
5956 if (us->us_sendto_ignerr)
5957 return (0);
5958 else
5959 return (error);
5960 }
5961
5962 /*
5963 * Check if we're allowed to send to a connection on which we've
5964 * already called 'connect'. The posix spec. allows both behaviors but
5965 * historically we've returned an error if already connected. The
5966 * client can allow this via a sockopt.
5967 */
5968 mutex_enter(&connp->conn_lock);
5969 snd_to_conn = (udp->udp_snd_to_conn != 0);
5970 mutex_exit(&connp->conn_lock);
5971 if (udp->udp_state == TS_DATA_XFER && !snd_to_conn) {
5972 UDPS_BUMP_MIB(us, udpOutErrors);
5973 return (EISCONN);
5974 }
5975
5976 error = proto_verify_ip_addr(connp->conn_family,
5977 (struct sockaddr *)msg->msg_name, msg->msg_namelen);
5978 if (error != 0) {
5979 UDPS_BUMP_MIB(us, udpOutErrors);
5980 return (error);
5981 }
5982 switch (connp->conn_family) {
5983 case AF_INET6:
5984 sin6 = (sin6_t *)msg->msg_name;
5985
5986 srcid = sin6->__sin6_src_id;
5987
5988 if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
5989 /*
5990 * Destination is a non-IPv4-compatible IPv6 address.
5991 * Send out an IPv6 format packet.
|