Print this page




   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  25  * Copyright 2015, Joyent, Inc.
  26  */
  27 /* Copyright (c) 1990 Mentat Inc. */
  28 
  29 #include <sys/sysmacros.h>
  30 #include <sys/types.h>
  31 #include <sys/stream.h>
  32 #include <sys/stropts.h>
  33 #include <sys/strlog.h>
  34 #include <sys/strsun.h>
  35 #define _SUN_TPI_VERSION 2
  36 #include <sys/tihdr.h>
  37 #include <sys/timod.h>
  38 #include <sys/ddi.h>
  39 #include <sys/sunddi.h>
  40 #include <sys/strsubr.h>
  41 #include <sys/suntpi.h>
  42 #include <sys/xti_inet.h>
  43 #include <sys/kmem.h>
  44 #include <sys/cred_impl.h>
  45 #include <sys/policy.h>


  60 #include <netinet/icmp6.h>
  61 #include <netinet/udp.h>
  62 
  63 #include <inet/common.h>
  64 #include <inet/ip.h>
  65 #include <inet/ip_impl.h>
  66 #include <inet/ipsec_impl.h>
  67 #include <inet/ip6.h>
  68 #include <inet/ip_ire.h>
  69 #include <inet/ip_if.h>
  70 #include <inet/ip_multi.h>
  71 #include <inet/ip_ndp.h>
  72 #include <inet/proto_set.h>
  73 #include <inet/mib2.h>
  74 #include <inet/optcom.h>
  75 #include <inet/snmpcom.h>
  76 #include <inet/kstatcom.h>
  77 #include <inet/ipclassifier.h>
  78 #include <sys/squeue_impl.h>
  79 #include <inet/ipnet.h>
  80 #include <sys/vxlan.h>
  81 #include <inet/inet_hash.h>
  82 
  83 #include <sys/tsol/label.h>
  84 #include <sys/tsol/tnet.h>
  85 #include <rpc/pmap_prot.h>
  86 
  87 #include <inet/udp_impl.h>
  88 
  89 /*
  90  * Synchronization notes:
  91  *
  92  * UDP is MT and uses the usual kernel synchronization primitives. There are 2
  93  * locks, the fanout lock (uf_lock) and conn_lock. conn_lock
  94  * protects the contents of the udp_t. uf_lock protects the address and the
  95  * fanout information.
  96  * The lock order is conn_lock -> uf_lock.
  97  *
  98  * The fanout lock uf_lock:
  99  * When a UDP endpoint is bound to a local port, it is inserted into
 100  * a bind hash list.  The list consists of an array of udp_fanout_t buckets.
 101  * The size of the array is controlled by the udp_bind_fanout_size variable.


 331 
 332 /* Setable in /etc/system */
 333 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
 334 uint32_t udp_random_anon_port = 1;
 335 
 336 /*
 337  * Hook functions to enable cluster networking.
 338  * On non-clustered systems these vectors must always be NULL
 339  */
 340 
 341 void (*cl_inet_bind)(netstackid_t stack_id, uchar_t protocol,
 342     sa_family_t addr_family, uint8_t *laddrp, in_port_t lport,
 343     void *args) = NULL;
 344 void (*cl_inet_unbind)(netstackid_t stack_id, uint8_t protocol,
 345     sa_family_t addr_family, uint8_t *laddrp, in_port_t lport,
 346     void *args) = NULL;
 347 
 348 typedef union T_primitives *t_primp_t;
 349 
 350 /*
 351  * Various protocols that encapsulate UDP have no real use for the source port.
 352  * Instead, they want to vary the source port to provide better equal-cost
 353  * multipathing and other systems that use fanout. Consider something like
 354  * VXLAN. If you're actually sending multiple different streams to a single
 355  * host, if you don't vary the source port, then the tuple of ( SRC IP, DST IP,
 356  * SRC Port, DST Port) will always be the same.
 357  *
 358  * Here, we return a port to hash this to, if we know how to hash it. If for
 359  * some reason we can't perform an L4 hash, then we just return the default
 360  * value, usually the default port. After we determine the hash we transform it
 361  * so that it's in the range of [ min, max ].
 362  *
 363  * We'd like to avoid a pull up for the sake of performing the hash. If the
 364  * first mblk_t doesn't have the full protocol header, then we just send it to
 365  * the default. If for some reason we have an encapsulated packet that has its
 366  * protocol header in different parts of an mblk_t, then we'll go with the
 367  * default port. This means that that if a driver isn't consistent about how it
 368  * generates the frames for a given flow, it will not always be consistently
 369  * hashed. That should be an uncommon event.
 370  */
 371 uint16_t
 372 udp_srcport_hash(mblk_t *mp, int type, uint16_t min, uint16_t max,
 373     uint16_t def)
 374 {
 375         size_t szused = 0;
 376         struct ether_header *ether;
 377         struct ether_vlan_header *vether;
 378         ip6_t *ip6h;
 379         ipha_t *ipha;
 380         uint16_t sap;
 381         uint64_t hash;
 382         uint32_t mod;
 383 
 384         ASSERT(min <= max);
 385 
 386         if (type != UDP_HASH_VXLAN)
 387                 return (def);
 388 
 389         if (!IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)))
 390                 return (def);
 391 
 392         /*
 393          * The following logic is VXLAN specific to get at the header, if we
 394          * have formats, eg. GENEVE, then we should ignore this.
 395          *
 396          * The kernel overlay device often puts a first mblk_t for the data
 397          * which is just the encap. If so, then we're going to use that and try
 398          * to avoid a pull up.
 399          */
 400         if (MBLKL(mp) == VXLAN_HDR_LEN) {
 401                 if (mp->b_cont == NULL)
 402                         return (def);
 403                 mp = mp->b_cont;
 404                 ether = (struct ether_header *)mp->b_rptr;
 405         } else if (MBLKL(mp) < VXLAN_HDR_LEN) {
 406                 return (def);
 407         } else {
 408                 szused = VXLAN_HDR_LEN;
 409                 ether = (struct ether_header *)((uintptr_t)mp->b_rptr + szused);
 410         }
 411 
 412         /* Can we hold a MAC header? */
 413         if (MBLKL(mp) + szused < sizeof (struct ether_header))
 414                 return (def);
 415 
 416         /*
 417          * We need to lie about the starting offset into the message block for
 418          * convenience. Undo it at the end. We know that inet_pkt_hash() won't
 419          * modify the mblk_t.
 420          */
 421         mp->b_rptr += szused;
 422         hash = inet_pkt_hash(DL_ETHER, mp, INET_PKT_HASH_L2 |
 423             INET_PKT_HASH_L3 | INET_PKT_HASH_L4);
 424         mp->b_rptr -= szused;
 425 
 426         if (hash == 0)
 427                 return (def);
 428 
 429         mod = max - min + 1;
 430         return ((hash % mod) + min);
 431 }
 432 
 433 /*
 434  * Return the next anonymous port in the privileged port range for
 435  * bind checking.
 436  *
 437  * Trusted Extension (TX) notes: TX allows administrator to mark or
 438  * reserve ports as Multilevel ports (MLP). MLP has special function
 439  * on TX systems. Once a port is made MLP, it's not available as
 440  * ordinary port. This creates "holes" in the port name space. It
 441  * may be necessary to skip the "holes" find a suitable anon port.
 442  */
 443 static in_port_t
 444 udp_get_next_priv_port(udp_t *udp)
 445 {
 446         static in_port_t next_priv_port = IPPORT_RESERVED - 1;
 447         in_port_t nextport;
 448         boolean_t restart = B_FALSE;
 449         udp_stack_t *us = udp->udp_us;
 450 
 451 retry:
 452         if (next_priv_port < us->us_min_anonpriv_port ||
 453             next_priv_port >= IPPORT_RESERVED) {


1651 
1652                         len = udp->udp_recv_ipp.ipp_ipv4_options_len;
1653                         ASSERT(len != 0);
1654                         bcopy(udp->udp_recv_ipp.ipp_ipv4_options, ptr, len);
1655                         mutex_exit(&connp->conn_lock);
1656                         return (len);
1657                 }
1658                 break;
1659         case IPPROTO_UDP:
1660                 switch (name) {
1661                 case UDP_NAT_T_ENDPOINT:
1662                         mutex_enter(&connp->conn_lock);
1663                         *i1 = udp->udp_nat_t_endpoint;
1664                         mutex_exit(&connp->conn_lock);
1665                         return (sizeof (int));
1666                 case UDP_RCVHDR:
1667                         mutex_enter(&connp->conn_lock);
1668                         *i1 = udp->udp_rcvhdr ? 1 : 0;
1669                         mutex_exit(&connp->conn_lock);
1670                         return (sizeof (int));
1671                 case UDP_SRCPORT_HASH:
1672                         mutex_enter(&connp->conn_lock);
1673                         *i1 = udp->udp_vxlanhash;
1674                         mutex_exit(&connp->conn_lock);
1675                         return (sizeof (int));
1676                 case UDP_SND_TO_CONNECTED:
1677                         mutex_enter(&connp->conn_lock);
1678                         *i1 = udp->udp_snd_to_conn ? 1 : 0;
1679                         mutex_exit(&connp->conn_lock);
1680                         return (sizeof (int));
1681                 }
1682         }
1683         mutex_enter(&connp->conn_lock);
1684         retval = conn_opt_get(&coas, level, name, ptr);
1685         mutex_exit(&connp->conn_lock);
1686         return (retval);
1687 }
1688 
1689 /*
1690  * This routine retrieves the current status of socket options.
1691  * It returns the size of the option retrieved, or -1.
1692  */
1693 int
1694 udp_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1695 {


1796                          * to IPv6.
1797                          */
1798                         if (connp->conn_family != AF_INET) {
1799                                 return (EAFNOSUPPORT);
1800                         }
1801 
1802                         if (!checkonly) {
1803                                 mutex_enter(&connp->conn_lock);
1804                                 udp->udp_nat_t_endpoint = onoff;
1805                                 mutex_exit(&connp->conn_lock);
1806                                 coa->coa_changed |= COA_HEADER_CHANGED;
1807                                 coa->coa_changed |= COA_WROFF_CHANGED;
1808                         }
1809                         /* Fully handled this option. */
1810                         return (0);
1811                 case UDP_RCVHDR:
1812                         mutex_enter(&connp->conn_lock);
1813                         udp->udp_rcvhdr = onoff;
1814                         mutex_exit(&connp->conn_lock);
1815                         return (0);
1816                 case UDP_SRCPORT_HASH:
1817                         /*
1818                          * This should have already been verified, but double
1819                          * check.
1820                          */
1821                         if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) {
1822                                 return (error);
1823                         }
1824 
1825                         /* First see if the val is something we understand */
1826                         if (*i1 != UDP_HASH_DISABLE && *i1 != UDP_HASH_VXLAN)
1827                                 return (EINVAL);
1828 
1829                         if (!checkonly) {
1830                                 mutex_enter(&connp->conn_lock);
1831                                 udp->udp_vxlanhash = *i1;
1832                                 mutex_exit(&connp->conn_lock);
1833                         }
1834                         /* Fully handled this option. */
1835                         return (0);
1836                 case UDP_SND_TO_CONNECTED:
1837                         mutex_enter(&connp->conn_lock);
1838                         udp->udp_snd_to_conn = onoff;
1839                         mutex_exit(&connp->conn_lock);
1840                         return (0);
1841                 }
1842                 break;
1843         }
1844         error = conn_opt_set(coa, level, name, inlen, invalp,
1845             checkonly, cr);
1846         return (error);
1847 }
1848 
1849 /*
1850  * This routine sets socket options.
1851  */
1852 int
1853 udp_opt_set(conn_t *connp, uint_t optset_context, int level,
1854     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
1855     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr)


2104             outlenp, outvalp, thisdg_attrs, cr);
2105         return (error);
2106 }
2107 
2108 /*
2109  * Setup IP and UDP headers.
2110  * Returns NULL on allocation failure, in which case data_mp is freed.
2111  */
2112 mblk_t *
2113 udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
2114     const in6_addr_t *v6src, const in6_addr_t *v6dst, in_port_t dstport,
2115     uint32_t flowinfo, mblk_t *data_mp, int *errorp)
2116 {
2117         mblk_t          *mp;
2118         udpha_t         *udpha;
2119         udp_stack_t     *us = connp->conn_netstack->netstack_udp;
2120         uint_t          data_len;
2121         uint32_t        cksum;
2122         udp_t           *udp = connp->conn_udp;
2123         boolean_t       insert_spi = udp->udp_nat_t_endpoint;
2124         boolean_t       hash_srcport = udp->udp_vxlanhash;
2125         uint_t          ulp_hdr_len;
2126         uint16_t        srcport;
2127 
2128         data_len = msgdsize(data_mp);
2129         ulp_hdr_len = UDPH_SIZE;
2130         if (insert_spi)
2131                 ulp_hdr_len += sizeof (uint32_t);
2132 
2133         /*
2134          * If we have source port hashing going on, determine the hash before
2135          * we modify the mblk_t.
2136          */
2137         if (hash_srcport == B_TRUE) {
2138                 srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN,
2139                     IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX,
2140                     ntohs(connp->conn_lport));
2141         }
2142 
2143         mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo,
2144             ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp);
2145         if (mp == NULL) {
2146                 ASSERT(*errorp != 0);
2147                 return (NULL);
2148         }
2149 
2150         data_len += ulp_hdr_len;
2151         ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2152 
2153         udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length);
2154         if (hash_srcport == B_TRUE) {
2155                 udpha->uha_src_port = htons(srcport);
2156         } else {
2157                 udpha->uha_src_port = connp->conn_lport;
2158         }
2159         udpha->uha_dst_port = dstport;
2160         udpha->uha_checksum = 0;
2161         udpha->uha_length = htons(data_len);
2162 
2163         /*
2164          * If there was a routing option/header then conn_prepend_hdr
2165          * has massaged it and placed the pseudo-header checksum difference
2166          * in the cksum argument.
2167          *
2168          * Setup header length and prepare for ULP checksum done in IP.
2169          *
2170          * We make it easy for IP to include our pseudo header
2171          * by putting our length in uha_checksum.
2172          * The IP source, destination, and length have already been set by
2173          * conn_prepend_hdr.
2174          */
2175         cksum += data_len;
2176         cksum = (cksum >> 16) + (cksum & 0xFFFF);
2177         ASSERT(cksum < 0x10000);
2178 


3313         ixa_refrele(ixa);
3314         return (error);
3315 }
3316 
3317 
3318 /*
3319  * Prepend the header template and then fill in the source and
3320  * flowinfo. The caller needs to handle the destination address since
3321  * it's setting is different if rthdr or source route.
3322  *
3323  * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
3324  * When it returns NULL it sets errorp.
3325  */
3326 static mblk_t *
3327 udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
3328     const in6_addr_t *v6src, in_port_t dstport, uint32_t flowinfo, int *errorp)
3329 {
3330         udp_t           *udp = connp->conn_udp;
3331         udp_stack_t     *us = udp->udp_us;
3332         boolean_t       insert_spi = udp->udp_nat_t_endpoint;
3333         boolean_t       hash_srcport = udp->udp_vxlanhash;
3334         uint_t          pktlen;
3335         uint_t          alloclen;
3336         uint_t          copylen;
3337         uint8_t         *iph;
3338         uint_t          ip_hdr_length;
3339         udpha_t         *udpha;
3340         uint32_t        cksum;
3341         ip_pkt_t        *ipp;
3342         uint16_t        srcport;
3343 
3344         ASSERT(MUTEX_HELD(&connp->conn_lock));
3345 
3346         /*
3347          * If we have source port hashing going on, determine the hash before
3348          * we modify the mblk_t.
3349          */
3350         if (hash_srcport == B_TRUE) {
3351                 srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN,
3352                     IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX,
3353                     ntohs(connp->conn_lport));
3354         }
3355 
3356         /*
3357          * Copy the header template and leave space for an SPI
3358          */
3359         copylen = connp->conn_ht_iphc_len;
3360         alloclen = copylen + (insert_spi ? sizeof (uint32_t) : 0);
3361         pktlen = alloclen + msgdsize(mp);
3362         if (pktlen > IP_MAXPACKET) {
3363                 freemsg(mp);
3364                 *errorp = EMSGSIZE;
3365                 return (NULL);
3366         }
3367         ixa->ixa_pktlen = pktlen;
3368 
3369         /* check/fix buffer config, setup pointers into it */
3370         iph = mp->b_rptr - alloclen;
3371         if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
3372                 mblk_t *mp1;
3373 
3374                 mp1 = allocb(alloclen + us->us_wroff_extra, BPRI_MED);
3375                 if (mp1 == NULL) {
3376                         freemsg(mp);


3434                         ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
3435                         ip6h->ip6_src = ipp->ipp_addr;
3436                 } else {
3437                         ip6h->ip6_src = *v6src;
3438                 }
3439                 ip6h->ip6_vcf =
3440                     (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
3441                     (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
3442                 if (ipp->ipp_fields & IPPF_TCLASS) {
3443                         /* Overrides the class part of flowinfo */
3444                         ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
3445                             ipp->ipp_tclass);
3446                 }
3447         }
3448 
3449         /* Insert all-0s SPI now. */
3450         if (insert_spi)
3451                 *((uint32_t *)(udpha + 1)) = 0;
3452 
3453         udpha->uha_dst_port = dstport;
3454         if (hash_srcport == B_TRUE)
3455                 udpha->uha_src_port = htons(srcport);
3456 
3457         return (mp);
3458 }
3459 
3460 /*
3461  * Send a T_UDERR_IND in response to an M_DATA
3462  */
3463 static void
3464 udp_ud_err_connected(conn_t *connp, t_scalar_t error)
3465 {
3466         struct sockaddr_storage ss;
3467         sin_t           *sin;
3468         sin6_t          *sin6;
3469         struct sockaddr *addr;
3470         socklen_t       addrlen;
3471         mblk_t          *mp1;
3472 
3473         mutex_enter(&connp->conn_lock);
3474         /* Initialize addr and addrlen as if they're passed in */
3475         if (connp->conn_family == AF_INET) {
3476                 sin = (sin_t *)&ss;


6044                  */
6045                 udp->udp_state = TS_IDLE;
6046         }
6047         return (error);
6048 }
6049 
6050 int
6051 udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
6052     cred_t *cr)
6053 {
6054         sin6_t          *sin6;
6055         sin_t           *sin = NULL;
6056         uint_t          srcid;
6057         conn_t          *connp = (conn_t *)proto_handle;
6058         udp_t           *udp = connp->conn_udp;
6059         int             error = 0;
6060         udp_stack_t     *us = udp->udp_us;
6061         ushort_t        ipversion;
6062         pid_t           pid = curproc->p_pid;
6063         ip_xmit_attr_t  *ixa;

6064 
6065         ASSERT(DB_TYPE(mp) == M_DATA);
6066 
6067         /* All Solaris components should pass a cred for this operation. */
6068         ASSERT(cr != NULL);
6069 
6070         /* do an implicit bind if necessary */
6071         if (udp->udp_state == TS_UNBND) {
6072                 error = udp_implicit_bind(connp, cr);
6073                 /*
6074                  * We could be racing with an actual bind, in which case
6075                  * we would see EPROTO. We cross our fingers and try
6076                  * to connect.
6077                  */
6078                 if (!(error == 0 || error == EPROTO)) {
6079                         freemsg(mp);
6080                         return (error);
6081                 }
6082         }
6083 


6088                         return (EDESTADDRREQ);
6089                 }
6090                 if (msg->msg_controllen != 0) {
6091                         error = udp_output_ancillary(connp, NULL, NULL, mp,
6092                             NULL, msg, cr, pid);
6093                 } else {
6094                         error = udp_output_connected(connp, mp, cr, pid);
6095                 }
6096                 if (us->us_sendto_ignerr)
6097                         return (0);
6098                 else
6099                         return (error);
6100         }
6101 
6102         /*
6103          * Check if we're allowed to send to a connection on which we've
6104          * already called 'connect'. The posix spec. allows both behaviors but
6105          * historically we've returned an error if already connected. The
6106          * client can allow this via a sockopt.
6107          */
6108         if (udp->udp_state == TS_DATA_XFER && !udp->udp_snd_to_conn) {



6109                 UDPS_BUMP_MIB(us, udpOutErrors);
6110                 return (EISCONN);
6111         }
6112 
6113         error = proto_verify_ip_addr(connp->conn_family,
6114             (struct sockaddr *)msg->msg_name, msg->msg_namelen);
6115         if (error != 0) {
6116                 UDPS_BUMP_MIB(us, udpOutErrors);
6117                 return (error);
6118         }
6119         switch (connp->conn_family) {
6120         case AF_INET6:
6121                 sin6 = (sin6_t *)msg->msg_name;
6122 
6123                 srcid = sin6->__sin6_src_id;
6124 
6125                 if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6126                         /*
6127                          * Destination is a non-IPv4-compatible IPv6 address.
6128                          * Send out an IPv6 format packet.




   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.

  25  */
  26 /* Copyright (c) 1990 Mentat Inc. */
  27 
  28 #include <sys/sysmacros.h>
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #include <sys/stropts.h>
  32 #include <sys/strlog.h>
  33 #include <sys/strsun.h>
  34 #define _SUN_TPI_VERSION 2
  35 #include <sys/tihdr.h>
  36 #include <sys/timod.h>
  37 #include <sys/ddi.h>
  38 #include <sys/sunddi.h>
  39 #include <sys/strsubr.h>
  40 #include <sys/suntpi.h>
  41 #include <sys/xti_inet.h>
  42 #include <sys/kmem.h>
  43 #include <sys/cred_impl.h>
  44 #include <sys/policy.h>


  59 #include <netinet/icmp6.h>
  60 #include <netinet/udp.h>
  61 
  62 #include <inet/common.h>
  63 #include <inet/ip.h>
  64 #include <inet/ip_impl.h>
  65 #include <inet/ipsec_impl.h>
  66 #include <inet/ip6.h>
  67 #include <inet/ip_ire.h>
  68 #include <inet/ip_if.h>
  69 #include <inet/ip_multi.h>
  70 #include <inet/ip_ndp.h>
  71 #include <inet/proto_set.h>
  72 #include <inet/mib2.h>
  73 #include <inet/optcom.h>
  74 #include <inet/snmpcom.h>
  75 #include <inet/kstatcom.h>
  76 #include <inet/ipclassifier.h>
  77 #include <sys/squeue_impl.h>
  78 #include <inet/ipnet.h>
  79 #include <sys/ethernet.h>

  80 
  81 #include <sys/tsol/label.h>
  82 #include <sys/tsol/tnet.h>
  83 #include <rpc/pmap_prot.h>
  84 
  85 #include <inet/udp_impl.h>
  86 
  87 /*
  88  * Synchronization notes:
  89  *
  90  * UDP is MT and uses the usual kernel synchronization primitives. There are 2
  91  * locks, the fanout lock (uf_lock) and conn_lock. conn_lock
  92  * protects the contents of the udp_t. uf_lock protects the address and the
  93  * fanout information.
  94  * The lock order is conn_lock -> uf_lock.
  95  *
  96  * The fanout lock uf_lock:
  97  * When a UDP endpoint is bound to a local port, it is inserted into
  98  * a bind hash list.  The list consists of an array of udp_fanout_t buckets.
  99  * The size of the array is controlled by the udp_bind_fanout_size variable.


 329 
 330 /* Setable in /etc/system */
 331 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
 332 uint32_t udp_random_anon_port = 1;
 333 
 334 /*
 335  * Hook functions to enable cluster networking.
 336  * On non-clustered systems these vectors must always be NULL
 337  */
 338 
 339 void (*cl_inet_bind)(netstackid_t stack_id, uchar_t protocol,
 340     sa_family_t addr_family, uint8_t *laddrp, in_port_t lport,
 341     void *args) = NULL;
 342 void (*cl_inet_unbind)(netstackid_t stack_id, uint8_t protocol,
 343     sa_family_t addr_family, uint8_t *laddrp, in_port_t lport,
 344     void *args) = NULL;
 345 
 346 typedef union T_primitives *t_primp_t;
 347 
 348 /*



















































































 349  * Return the next anonymous port in the privileged port range for
 350  * bind checking.
 351  *
 352  * Trusted Extension (TX) notes: TX allows administrator to mark or
 353  * reserve ports as Multilevel ports (MLP). MLP has special function
 354  * on TX systems. Once a port is made MLP, it's not available as
 355  * ordinary port. This creates "holes" in the port name space. It
 356  * may be necessary to skip the "holes" find a suitable anon port.
 357  */
 358 static in_port_t
 359 udp_get_next_priv_port(udp_t *udp)
 360 {
 361         static in_port_t next_priv_port = IPPORT_RESERVED - 1;
 362         in_port_t nextport;
 363         boolean_t restart = B_FALSE;
 364         udp_stack_t *us = udp->udp_us;
 365 
 366 retry:
 367         if (next_priv_port < us->us_min_anonpriv_port ||
 368             next_priv_port >= IPPORT_RESERVED) {


1566 
1567                         len = udp->udp_recv_ipp.ipp_ipv4_options_len;
1568                         ASSERT(len != 0);
1569                         bcopy(udp->udp_recv_ipp.ipp_ipv4_options, ptr, len);
1570                         mutex_exit(&connp->conn_lock);
1571                         return (len);
1572                 }
1573                 break;
1574         case IPPROTO_UDP:
1575                 switch (name) {
1576                 case UDP_NAT_T_ENDPOINT:
1577                         mutex_enter(&connp->conn_lock);
1578                         *i1 = udp->udp_nat_t_endpoint;
1579                         mutex_exit(&connp->conn_lock);
1580                         return (sizeof (int));
1581                 case UDP_RCVHDR:
1582                         mutex_enter(&connp->conn_lock);
1583                         *i1 = udp->udp_rcvhdr ? 1 : 0;
1584                         mutex_exit(&connp->conn_lock);
1585                         return (sizeof (int));





1586                 case UDP_SND_TO_CONNECTED:
1587                         mutex_enter(&connp->conn_lock);
1588                         *i1 = udp->udp_snd_to_conn ? 1 : 0;
1589                         mutex_exit(&connp->conn_lock);
1590                         return (sizeof (int));
1591                 }
1592         }
1593         mutex_enter(&connp->conn_lock);
1594         retval = conn_opt_get(&coas, level, name, ptr);
1595         mutex_exit(&connp->conn_lock);
1596         return (retval);
1597 }
1598 
1599 /*
1600  * This routine retrieves the current status of socket options.
1601  * It returns the size of the option retrieved, or -1.
1602  */
1603 int
1604 udp_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1605 {


1706                          * to IPv6.
1707                          */
1708                         if (connp->conn_family != AF_INET) {
1709                                 return (EAFNOSUPPORT);
1710                         }
1711 
1712                         if (!checkonly) {
1713                                 mutex_enter(&connp->conn_lock);
1714                                 udp->udp_nat_t_endpoint = onoff;
1715                                 mutex_exit(&connp->conn_lock);
1716                                 coa->coa_changed |= COA_HEADER_CHANGED;
1717                                 coa->coa_changed |= COA_WROFF_CHANGED;
1718                         }
1719                         /* Fully handled this option. */
1720                         return (0);
1721                 case UDP_RCVHDR:
1722                         mutex_enter(&connp->conn_lock);
1723                         udp->udp_rcvhdr = onoff;
1724                         mutex_exit(&connp->conn_lock);
1725                         return (0);




















1726                 case UDP_SND_TO_CONNECTED:
1727                         mutex_enter(&connp->conn_lock);
1728                         udp->udp_snd_to_conn = onoff;
1729                         mutex_exit(&connp->conn_lock);
1730                         return (0);
1731                 }
1732                 break;
1733         }
1734         error = conn_opt_set(coa, level, name, inlen, invalp,
1735             checkonly, cr);
1736         return (error);
1737 }
1738 
1739 /*
1740  * This routine sets socket options.
1741  */
1742 int
1743 udp_opt_set(conn_t *connp, uint_t optset_context, int level,
1744     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
1745     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr)


1994             outlenp, outvalp, thisdg_attrs, cr);
1995         return (error);
1996 }
1997 
1998 /*
1999  * Setup IP and UDP headers.
2000  * Returns NULL on allocation failure, in which case data_mp is freed.
2001  */
2002 mblk_t *
2003 udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
2004     const in6_addr_t *v6src, const in6_addr_t *v6dst, in_port_t dstport,
2005     uint32_t flowinfo, mblk_t *data_mp, int *errorp)
2006 {
2007         mblk_t          *mp;
2008         udpha_t         *udpha;
2009         udp_stack_t     *us = connp->conn_netstack->netstack_udp;
2010         uint_t          data_len;
2011         uint32_t        cksum;
2012         udp_t           *udp = connp->conn_udp;
2013         boolean_t       insert_spi = udp->udp_nat_t_endpoint;

2014         uint_t          ulp_hdr_len;

2015 
2016         data_len = msgdsize(data_mp);
2017         ulp_hdr_len = UDPH_SIZE;
2018         if (insert_spi)
2019                 ulp_hdr_len += sizeof (uint32_t);
2020 










2021         mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo,
2022             ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp);
2023         if (mp == NULL) {
2024                 ASSERT(*errorp != 0);
2025                 return (NULL);
2026         }
2027 
2028         data_len += ulp_hdr_len;
2029         ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2030 
2031         udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length);



2032         udpha->uha_src_port = connp->conn_lport;

2033         udpha->uha_dst_port = dstport;
2034         udpha->uha_checksum = 0;
2035         udpha->uha_length = htons(data_len);
2036 
2037         /*
2038          * If there was a routing option/header then conn_prepend_hdr
2039          * has massaged it and placed the pseudo-header checksum difference
2040          * in the cksum argument.
2041          *
2042          * Setup header length and prepare for ULP checksum done in IP.
2043          *
2044          * We make it easy for IP to include our pseudo header
2045          * by putting our length in uha_checksum.
2046          * The IP source, destination, and length have already been set by
2047          * conn_prepend_hdr.
2048          */
2049         cksum += data_len;
2050         cksum = (cksum >> 16) + (cksum & 0xFFFF);
2051         ASSERT(cksum < 0x10000);
2052 


3187         ixa_refrele(ixa);
3188         return (error);
3189 }
3190 
3191 
3192 /*
3193  * Prepend the header template and then fill in the source and
3194  * flowinfo. The caller needs to handle the destination address since
3195  * it's setting is different if rthdr or source route.
3196  *
3197  * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
3198  * When it returns NULL it sets errorp.
3199  */
3200 static mblk_t *
3201 udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
3202     const in6_addr_t *v6src, in_port_t dstport, uint32_t flowinfo, int *errorp)
3203 {
3204         udp_t           *udp = connp->conn_udp;
3205         udp_stack_t     *us = udp->udp_us;
3206         boolean_t       insert_spi = udp->udp_nat_t_endpoint;

3207         uint_t          pktlen;
3208         uint_t          alloclen;
3209         uint_t          copylen;
3210         uint8_t         *iph;
3211         uint_t          ip_hdr_length;
3212         udpha_t         *udpha;
3213         uint32_t        cksum;
3214         ip_pkt_t        *ipp;

3215 
3216         ASSERT(MUTEX_HELD(&connp->conn_lock));
3217 
3218         /*










3219          * Copy the header template and leave space for an SPI
3220          */
3221         copylen = connp->conn_ht_iphc_len;
3222         alloclen = copylen + (insert_spi ? sizeof (uint32_t) : 0);
3223         pktlen = alloclen + msgdsize(mp);
3224         if (pktlen > IP_MAXPACKET) {
3225                 freemsg(mp);
3226                 *errorp = EMSGSIZE;
3227                 return (NULL);
3228         }
3229         ixa->ixa_pktlen = pktlen;
3230 
3231         /* check/fix buffer config, setup pointers into it */
3232         iph = mp->b_rptr - alloclen;
3233         if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
3234                 mblk_t *mp1;
3235 
3236                 mp1 = allocb(alloclen + us->us_wroff_extra, BPRI_MED);
3237                 if (mp1 == NULL) {
3238                         freemsg(mp);


3296                         ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
3297                         ip6h->ip6_src = ipp->ipp_addr;
3298                 } else {
3299                         ip6h->ip6_src = *v6src;
3300                 }
3301                 ip6h->ip6_vcf =
3302                     (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
3303                     (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
3304                 if (ipp->ipp_fields & IPPF_TCLASS) {
3305                         /* Overrides the class part of flowinfo */
3306                         ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
3307                             ipp->ipp_tclass);
3308                 }
3309         }
3310 
3311         /* Insert all-0s SPI now. */
3312         if (insert_spi)
3313                 *((uint32_t *)(udpha + 1)) = 0;
3314 
3315         udpha->uha_dst_port = dstport;



3316         return (mp);
3317 }
3318 
3319 /*
3320  * Send a T_UDERR_IND in response to an M_DATA
3321  */
3322 static void
3323 udp_ud_err_connected(conn_t *connp, t_scalar_t error)
3324 {
3325         struct sockaddr_storage ss;
3326         sin_t           *sin;
3327         sin6_t          *sin6;
3328         struct sockaddr *addr;
3329         socklen_t       addrlen;
3330         mblk_t          *mp1;
3331 
3332         mutex_enter(&connp->conn_lock);
3333         /* Initialize addr and addrlen as if they're passed in */
3334         if (connp->conn_family == AF_INET) {
3335                 sin = (sin_t *)&ss;


5903                  */
5904                 udp->udp_state = TS_IDLE;
5905         }
5906         return (error);
5907 }
5908 
5909 int
5910 udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
5911     cred_t *cr)
5912 {
5913         sin6_t          *sin6;
5914         sin_t           *sin = NULL;
5915         uint_t          srcid;
5916         conn_t          *connp = (conn_t *)proto_handle;
5917         udp_t           *udp = connp->conn_udp;
5918         int             error = 0;
5919         udp_stack_t     *us = udp->udp_us;
5920         ushort_t        ipversion;
5921         pid_t           pid = curproc->p_pid;
5922         ip_xmit_attr_t  *ixa;
5923         boolean_t       snd_to_conn;
5924 
5925         ASSERT(DB_TYPE(mp) == M_DATA);
5926 
5927         /* All Solaris components should pass a cred for this operation. */
5928         ASSERT(cr != NULL);
5929 
5930         /* do an implicit bind if necessary */
5931         if (udp->udp_state == TS_UNBND) {
5932                 error = udp_implicit_bind(connp, cr);
5933                 /*
5934                  * We could be racing with an actual bind, in which case
5935                  * we would see EPROTO. We cross our fingers and try
5936                  * to connect.
5937                  */
5938                 if (!(error == 0 || error == EPROTO)) {
5939                         freemsg(mp);
5940                         return (error);
5941                 }
5942         }
5943 


5948                         return (EDESTADDRREQ);
5949                 }
5950                 if (msg->msg_controllen != 0) {
5951                         error = udp_output_ancillary(connp, NULL, NULL, mp,
5952                             NULL, msg, cr, pid);
5953                 } else {
5954                         error = udp_output_connected(connp, mp, cr, pid);
5955                 }
5956                 if (us->us_sendto_ignerr)
5957                         return (0);
5958                 else
5959                         return (error);
5960         }
5961 
5962         /*
5963          * Check if we're allowed to send to a connection on which we've
5964          * already called 'connect'. The posix spec. allows both behaviors but
5965          * historically we've returned an error if already connected. The
5966          * client can allow this via a sockopt.
5967          */
5968         mutex_enter(&connp->conn_lock);
5969         snd_to_conn = (udp->udp_snd_to_conn != 0);
5970         mutex_exit(&connp->conn_lock);
5971         if (udp->udp_state == TS_DATA_XFER && !snd_to_conn) {
5972                 UDPS_BUMP_MIB(us, udpOutErrors);
5973                 return (EISCONN);
5974         }
5975 
5976         error = proto_verify_ip_addr(connp->conn_family,
5977             (struct sockaddr *)msg->msg_name, msg->msg_namelen);
5978         if (error != 0) {
5979                 UDPS_BUMP_MIB(us, udpOutErrors);
5980                 return (error);
5981         }
5982         switch (connp->conn_family) {
5983         case AF_INET6:
5984                 sin6 = (sin6_t *)msg->msg_name;
5985 
5986                 srcid = sin6->__sin6_src_id;
5987 
5988                 if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
5989                         /*
5990                          * Destination is a non-IPv4-compatible IPv6 address.
5991                          * Send out an IPv6 format packet.