Print this page
918 Need better IP fanout (esp. with VLANs present)

*** 19,31 **** * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - */ - /* * Copyright 2011 Joyent, Inc. All rights reserved. */ #include <sys/types.h> #include <sys/callb.h> #include <sys/sdt.h> --- 19,30 ---- * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2011 Joyent, Inc. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ #include <sys/types.h> #include <sys/callb.h> #include <sys/sdt.h>
*** 528,543 **** UNDEF }; /* * In general we do port based hashing to spread traffic over different ! * softrings. The below tunable allows to override that behavior. Setting it ! * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior ! * is also the applicable to ipv6 packets carrying multiple optional headers ! * and other uncommon packet types. */ boolean_t mac_src_ipv6_fanout = B_FALSE; /* * Pair of local and remote ports in the transport header */ #define PORTS_SIZE 4 --- 527,543 ---- UNDEF }; /* * In general we do port based hashing to spread traffic over different ! * softrings. The below tunables allow to override that behavior. Setting one ! * (depending on IPv6 or IPv4) to B_TRUE allows a fanout based on src ! * IPv6 or IPv4 address. This behavior is also applicable to IPv6 packets ! * carrying multiple optional headers and other uncommon packet types. */ boolean_t mac_src_ipv6_fanout = B_FALSE; + boolean_t mac_src_ipv4_fanout = B_FALSE; /* * Pair of local and remote ports in the transport header */ #define PORTS_SIZE 4
*** 758,907 **** headmp[type], tailmp[type], cnt[type], sz[type]); } } } ! int fanout_unalligned = 0; /* * mac_rx_srs_long_fanout * ! * The fanout routine for IPv6 */ static int mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) { ip6_t *ip6h; uint8_t *whereptr; uint_t hash; uint16_t remlen; uint8_t nexthdr; uint16_t hdr_len; ! ! if (sap == ETHERTYPE_IPV6) { boolean_t modifiable = B_TRUE; ASSERT(MBLKL(mp) >= hdrsize); ip6h = (ip6_t *)(mp->b_rptr + hdrsize); ! if ((unsigned char *)ip6h == mp->b_wptr) { /* * The first mblk_t only includes the mac header. * Note that it is safe to change the mp pointer here, * as the subsequent operation does not assume mp * points to the start of the mac header. */ mp = mp->b_cont; /* ! * Make sure ip6h holds the full ip6_t structure. */ if (mp == NULL) return (-1); ! if (MBLKL(mp) < IPV6_HDR_LEN) { modifiable = (DB_REF(mp) == 1); ! if (modifiable && ! !pullupmsg(mp, IPV6_HDR_LEN)) { return (-1); } - } ip6h = (ip6_t *)mp->b_rptr; } if (!modifiable || !(OK_32PTR((char *)ip6h)) || ! ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) { /* ! * If either ip6h is not alligned, or ip6h does not ! * hold the complete ip6_t structure (a pullupmsg() ! * is not an option since it would result in an ! * unalligned ip6h), fanout to the default ring. Note ! * that this may cause packets reordering. */ *indx = 0; *type = OTH; ! fanout_unalligned++; return (0); } remlen = ntohs(ip6h->ip6_plen); nexthdr = ip6h->ip6_nxt; ! ! if (remlen < MIN_EHDR_LEN) ! return (-1); /* * Do src based fanout if below tunable is set to B_TRUE or * when mac_ip_hdr_length_v6() fails because of malformed ! * packets or because mblk's need to be concatenated using * pullupmsg(). */ if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h, mp->b_wptr, &hdr_len, &nexthdr, NULL)) { goto src_based_fanout; } whereptr = (uint8_t *)ip6h + hdr_len; ! /* If the transport is one of below, we do port based fanout */ switch (nexthdr) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_SCTP: case IPPROTO_ESP: /* ! * If the ports in the transport header is not part of * the mblk, do src_based_fanout, instead of calling * pullupmsg(). */ ! if (mp->b_cont != NULL && ! whereptr + PORTS_SIZE > mp->b_wptr) { goto src_based_fanout; } - break; - default: - break; - } switch (nexthdr) { case IPPROTO_TCP: ! hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), ! *(uint32_t *)whereptr); ! *indx = COMPUTE_INDEX(hash, ! mac_srs->srs_tcp_ring_count); *type = OTH; break; - case IPPROTO_UDP: case IPPROTO_SCTP: case IPPROTO_ESP: if (mac_fanout_type == MAC_FANOUT_DEFAULT) { ! hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), ! *(uint32_t *)whereptr); *indx = COMPUTE_INDEX(hash, mac_srs->srs_udp_ring_count); } else { ! *indx = mac_srs->srs_ind % ! mac_srs->srs_udp_ring_count; mac_srs->srs_ind++; } *type = OTH; break; - - /* For all other protocol, do source based fanout */ - default: - goto src_based_fanout; } - } else { - *indx = 0; - *type = OTH; - } return (0); src_based_fanout: ! hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); *type = OTH; return (0); } --- 758,929 ---- headmp[type], tailmp[type], cnt[type], sz[type]); } } } ! int fanout_unaligned = 0; /* * mac_rx_srs_long_fanout * ! * The fanout routine for VLANs, and for anything else that isn't performing ! * explicit dls bypass. Returns -1 on an error (drop the packet due to a ! * malformed packet), 0 on success, with values written in *indx and *type. */ static int mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) { ip6_t *ip6h; + ipha_t *ipha; uint8_t *whereptr; uint_t hash; uint16_t remlen; uint8_t nexthdr; uint16_t hdr_len; ! uint32_t src_val; boolean_t modifiable = B_TRUE; + boolean_t v6; ASSERT(MBLKL(mp) >= hdrsize); + if (sap == ETHERTYPE_IPV6) { + v6 = B_TRUE; + hdr_len = IPV6_HDR_LEN; + } else if (sap == ETHERTYPE_IP) { + v6 = B_FALSE; + hdr_len = IP_SIMPLE_HDR_LENGTH; + } else { + *indx = 0; + *type = OTH; + return (0); + } + ip6h = (ip6_t *)(mp->b_rptr + hdrsize); ! ipha = (ipha_t *)ip6h; ! ! if ((uint8_t *)ip6h == mp->b_wptr) { /* * The first mblk_t only includes the mac header. * Note that it is safe to change the mp pointer here, * as the subsequent operation does not assume mp * points to the start of the mac header. */ mp = mp->b_cont; /* ! * Make sure the IP header points to an entire one. */ if (mp == NULL) return (-1); ! if (MBLKL(mp) < hdr_len) { modifiable = (DB_REF(mp) == 1); ! if (modifiable && !pullupmsg(mp, hdr_len)) return (-1); } ip6h = (ip6_t *)mp->b_rptr; + ipha = (ipha_t *)ip6h; } if (!modifiable || !(OK_32PTR((char *)ip6h)) || ! ((uint8_t *)ip6h + hdr_len > mp->b_wptr)) { /* ! * If either the IP header is not aligned, or it does not hold ! * the complete simple structure (a pullupmsg() is not an ! * option since it would result in an unaligned IP header), ! * fanout to the default ring. ! * ! * Note that this may cause packet reordering. */ *indx = 0; *type = OTH; ! fanout_unaligned++; return (0); } + /* + * Extract next-header, full header length, and source-hash value + * using v4/v6 specific fields. + */ + if (v6) { remlen = ntohs(ip6h->ip6_plen); nexthdr = ip6h->ip6_nxt; ! src_val = V4_PART_OF_V6(ip6h->ip6_src); /* * Do src based fanout if below tunable is set to B_TRUE or * when mac_ip_hdr_length_v6() fails because of malformed ! * packets or because mblks need to be concatenated using * pullupmsg(). */ if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h, mp->b_wptr, &hdr_len, &nexthdr, NULL)) { goto src_based_fanout; } + } else { + hdr_len = IPH_HDR_LENGTH(ipha); + remlen = ntohs(ipha->ipha_length) - hdr_len; + nexthdr = ipha->ipha_protocol; + src_val = (uint32_t)ipha->ipha_src; + /* + * Catch IPv4 fragment case here. IPv6 has nexthdr == FRAG + * for its equivalent case. + */ + if (mac_src_ipv4_fanout || + (ntohs(ipha->ipha_fragment_offset_and_flags) & + (IPH_MF | IPH_OFFSET)) != 0) { + goto src_based_fanout; + } + } + if (remlen < MIN_EHDR_LEN) + return (-1); whereptr = (uint8_t *)ip6h + hdr_len; ! /* If the transport is one of below, we do port/SPI based fanout */ switch (nexthdr) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_SCTP: case IPPROTO_ESP: /* ! * If the ports or SPI in the transport header is not part of * the mblk, do src_based_fanout, instead of calling * pullupmsg(). */ ! if (mp->b_cont == NULL || whereptr + PORTS_SIZE <= mp->b_wptr) ! break; /* out of switch... */ ! /* FALLTHRU */ ! default: goto src_based_fanout; } switch (nexthdr) { case IPPROTO_TCP: ! hash = HASH_ADDR(src_val, *(uint32_t *)whereptr); ! *indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); *type = OTH; break; case IPPROTO_UDP: case IPPROTO_SCTP: case IPPROTO_ESP: if (mac_fanout_type == MAC_FANOUT_DEFAULT) { ! hash = HASH_ADDR(src_val, *(uint32_t *)whereptr); *indx = COMPUTE_INDEX(hash, mac_srs->srs_udp_ring_count); } else { ! *indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count; mac_srs->srs_ind++; } *type = OTH; break; } return (0); src_based_fanout: ! hash = HASH_ADDR(src_val, (uint32_t)0); *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); *type = OTH; return (0); }