Print this page
918 Need better IP fanout (esp. with VLANs present)
@@ -19,13 +19,12 @@
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- */
-/*
* Copyright 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/types.h>
#include <sys/callb.h>
#include <sys/sdt.h>
@@ -528,16 +527,17 @@
UNDEF
};
/*
* In general we do port based hashing to spread traffic over different
- * softrings. The below tunable allows to override that behavior. Setting it
- * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior
- * is also the applicable to ipv6 packets carrying multiple optional headers
- * and other uncommon packet types.
+ * softrings. The below tunables allow to override that behavior. Setting one
+ * (depending on IPv6 or IPv4) to B_TRUE allows a fanout based on src
+ * IPv6 or IPv4 address. This behavior is also applicable to IPv6 packets
+ * carrying multiple optional headers and other uncommon packet types.
*/
boolean_t mac_src_ipv6_fanout = B_FALSE;
+boolean_t mac_src_ipv4_fanout = B_FALSE;
/*
* Pair of local and remote ports in the transport header
*/
#define PORTS_SIZE 4
@@ -758,150 +758,172 @@
headmp[type], tailmp[type], cnt[type], sz[type]);
}
}
}
-int fanout_unalligned = 0;
+int fanout_unaligned = 0;
/*
* mac_rx_srs_long_fanout
*
- * The fanout routine for IPv6
+ * The fanout routine for VLANs, and for anything else that isn't performing
+ * explicit dls bypass. Returns -1 on an error (drop the packet due to a
+ * malformed packet), 0 on success, with values written in *indx and *type.
*/
static int
mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
{
ip6_t *ip6h;
+ ipha_t *ipha;
uint8_t *whereptr;
uint_t hash;
uint16_t remlen;
uint8_t nexthdr;
uint16_t hdr_len;
-
- if (sap == ETHERTYPE_IPV6) {
+ uint32_t src_val;
boolean_t modifiable = B_TRUE;
+ boolean_t v6;
ASSERT(MBLKL(mp) >= hdrsize);
+ if (sap == ETHERTYPE_IPV6) {
+ v6 = B_TRUE;
+ hdr_len = IPV6_HDR_LEN;
+ } else if (sap == ETHERTYPE_IP) {
+ v6 = B_FALSE;
+ hdr_len = IP_SIMPLE_HDR_LENGTH;
+ } else {
+ *indx = 0;
+ *type = OTH;
+ return (0);
+ }
+
ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
- if ((unsigned char *)ip6h == mp->b_wptr) {
+ ipha = (ipha_t *)ip6h;
+
+ if ((uint8_t *)ip6h == mp->b_wptr) {
/*
* The first mblk_t only includes the mac header.
* Note that it is safe to change the mp pointer here,
* as the subsequent operation does not assume mp
* points to the start of the mac header.
*/
mp = mp->b_cont;
/*
- * Make sure ip6h holds the full ip6_t structure.
+ * Make sure the IP header points to an entire one.
*/
if (mp == NULL)
return (-1);
- if (MBLKL(mp) < IPV6_HDR_LEN) {
+ if (MBLKL(mp) < hdr_len) {
modifiable = (DB_REF(mp) == 1);
- if (modifiable &&
- !pullupmsg(mp, IPV6_HDR_LEN)) {
+ if (modifiable && !pullupmsg(mp, hdr_len))
return (-1);
}
- }
ip6h = (ip6_t *)mp->b_rptr;
+ ipha = (ipha_t *)ip6h;
}
if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
- ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) {
+ ((uint8_t *)ip6h + hdr_len > mp->b_wptr)) {
/*
- * If either ip6h is not alligned, or ip6h does not
- * hold the complete ip6_t structure (a pullupmsg()
- * is not an option since it would result in an
- * unalligned ip6h), fanout to the default ring. Note
- * that this may cause packets reordering.
+ * If either the IP header is not aligned, or it does not hold
+ * the complete simple structure (a pullupmsg() is not an
+ * option since it would result in an unaligned IP header),
+ * fanout to the default ring.
+ *
+ * Note that this may cause packet reordering.
*/
*indx = 0;
*type = OTH;
- fanout_unalligned++;
+ fanout_unaligned++;
return (0);
}
+ /*
+ * Extract next-header, full header length, and source-hash value
+ * using v4/v6 specific fields.
+ */
+ if (v6) {
remlen = ntohs(ip6h->ip6_plen);
nexthdr = ip6h->ip6_nxt;
-
- if (remlen < MIN_EHDR_LEN)
- return (-1);
+ src_val = V4_PART_OF_V6(ip6h->ip6_src);
/*
* Do src based fanout if below tunable is set to B_TRUE or
* when mac_ip_hdr_length_v6() fails because of malformed
- * packets or because mblk's need to be concatenated using
+ * packets or because mblks need to be concatenated using
* pullupmsg().
*/
if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h,
mp->b_wptr, &hdr_len, &nexthdr, NULL)) {
goto src_based_fanout;
}
+ } else {
+ hdr_len = IPH_HDR_LENGTH(ipha);
+ remlen = ntohs(ipha->ipha_length) - hdr_len;
+ nexthdr = ipha->ipha_protocol;
+ src_val = (uint32_t)ipha->ipha_src;
+ /*
+ * Catch IPv4 fragment case here. IPv6 has nexthdr == FRAG
+ * for its equivalent case.
+ */
+ if (mac_src_ipv4_fanout ||
+ (ntohs(ipha->ipha_fragment_offset_and_flags) &
+ (IPH_MF | IPH_OFFSET)) != 0) {
+ goto src_based_fanout;
+ }
+ }
+ if (remlen < MIN_EHDR_LEN)
+ return (-1);
whereptr = (uint8_t *)ip6h + hdr_len;
- /* If the transport is one of below, we do port based fanout */
+ /* If the transport is one of below, we do port/SPI based fanout */
switch (nexthdr) {
case IPPROTO_TCP:
case IPPROTO_UDP:
case IPPROTO_SCTP:
case IPPROTO_ESP:
/*
- * If the ports in the transport header is not part of
+ * If the ports or SPI in the transport header is not part of
* the mblk, do src_based_fanout, instead of calling
* pullupmsg().
*/
- if (mp->b_cont != NULL &&
- whereptr + PORTS_SIZE > mp->b_wptr) {
+ if (mp->b_cont == NULL || whereptr + PORTS_SIZE <= mp->b_wptr)
+ break; /* out of switch... */
+ /* FALLTHRU */
+ default:
goto src_based_fanout;
}
- break;
- default:
- break;
- }
switch (nexthdr) {
case IPPROTO_TCP:
- hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
- *(uint32_t *)whereptr);
- *indx = COMPUTE_INDEX(hash,
- mac_srs->srs_tcp_ring_count);
+ hash = HASH_ADDR(src_val, *(uint32_t *)whereptr);
+ *indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
*type = OTH;
break;
-
case IPPROTO_UDP:
case IPPROTO_SCTP:
case IPPROTO_ESP:
if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
- hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
- *(uint32_t *)whereptr);
+ hash = HASH_ADDR(src_val, *(uint32_t *)whereptr);
*indx = COMPUTE_INDEX(hash,
mac_srs->srs_udp_ring_count);
} else {
- *indx = mac_srs->srs_ind %
- mac_srs->srs_udp_ring_count;
+ *indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
mac_srs->srs_ind++;
}
*type = OTH;
break;
-
- /* For all other protocol, do source based fanout */
- default:
- goto src_based_fanout;
}
- } else {
- *indx = 0;
- *type = OTH;
- }
return (0);
src_based_fanout:
- hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0);
+ hash = HASH_ADDR(src_val, (uint32_t)0);
*indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
*type = OTH;
return (0);
}