Print this page
918 Need better IP fanout (esp. with VLANs present)
*** 19,31 ****
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- */
- /*
* Copyright 2011 Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
#include <sys/callb.h>
#include <sys/sdt.h>
--- 19,30 ----
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/types.h>
#include <sys/callb.h>
#include <sys/sdt.h>
*** 528,543 ****
UNDEF
};
/*
* In general we do port based hashing to spread traffic over different
! * softrings. The below tunable allows to override that behavior. Setting it
! * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior
! * is also the applicable to ipv6 packets carrying multiple optional headers
! * and other uncommon packet types.
*/
boolean_t mac_src_ipv6_fanout = B_FALSE;
/*
* Pair of local and remote ports in the transport header
*/
#define PORTS_SIZE 4
--- 527,543 ----
UNDEF
};
/*
* In general we do port based hashing to spread traffic over different
! * softrings. The below tunables allow to override that behavior. Setting one
! * (depending on IPv6 or IPv4) to B_TRUE allows a fanout based on src
! * IPv6 or IPv4 address. This behavior is also applicable to IPv6 packets
! * carrying multiple optional headers and other uncommon packet types.
*/
boolean_t mac_src_ipv6_fanout = B_FALSE;
+ boolean_t mac_src_ipv4_fanout = B_FALSE;
/*
* Pair of local and remote ports in the transport header
*/
#define PORTS_SIZE 4
*** 758,907 ****
headmp[type], tailmp[type], cnt[type], sz[type]);
}
}
}
! int fanout_unalligned = 0;
/*
* mac_rx_srs_long_fanout
*
! * The fanout routine for IPv6
*/
static int
mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
{
ip6_t *ip6h;
uint8_t *whereptr;
uint_t hash;
uint16_t remlen;
uint8_t nexthdr;
uint16_t hdr_len;
!
! if (sap == ETHERTYPE_IPV6) {
boolean_t modifiable = B_TRUE;
ASSERT(MBLKL(mp) >= hdrsize);
ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
! if ((unsigned char *)ip6h == mp->b_wptr) {
/*
* The first mblk_t only includes the mac header.
* Note that it is safe to change the mp pointer here,
* as the subsequent operation does not assume mp
* points to the start of the mac header.
*/
mp = mp->b_cont;
/*
! * Make sure ip6h holds the full ip6_t structure.
*/
if (mp == NULL)
return (-1);
! if (MBLKL(mp) < IPV6_HDR_LEN) {
modifiable = (DB_REF(mp) == 1);
! if (modifiable &&
! !pullupmsg(mp, IPV6_HDR_LEN)) {
return (-1);
}
- }
ip6h = (ip6_t *)mp->b_rptr;
}
if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
! ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) {
/*
! * If either ip6h is not alligned, or ip6h does not
! * hold the complete ip6_t structure (a pullupmsg()
! * is not an option since it would result in an
! * unalligned ip6h), fanout to the default ring. Note
! * that this may cause packets reordering.
*/
*indx = 0;
*type = OTH;
! fanout_unalligned++;
return (0);
}
remlen = ntohs(ip6h->ip6_plen);
nexthdr = ip6h->ip6_nxt;
!
! if (remlen < MIN_EHDR_LEN)
! return (-1);
/*
* Do src based fanout if below tunable is set to B_TRUE or
* when mac_ip_hdr_length_v6() fails because of malformed
! * packets or because mblk's need to be concatenated using
* pullupmsg().
*/
if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h,
mp->b_wptr, &hdr_len, &nexthdr, NULL)) {
goto src_based_fanout;
}
whereptr = (uint8_t *)ip6h + hdr_len;
! /* If the transport is one of below, we do port based fanout */
switch (nexthdr) {
case IPPROTO_TCP:
case IPPROTO_UDP:
case IPPROTO_SCTP:
case IPPROTO_ESP:
/*
! * If the ports in the transport header is not part of
* the mblk, do src_based_fanout, instead of calling
* pullupmsg().
*/
! if (mp->b_cont != NULL &&
! whereptr + PORTS_SIZE > mp->b_wptr) {
goto src_based_fanout;
}
- break;
- default:
- break;
- }
switch (nexthdr) {
case IPPROTO_TCP:
! hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
! *(uint32_t *)whereptr);
! *indx = COMPUTE_INDEX(hash,
! mac_srs->srs_tcp_ring_count);
*type = OTH;
break;
-
case IPPROTO_UDP:
case IPPROTO_SCTP:
case IPPROTO_ESP:
if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
! hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
! *(uint32_t *)whereptr);
*indx = COMPUTE_INDEX(hash,
mac_srs->srs_udp_ring_count);
} else {
! *indx = mac_srs->srs_ind %
! mac_srs->srs_udp_ring_count;
mac_srs->srs_ind++;
}
*type = OTH;
break;
-
- /* For all other protocol, do source based fanout */
- default:
- goto src_based_fanout;
}
- } else {
- *indx = 0;
- *type = OTH;
- }
return (0);
src_based_fanout:
! hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0);
*indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
*type = OTH;
return (0);
}
--- 758,929 ----
headmp[type], tailmp[type], cnt[type], sz[type]);
}
}
}
! int fanout_unaligned = 0;
/*
* mac_rx_srs_long_fanout
*
! * The fanout routine for VLANs, and for anything else that isn't performing
! * explicit dls bypass. Returns -1 on an error (drop the packet due to a
! * malformed packet), 0 on success, with values written in *indx and *type.
*/
static int
mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
{
ip6_t *ip6h;
+ ipha_t *ipha;
uint8_t *whereptr;
uint_t hash;
uint16_t remlen;
uint8_t nexthdr;
uint16_t hdr_len;
! uint32_t src_val;
boolean_t modifiable = B_TRUE;
+ boolean_t v6;
ASSERT(MBLKL(mp) >= hdrsize);
+ if (sap == ETHERTYPE_IPV6) {
+ v6 = B_TRUE;
+ hdr_len = IPV6_HDR_LEN;
+ } else if (sap == ETHERTYPE_IP) {
+ v6 = B_FALSE;
+ hdr_len = IP_SIMPLE_HDR_LENGTH;
+ } else {
+ *indx = 0;
+ *type = OTH;
+ return (0);
+ }
+
ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
! ipha = (ipha_t *)ip6h;
!
! if ((uint8_t *)ip6h == mp->b_wptr) {
/*
* The first mblk_t only includes the mac header.
* Note that it is safe to change the mp pointer here,
* as the subsequent operation does not assume mp
* points to the start of the mac header.
*/
mp = mp->b_cont;
/*
! * Make sure the IP header points to an entire one.
*/
if (mp == NULL)
return (-1);
! if (MBLKL(mp) < hdr_len) {
modifiable = (DB_REF(mp) == 1);
! if (modifiable && !pullupmsg(mp, hdr_len))
return (-1);
}
ip6h = (ip6_t *)mp->b_rptr;
+ ipha = (ipha_t *)ip6h;
}
if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
! ((uint8_t *)ip6h + hdr_len > mp->b_wptr)) {
/*
! * If either the IP header is not aligned, or it does not hold
! * the complete simple structure (a pullupmsg() is not an
! * option since it would result in an unaligned IP header),
! * fanout to the default ring.
! *
! * Note that this may cause packet reordering.
*/
*indx = 0;
*type = OTH;
! fanout_unaligned++;
return (0);
}
+ /*
+ * Extract next-header, full header length, and source-hash value
+ * using v4/v6 specific fields.
+ */
+ if (v6) {
remlen = ntohs(ip6h->ip6_plen);
nexthdr = ip6h->ip6_nxt;
! src_val = V4_PART_OF_V6(ip6h->ip6_src);
/*
* Do src based fanout if below tunable is set to B_TRUE or
* when mac_ip_hdr_length_v6() fails because of malformed
! * packets or because mblks need to be concatenated using
* pullupmsg().
*/
if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h,
mp->b_wptr, &hdr_len, &nexthdr, NULL)) {
goto src_based_fanout;
}
+ } else {
+ hdr_len = IPH_HDR_LENGTH(ipha);
+ remlen = ntohs(ipha->ipha_length) - hdr_len;
+ nexthdr = ipha->ipha_protocol;
+ src_val = (uint32_t)ipha->ipha_src;
+ /*
+ * Catch IPv4 fragment case here. IPv6 has nexthdr == FRAG
+ * for its equivalent case.
+ */
+ if (mac_src_ipv4_fanout ||
+ (ntohs(ipha->ipha_fragment_offset_and_flags) &
+ (IPH_MF | IPH_OFFSET)) != 0) {
+ goto src_based_fanout;
+ }
+ }
+ if (remlen < MIN_EHDR_LEN)
+ return (-1);
whereptr = (uint8_t *)ip6h + hdr_len;
! /* If the transport is one of below, we do port/SPI based fanout */
switch (nexthdr) {
case IPPROTO_TCP:
case IPPROTO_UDP:
case IPPROTO_SCTP:
case IPPROTO_ESP:
/*
! * If the ports or SPI in the transport header is not part of
* the mblk, do src_based_fanout, instead of calling
* pullupmsg().
*/
! if (mp->b_cont == NULL || whereptr + PORTS_SIZE <= mp->b_wptr)
! break; /* out of switch... */
! /* FALLTHRU */
! default:
goto src_based_fanout;
}
switch (nexthdr) {
case IPPROTO_TCP:
! hash = HASH_ADDR(src_val, *(uint32_t *)whereptr);
! *indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
*type = OTH;
break;
case IPPROTO_UDP:
case IPPROTO_SCTP:
case IPPROTO_ESP:
if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
! hash = HASH_ADDR(src_val, *(uint32_t *)whereptr);
*indx = COMPUTE_INDEX(hash,
mac_srs->srs_udp_ring_count);
} else {
! *indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
mac_srs->srs_ind++;
}
*type = OTH;
break;
}
return (0);
src_based_fanout:
! hash = HASH_ADDR(src_val, (uint32_t)0);
*indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
*type = OTH;
return (0);
}