Print this page
918 Need better IP fanout (esp. with VLANs present)

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/io/mac/mac_sched.c
          +++ new/usr/src/uts/common/io/mac/mac_sched.c
↓ open down ↓ 13 lines elided ↑ open up ↑
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24      - */
  25      -/*
  26   24   * Copyright 2011 Joyent, Inc.  All rights reserved.
       25 + * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  27   26   */
  28   27  
  29   28  #include <sys/types.h>
  30   29  #include <sys/callb.h>
  31   30  #include <sys/sdt.h>
  32   31  #include <sys/strsubr.h>
  33   32  #include <sys/strsun.h>
  34   33  #include <sys/vlan.h>
  35   34  #include <sys/stack.h>
  36   35  #include <sys/archsystm.h>
↓ open down ↓ 486 lines elided ↑ open up ↑
 523  522  /* fanout types for port based hashing */
 524  523  enum pkt_type {
 525  524          V4_TCP = 0,
 526  525          V4_UDP,
 527  526          OTH,
 528  527          UNDEF
 529  528  };
 530  529  
 531  530  /*
 532  531   * In general we do port based hashing to spread traffic over different
 533      - * softrings. The below tunable allows to override that behavior. Setting it
 534      - * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior
 535      - * is also the applicable to ipv6 packets carrying multiple optional headers
 536      - * and other uncommon packet types.
      532 + * softrings. The below tunables allow to override that behavior. Setting one
      533 + * (depending on IPv6 or IPv4) to B_TRUE allows a fanout based on src
      534 + * IPv6 or IPv4 address. This behavior is also applicable to IPv6 packets
      535 + * carrying multiple optional headers and other uncommon packet types.
 537  536   */
 538  537  boolean_t mac_src_ipv6_fanout = B_FALSE;
      538 +boolean_t mac_src_ipv4_fanout = B_FALSE;
 539  539  
 540  540  /*
 541  541   * Pair of local and remote ports in the transport header
 542  542   */
 543  543  #define PORTS_SIZE 4
 544  544  
 545  545  /*
 546  546   * mac_rx_srs_proto_fanout
 547  547   *
 548  548   * This routine delivers packets destined to an SRS into one of the
↓ open down ↓ 204 lines elided ↑ open up ↑
 753  753                                  break;
 754  754                          case OTH:
 755  755                                  softring = mac_srs->srs_oth_soft_rings[0];
 756  756                          }
 757  757                          mac_rx_soft_ring_process(mcip, softring,
 758  758                              headmp[type], tailmp[type], cnt[type], sz[type]);
 759  759                  }
 760  760          }
 761  761  }
 762  762  
 763      -int     fanout_unalligned = 0;
      763 +int     fanout_unaligned = 0;
 764  764  
 765  765  /*
 766  766   * mac_rx_srs_long_fanout
 767  767   *
 768      - * The fanout routine for IPv6
      768 + * The fanout routine for VLANs, and for anything else that isn't performing
      769 + * explicit dls bypass.  Returns -1 on an error (drop the packet due to a
      770 + * malformed packet), 0 on success, with values written in *indx and *type.
 769  771   */
 770  772  static int
 771  773  mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
 772  774      uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
 773  775  {
 774  776          ip6_t           *ip6h;
      777 +        ipha_t          *ipha;
 775  778          uint8_t         *whereptr;
 776  779          uint_t          hash;
 777  780          uint16_t        remlen;
 778  781          uint8_t         nexthdr;
 779  782          uint16_t        hdr_len;
      783 +        uint32_t        src_val;
      784 +        boolean_t       modifiable = B_TRUE;
      785 +        boolean_t       v6;
 780  786  
      787 +        ASSERT(MBLKL(mp) >= hdrsize);
      788 +
 781  789          if (sap == ETHERTYPE_IPV6) {
 782      -                boolean_t       modifiable = B_TRUE;
      790 +                v6 = B_TRUE;
      791 +                hdr_len = IPV6_HDR_LEN;
      792 +        } else if (sap == ETHERTYPE_IP) {
      793 +                v6 = B_FALSE;
      794 +                hdr_len = IP_SIMPLE_HDR_LENGTH;
      795 +        } else {
      796 +                *indx = 0;
      797 +                *type = OTH;
      798 +                return (0);
      799 +        }
 783  800  
 784      -                ASSERT(MBLKL(mp) >= hdrsize);
      801 +        ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
      802 +        ipha = (ipha_t *)ip6h;
 785  803  
 786      -                ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
 787      -                if ((unsigned char *)ip6h == mp->b_wptr) {
 788      -                        /*
 789      -                         * The first mblk_t only includes the mac header.
 790      -                         * Note that it is safe to change the mp pointer here,
 791      -                         * as the subsequent operation does not assume mp
 792      -                         * points to the start of the mac header.
 793      -                         */
 794      -                        mp = mp->b_cont;
      804 +        if ((uint8_t *)ip6h == mp->b_wptr) {
      805 +                /*
      806 +                 * The first mblk_t only includes the mac header.
      807 +                 * Note that it is safe to change the mp pointer here,
      808 +                 * as the subsequent operation does not assume mp
      809 +                 * points to the start of the mac header.
      810 +                 */
      811 +                mp = mp->b_cont;
 795  812  
 796      -                        /*
 797      -                         * Make sure ip6h holds the full ip6_t structure.
 798      -                         */
 799      -                        if (mp == NULL)
 800      -                                return (-1);
      813 +                /*
      814 +                 * Make sure the IP header points to an entire one.
      815 +                 */
      816 +                if (mp == NULL)
      817 +                        return (-1);
 801  818  
 802      -                        if (MBLKL(mp) < IPV6_HDR_LEN) {
 803      -                                modifiable = (DB_REF(mp) == 1);
      819 +                if (MBLKL(mp) < hdr_len) {
      820 +                        modifiable = (DB_REF(mp) == 1);
 804  821  
 805      -                                if (modifiable &&
 806      -                                    !pullupmsg(mp, IPV6_HDR_LEN)) {
 807      -                                        return (-1);
 808      -                                }
 809      -                        }
 810      -
 811      -                        ip6h = (ip6_t *)mp->b_rptr;
      822 +                        if (modifiable && !pullupmsg(mp, hdr_len))
      823 +                                return (-1);
 812  824                  }
 813  825  
 814      -                if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
 815      -                    ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) {
 816      -                        /*
 817      -                         * If either ip6h is not alligned, or ip6h does not
 818      -                         * hold the complete ip6_t structure (a pullupmsg()
 819      -                         * is not an option since it would result in an
 820      -                         * unalligned ip6h), fanout to the default ring. Note
 821      -                         * that this may cause packets reordering.
 822      -                         */
 823      -                        *indx = 0;
 824      -                        *type = OTH;
 825      -                        fanout_unalligned++;
 826      -                        return (0);
 827      -                }
      826 +                ip6h = (ip6_t *)mp->b_rptr;
      827 +                ipha = (ipha_t *)ip6h;
      828 +        }
 828  829  
      830 +        if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
      831 +            ((uint8_t *)ip6h + hdr_len > mp->b_wptr)) {
      832 +                /*
      833 +                 * If either the IP header is not aligned, or it does not hold
      834 +                 * the complete simple structure (a pullupmsg() is not an
      835 +                 * option since it would result in an unaligned IP header),
      836 +                 * fanout to the default ring.
      837 +                 *
      838 +                 * Note that this may cause packet reordering.
      839 +                 */
      840 +                *indx = 0;
      841 +                *type = OTH;
      842 +                fanout_unaligned++;
      843 +                return (0);
      844 +        }
      845 +
      846 +        /*
      847 +         * Extract next-header, full header length, and source-hash value
      848 +         * using v4/v6 specific fields.
      849 +         */
      850 +        if (v6) {
 829  851                  remlen = ntohs(ip6h->ip6_plen);
 830  852                  nexthdr = ip6h->ip6_nxt;
 831      -
 832      -                if (remlen < MIN_EHDR_LEN)
 833      -                        return (-1);
      853 +                src_val = V4_PART_OF_V6(ip6h->ip6_src);
 834  854                  /*
 835  855                   * Do src based fanout if below tunable is set to B_TRUE or
 836  856                   * when mac_ip_hdr_length_v6() fails because of malformed
 837      -                 * packets or because mblk's need to be concatenated using
      857 +                 * packets or because mblks need to be concatenated using
 838  858                   * pullupmsg().
 839  859                   */
 840  860                  if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h,
 841  861                      mp->b_wptr, &hdr_len, &nexthdr, NULL)) {
 842  862                          goto src_based_fanout;
 843  863                  }
 844      -                whereptr = (uint8_t *)ip6h + hdr_len;
 845      -
 846      -                /* If the transport is one of below, we do port based fanout */
 847      -                switch (nexthdr) {
 848      -                case IPPROTO_TCP:
 849      -                case IPPROTO_UDP:
 850      -                case IPPROTO_SCTP:
 851      -                case IPPROTO_ESP:
 852      -                        /*
 853      -                         * If the ports in the transport header is not part of
 854      -                         * the mblk, do src_based_fanout, instead of calling
 855      -                         * pullupmsg().
 856      -                         */
 857      -                        if (mp->b_cont != NULL &&
 858      -                            whereptr + PORTS_SIZE > mp->b_wptr) {
 859      -                                goto src_based_fanout;
 860      -                        }
 861      -                        break;
 862      -                default:
 863      -                        break;
      864 +        } else {
      865 +                hdr_len = IPH_HDR_LENGTH(ipha);
      866 +                remlen = ntohs(ipha->ipha_length) - hdr_len;
      867 +                nexthdr = ipha->ipha_protocol;
      868 +                src_val = (uint32_t)ipha->ipha_src;
      869 +                /*
      870 +                 * Catch IPv4 fragment case here.  IPv6 has nexthdr == FRAG
      871 +                 * for its equivalent case.
      872 +                 */
      873 +                if (mac_src_ipv4_fanout ||
      874 +                    (ntohs(ipha->ipha_fragment_offset_and_flags) &
      875 +                    (IPH_MF | IPH_OFFSET)) != 0) {
      876 +                        goto src_based_fanout;
 864  877                  }
      878 +        }
      879 +        if (remlen < MIN_EHDR_LEN)
      880 +                return (-1);
      881 +        whereptr = (uint8_t *)ip6h + hdr_len;
 865  882  
 866      -                switch (nexthdr) {
 867      -                case IPPROTO_TCP:
 868      -                        hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
 869      -                            *(uint32_t *)whereptr);
 870      -                        *indx = COMPUTE_INDEX(hash,
 871      -                            mac_srs->srs_tcp_ring_count);
 872      -                        *type = OTH;
 873      -                        break;
      883 +        /* If the transport is one of below, we do port/SPI based fanout */
      884 +        switch (nexthdr) {
      885 +        case IPPROTO_TCP:
      886 +        case IPPROTO_UDP:
      887 +        case IPPROTO_SCTP:
      888 +        case IPPROTO_ESP:
      889 +                /*
      890 +                 * If the ports or SPI in the transport header is not part of
      891 +                 * the mblk, do src_based_fanout, instead of calling
      892 +                 * pullupmsg().
      893 +                 */
      894 +                if (mp->b_cont == NULL || whereptr + PORTS_SIZE <= mp->b_wptr)
      895 +                        break;  /* out of switch... */
      896 +                /* FALLTHRU */
      897 +        default:
      898 +                goto src_based_fanout;
      899 +        }
 874  900  
 875      -                case IPPROTO_UDP:
 876      -                case IPPROTO_SCTP:
 877      -                case IPPROTO_ESP:
 878      -                        if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
 879      -                                hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
 880      -                                    *(uint32_t *)whereptr);
 881      -                                *indx = COMPUTE_INDEX(hash,
 882      -                                    mac_srs->srs_udp_ring_count);
 883      -                        } else {
 884      -                                *indx = mac_srs->srs_ind %
 885      -                                    mac_srs->srs_udp_ring_count;
 886      -                                mac_srs->srs_ind++;
 887      -                        }
 888      -                        *type = OTH;
 889      -                        break;
 890      -
 891      -                        /* For all other protocol, do source based fanout */
 892      -                default:
 893      -                        goto src_based_fanout;
      901 +        switch (nexthdr) {
      902 +        case IPPROTO_TCP:
      903 +                hash = HASH_ADDR(src_val, *(uint32_t *)whereptr);
      904 +                *indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
      905 +                *type = OTH;
      906 +                break;
      907 +        case IPPROTO_UDP:
      908 +        case IPPROTO_SCTP:
      909 +        case IPPROTO_ESP:
      910 +                if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
      911 +                        hash = HASH_ADDR(src_val, *(uint32_t *)whereptr);
      912 +                        *indx = COMPUTE_INDEX(hash,
      913 +                            mac_srs->srs_udp_ring_count);
      914 +                } else {
      915 +                        *indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
      916 +                        mac_srs->srs_ind++;
 894  917                  }
 895      -        } else {
 896      -                *indx = 0;
 897  918                  *type = OTH;
      919 +                break;
 898  920          }
 899  921          return (0);
 900  922  
 901  923  src_based_fanout:
 902      -        hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0);
      924 +        hash = HASH_ADDR(src_val, (uint32_t)0);
 903  925          *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
 904  926          *type = OTH;
 905  927          return (0);
 906  928  }
 907  929  
 908  930  /*
 909  931   * mac_rx_srs_fanout
 910  932   *
 911  933   * This routine delivers packets destined to an SRS into a soft ring member
 912  934   * of the set.
↓ open down ↓ 3123 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX