1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * IEEE 802.3ad Link Aggregation - Send code.
  28  *
  29  * Implements the Distributor function.
  30  */
  31 
  32 #include <sys/conf.h>
  33 #include <sys/modctl.h>
  34 #include <sys/sunddi.h>
  35 #include <sys/callb.h>
  36 #include <sys/vlan.h>
  37 #include <sys/strsun.h>
  38 #include <sys/strsubr.h>
  39 #include <sys/dlpi.h>
  40 
  41 #include <inet/common.h>
  42 #include <inet/led.h>
  43 #include <inet/ip.h>
  44 #include <inet/ip6.h>
  45 #include <inet/tcp.h>
  46 #include <netinet/udp.h>
  47 
  48 #include <sys/aggr.h>
  49 #include <sys/aggr_impl.h>
  50 
  51 /*
  52  * Update the TX load balancing policy of the specified group.
  53  */
  54 void
  55 aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy)
  56 {
  57         uint8_t mac_policy = 0;
  58 
  59         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
  60 
  61         if ((policy & AGGR_POLICY_L2) != 0)
  62                 mac_policy |= MAC_PKT_HASH_L2;
  63         if ((policy & AGGR_POLICY_L3) != 0)
  64                 mac_policy |= MAC_PKT_HASH_L3;
  65         if ((policy & AGGR_POLICY_L4) != 0)
  66                 mac_policy |= MAC_PKT_HASH_L4;
  67 
  68         grp->lg_tx_policy = policy;
  69         grp->lg_mac_tx_policy = mac_policy;
  70 }
  71 
  72 #define HASH_HINT(hint) \
  73         ((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8))
  74 
  75 /*
  76  * Function invoked by mac layer to find a specific TX ring on a port
  77  * to send data.
  78  */
  79 mblk_t *
  80 aggr_find_tx_ring(void *arg, mblk_t *mp, uintptr_t hint, mac_ring_handle_t *rh)
  81 {
  82         aggr_grp_t *grp = arg;
  83         aggr_port_t *port;
  84         uint64_t hash;
  85 
  86         rw_enter(&grp->lg_tx_lock, RW_READER);
  87         if (grp->lg_ntx_ports == 0) {
  88                 /*
  89                  * We could have returned from aggr_m_start() before
  90                  * the ports were actually attached. Drop the chain.
  91                  */
  92                 rw_exit(&grp->lg_tx_lock);
  93                 freemsgchain(mp);
  94                 return (NULL);
  95         }
  96         hash = mac_pkt_hash(DL_ETHER, mp, grp->lg_mac_tx_policy, B_TRUE);
  97         port = grp->lg_tx_ports[hash % grp->lg_ntx_ports];
  98 
  99         /*
 100          * Use hash as the hint so to direct traffic to
 101          * different TX rings. Note below bit operation
 102          * is needed in case hint is 0 to get the most
 103          * benefit from HASH_HINT() algorithm.
 104          */
 105         if (port->lp_tx_ring_cnt > 1) {
 106                 if (hint == 0) {
 107                         hash = (hash << 24 | hash << 16 | hash);
 108                         hash = (hash << 32 | hash);
 109                 } else {
 110                         hash = hint;
 111                 }
 112                 hash = HASH_HINT(hash);
 113                 *rh = port->lp_pseudo_tx_rings[hash % port->lp_tx_ring_cnt];
 114         } else {
 115                 *rh = port->lp_pseudo_tx_rings[0];
 116         }
 117         rw_exit(&grp->lg_tx_lock);
 118 
 119         return (mp);
 120 }
 121 
 122 /*
 123  * aggr_tx_notify_thread:
 124  *
 125  * aggr_tx_ring_update() callback function wakes up this thread when
 126  * it gets called. This thread will call mac_tx_ring_update() to
 127  * notify upper mac of flow control getting relieved. Note that
 128  * aggr_tx_ring_update() cannot call mac_tx_ring_update() directly
 129  * because aggr_tx_ring_update() is called from lower mac with
 130  * mi_rw_lock held.
 131  */
 132 void
 133 aggr_tx_notify_thread(void *arg)
 134 {
 135         callb_cpr_t     cprinfo;
 136         aggr_grp_t      *grp = (aggr_grp_t *)arg;
 137         mac_ring_handle_t       pseudo_mrh;
 138 
 139         CALLB_CPR_INIT(&cprinfo, &grp->lg_tx_flowctl_lock, callb_generic_cpr,
 140             "aggr_tx_notify_thread");
 141 
 142         mutex_enter(&grp->lg_tx_flowctl_lock);
 143         while (!grp->lg_tx_notify_done) {
 144                 if ((grp->lg_tx_blocked_cnt) == 0) {
 145                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 146                         cv_wait(&grp->lg_tx_flowctl_cv,
 147                             &grp->lg_tx_flowctl_lock);
 148                         CALLB_CPR_SAFE_END(&cprinfo, &grp->lg_tx_flowctl_lock);
 149                         continue;
 150                 }
 151                 while (grp->lg_tx_blocked_cnt != 0) {
 152                         grp->lg_tx_blocked_cnt--;
 153                         pseudo_mrh =
 154                             grp->lg_tx_blocked_rings[grp->lg_tx_blocked_cnt];
 155                         mutex_exit(&grp->lg_tx_flowctl_lock);
 156                         mac_tx_ring_update(grp->lg_mh, pseudo_mrh);
 157                         mutex_enter(&grp->lg_tx_flowctl_lock);
 158                 }
 159         }
 160         /*
 161          * The grp is being destroyed, exit the thread.
 162          */
 163         grp->lg_tx_notify_thread = NULL;
 164         CALLB_CPR_EXIT(&cprinfo);
 165         thread_exit();
 166 }
 167 
 168 /*
 169  * Callback function registered with lower mac to receive wakeups from
 170  * drivers when flow control is relieved (i.e. Tx descriptors are
 171  * available).
 172  */
 173 void
 174 aggr_tx_ring_update(void *arg1, uintptr_t arg2)
 175 {
 176         aggr_port_t *port = (aggr_port_t *)arg1;
 177         mac_ring_handle_t mrh = (mac_ring_handle_t)arg2;
 178         mac_ring_handle_t pseudo_mrh;
 179         aggr_grp_t *grp = port->lp_grp;
 180         int i = 0;
 181 
 182         if (mrh == NULL) {
 183                 /*
 184                  * If the underlying NIC does not expose TX rings,
 185                  * still as pseudo TX ring is presented to the
 186                  * aggr mac.
 187                  */
 188                 pseudo_mrh = port->lp_pseudo_tx_rings[0];
 189         } else {
 190                 for (i = 0; i < port->lp_tx_ring_cnt; i++) {
 191                         if (port->lp_tx_rings[i] == mrh)
 192                                 break;
 193                 }
 194                 ASSERT(i < port->lp_tx_ring_cnt);
 195                 pseudo_mrh = port->lp_pseudo_tx_rings[i];
 196         }
 197         mutex_enter(&grp->lg_tx_flowctl_lock);
 198         /*
 199          * It could be possible that some (broken?) device driver
 200          * could send more than one wakeup on the same ring. In
 201          * such a case, multiple instances of the same pseudo TX
 202          * ring should not be saved in lg_tx_blocked_rings[]
 203          * array. So first check if woken up ring (pseudo_mrh) is
 204          * already in the lg_tx_blocked_rings[] array.
 205          */
 206         for (i = 0; i < grp->lg_tx_blocked_cnt; i++) {
 207                 if (grp->lg_tx_blocked_rings[i] == pseudo_mrh) {
 208                         mutex_exit(&grp->lg_tx_flowctl_lock);
 209                         return;
 210                 }
 211         }
 212         /* A distinct mac_ring_handle. Save and increment count */
 213         grp->lg_tx_blocked_rings[grp->lg_tx_blocked_cnt] = pseudo_mrh;
 214         grp->lg_tx_blocked_cnt++;
 215         cv_signal(&grp->lg_tx_flowctl_cv);
 216         mutex_exit(&grp->lg_tx_flowctl_lock);
 217 }
 218 
 219 /*
 220  * Send function invoked by the MAC service module.
 221  */
 222 mblk_t *
 223 aggr_ring_tx(void *arg, mblk_t *mp)
 224 {
 225         aggr_pseudo_tx_ring_t *pseudo_ring = (aggr_pseudo_tx_ring_t *)arg;
 226         aggr_port_t *port = pseudo_ring->atr_port;
 227 
 228         return (mac_hwring_send_priv(port->lp_mch, pseudo_ring->atr_hw_rh, mp));
 229 }
 230 
 231 /*
 232  * Enable sending on the specified port.
 233  */
 234 void
 235 aggr_send_port_enable(aggr_port_t *port)
 236 {
 237         aggr_grp_t *grp = port->lp_grp;
 238 
 239         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 240 
 241         if (port->lp_tx_enabled || (port->lp_state !=
 242             AGGR_PORT_STATE_ATTACHED)) {
 243                 /* already enabled or port not yet attached */
 244                 return;
 245         }
 246 
 247         /*
 248          * Add to group's array of tx ports.
 249          */
 250         rw_enter(&grp->lg_tx_lock, RW_WRITER);
 251         if (grp->lg_tx_ports_size < grp->lg_ntx_ports+1) {
 252                 /* current array too small */
 253                 aggr_port_t **new_ports;
 254                 uint_t new_size;
 255 
 256                 new_size = grp->lg_ntx_ports+1;
 257                 new_ports = kmem_zalloc(new_size * sizeof (aggr_port_t *),
 258                     KM_SLEEP);
 259 
 260                 if (grp->lg_tx_ports_size > 0) {
 261                         ASSERT(grp->lg_tx_ports != NULL);
 262                         bcopy(grp->lg_tx_ports, new_ports,
 263                             grp->lg_ntx_ports * sizeof (aggr_port_t *));
 264                         kmem_free(grp->lg_tx_ports,
 265                             grp->lg_tx_ports_size * sizeof (aggr_port_t *));
 266                 }
 267 
 268                 grp->lg_tx_ports = new_ports;
 269                 grp->lg_tx_ports_size = new_size;
 270         }
 271 
 272         grp->lg_tx_ports[grp->lg_ntx_ports++] = port;
 273         port->lp_tx_idx = grp->lg_ntx_ports-1;
 274         rw_exit(&grp->lg_tx_lock);
 275 
 276         port->lp_tx_enabled = B_TRUE;
 277 
 278         aggr_grp_update_default(grp);
 279 }
 280 
 281 /*
 282  * Disable sending from the specified port.
 283  */
 284 void
 285 aggr_send_port_disable(aggr_port_t *port)
 286 {
 287         uint_t idx, ntx;
 288         aggr_grp_t *grp = port->lp_grp;
 289 
 290         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 291         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 292 
 293         if (!port->lp_tx_enabled) {
 294                 /* not yet enabled */
 295                 return;
 296         }
 297 
 298         rw_enter(&grp->lg_tx_lock, RW_WRITER);
 299         idx = port->lp_tx_idx;
 300         ntx = grp->lg_ntx_ports;
 301         ASSERT(idx < ntx);
 302 
 303         /* remove from array of attached ports */
 304         if (idx == (ntx - 1)) {
 305                 grp->lg_tx_ports[idx] = NULL;
 306         } else {
 307                 /* not the last entry, replace with last one */
 308                 aggr_port_t *victim;
 309 
 310                 victim = grp->lg_tx_ports[ntx - 1];
 311                 grp->lg_tx_ports[ntx - 1] = NULL;
 312                 victim->lp_tx_idx = idx;
 313                 grp->lg_tx_ports[idx] = victim;
 314         }
 315 
 316         port->lp_tx_idx = 0;
 317         grp->lg_ntx_ports--;
 318         rw_exit(&grp->lg_tx_lock);
 319 
 320         port->lp_tx_enabled = B_FALSE;
 321 
 322         aggr_grp_update_default(grp);
 323 }