Print this page
MFV: illumos-joyent@61dc3dec4f82a3e13e94609a0a83d5f66c64e760
OS-6846 want i40e multi-group support
OS-7372 i40e_alloc_ring_mem() unwinds when it shouldn't
Reviewed by: Robert Mustacchi <rm@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Ryan Zezeski <rpz@joyent.com>
NEX-13226 xvv710 25Gb NIC panics system under load
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
NEX-7822 40Gb Intel XL710 NIC performance data
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
NEX-6977 Ericsson hangs on reboot with Intel XL710 NICs
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/io/i40e/i40e_intr.c
          +++ new/usr/src/uts/common/io/i40e/i40e_intr.c
↓ open down ↓ 2 lines elided ↑ open up ↑
   3    3   * Common Development and Distribution License ("CDDL"), version 1.0.
   4    4   * You may only use this file in accordance with the terms of version
   5    5   * 1.0 of the CDDL.
   6    6   *
   7    7   * A full copy of the text of the CDDL should have accompanied this
   8    8   * source.  A copy of the CDDL is also available via the Internet at
   9    9   * http://www.illumos.org/license/CDDL.
  10   10   */
  11   11  
  12   12  /*
  13      - * Copyright (c) 2017, Joyent, Inc.
       13 + * Copyright 2018 Joyent, Inc.
  14   14   * Copyright 2017 Tegile Systems, Inc.  All rights reserved.
  15   15   */
  16   16  
  17   17  /*
  18   18   * -------------------------
  19   19   * Interrupt Handling Theory
  20   20   * -------------------------
  21   21   *
  22   22   * There are a couple different sets of interrupts that we need to worry about:
  23   23   *
↓ open down ↓ 198 lines elided ↑ open up ↑
 222  222  static void
 223  223  i40e_intr_adminq_disable(i40e_t *i40e)
 224  224  {
 225  225          i40e_hw_t *hw = &i40e->i40e_hw_space;
 226  226          uint32_t reg;
 227  227  
 228  228          reg = I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT;
 229  229          I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTL0, reg);
 230  230  }
 231  231  
      232 +/*
      233 + * The next two functions enable/disable the reception of interrupts
      234 + * on the given vector. Only vectors 1..N are programmed by these
      235 + * functions; vector 0 is special and handled by a different register.
      236 + * We must subtract one from the vector because i40e implicitly adds
      237 + * one to the vector value. See section 10.2.2.10.13 for more details.
      238 + */
 232  239  static void
 233  240  i40e_intr_io_enable(i40e_t *i40e, int vector)
 234  241  {
 235  242          uint32_t reg;
 236  243          i40e_hw_t *hw = &i40e->i40e_hw_space;
 237  244  
      245 +        ASSERT3S(vector, >, 0);
 238  246          reg = I40E_PFINT_DYN_CTLN_INTENA_MASK |
 239  247              I40E_PFINT_DYN_CTLN_CLEARPBA_MASK |
 240  248              (I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT);
 241  249          I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTLN(vector - 1), reg);
 242  250  }
 243  251  
 244  252  static void
 245  253  i40e_intr_io_disable(i40e_t *i40e, int vector)
 246  254  {
 247  255          uint32_t reg;
 248  256          i40e_hw_t *hw = &i40e->i40e_hw_space;
 249  257  
      258 +        ASSERT3S(vector, >, 0);
 250  259          reg = I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
 251  260          I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTLN(vector - 1), reg);
 252  261  }
 253  262  
 254  263  /*
 255  264   * When MSI-X interrupts are being used, then we can enable the actual
 256  265   * interrupts themselves. However, when they are not, we instead have to turn
 257  266   * towards the queue's CAUSE_ENA bit and enable that.
 258  267   */
 259  268  void
↓ open down ↓ 108 lines elided ↑ open up ↑
 368  377                          reg = I40E_READ_REG(hw, I40E_PFINT_LNKLSTN(i));
 369  378                          VERIFY3U(reg, ==, I40E_QUEUE_TYPE_EOL);
 370  379                  }
 371  380          }
 372  381  #endif
 373  382  
 374  383          i40e_intr_adminq_disable(i40e);
 375  384  }
 376  385  
 377  386  /*
 378      - * Enable all of the queues and set the corresponding LNKLSTN registers. Note
 379      - * that we always enable queues as interrupt sources, even though we don't
 380      - * enable the MSI-X interrupt vectors.
      387 + * Set the head of the interrupt linked list. The PFINT_LNKLSTN[N]
      388 + * register actually refers to the 'N + 1' interrupt vector. E.g.,
      389 + * PFINT_LNKLSTN[0] refers to interrupt vector 1.
 381  390   */
 382  391  static void
      392 +i40e_set_lnklstn(i40e_t *i40e, uint_t vector, uint_t queue)
      393 +{
      394 +        uint32_t        reg;
      395 +        i40e_hw_t       *hw = &i40e->i40e_hw_space;
      396 +
      397 +        reg = (queue << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) |
      398 +            (I40E_QUEUE_TYPE_RX << I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
      399 +
      400 +        I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(vector), reg);
      401 +        DEBUGOUT2("PFINT_LNKLSTN[%u] = 0x%x", vector, reg);
      402 +}
      403 +
      404 +/*
      405 + * Set the QINT_RQCTL[queue] register. The next queue is always the Tx
      406 + * queue associated with this Rx queue. Unlike PFINT_LNKLSTN, the
      407 + * vector should be the actual vector this queue is on -- i.e., it
      408 + * should be equal to itrq_rx_intrvec.
      409 + */
      410 +static void
      411 +i40e_set_rqctl(i40e_t *i40e, uint_t vector, uint_t queue)
      412 +{
      413 +        uint32_t        reg;
      414 +        i40e_hw_t       *hw = &i40e->i40e_hw_space;
      415 +
      416 +        ASSERT3U(vector, ==, i40e->i40e_trqpairs[queue].itrq_rx_intrvec);
      417 +
      418 +        reg = (vector << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
      419 +            (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
      420 +            (queue << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
      421 +            (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) |
      422 +            I40E_QINT_RQCTL_CAUSE_ENA_MASK;
      423 +
      424 +        I40E_WRITE_REG(hw, I40E_QINT_RQCTL(queue), reg);
      425 +        DEBUGOUT2("QINT_RQCTL[%u] = 0x%x", queue, reg);
      426 +}
      427 +
      428 +/*
      429 + * Like i40e_set_rqctl(), but for QINT_TQCTL[queue]. The next queue is
      430 + * either the Rx queue of another TRQP, or EOL.
      431 + */
      432 +static void
      433 +i40e_set_tqctl(i40e_t *i40e, uint_t vector, uint_t queue, uint_t next_queue)
      434 +{
      435 +        uint32_t        reg;
      436 +        i40e_hw_t       *hw = &i40e->i40e_hw_space;
      437 +
      438 +        ASSERT3U(vector, ==, i40e->i40e_trqpairs[queue].itrq_tx_intrvec);
      439 +
      440 +        reg = (vector << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) |
      441 +            (I40E_ITR_INDEX_TX << I40E_QINT_TQCTL_ITR_INDX_SHIFT) |
      442 +            (next_queue << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
      443 +            (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) |
      444 +            I40E_QINT_TQCTL_CAUSE_ENA_MASK;
      445 +
      446 +        I40E_WRITE_REG(hw, I40E_QINT_TQCTL(queue), reg);
      447 +        DEBUGOUT2("QINT_TQCTL[%u] = 0x%x", queue, reg);
      448 +}
      449 +
      450 +/*
      451 + * Program the interrupt linked list. Each vector has a linked list of
      452 + * queues which act as event sources for that vector. When one of
      453 + * those sources has an event the associated interrupt vector is
      454 + * fired. This mapping must match the mapping found in
      455 + * i40e_map_intrs_to_vectors().
      456 + *
      457 + * See section 7.5.3 for more information about the configuration of
      458 + * the interrupt linked list.
      459 + */
      460 +static void
 383  461  i40e_intr_init_queue_msix(i40e_t *i40e)
 384  462  {
 385      -        i40e_hw_t *hw = &i40e->i40e_hw_space;
 386      -        uint32_t reg;
 387      -        int i;
      463 +        uint_t intr_count;
 388  464  
 389  465          /*
 390      -         * Map queues to MSI-X interrupts. Queue i is mapped to vector i + 1.
 391      -         * Note that we skip the ITR logic for the moment, just to make our
 392      -         * lives as explicit and simple as possible.
      466 +         * The 0th vector is for 'Other Interrupts' only (subject to
      467 +         * change in the future).
 393  468           */
 394      -        for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
 395      -                i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
      469 +        intr_count = i40e->i40e_intr_count - 1;
 396  470  
 397      -                reg = (i << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) |
 398      -                    (I40E_QUEUE_TYPE_RX <<
 399      -                    I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
 400      -                I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(i), reg);
      471 +        for (uint_t vec = 0; vec < intr_count; vec++) {
      472 +                boolean_t head = B_TRUE;
 401  473  
 402      -                reg =
 403      -                    (itrq->itrq_rx_intrvec << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
 404      -                    (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
 405      -                    (i << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
 406      -                    (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) |
 407      -                    I40E_QINT_RQCTL_CAUSE_ENA_MASK;
      474 +                for (uint_t qidx = vec; qidx < i40e->i40e_num_trqpairs;
      475 +                     qidx += intr_count) {
      476 +                        uint_t next_qidx = qidx + intr_count;
 408  477  
 409      -                I40E_WRITE_REG(hw, I40E_QINT_RQCTL(i), reg);
      478 +                        next_qidx = (next_qidx > i40e->i40e_num_trqpairs) ?
      479 +                            I40E_QUEUE_TYPE_EOL : next_qidx;
 410  480  
 411      -                reg =
 412      -                    (itrq->itrq_tx_intrvec << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) |
 413      -                    (I40E_ITR_INDEX_TX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
 414      -                    (I40E_QUEUE_TYPE_EOL << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
 415      -                    (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) |
 416      -                    I40E_QINT_TQCTL_CAUSE_ENA_MASK;
      481 +                        if (head) {
      482 +                                i40e_set_lnklstn(i40e, vec, qidx);
      483 +                                head = B_FALSE;
      484 +                        }
 417  485  
 418      -                I40E_WRITE_REG(hw, I40E_QINT_TQCTL(i), reg);
      486 +                        i40e_set_rqctl(i40e, vec + 1, qidx);
      487 +                        i40e_set_tqctl(i40e, vec + 1, qidx, next_qidx);
      488 +                }
 419  489          }
 420      -
 421  490  }
 422  491  
 423  492  /*
 424  493   * Set up a single queue to share the admin queue interrupt in the non-MSI-X
 425  494   * world. Note we do not enable the queue as an interrupt cause at this time. We
 426  495   * don't have any other vector of control here, unlike with the MSI-X interrupt
 427  496   * case.
 428  497   */
 429  498  static void
 430  499  i40e_intr_init_queue_shared(i40e_t *i40e)
↓ open down ↓ 150 lines elided ↑ open up ↑
 581  650                   * At the moment, the only error code that seems to be returned
 582  651                   * is one saying that there's no work. In such a case we leave
 583  652                   * this be.
 584  653                   */
 585  654                  ret = i40e_clean_arq_element(hw, &evt, &remain);
 586  655                  if (ret != I40E_SUCCESS)
 587  656                          break;
 588  657  
 589  658                  opcode = LE_16(evt.desc.opcode);
 590  659                  switch (opcode) {
 591      -                case i40e_aqc_opc_get_link_status:
 592      -                        mutex_enter(&i40e->i40e_general_lock);
 593      -                        i40e_link_check(i40e);
 594      -                        mutex_exit(&i40e->i40e_general_lock);
 595      -                        break;
      660 +                /*
      661 +                 * Disable link checks for NEX-6977. With the fibers unplugged
      662 +                 * we can end up receiving too many link check interrupts,
      663 +                 * saturating one CPU for each link. This can cause system hangs
      664 +                 * at boot or shutdown when the system is running single-threaded.
      665 +                 *
      666 +                 * case i40e_aqc_opc_get_link_status:
      667 +                 *      mutex_enter(&i40e->i40e_general_lock);
      668 +                 *      i40e_link_check(i40e);
      669 +                 *      mutex_exit(&i40e->i40e_general_lock);
      670 +                 *      break;
      671 +                 */
 596  672                  default:
 597  673                          /*
 598  674                           * Longer term we'll want to enable other causes here
 599  675                           * and get these cleaned up and doing something.
 600  676                           */
 601  677                          break;
 602  678                  }
 603  679          }
 604  680  }
 605  681  
 606  682  static void
 607      -i40e_intr_rx_work(i40e_t *i40e, int queue)
      683 +i40e_intr_rx_work(i40e_t *i40e, i40e_trqpair_t *itrq)
 608  684  {
 609  685          mblk_t *mp = NULL;
 610      -        i40e_trqpair_t *itrq;
 611  686  
 612      -        ASSERT(queue < i40e->i40e_num_trqpairs);
 613      -        itrq = &i40e->i40e_trqpairs[queue];
 614      -
 615  687          mutex_enter(&itrq->itrq_rx_lock);
 616  688          if (!itrq->itrq_intr_poll)
 617  689                  mp = i40e_ring_rx(itrq, I40E_POLL_NULL);
 618  690          mutex_exit(&itrq->itrq_rx_lock);
 619  691  
 620      -        if (mp != NULL) {
 621      -                mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp,
 622      -                    itrq->itrq_rxgen);
 623      -        }
      692 +        if (mp == NULL)
      693 +                return;
      694 +
      695 +        mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp,
      696 +            itrq->itrq_rxgen);
 624  697  }
 625  698  
      699 +/* ARGSUSED */
 626  700  static void
 627      -i40e_intr_tx_work(i40e_t *i40e, int queue)
      701 +i40e_intr_tx_work(i40e_t *i40e, i40e_trqpair_t *itrq)
 628  702  {
 629      -        i40e_trqpair_t *itrq;
 630      -
 631      -        itrq = &i40e->i40e_trqpairs[queue];
 632  703          i40e_tx_recycle_ring(itrq);
 633  704  }
 634  705  
 635  706  /*
 636  707   * At the moment, the only 'other' interrupt on ICR0 that we handle is the
 637  708   * adminq. We should go through and support the other notifications at some
 638  709   * point.
 639  710   */
 640  711  static void
 641  712  i40e_intr_other_work(i40e_t *i40e)
↓ open down ↓ 16 lines elided ↑ open up ↑
 658  729           * Make sure that the adminq interrupt is not masked and then explicitly
 659  730           * enable the adminq and thus the other interrupt.
 660  731           */
 661  732          reg = I40E_READ_REG(hw, I40E_PFINT_ICR0_ENA);
 662  733          reg |= I40E_PFINT_ICR0_ENA_ADMINQ_MASK;
 663  734          I40E_WRITE_REG(hw, I40E_PFINT_ICR0_ENA, reg);
 664  735  
 665  736          i40e_intr_adminq_enable(i40e);
 666  737  }
 667  738  
      739 +/*
      740 + * Handle an MSI-X interrupt. See section 7.5.1.3 for an overview of
      741 + * the MSI-X interrupt sequence.
      742 + */
 668  743  uint_t
 669  744  i40e_intr_msix(void *arg1, void *arg2)
 670  745  {
 671  746          i40e_t *i40e = (i40e_t *)arg1;
 672      -        int vector_idx = (int)(uintptr_t)arg2;
      747 +        uint_t vector_idx = (uint_t)(uintptr_t)arg2;
 673  748  
      749 +        ASSERT3U(vector_idx, <, i40e->i40e_intr_count);
      750 +
 674  751          /*
 675  752           * When using MSI-X interrupts, vector 0 is always reserved for the
 676  753           * adminq at this time. Though longer term, we'll want to also bridge
 677  754           * some I/O to them.
 678  755           */
 679  756          if (vector_idx == 0) {
 680  757                  i40e_intr_other_work(i40e);
 681  758                  return (DDI_INTR_CLAIMED);
 682  759          }
 683  760  
 684      -        i40e_intr_rx_work(i40e, vector_idx - 1);
 685      -        i40e_intr_tx_work(i40e, vector_idx - 1);
 686      -        i40e_intr_io_enable(i40e, vector_idx);
      761 +        ASSERT3U(vector_idx, >, 0);
 687  762  
      763 +        /*
      764 +         * We determine the queue indexes via simple arithmetic (as
      765 +         * opposed to keeping explicit state like a bitmap). While
      766 +         * conveinent, it does mean that i40e_map_intrs_to_vectors(),
      767 +         * i40e_intr_init_queue_msix(), and this function must be
      768 +         * modified as a unit.
      769 +         *
      770 +         * We subtract 1 from the vector to offset the addition we
      771 +         * performed during i40e_map_intrs_to_vectors().
      772 +         */
      773 +        for (uint_t i = vector_idx - 1; i < i40e->i40e_num_trqpairs;
      774 +             i += (i40e->i40e_intr_count - 1)) {
      775 +                i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
      776 +
      777 +                ASSERT3U(i, <, i40e->i40e_num_trqpairs);
      778 +                ASSERT3P(itrq, !=, NULL);
      779 +                i40e_intr_rx_work(i40e, itrq);
      780 +                i40e_intr_tx_work(i40e, itrq);
      781 +        }
      782 +
      783 +        i40e_intr_io_enable(i40e, vector_idx);
 688  784          return (DDI_INTR_CLAIMED);
 689  785  }
 690  786  
 691  787  static uint_t
 692  788  i40e_intr_notx(i40e_t *i40e, boolean_t shared)
 693  789  {
 694  790          i40e_hw_t *hw = &i40e->i40e_hw_space;
 695  791          uint32_t reg;
      792 +        i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[0];
 696  793          int ret = DDI_INTR_CLAIMED;
 697  794  
 698  795          if (shared == B_TRUE) {
 699  796                  mutex_enter(&i40e->i40e_general_lock);
 700  797                  if (i40e->i40e_state & I40E_SUSPENDED) {
 701  798                          mutex_exit(&i40e->i40e_general_lock);
 702  799                          return (DDI_INTR_UNCLAIMED);
 703  800                  }
 704  801                  mutex_exit(&i40e->i40e_general_lock);
 705  802          }
↓ open down ↓ 9 lines elided ↑ open up ↑
 715  812          if (reg == 0) {
 716  813                  if (shared == B_TRUE)
 717  814                          ret = DDI_INTR_UNCLAIMED;
 718  815                  goto done;
 719  816          }
 720  817  
 721  818          if (reg & I40E_PFINT_ICR0_ADMINQ_MASK)
 722  819                  i40e_intr_adminq_work(i40e);
 723  820  
 724  821          if (reg & I40E_INTR_NOTX_RX_MASK)
 725      -                i40e_intr_rx_work(i40e, 0);
      822 +                i40e_intr_rx_work(i40e, itrq);
 726  823  
 727  824          if (reg & I40E_INTR_NOTX_TX_MASK)
 728      -                i40e_intr_tx_work(i40e, 0);
      825 +                i40e_intr_tx_work(i40e, itrq);
 729  826  
 730  827  done:
 731  828          i40e_intr_adminq_enable(i40e);
 732  829          return (ret);
 733  830  
 734  831  }
 735  832  
 736  833  /* ARGSUSED */
 737  834  uint_t
 738  835  i40e_intr_msi(void *arg1, void *arg2)
↓ open down ↓ 14 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX