1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2019 Joyent, Inc.
  24  */
  25 
  26 /*
  27  * MAC Services Module - misc utilities
  28  */
  29 
  30 #include <sys/types.h>
  31 #include <sys/mac.h>
  32 #include <sys/mac_impl.h>
  33 #include <sys/mac_client_priv.h>
  34 #include <sys/mac_client_impl.h>
  35 #include <sys/mac_soft_ring.h>
  36 #include <sys/strsubr.h>
  37 #include <sys/strsun.h>
  38 #include <sys/vlan.h>
  39 #include <sys/pattr.h>
  40 #include <sys/pci_tools.h>
  41 #include <inet/ip.h>
  42 #include <inet/ip_impl.h>
  43 #include <inet/ip6.h>
  44 #include <sys/vtrace.h>
  45 #include <sys/dlpi.h>
  46 #include <sys/sunndi.h>
  47 #include <inet/ipsec_impl.h>
  48 #include <inet/sadb.h>
  49 #include <inet/ipsecesp.h>
  50 #include <inet/ipsecah.h>
  51 
  52 /*
  53  * Copy an mblk, preserving its hardware checksum flags.
  54  */
  55 static mblk_t *
  56 mac_copymsg_cksum(mblk_t *mp)
  57 {
  58         mblk_t *mp1;
  59         uint32_t start, stuff, end, value, flags;
  60 
  61         mp1 = copymsg(mp);
  62         if (mp1 == NULL)
  63                 return (NULL);
  64 
  65         hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
  66         (void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
  67             flags, KM_NOSLEEP);
  68 
  69         return (mp1);
  70 }
  71 
  72 /*
  73  * Copy an mblk chain, presenting the hardware checksum flags of the
  74  * individual mblks.
  75  */
  76 mblk_t *
  77 mac_copymsgchain_cksum(mblk_t *mp)
  78 {
  79         mblk_t *nmp = NULL;
  80         mblk_t **nmpp = &nmp;
  81 
  82         for (; mp != NULL; mp = mp->b_next) {
  83                 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
  84                         freemsgchain(nmp);
  85                         return (NULL);
  86                 }
  87 
  88                 nmpp = &((*nmpp)->b_next);
  89         }
  90 
  91         return (nmp);
  92 }
  93 
  94 /*
  95  * Process the specified mblk chain for proper handling of hardware
  96  * checksum offload. This routine is invoked for loopback traffic
  97  * between MAC clients.
  98  * The function handles a NULL mblk chain passed as argument.
  99  */
 100 mblk_t *
 101 mac_fix_cksum(mblk_t *mp_chain)
 102 {
 103         mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
 104         uint32_t flags, start, stuff, end, value;
 105 
 106         for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
 107                 uint16_t len;
 108                 uint32_t offset;
 109                 struct ether_header *ehp;
 110                 uint16_t sap;
 111 
 112                 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
 113                     &flags);
 114                 if (flags == 0)
 115                         continue;
 116 
 117                 /*
 118                  * Since the processing of checksum offload for loopback
 119                  * traffic requires modification of the packet contents,
 120                  * ensure sure that we are always modifying our own copy.
 121                  */
 122                 if (DB_REF(mp) > 1) {
 123                         mp1 = copymsg(mp);
 124                         if (mp1 == NULL)
 125                                 continue;
 126                         mp1->b_next = mp->b_next;
 127                         mp->b_next = NULL;
 128                         freemsg(mp);
 129                         if (prev != NULL)
 130                                 prev->b_next = mp1;
 131                         else
 132                                 new_chain = mp1;
 133                         mp = mp1;
 134                 }
 135 
 136                 /*
 137                  * Ethernet, and optionally VLAN header.
 138                  */
 139                 /* LINTED: improper alignment cast */
 140                 ehp = (struct ether_header *)mp->b_rptr;
 141                 if (ntohs(ehp->ether_type) == VLAN_TPID) {
 142                         struct ether_vlan_header *evhp;
 143 
 144                         ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
 145                         /* LINTED: improper alignment cast */
 146                         evhp = (struct ether_vlan_header *)mp->b_rptr;
 147                         sap = ntohs(evhp->ether_type);
 148                         offset = sizeof (struct ether_vlan_header);
 149                 } else {
 150                         sap = ntohs(ehp->ether_type);
 151                         offset = sizeof (struct ether_header);
 152                 }
 153 
 154                 if (MBLKL(mp) <= offset) {
 155                         offset -= MBLKL(mp);
 156                         if (mp->b_cont == NULL) {
 157                                 /* corrupted packet, skip it */
 158                                 if (prev != NULL)
 159                                         prev->b_next = mp->b_next;
 160                                 else
 161                                         new_chain = mp->b_next;
 162                                 mp1 = mp->b_next;
 163                                 mp->b_next = NULL;
 164                                 freemsg(mp);
 165                                 mp = mp1;
 166                                 continue;
 167                         }
 168                         mp = mp->b_cont;
 169                 }
 170 
 171                 if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
 172                         ipha_t *ipha = NULL;
 173 
 174                         /*
 175                          * In order to compute the full and header
 176                          * checksums, we need to find and parse
 177                          * the IP and/or ULP headers.
 178                          */
 179 
 180                         sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
 181 
 182                         /*
 183                          * IP header.
 184                          */
 185                         if (sap != ETHERTYPE_IP)
 186                                 continue;
 187 
 188                         ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
 189                         /* LINTED: improper alignment cast */
 190                         ipha = (ipha_t *)(mp->b_rptr + offset);
 191 
 192                         if (flags & HCK_FULLCKSUM) {
 193                                 ipaddr_t src, dst;
 194                                 uint32_t cksum;
 195                                 uint16_t *up;
 196                                 uint8_t proto;
 197 
 198                                 /*
 199                                  * Pointer to checksum field in ULP header.
 200                                  */
 201                                 proto = ipha->ipha_protocol;
 202                                 ASSERT(ipha->ipha_version_and_hdr_length ==
 203                                     IP_SIMPLE_HDR_VERSION);
 204 
 205                                 switch (proto) {
 206                                 case IPPROTO_TCP:
 207                                         /* LINTED: improper alignment cast */
 208                                         up = IPH_TCPH_CHECKSUMP(ipha,
 209                                             IP_SIMPLE_HDR_LENGTH);
 210                                         break;
 211 
 212                                 case IPPROTO_UDP:
 213                                         /* LINTED: improper alignment cast */
 214                                         up = IPH_UDPH_CHECKSUMP(ipha,
 215                                             IP_SIMPLE_HDR_LENGTH);
 216                                         break;
 217 
 218                                 default:
 219                                         cmn_err(CE_WARN, "mac_fix_cksum: "
 220                                             "unexpected protocol: %d", proto);
 221                                         continue;
 222                                 }
 223 
 224                                 /*
 225                                  * Pseudo-header checksum.
 226                                  */
 227                                 src = ipha->ipha_src;
 228                                 dst = ipha->ipha_dst;
 229                                 len = ntohs(ipha->ipha_length) -
 230                                     IP_SIMPLE_HDR_LENGTH;
 231 
 232                                 cksum = (dst >> 16) + (dst & 0xFFFF) +
 233                                     (src >> 16) + (src & 0xFFFF);
 234                                 cksum += htons(len);
 235 
 236                                 /*
 237                                  * The checksum value stored in the packet needs
 238                                  * to be correct. Compute it here.
 239                                  */
 240                                 *up = 0;
 241                                 cksum += (((proto) == IPPROTO_UDP) ?
 242                                     IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
 243                                 cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
 244                                     offset, cksum);
 245                                 *(up) = (uint16_t)(cksum ? cksum : ~cksum);
 246 
 247                                 /*
 248                                  * Flag the packet so that it appears
 249                                  * that the checksum has already been
 250                                  * verified by the hardware.
 251                                  */
 252                                 flags &= ~HCK_FULLCKSUM;
 253                                 flags |= HCK_FULLCKSUM_OK;
 254                                 value = 0;
 255                         }
 256 
 257                         if (flags & HCK_IPV4_HDRCKSUM) {
 258                                 ASSERT(ipha != NULL);
 259                                 ipha->ipha_hdr_checksum =
 260                                     (uint16_t)ip_csum_hdr(ipha);
 261                                 flags &= ~HCK_IPV4_HDRCKSUM;
 262                                 flags |= HCK_IPV4_HDRCKSUM_OK;
 263 
 264                         }
 265                 }
 266 
 267                 if (flags & HCK_PARTIALCKSUM) {
 268                         uint16_t *up, partial, cksum;
 269                         uchar_t *ipp; /* ptr to beginning of IP header */
 270 
 271                         if (mp->b_cont != NULL) {
 272                                 mblk_t *mp1;
 273 
 274                                 mp1 = msgpullup(mp, offset + end);
 275                                 if (mp1 == NULL)
 276                                         continue;
 277                                 mp1->b_next = mp->b_next;
 278                                 mp->b_next = NULL;
 279                                 freemsg(mp);
 280                                 if (prev != NULL)
 281                                         prev->b_next = mp1;
 282                                 else
 283                                         new_chain = mp1;
 284                                 mp = mp1;
 285                         }
 286 
 287                         ipp = mp->b_rptr + offset;
 288                         /* LINTED: cast may result in improper alignment */
 289                         up = (uint16_t *)((uchar_t *)ipp + stuff);
 290                         partial = *up;
 291                         *up = 0;
 292 
 293                         cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
 294                             end - start, partial);
 295                         cksum = ~cksum;
 296                         *up = cksum ? cksum : ~cksum;
 297 
 298                         /*
 299                          * Since we already computed the whole checksum,
 300                          * indicate to the stack that it has already
 301                          * been verified by the hardware.
 302                          */
 303                         flags &= ~HCK_PARTIALCKSUM;
 304                         flags |= HCK_FULLCKSUM_OK;
 305                         value = 0;
 306                 }
 307 
 308                 (void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
 309                     value, flags, KM_NOSLEEP);
 310         }
 311 
 312         return (new_chain);
 313 }
 314 
 315 /*
 316  * Add VLAN tag to the specified mblk.
 317  */
 318 mblk_t *
 319 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
 320 {
 321         mblk_t *hmp;
 322         struct ether_vlan_header *evhp;
 323         struct ether_header *ehp;
 324         uint32_t start, stuff, end, value, flags;
 325 
 326         ASSERT(pri != 0 || vid != 0);
 327 
 328         /*
 329          * Allocate an mblk for the new tagged ethernet header,
 330          * and copy the MAC addresses and ethertype from the
 331          * original header.
 332          */
 333 
 334         hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
 335         if (hmp == NULL) {
 336                 freemsg(mp);
 337                 return (NULL);
 338         }
 339 
 340         evhp = (struct ether_vlan_header *)hmp->b_rptr;
 341         ehp = (struct ether_header *)mp->b_rptr;
 342 
 343         bcopy(ehp, evhp, (ETHERADDRL * 2));
 344         evhp->ether_type = ehp->ether_type;
 345         evhp->ether_tpid = htons(ETHERTYPE_VLAN);
 346 
 347         hmp->b_wptr += sizeof (struct ether_vlan_header);
 348         mp->b_rptr += sizeof (struct ether_header);
 349 
 350         /*
 351          * Free the original message if it's now empty. Link the
 352          * rest of messages to the header message.
 353          */
 354         hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
 355         (void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags,
 356             KM_NOSLEEP);
 357         if (MBLKL(mp) == 0) {
 358                 hmp->b_cont = mp->b_cont;
 359                 freeb(mp);
 360         } else {
 361                 hmp->b_cont = mp;
 362         }
 363         ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
 364 
 365         /*
 366          * Initialize the new TCI (Tag Control Information).
 367          */
 368         evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
 369 
 370         return (hmp);
 371 }
 372 
 373 /*
 374  * Adds a VLAN tag with the specified VID and priority to each mblk of
 375  * the specified chain.
 376  */
 377 mblk_t *
 378 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
 379 {
 380         mblk_t *next_mp, **prev, *mp;
 381 
 382         mp = mp_chain;
 383         prev = &mp_chain;
 384 
 385         while (mp != NULL) {
 386                 next_mp = mp->b_next;
 387                 mp->b_next = NULL;
 388                 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
 389                         freemsgchain(next_mp);
 390                         break;
 391                 }
 392                 *prev = mp;
 393                 prev = &mp->b_next;
 394                 mp = mp->b_next = next_mp;
 395         }
 396 
 397         return (mp_chain);
 398 }
 399 
 400 /*
 401  * Strip VLAN tag
 402  */
 403 mblk_t *
 404 mac_strip_vlan_tag(mblk_t *mp)
 405 {
 406         mblk_t *newmp;
 407         struct ether_vlan_header *evhp;
 408 
 409         evhp = (struct ether_vlan_header *)mp->b_rptr;
 410         if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
 411                 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
 412 
 413                 if (DB_REF(mp) > 1) {
 414                         newmp = copymsg(mp);
 415                         if (newmp == NULL)
 416                                 return (NULL);
 417                         freemsg(mp);
 418                         mp = newmp;
 419                 }
 420 
 421                 evhp = (struct ether_vlan_header *)mp->b_rptr;
 422 
 423                 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
 424                 mp->b_rptr += VLAN_TAGSZ;
 425         }
 426         return (mp);
 427 }
 428 
 429 /*
 430  * Strip VLAN tag from each mblk of the chain.
 431  */
 432 mblk_t *
 433 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
 434 {
 435         mblk_t *mp, *next_mp, **prev;
 436 
 437         mp = mp_chain;
 438         prev = &mp_chain;
 439 
 440         while (mp != NULL) {
 441                 next_mp = mp->b_next;
 442                 mp->b_next = NULL;
 443                 if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
 444                         freemsgchain(next_mp);
 445                         break;
 446                 }
 447                 *prev = mp;
 448                 prev = &mp->b_next;
 449                 mp = mp->b_next = next_mp;
 450         }
 451 
 452         return (mp_chain);
 453 }
 454 
 455 /*
 456  * Default callback function. Used when the datapath is not yet initialized.
 457  */
 458 /* ARGSUSED */
 459 void
 460 mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
 461     boolean_t loopback)
 462 {
 463         mblk_t  *mp1 = mp;
 464 
 465         while (mp1 != NULL) {
 466                 mp1->b_prev = NULL;
 467                 mp1->b_queue = NULL;
 468                 mp1 = mp1->b_next;
 469         }
 470         freemsgchain(mp);
 471 }
 472 
 473 /*
 474  * Determines the IPv6 header length accounting for all the optional IPv6
 475  * headers (hop-by-hop, destination, routing and fragment). The header length
 476  * and next header value (a transport header) is captured.
 477  *
 478  * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
 479  * returns B_TRUE.
 480  */
 481 int
 482 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
 483     uint8_t *next_hdr, ip6_frag_t **fragp)
 484 {
 485         uint16_t length;
 486         uint_t  ehdrlen;
 487         uint8_t *whereptr;
 488         uint8_t *nexthdrp;
 489         ip6_dest_t *desthdr;
 490         ip6_rthdr_t *rthdr;
 491         ip6_frag_t *fraghdr;
 492 
 493         if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
 494                 return (ENOSPC);
 495         /*
 496          * Return EINVAL, which mac_protect callers treat explicitly as "let
 497          * pass", flow callers treat as "not in a flow", and the rest treat
 498          * as "don't do special processing".
 499          */
 500         if (IPH_HDR_VERSION(ip6h) != IPV6_VERSION)
 501                 return (EINVAL);
 502         length = IPV6_HDR_LEN;
 503         whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
 504 
 505         if (fragp != NULL)
 506                 *fragp = NULL;
 507 
 508         nexthdrp = &ip6h->ip6_nxt;
 509         while (whereptr < endptr) {
 510                 /* Is there enough left for len + nexthdr? */
 511                 if (whereptr + MIN_EHDR_LEN > endptr)
 512                         break;
 513 
 514                 switch (*nexthdrp) {
 515                 case IPPROTO_HOPOPTS:
 516                 case IPPROTO_DSTOPTS:
 517                         /* Assumes the headers are identical for hbh and dst */
 518                         desthdr = (ip6_dest_t *)whereptr;
 519                         ehdrlen = 8 * (desthdr->ip6d_len + 1);
 520                         if ((uchar_t *)desthdr +  ehdrlen > endptr)
 521                                 return (ENOSPC);
 522                         nexthdrp = &desthdr->ip6d_nxt;
 523                         break;
 524                 case IPPROTO_ROUTING:
 525                         rthdr = (ip6_rthdr_t *)whereptr;
 526                         ehdrlen =  8 * (rthdr->ip6r_len + 1);
 527                         if ((uchar_t *)rthdr +  ehdrlen > endptr)
 528                                 return (ENOSPC);
 529                         nexthdrp = &rthdr->ip6r_nxt;
 530                         break;
 531                 case IPPROTO_FRAGMENT:
 532                         fraghdr = (ip6_frag_t *)whereptr;
 533                         ehdrlen = sizeof (ip6_frag_t);
 534                         if ((uchar_t *)&fraghdr[1] > endptr)
 535                                 return (ENOSPC);
 536                         nexthdrp = &fraghdr->ip6f_nxt;
 537                         if (fragp != NULL)
 538                                 *fragp = fraghdr;
 539                         break;
 540                 case IPPROTO_NONE:
 541                         /* No next header means we're finished */
 542                 default:
 543                         *hdr_length = length;
 544                         *next_hdr = *nexthdrp;
 545                         return (0);
 546                 }
 547                 length += ehdrlen;
 548                 whereptr += ehdrlen;
 549                 *hdr_length = length;
 550                 *next_hdr = *nexthdrp;
 551         }
 552         switch (*nexthdrp) {
 553         case IPPROTO_HOPOPTS:
 554         case IPPROTO_DSTOPTS:
 555         case IPPROTO_ROUTING:
 556         case IPPROTO_FRAGMENT:
 557                 /*
 558                  * If any know extension headers are still to be processed,
 559                  * the packet's malformed (or at least all the IP header(s) are
 560                  * not in the same mblk - and that should never happen.
 561                  *
 562                  * Return ENOSPC because it MAY be spread across mblks, and
 563                  * and the rest of MAC or IPv6 itself can cope.
 564                  */
 565                 return (ENOSPC);
 566 
 567         default:
 568                 /*
 569                  * If we get here, we know that all of the IP headers were in
 570                  * the same mblk, even if the ULP header is in the next mblk.
 571                  */
 572                 *hdr_length = length;
 573                 *next_hdr = *nexthdrp;
 574                 return (0);
 575         }
 576 }
 577 
 578 /*
 579  * The following set of routines are there to take care of interrupt
 580  * re-targeting for legacy (fixed) interrupts. Some older versions
 581  * of the popular NICs like e1000g do not support MSI-X interrupts
 582  * and they reserve fixed interrupts for RX/TX rings. To re-target
 583  * these interrupts, PCITOOL ioctls need to be used.
 584  */
 585 typedef struct mac_dladm_intr {
 586         int     ino;
 587         int     cpu_id;
 588         char    driver_path[MAXPATHLEN];
 589         char    nexus_path[MAXPATHLEN];
 590 } mac_dladm_intr_t;
 591 
 592 /* Bind the interrupt to cpu_num */
 593 static int
 594 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
 595 {
 596         pcitool_intr_set_t      iset;
 597         int                     err;
 598 
 599         iset.old_cpu = oldcpuid;
 600         iset.ino = ino;
 601         iset.cpu_id = cpu_num;
 602         iset.user_version = PCITOOL_VERSION;
 603         err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
 604             kcred, NULL);
 605 
 606         return (err);
 607 }
 608 
 609 /*
 610  * Search interrupt information. iget is filled in with the info to search
 611  */
 612 static boolean_t
 613 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
 614 {
 615         int     i;
 616         char    driver_path[2 * MAXPATHLEN];
 617 
 618         for (i = 0; i < iget_p->num_devs; i++) {
 619                 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
 620                 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
 621                     ":%s%d", iget_p->dev[i].driver_name,
 622                     iget_p->dev[i].dev_inst);
 623                 /* Match the device path for the device path */
 624                 if (strcmp(driver_path, dln->driver_path) == 0) {
 625                         dln->ino = iget_p->ino;
 626                         dln->cpu_id = iget_p->cpu_id;
 627                         return (B_TRUE);
 628                 }
 629         }
 630         return (B_FALSE);
 631 }
 632 
 633 /*
 634  * Get information about ino, i.e. if this is the interrupt for our
 635  * device and where it is bound etc.
 636  */
 637 static boolean_t
 638 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
 639     mac_dladm_intr_t *dln)
 640 {
 641         pcitool_intr_get_t      *iget_p;
 642         int                     ipsz;
 643         int                     nipsz;
 644         int                     err;
 645         uint8_t                 inum;
 646 
 647         /*
 648          * Check if SLEEP is OK, i.e if could come here in response to
 649          * changing the fanout due to some callback from the driver, say
 650          * link speed changes.
 651          */
 652         ipsz = PCITOOL_IGET_SIZE(0);
 653         iget_p = kmem_zalloc(ipsz, KM_SLEEP);
 654 
 655         iget_p->num_devs_ret = 0;
 656         iget_p->user_version = PCITOOL_VERSION;
 657         iget_p->cpu_id = oldcpuid;
 658         iget_p->ino = ino;
 659 
 660         err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
 661             FKIOCTL, kcred, NULL);
 662         if (err != 0) {
 663                 kmem_free(iget_p, ipsz);
 664                 return (B_FALSE);
 665         }
 666         if (iget_p->num_devs == 0) {
 667                 kmem_free(iget_p, ipsz);
 668                 return (B_FALSE);
 669         }
 670         inum = iget_p->num_devs;
 671         if (iget_p->num_devs_ret < iget_p->num_devs) {
 672                 /* Reallocate */
 673                 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
 674 
 675                 kmem_free(iget_p, ipsz);
 676                 ipsz = nipsz;
 677                 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
 678 
 679                 iget_p->num_devs_ret = inum;
 680                 iget_p->cpu_id = oldcpuid;
 681                 iget_p->ino = ino;
 682                 iget_p->user_version = PCITOOL_VERSION;
 683                 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
 684                     FKIOCTL, kcred, NULL);
 685                 if (err != 0) {
 686                         kmem_free(iget_p, ipsz);
 687                         return (B_FALSE);
 688                 }
 689                 /* defensive */
 690                 if (iget_p->num_devs != iget_p->num_devs_ret) {
 691                         kmem_free(iget_p, ipsz);
 692                         return (B_FALSE);
 693                 }
 694         }
 695 
 696         if (mac_search_intrinfo(iget_p, dln)) {
 697                 kmem_free(iget_p, ipsz);
 698                 return (B_TRUE);
 699         }
 700         kmem_free(iget_p, ipsz);
 701         return (B_FALSE);
 702 }
 703 
 704 /*
 705  * Get the interrupts and check each one to see if it is for our device.
 706  */
 707 static int
 708 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
 709 {
 710         pcitool_intr_info_t     intr_info;
 711         int                     err;
 712         int                     ino;
 713         int                     oldcpuid;
 714 
 715         err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
 716             FKIOCTL, kcred, NULL);
 717         if (err != 0)
 718                 return (-1);
 719 
 720         for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
 721                 for (ino = 0; ino < intr_info.num_intr; ino++) {
 722                         if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
 723                                 if (dln->cpu_id == cpuid)
 724                                         return (0);
 725                                 return (1);
 726                         }
 727                 }
 728         }
 729         return (-1);
 730 }
 731 
 732 /*
 733  * Obtain the nexus parent node info. for mdip.
 734  */
 735 static dev_info_t *
 736 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
 737 {
 738         struct dev_info         *tdip = (struct dev_info *)mdip;
 739         struct ddi_minor_data   *minordata;
 740         int                     circ;
 741         dev_info_t              *pdip;
 742         char                    pathname[MAXPATHLEN];
 743 
 744         while (tdip != NULL) {
 745                 /*
 746                  * The netboot code could call this function while walking the
 747                  * device tree so we need to use ndi_devi_tryenter() here to
 748                  * avoid deadlock.
 749                  */
 750                 if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0)
 751                         break;
 752 
 753                 for (minordata = tdip->devi_minor; minordata != NULL;
 754                     minordata = minordata->next) {
 755                         if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
 756                             strlen(DDI_NT_INTRCTL)) == 0) {
 757                                 pdip = minordata->dip;
 758                                 (void) ddi_pathname(pdip, pathname);
 759                                 (void) snprintf(dln->nexus_path, MAXPATHLEN,
 760                                     "/devices%s:intr", pathname);
 761                                 (void) ddi_pathname_minor(minordata, pathname);
 762                                 ndi_devi_exit((dev_info_t *)tdip, circ);
 763                                 return (pdip);
 764                         }
 765                 }
 766                 ndi_devi_exit((dev_info_t *)tdip, circ);
 767                 tdip = tdip->devi_parent;
 768         }
 769         return (NULL);
 770 }
 771 
 772 /*
 773  * For a primary MAC client, if the user has set a list or CPUs or
 774  * we have obtained it implicitly, we try to retarget the interrupt
 775  * for that device on one of the CPUs in the list.
 776  * We assign the interrupt to the same CPU as the poll thread.
 777  */
 778 static boolean_t
 779 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
 780 {
 781         ldi_handle_t            lh = NULL;
 782         ldi_ident_t             li = NULL;
 783         int                     err;
 784         int                     ret;
 785         mac_dladm_intr_t        dln;
 786         dev_info_t              *dip;
 787         struct ddi_minor_data   *minordata;
 788 
 789         dln.nexus_path[0] = '\0';
 790         dln.driver_path[0] = '\0';
 791 
 792         minordata = ((struct dev_info *)mdip)->devi_minor;
 793         while (minordata != NULL) {
 794                 if (minordata->type == DDM_MINOR)
 795                         break;
 796                 minordata = minordata->next;
 797         }
 798         if (minordata == NULL)
 799                 return (B_FALSE);
 800 
 801         (void) ddi_pathname_minor(minordata, dln.driver_path);
 802 
 803         dip = mac_get_nexus_node(mdip, &dln);
 804         /* defensive */
 805         if (dip == NULL)
 806                 return (B_FALSE);
 807 
 808         err = ldi_ident_from_major(ddi_driver_major(dip), &li);
 809         if (err != 0)
 810                 return (B_FALSE);
 811 
 812         err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
 813         if (err != 0)
 814                 return (B_FALSE);
 815 
 816         ret = mac_validate_intr(lh, &dln, cpuid);
 817         if (ret < 0) {
 818                 (void) ldi_close(lh, FREAD|FWRITE, kcred);
 819                 return (B_FALSE);
 820         }
 821         /* cmn_note? */
 822         if (ret != 0)
 823                 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
 824                     != 0) {
 825                         (void) ldi_close(lh, FREAD|FWRITE, kcred);
 826                         return (B_FALSE);
 827                 }
 828         (void) ldi_close(lh, FREAD|FWRITE, kcred);
 829         return (B_TRUE);
 830 }
 831 
 832 void
 833 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
 834 {
 835         dev_info_t              *mdip = (dev_info_t *)arg;
 836         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
 837         mac_resource_props_t    *mrp;
 838         mac_perim_handle_t      mph;
 839         flow_entry_t            *flent = mcip->mci_flent;
 840         mac_soft_ring_set_t     *rx_srs;
 841         mac_cpus_t              *srs_cpu;
 842 
 843         if (!mac_check_interrupt_binding(mdip, cpuid))
 844                 cpuid = -1;
 845         mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
 846         mrp = MCIP_RESOURCE_PROPS(mcip);
 847         mrp->mrp_rx_intr_cpu = cpuid;
 848         if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
 849                 rx_srs = flent->fe_rx_srs[1];
 850                 srs_cpu = &rx_srs->srs_cpu;
 851                 srs_cpu->mc_rx_intr_cpu = cpuid;
 852         }
 853         mac_perim_exit(mph);
 854 }
 855 
 856 int32_t
 857 mac_client_intr_cpu(mac_client_handle_t mch)
 858 {
 859         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
 860         mac_cpus_t              *srs_cpu;
 861         mac_soft_ring_set_t     *rx_srs;
 862         flow_entry_t            *flent = mcip->mci_flent;
 863         mac_resource_props_t    *mrp = MCIP_RESOURCE_PROPS(mcip);
 864         mac_ring_t              *ring;
 865         mac_intr_t              *mintr;
 866 
 867         /*
 868          * Check if we need to retarget the interrupt. We do this only
 869          * for the primary MAC client. We do this if we have the only
 870          * exclusive ring in the group.
 871          */
 872         if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
 873                 rx_srs = flent->fe_rx_srs[1];
 874                 srs_cpu = &rx_srs->srs_cpu;
 875                 ring = rx_srs->srs_ring;
 876                 mintr = &ring->mr_info.mri_intr;
 877                 /*
 878                  * If ddi_handle is present or the poll CPU is
 879                  * already bound to the interrupt CPU, return -1.
 880                  */
 881                 if (mintr->mi_ddi_handle != NULL ||
 882                     ((mrp->mrp_ncpus != 0) &&
 883                     (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
 884                         return (-1);
 885                 }
 886                 return (srs_cpu->mc_rx_pollid);
 887         }
 888         return (-1);
 889 }
 890 
 891 void *
 892 mac_get_devinfo(mac_handle_t mh)
 893 {
 894         mac_impl_t      *mip = (mac_impl_t *)mh;
 895 
 896         return ((void *)mip->mi_dip);
 897 }
 898 
 899 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
 900 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
 901 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
 902 
 903 uint64_t
 904 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
 905 {
 906         struct ether_header *ehp;
 907         uint64_t hash = 0;
 908         uint16_t sap;
 909         uint_t skip_len;
 910         uint8_t proto;
 911         boolean_t ip_fragmented;
 912 
 913         /*
 914          * We may want to have one of these per MAC type plugin in the
 915          * future. For now supports only ethernet.
 916          */
 917         if (media != DL_ETHER)
 918                 return (0L);
 919 
 920         /* for now we support only outbound packets */
 921         ASSERT(is_outbound);
 922         ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
 923         ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
 924 
 925         /* compute L2 hash */
 926 
 927         ehp = (struct ether_header *)mp->b_rptr;
 928 
 929         if ((policy & MAC_PKT_HASH_L2) != 0) {
 930                 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
 931                 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
 932                 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
 933                 policy &= ~MAC_PKT_HASH_L2;
 934         }
 935 
 936         if (policy == 0)
 937                 goto done;
 938 
 939         /* skip ethernet header */
 940 
 941         sap = ntohs(ehp->ether_type);
 942         if (sap == ETHERTYPE_VLAN) {
 943                 struct ether_vlan_header *evhp;
 944                 mblk_t *newmp = NULL;
 945 
 946                 skip_len = sizeof (struct ether_vlan_header);
 947                 if (MBLKL(mp) < skip_len) {
 948                         /* the vlan tag is the payload, pull up first */
 949                         newmp = msgpullup(mp, -1);
 950                         if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
 951                                 goto done;
 952                         }
 953                         evhp = (struct ether_vlan_header *)newmp->b_rptr;
 954                 } else {
 955                         evhp = (struct ether_vlan_header *)mp->b_rptr;
 956                 }
 957 
 958                 sap = ntohs(evhp->ether_type);
 959                 freemsg(newmp);
 960         } else {
 961                 skip_len = sizeof (struct ether_header);
 962         }
 963 
 964         /* if ethernet header is in its own mblk, skip it */
 965         if (MBLKL(mp) <= skip_len) {
 966                 skip_len -= MBLKL(mp);
 967                 mp = mp->b_cont;
 968                 if (mp == NULL)
 969                         goto done;
 970         }
 971 
 972         sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
 973 
 974         /* compute IP src/dst addresses hash and skip IPv{4,6} header */
 975 
 976         switch (sap) {
 977         case ETHERTYPE_IP: {
 978                 ipha_t *iphp;
 979 
 980                 /*
 981                  * If the header is not aligned, the header doesn't fit in the
 982                  * mblk, OR we have a bad IP version, bail now. Note that this
 983                  * may cause packets reordering.
 984                  */
 985                 iphp = (ipha_t *)(mp->b_rptr + skip_len);
 986                 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
 987                     !OK_32PTR((char *)iphp) ||
 988                     IPH_HDR_VERSION(iphp) != IPV4_VERSION)
 989                         goto done;
 990 
 991                 proto = iphp->ipha_protocol;
 992                 skip_len += IPH_HDR_LENGTH(iphp);
 993 
 994                 /* Check if the packet is fragmented. */
 995                 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
 996                     IPH_OFFSET;
 997 
 998                 /*
 999                  * For fragmented packets, use addresses in addition to
1000                  * the frag_id to generate the hash inorder to get
1001                  * better distribution.
1002                  */
1003                 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
1004                         uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
1005                         uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
1006 
1007                         hash ^= (PKT_HASH_4BYTES(ip_src) ^
1008                             PKT_HASH_4BYTES(ip_dst));
1009                         policy &= ~MAC_PKT_HASH_L3;
1010                 }
1011 
1012                 if (ip_fragmented) {
1013                         uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
1014                         hash ^= PKT_HASH_2BYTES(identp);
1015                         goto done;
1016                 }
1017                 break;
1018         }
1019         case ETHERTYPE_IPV6: {
1020                 ip6_t *ip6hp;
1021                 ip6_frag_t *frag = NULL;
1022                 uint16_t hdr_length;
1023 
1024                 /*
1025                  * If the header is not aligned or the header doesn't fit
1026                  * in the mblk, bail now. Note that this may cause packets
1027                  * reordering.
1028                  */
1029 
1030                 ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
1031                 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
1032                     !OK_32PTR((char *)ip6hp))
1033                         goto done;
1034 
1035                 /* Also bail, regardless of why, if the function below fails. */
1036                 if (mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
1037                     &proto, &frag) != 0)
1038                         goto done;
1039                 skip_len += hdr_length;
1040 
1041                 /*
1042                  * For fragmented packets, use addresses in addition to
1043                  * the frag_id to generate the hash inorder to get
1044                  * better distribution.
1045                  */
1046                 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
1047                         uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
1048                         uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
1049 
1050                         hash ^= (PKT_HASH_4BYTES(ip_src) ^
1051                             PKT_HASH_4BYTES(ip_dst));
1052                         policy &= ~MAC_PKT_HASH_L3;
1053                 }
1054 
1055                 if (frag != NULL) {
1056                         uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
1057                         hash ^= PKT_HASH_4BYTES(identp);
1058                         goto done;
1059                 }
1060                 break;
1061         }
1062         default:
1063                 goto done;
1064         }
1065 
1066         if (policy == 0)
1067                 goto done;
1068 
1069         /* if ip header is in its own mblk, skip it */
1070         if (MBLKL(mp) <= skip_len) {
1071                 skip_len -= MBLKL(mp);
1072                 mp = mp->b_cont;
1073                 if (mp == NULL)
1074                         goto done;
1075         }
1076 
1077         /* parse ULP header */
1078 again:
1079         switch (proto) {
1080         case IPPROTO_TCP:
1081         case IPPROTO_UDP:
1082         case IPPROTO_ESP:
1083         case IPPROTO_SCTP:
1084                 /*
1085                  * These Internet Protocols are intentionally designed
1086                  * for hashing from the git-go.  Port numbers are in the first
1087                  * word for transports, SPI is first for ESP.
1088                  */
1089                 if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
1090                         goto done;
1091                 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
1092                 break;
1093 
1094         case IPPROTO_AH: {
1095                 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
1096                 uint_t ah_length = AH_TOTAL_LEN(ah);
1097 
1098                 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
1099                         goto done;
1100 
1101                 proto = ah->ah_nexthdr;
1102                 skip_len += ah_length;
1103 
1104                 /* if AH header is in its own mblk, skip it */
1105                 if (MBLKL(mp) <= skip_len) {
1106                         skip_len -= MBLKL(mp);
1107                         mp = mp->b_cont;
1108                         if (mp == NULL)
1109                                 goto done;
1110                 }
1111 
1112                 goto again;
1113         }
1114         }
1115 
1116 done:
1117         return (hash);
1118 }