1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * MAC Services Module - misc utilities
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/mac.h>
  31 #include <sys/mac_impl.h>
  32 #include <sys/mac_client_priv.h>
  33 #include <sys/mac_client_impl.h>
  34 #include <sys/mac_soft_ring.h>
  35 #include <sys/strsubr.h>
  36 #include <sys/strsun.h>
  37 #include <sys/vlan.h>
  38 #include <sys/pattr.h>
  39 #include <sys/pci_tools.h>
  40 #include <inet/ip.h>
  41 #include <inet/ip_impl.h>
  42 #include <inet/ip6.h>
  43 #include <sys/vtrace.h>
  44 #include <sys/dlpi.h>
  45 #include <sys/sunndi.h>
  46 #include <inet/ipsec_impl.h>
  47 #include <inet/sadb.h>
  48 #include <inet/ipsecesp.h>
  49 #include <inet/ipsecah.h>
  50 
  51 /*
  52  * Copy an mblk, preserving its hardware checksum flags.
  53  */
  54 static mblk_t *
  55 mac_copymsg_cksum(mblk_t *mp)
  56 {
  57         mblk_t *mp1;
  58         uint32_t start, stuff, end, value, flags;
  59 
  60         mp1 = copymsg(mp);
  61         if (mp1 == NULL)
  62                 return (NULL);
  63 
  64         hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
  65         (void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
  66             flags, KM_NOSLEEP);
  67 
  68         return (mp1);
  69 }
  70 
  71 /*
  72  * Copy an mblk chain, presenting the hardware checksum flags of the
  73  * individual mblks.
  74  */
  75 mblk_t *
  76 mac_copymsgchain_cksum(mblk_t *mp)
  77 {
  78         mblk_t *nmp = NULL;
  79         mblk_t **nmpp = &nmp;
  80 
  81         for (; mp != NULL; mp = mp->b_next) {
  82                 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
  83                         freemsgchain(nmp);
  84                         return (NULL);
  85                 }
  86 
  87                 nmpp = &((*nmpp)->b_next);
  88         }
  89 
  90         return (nmp);
  91 }
  92 
  93 /*
  94  * Process the specified mblk chain for proper handling of hardware
  95  * checksum offload. This routine is invoked for loopback traffic
  96  * between MAC clients.
  97  * The function handles a NULL mblk chain passed as argument.
  98  */
  99 mblk_t *
 100 mac_fix_cksum(mblk_t *mp_chain)
 101 {
 102         mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
 103         uint32_t flags, start, stuff, end, value;
 104 
 105         for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
 106                 uint16_t len;
 107                 uint32_t offset;
 108                 struct ether_header *ehp;
 109                 uint16_t sap;
 110 
 111                 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
 112                     &flags);
 113                 if (flags == 0)
 114                         continue;
 115 
 116                 /*
 117                  * Since the processing of checksum offload for loopback
 118                  * traffic requires modification of the packet contents,
 119                  * ensure sure that we are always modifying our own copy.
 120                  */
 121                 if (DB_REF(mp) > 1) {
 122                         mp1 = copymsg(mp);
 123                         if (mp1 == NULL)
 124                                 continue;
 125                         mp1->b_next = mp->b_next;
 126                         mp->b_next = NULL;
 127                         freemsg(mp);
 128                         if (prev != NULL)
 129                                 prev->b_next = mp1;
 130                         else
 131                                 new_chain = mp1;
 132                         mp = mp1;
 133                 }
 134 
 135                 /*
 136                  * Ethernet, and optionally VLAN header.
 137                  */
 138                 /* LINTED: improper alignment cast */
 139                 ehp = (struct ether_header *)mp->b_rptr;
 140                 if (ntohs(ehp->ether_type) == VLAN_TPID) {
 141                         struct ether_vlan_header *evhp;
 142 
 143                         ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
 144                         /* LINTED: improper alignment cast */
 145                         evhp = (struct ether_vlan_header *)mp->b_rptr;
 146                         sap = ntohs(evhp->ether_type);
 147                         offset = sizeof (struct ether_vlan_header);
 148                 } else {
 149                         sap = ntohs(ehp->ether_type);
 150                         offset = sizeof (struct ether_header);
 151                 }
 152 
 153                 if (MBLKL(mp) <= offset) {
 154                         offset -= MBLKL(mp);
 155                         if (mp->b_cont == NULL) {
 156                                 /* corrupted packet, skip it */
 157                                 if (prev != NULL)
 158                                         prev->b_next = mp->b_next;
 159                                 else
 160                                         new_chain = mp->b_next;
 161                                 mp1 = mp->b_next;
 162                                 mp->b_next = NULL;
 163                                 freemsg(mp);
 164                                 mp = mp1;
 165                                 continue;
 166                         }
 167                         mp = mp->b_cont;
 168                 }
 169 
 170                 if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
 171                         ipha_t *ipha = NULL;
 172 
 173                         /*
 174                          * In order to compute the full and header
 175                          * checksums, we need to find and parse
 176                          * the IP and/or ULP headers.
 177                          */
 178 
 179                         sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
 180 
 181                         /*
 182                          * IP header.
 183                          */
 184                         if (sap != ETHERTYPE_IP)
 185                                 continue;
 186 
 187                         ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
 188                         /* LINTED: improper alignment cast */
 189                         ipha = (ipha_t *)(mp->b_rptr + offset);
 190 
 191                         if (flags & HCK_FULLCKSUM) {
 192                                 ipaddr_t src, dst;
 193                                 uint32_t cksum;
 194                                 uint16_t *up;
 195                                 uint8_t proto;
 196 
 197                                 /*
 198                                  * Pointer to checksum field in ULP header.
 199                                  */
 200                                 proto = ipha->ipha_protocol;
 201                                 ASSERT(ipha->ipha_version_and_hdr_length ==
 202                                     IP_SIMPLE_HDR_VERSION);
 203 
 204                                 switch (proto) {
 205                                 case IPPROTO_TCP:
 206                                         /* LINTED: improper alignment cast */
 207                                         up = IPH_TCPH_CHECKSUMP(ipha,
 208                                             IP_SIMPLE_HDR_LENGTH);
 209                                         break;
 210 
 211                                 case IPPROTO_UDP:
 212                                         /* LINTED: improper alignment cast */
 213                                         up = IPH_UDPH_CHECKSUMP(ipha,
 214                                             IP_SIMPLE_HDR_LENGTH);
 215                                         break;
 216 
 217                                 default:
 218                                         cmn_err(CE_WARN, "mac_fix_cksum: "
 219                                             "unexpected protocol: %d", proto);
 220                                         continue;
 221                                 }
 222 
 223                                 /*
 224                                  * Pseudo-header checksum.
 225                                  */
 226                                 src = ipha->ipha_src;
 227                                 dst = ipha->ipha_dst;
 228                                 len = ntohs(ipha->ipha_length) -
 229                                     IP_SIMPLE_HDR_LENGTH;
 230 
 231                                 cksum = (dst >> 16) + (dst & 0xFFFF) +
 232                                     (src >> 16) + (src & 0xFFFF);
 233                                 cksum += htons(len);
 234 
 235                                 /*
 236                                  * The checksum value stored in the packet needs
 237                                  * to be correct. Compute it here.
 238                                  */
 239                                 *up = 0;
 240                                 cksum += (((proto) == IPPROTO_UDP) ?
 241                                     IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
 242                                 cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
 243                                     offset, cksum);
 244                                 *(up) = (uint16_t)(cksum ? cksum : ~cksum);
 245 
 246                                 /*
 247                                  * Flag the packet so that it appears
 248                                  * that the checksum has already been
 249                                  * verified by the hardware.
 250                                  */
 251                                 flags &= ~HCK_FULLCKSUM;
 252                                 flags |= HCK_FULLCKSUM_OK;
 253                                 value = 0;
 254                         }
 255 
 256                         if (flags & HCK_IPV4_HDRCKSUM) {
 257                                 ASSERT(ipha != NULL);
 258                                 ipha->ipha_hdr_checksum =
 259                                     (uint16_t)ip_csum_hdr(ipha);
 260                                 flags &= ~HCK_IPV4_HDRCKSUM;
 261                                 flags |= HCK_IPV4_HDRCKSUM_OK;
 262 
 263                         }
 264                 }
 265 
 266                 if (flags & HCK_PARTIALCKSUM) {
 267                         uint16_t *up, partial, cksum;
 268                         uchar_t *ipp; /* ptr to beginning of IP header */
 269 
 270                         if (mp->b_cont != NULL) {
 271                                 mblk_t *mp1;
 272 
 273                                 mp1 = msgpullup(mp, offset + end);
 274                                 if (mp1 == NULL)
 275                                         continue;
 276                                 mp1->b_next = mp->b_next;
 277                                 mp->b_next = NULL;
 278                                 freemsg(mp);
 279                                 if (prev != NULL)
 280                                         prev->b_next = mp1;
 281                                 else
 282                                         new_chain = mp1;
 283                                 mp = mp1;
 284                         }
 285 
 286                         ipp = mp->b_rptr + offset;
 287                         /* LINTED: cast may result in improper alignment */
 288                         up = (uint16_t *)((uchar_t *)ipp + stuff);
 289                         partial = *up;
 290                         *up = 0;
 291 
 292                         cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
 293                             end - start, partial);
 294                         cksum = ~cksum;
 295                         *up = cksum ? cksum : ~cksum;
 296 
 297                         /*
 298                          * Since we already computed the whole checksum,
 299                          * indicate to the stack that it has already
 300                          * been verified by the hardware.
 301                          */
 302                         flags &= ~HCK_PARTIALCKSUM;
 303                         flags |= HCK_FULLCKSUM_OK;
 304                         value = 0;
 305                 }
 306 
 307                 (void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
 308                     value, flags, KM_NOSLEEP);
 309         }
 310 
 311         return (new_chain);
 312 }
 313 
 314 /*
 315  * Add VLAN tag to the specified mblk.
 316  */
 317 mblk_t *
 318 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
 319 {
 320         mblk_t *hmp;
 321         struct ether_vlan_header *evhp;
 322         struct ether_header *ehp;
 323         uint32_t start, stuff, end, value, flags;
 324 
 325         ASSERT(pri != 0 || vid != 0);
 326 
 327         /*
 328          * Allocate an mblk for the new tagged ethernet header,
 329          * and copy the MAC addresses and ethertype from the
 330          * original header.
 331          */
 332 
 333         hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
 334         if (hmp == NULL) {
 335                 freemsg(mp);
 336                 return (NULL);
 337         }
 338 
 339         evhp = (struct ether_vlan_header *)hmp->b_rptr;
 340         ehp = (struct ether_header *)mp->b_rptr;
 341 
 342         bcopy(ehp, evhp, (ETHERADDRL * 2));
 343         evhp->ether_type = ehp->ether_type;
 344         evhp->ether_tpid = htons(ETHERTYPE_VLAN);
 345 
 346         hmp->b_wptr += sizeof (struct ether_vlan_header);
 347         mp->b_rptr += sizeof (struct ether_header);
 348 
 349         /*
 350          * Free the original message if it's now empty. Link the
 351          * rest of messages to the header message.
 352          */
 353         hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
 354         (void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags,
 355             KM_NOSLEEP);
 356         if (MBLKL(mp) == 0) {
 357                 hmp->b_cont = mp->b_cont;
 358                 freeb(mp);
 359         } else {
 360                 hmp->b_cont = mp;
 361         }
 362         ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
 363 
 364         /*
 365          * Initialize the new TCI (Tag Control Information).
 366          */
 367         evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
 368 
 369         return (hmp);
 370 }
 371 
 372 /*
 373  * Adds a VLAN tag with the specified VID and priority to each mblk of
 374  * the specified chain.
 375  */
 376 mblk_t *
 377 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
 378 {
 379         mblk_t *next_mp, **prev, *mp;
 380 
 381         mp = mp_chain;
 382         prev = &mp_chain;
 383 
 384         while (mp != NULL) {
 385                 next_mp = mp->b_next;
 386                 mp->b_next = NULL;
 387                 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
 388                         freemsgchain(next_mp);
 389                         break;
 390                 }
 391                 *prev = mp;
 392                 prev = &mp->b_next;
 393                 mp = mp->b_next = next_mp;
 394         }
 395 
 396         return (mp_chain);
 397 }
 398 
 399 /*
 400  * Strip VLAN tag
 401  */
 402 mblk_t *
 403 mac_strip_vlan_tag(mblk_t *mp)
 404 {
 405         mblk_t *newmp;
 406         struct ether_vlan_header *evhp;
 407 
 408         evhp = (struct ether_vlan_header *)mp->b_rptr;
 409         if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
 410                 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
 411 
 412                 if (DB_REF(mp) > 1) {
 413                         newmp = copymsg(mp);
 414                         if (newmp == NULL)
 415                                 return (NULL);
 416                         freemsg(mp);
 417                         mp = newmp;
 418                 }
 419 
 420                 evhp = (struct ether_vlan_header *)mp->b_rptr;
 421 
 422                 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
 423                 mp->b_rptr += VLAN_TAGSZ;
 424         }
 425         return (mp);
 426 }
 427 
 428 /*
 429  * Strip VLAN tag from each mblk of the chain.
 430  */
 431 mblk_t *
 432 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
 433 {
 434         mblk_t *mp, *next_mp, **prev;
 435 
 436         mp = mp_chain;
 437         prev = &mp_chain;
 438 
 439         while (mp != NULL) {
 440                 next_mp = mp->b_next;
 441                 mp->b_next = NULL;
 442                 if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
 443                         freemsgchain(next_mp);
 444                         break;
 445                 }
 446                 *prev = mp;
 447                 prev = &mp->b_next;
 448                 mp = mp->b_next = next_mp;
 449         }
 450 
 451         return (mp_chain);
 452 }
 453 
 454 /*
 455  * Default callback function. Used when the datapath is not yet initialized.
 456  */
 457 /* ARGSUSED */
 458 void
 459 mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
 460     boolean_t loopback)
 461 {
 462         mblk_t  *mp1 = mp;
 463 
 464         while (mp1 != NULL) {
 465                 mp1->b_prev = NULL;
 466                 mp1->b_queue = NULL;
 467                 mp1 = mp1->b_next;
 468         }
 469         freemsgchain(mp);
 470 }
 471 
 472 /*
 473  * Determines the IPv6 header length accounting for all the optional IPv6
 474  * headers (hop-by-hop, destination, routing and fragment). The header length
 475  * and next header value (a transport header) is captured.
 476  *
 477  * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
 478  * returns B_TRUE.
 479  */
 480 boolean_t
 481 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
 482     uint8_t *next_hdr, ip6_frag_t **fragp)
 483 {
 484         uint16_t length;
 485         uint_t  ehdrlen;
 486         uint8_t *whereptr;
 487         uint8_t *nexthdrp;
 488         ip6_dest_t *desthdr;
 489         ip6_rthdr_t *rthdr;
 490         ip6_frag_t *fraghdr;
 491 
 492         if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
 493                 return (B_FALSE);
 494         ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
 495         length = IPV6_HDR_LEN;
 496         whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
 497 
 498         if (fragp != NULL)
 499                 *fragp = NULL;
 500 
 501         nexthdrp = &ip6h->ip6_nxt;
 502         while (whereptr < endptr) {
 503                 /* Is there enough left for len + nexthdr? */
 504                 if (whereptr + MIN_EHDR_LEN > endptr)
 505                         break;
 506 
 507                 switch (*nexthdrp) {
 508                 case IPPROTO_HOPOPTS:
 509                 case IPPROTO_DSTOPTS:
 510                         /* Assumes the headers are identical for hbh and dst */
 511                         desthdr = (ip6_dest_t *)whereptr;
 512                         ehdrlen = 8 * (desthdr->ip6d_len + 1);
 513                         if ((uchar_t *)desthdr +  ehdrlen > endptr)
 514                                 return (B_FALSE);
 515                         nexthdrp = &desthdr->ip6d_nxt;
 516                         break;
 517                 case IPPROTO_ROUTING:
 518                         rthdr = (ip6_rthdr_t *)whereptr;
 519                         ehdrlen =  8 * (rthdr->ip6r_len + 1);
 520                         if ((uchar_t *)rthdr +  ehdrlen > endptr)
 521                                 return (B_FALSE);
 522                         nexthdrp = &rthdr->ip6r_nxt;
 523                         break;
 524                 case IPPROTO_FRAGMENT:
 525                         fraghdr = (ip6_frag_t *)whereptr;
 526                         ehdrlen = sizeof (ip6_frag_t);
 527                         if ((uchar_t *)&fraghdr[1] > endptr)
 528                                 return (B_FALSE);
 529                         nexthdrp = &fraghdr->ip6f_nxt;
 530                         if (fragp != NULL)
 531                                 *fragp = fraghdr;
 532                         break;
 533                 case IPPROTO_NONE:
 534                         /* No next header means we're finished */
 535                 default:
 536                         *hdr_length = length;
 537                         *next_hdr = *nexthdrp;
 538                         return (B_TRUE);
 539                 }
 540                 length += ehdrlen;
 541                 whereptr += ehdrlen;
 542                 *hdr_length = length;
 543                 *next_hdr = *nexthdrp;
 544         }
 545         switch (*nexthdrp) {
 546         case IPPROTO_HOPOPTS:
 547         case IPPROTO_DSTOPTS:
 548         case IPPROTO_ROUTING:
 549         case IPPROTO_FRAGMENT:
 550                 /*
 551                  * If any know extension headers are still to be processed,
 552                  * the packet's malformed (or at least all the IP header(s) are
 553                  * not in the same mblk - and that should never happen.
 554                  */
 555                 return (B_FALSE);
 556 
 557         default:
 558                 /*
 559                  * If we get here, we know that all of the IP headers were in
 560                  * the same mblk, even if the ULP header is in the next mblk.
 561                  */
 562                 *hdr_length = length;
 563                 *next_hdr = *nexthdrp;
 564                 return (B_TRUE);
 565         }
 566 }
 567 
 568 /*
 569  * The following set of routines are there to take care of interrupt
 570  * re-targeting for legacy (fixed) interrupts. Some older versions
 571  * of the popular NICs like e1000g do not support MSI-X interrupts
 572  * and they reserve fixed interrupts for RX/TX rings. To re-target
 573  * these interrupts, PCITOOL ioctls need to be used.
 574  */
 575 typedef struct mac_dladm_intr {
 576         int     ino;
 577         int     cpu_id;
 578         char    driver_path[MAXPATHLEN];
 579         char    nexus_path[MAXPATHLEN];
 580 } mac_dladm_intr_t;
 581 
 582 /* Bind the interrupt to cpu_num */
 583 static int
 584 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
 585 {
 586         pcitool_intr_set_t      iset;
 587         int                     err;
 588 
 589         iset.old_cpu = oldcpuid;
 590         iset.ino = ino;
 591         iset.cpu_id = cpu_num;
 592         iset.user_version = PCITOOL_VERSION;
 593         err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
 594             kcred, NULL);
 595 
 596         return (err);
 597 }
 598 
 599 /*
 600  * Search interrupt information. iget is filled in with the info to search
 601  */
 602 static boolean_t
 603 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
 604 {
 605         int     i;
 606         char    driver_path[2 * MAXPATHLEN];
 607 
 608         for (i = 0; i < iget_p->num_devs; i++) {
 609                 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
 610                 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
 611                     ":%s%d", iget_p->dev[i].driver_name,
 612                     iget_p->dev[i].dev_inst);
 613                 /* Match the device path for the device path */
 614                 if (strcmp(driver_path, dln->driver_path) == 0) {
 615                         dln->ino = iget_p->ino;
 616                         dln->cpu_id = iget_p->cpu_id;
 617                         return (B_TRUE);
 618                 }
 619         }
 620         return (B_FALSE);
 621 }
 622 
 623 /*
 624  * Get information about ino, i.e. if this is the interrupt for our
 625  * device and where it is bound etc.
 626  */
 627 static boolean_t
 628 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
 629     mac_dladm_intr_t *dln)
 630 {
 631         pcitool_intr_get_t      *iget_p;
 632         int                     ipsz;
 633         int                     nipsz;
 634         int                     err;
 635         uint8_t                 inum;
 636 
 637         /*
 638          * Check if SLEEP is OK, i.e if could come here in response to
 639          * changing the fanout due to some callback from the driver, say
 640          * link speed changes.
 641          */
 642         ipsz = PCITOOL_IGET_SIZE(0);
 643         iget_p = kmem_zalloc(ipsz, KM_SLEEP);
 644 
 645         iget_p->num_devs_ret = 0;
 646         iget_p->user_version = PCITOOL_VERSION;
 647         iget_p->cpu_id = oldcpuid;
 648         iget_p->ino = ino;
 649 
 650         err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
 651             FKIOCTL, kcred, NULL);
 652         if (err != 0) {
 653                 kmem_free(iget_p, ipsz);
 654                 return (B_FALSE);
 655         }
 656         if (iget_p->num_devs == 0) {
 657                 kmem_free(iget_p, ipsz);
 658                 return (B_FALSE);
 659         }
 660         inum = iget_p->num_devs;
 661         if (iget_p->num_devs_ret < iget_p->num_devs) {
 662                 /* Reallocate */
 663                 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
 664 
 665                 kmem_free(iget_p, ipsz);
 666                 ipsz = nipsz;
 667                 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
 668 
 669                 iget_p->num_devs_ret = inum;
 670                 iget_p->cpu_id = oldcpuid;
 671                 iget_p->ino = ino;
 672                 iget_p->user_version = PCITOOL_VERSION;
 673                 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
 674                     FKIOCTL, kcred, NULL);
 675                 if (err != 0) {
 676                         kmem_free(iget_p, ipsz);
 677                         return (B_FALSE);
 678                 }
 679                 /* defensive */
 680                 if (iget_p->num_devs != iget_p->num_devs_ret) {
 681                         kmem_free(iget_p, ipsz);
 682                         return (B_FALSE);
 683                 }
 684         }
 685 
 686         if (mac_search_intrinfo(iget_p, dln)) {
 687                 kmem_free(iget_p, ipsz);
 688                 return (B_TRUE);
 689         }
 690         kmem_free(iget_p, ipsz);
 691         return (B_FALSE);
 692 }
 693 
 694 /*
 695  * Get the interrupts and check each one to see if it is for our device.
 696  */
 697 static int
 698 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
 699 {
 700         pcitool_intr_info_t     intr_info;
 701         int                     err;
 702         int                     ino;
 703         int                     oldcpuid;
 704 
 705         err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
 706             FKIOCTL, kcred, NULL);
 707         if (err != 0)
 708                 return (-1);
 709 
 710         for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
 711                 for (ino = 0; ino < intr_info.num_intr; ino++) {
 712                         if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
 713                                 if (dln->cpu_id == cpuid)
 714                                         return (0);
 715                                 return (1);
 716                         }
 717                 }
 718         }
 719         return (-1);
 720 }
 721 
 722 /*
 723  * Obtain the nexus parent node info. for mdip.
 724  */
 725 static dev_info_t *
 726 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
 727 {
 728         struct dev_info         *tdip = (struct dev_info *)mdip;
 729         struct ddi_minor_data   *minordata;
 730         int                     circ;
 731         dev_info_t              *pdip;
 732         char                    pathname[MAXPATHLEN];
 733 
 734         while (tdip != NULL) {
 735                 /*
 736                  * The netboot code could call this function while walking the
 737                  * device tree so we need to use ndi_devi_tryenter() here to
 738                  * avoid deadlock.
 739                  */
 740                 if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0)
 741                         break;
 742 
 743                 for (minordata = tdip->devi_minor; minordata != NULL;
 744                     minordata = minordata->next) {
 745                         if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
 746                             strlen(DDI_NT_INTRCTL)) == 0) {
 747                                 pdip = minordata->dip;
 748                                 (void) ddi_pathname(pdip, pathname);
 749                                 (void) snprintf(dln->nexus_path, MAXPATHLEN,
 750                                     "/devices%s:intr", pathname);
 751                                 (void) ddi_pathname_minor(minordata, pathname);
 752                                 ndi_devi_exit((dev_info_t *)tdip, circ);
 753                                 return (pdip);
 754                         }
 755                 }
 756                 ndi_devi_exit((dev_info_t *)tdip, circ);
 757                 tdip = tdip->devi_parent;
 758         }
 759         return (NULL);
 760 }
 761 
 762 /*
 763  * For a primary MAC client, if the user has set a list or CPUs or
 764  * we have obtained it implicitly, we try to retarget the interrupt
 765  * for that device on one of the CPUs in the list.
 766  * We assign the interrupt to the same CPU as the poll thread.
 767  */
 768 static boolean_t
 769 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
 770 {
 771         ldi_handle_t            lh = NULL;
 772         ldi_ident_t             li = NULL;
 773         int                     err;
 774         int                     ret;
 775         mac_dladm_intr_t        dln;
 776         dev_info_t              *dip;
 777         struct ddi_minor_data   *minordata;
 778 
 779         dln.nexus_path[0] = '\0';
 780         dln.driver_path[0] = '\0';
 781 
 782         minordata = ((struct dev_info *)mdip)->devi_minor;
 783         while (minordata != NULL) {
 784                 if (minordata->type == DDM_MINOR)
 785                         break;
 786                 minordata = minordata->next;
 787         }
 788         if (minordata == NULL)
 789                 return (B_FALSE);
 790 
 791         (void) ddi_pathname_minor(minordata, dln.driver_path);
 792 
 793         dip = mac_get_nexus_node(mdip, &dln);
 794         /* defensive */
 795         if (dip == NULL)
 796                 return (B_FALSE);
 797 
 798         err = ldi_ident_from_major(ddi_driver_major(dip), &li);
 799         if (err != 0)
 800                 return (B_FALSE);
 801 
 802         err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
 803         if (err != 0)
 804                 return (B_FALSE);
 805 
 806         ret = mac_validate_intr(lh, &dln, cpuid);
 807         if (ret < 0) {
 808                 (void) ldi_close(lh, FREAD|FWRITE, kcred);
 809                 return (B_FALSE);
 810         }
 811         /* cmn_note? */
 812         if (ret != 0)
 813                 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
 814                     != 0) {
 815                         (void) ldi_close(lh, FREAD|FWRITE, kcred);
 816                         return (B_FALSE);
 817                 }
 818         (void) ldi_close(lh, FREAD|FWRITE, kcred);
 819         return (B_TRUE);
 820 }
 821 
 822 void
 823 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
 824 {
 825         dev_info_t              *mdip = (dev_info_t *)arg;
 826         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
 827         mac_resource_props_t    *mrp;
 828         mac_perim_handle_t      mph;
 829         flow_entry_t            *flent = mcip->mci_flent;
 830         mac_soft_ring_set_t     *rx_srs;
 831         mac_cpus_t              *srs_cpu;
 832 
 833         if (!mac_check_interrupt_binding(mdip, cpuid))
 834                 cpuid = -1;
 835         mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
 836         mrp = MCIP_RESOURCE_PROPS(mcip);
 837         mrp->mrp_rx_intr_cpu = cpuid;
 838         if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
 839                 rx_srs = flent->fe_rx_srs[1];
 840                 srs_cpu = &rx_srs->srs_cpu;
 841                 srs_cpu->mc_rx_intr_cpu = cpuid;
 842         }
 843         mac_perim_exit(mph);
 844 }
 845 
 846 int32_t
 847 mac_client_intr_cpu(mac_client_handle_t mch)
 848 {
 849         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
 850         mac_cpus_t              *srs_cpu;
 851         mac_soft_ring_set_t     *rx_srs;
 852         flow_entry_t            *flent = mcip->mci_flent;
 853         mac_resource_props_t    *mrp = MCIP_RESOURCE_PROPS(mcip);
 854         mac_ring_t              *ring;
 855         mac_intr_t              *mintr;
 856 
 857         /*
 858          * Check if we need to retarget the interrupt. We do this only
 859          * for the primary MAC client. We do this if we have the only
 860          * exclusive ring in the group.
 861          */
 862         if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
 863                 rx_srs = flent->fe_rx_srs[1];
 864                 srs_cpu = &rx_srs->srs_cpu;
 865                 ring = rx_srs->srs_ring;
 866                 mintr = &ring->mr_info.mri_intr;
 867                 /*
 868                  * If ddi_handle is present or the poll CPU is
 869                  * already bound to the interrupt CPU, return -1.
 870                  */
 871                 if (mintr->mi_ddi_handle != NULL ||
 872                     ((mrp->mrp_ncpus != 0) &&
 873                     (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
 874                         return (-1);
 875                 }
 876                 return (srs_cpu->mc_rx_pollid);
 877         }
 878         return (-1);
 879 }
 880 
 881 void *
 882 mac_get_devinfo(mac_handle_t mh)
 883 {
 884         mac_impl_t      *mip = (mac_impl_t *)mh;
 885 
 886         return ((void *)mip->mi_dip);
 887 }
 888 
 889 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
 890 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
 891 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
 892 
 893 uint64_t
 894 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
 895 {
 896         struct ether_header *ehp;
 897         uint64_t hash = 0;
 898         uint16_t sap;
 899         uint_t skip_len;
 900         uint8_t proto;
 901         boolean_t ip_fragmented;
 902 
 903         /*
 904          * We may want to have one of these per MAC type plugin in the
 905          * future. For now supports only ethernet.
 906          */
 907         if (media != DL_ETHER)
 908                 return (0L);
 909 
 910         /* for now we support only outbound packets */
 911         ASSERT(is_outbound);
 912         ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
 913         ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
 914 
 915         /* compute L2 hash */
 916 
 917         ehp = (struct ether_header *)mp->b_rptr;
 918 
 919         if ((policy & MAC_PKT_HASH_L2) != 0) {
 920                 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
 921                 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
 922                 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
 923                 policy &= ~MAC_PKT_HASH_L2;
 924         }
 925 
 926         if (policy == 0)
 927                 goto done;
 928 
 929         /* skip ethernet header */
 930 
 931         sap = ntohs(ehp->ether_type);
 932         if (sap == ETHERTYPE_VLAN) {
 933                 struct ether_vlan_header *evhp;
 934                 mblk_t *newmp = NULL;
 935 
 936                 skip_len = sizeof (struct ether_vlan_header);
 937                 if (MBLKL(mp) < skip_len) {
 938                         /* the vlan tag is the payload, pull up first */
 939                         newmp = msgpullup(mp, -1);
 940                         if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
 941                                 goto done;
 942                         }
 943                         evhp = (struct ether_vlan_header *)newmp->b_rptr;
 944                 } else {
 945                         evhp = (struct ether_vlan_header *)mp->b_rptr;
 946                 }
 947 
 948                 sap = ntohs(evhp->ether_type);
 949                 freemsg(newmp);
 950         } else {
 951                 skip_len = sizeof (struct ether_header);
 952         }
 953 
 954         /* if ethernet header is in its own mblk, skip it */
 955         if (MBLKL(mp) <= skip_len) {
 956                 skip_len -= MBLKL(mp);
 957                 mp = mp->b_cont;
 958                 if (mp == NULL)
 959                         goto done;
 960         }
 961 
 962         sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
 963 
 964         /* compute IP src/dst addresses hash and skip IPv{4,6} header */
 965 
 966         switch (sap) {
 967         case ETHERTYPE_IP: {
 968                 ipha_t *iphp;
 969 
 970                 /*
 971                  * If the header is not aligned or the header doesn't fit
 972                  * in the mblk, bail now. Note that this may cause packets
 973                  * reordering.
 974                  */
 975                 iphp = (ipha_t *)(mp->b_rptr + skip_len);
 976                 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
 977                     !OK_32PTR((char *)iphp))
 978                         goto done;
 979 
 980                 proto = iphp->ipha_protocol;
 981                 skip_len += IPH_HDR_LENGTH(iphp);
 982 
 983                 /* Check if the packet is fragmented. */
 984                 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
 985                     IPH_OFFSET;
 986 
 987                 /*
 988                  * For fragmented packets, use addresses in addition to
 989                  * the frag_id to generate the hash inorder to get
 990                  * better distribution.
 991                  */
 992                 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
 993                         uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
 994                         uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
 995 
 996                         hash ^= (PKT_HASH_4BYTES(ip_src) ^
 997                             PKT_HASH_4BYTES(ip_dst));
 998                         policy &= ~MAC_PKT_HASH_L3;
 999                 }
1000 
1001                 if (ip_fragmented) {
1002                         uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
1003                         hash ^= PKT_HASH_2BYTES(identp);
1004                         goto done;
1005                 }
1006                 break;
1007         }
1008         case ETHERTYPE_IPV6: {
1009                 ip6_t *ip6hp;
1010                 ip6_frag_t *frag = NULL;
1011                 uint16_t hdr_length;
1012 
1013                 /*
1014                  * If the header is not aligned or the header doesn't fit
1015                  * in the mblk, bail now. Note that this may cause packets
1016                  * reordering.
1017                  */
1018 
1019                 ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
1020                 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
1021                     !OK_32PTR((char *)ip6hp))
1022                         goto done;
1023 
1024                 if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
1025                     &proto, &frag))
1026                         goto done;
1027                 skip_len += hdr_length;
1028 
1029                 /*
1030                  * For fragmented packets, use addresses in addition to
1031                  * the frag_id to generate the hash inorder to get
1032                  * better distribution.
1033                  */
1034                 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
1035                         uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
1036                         uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
1037 
1038                         hash ^= (PKT_HASH_4BYTES(ip_src) ^
1039                             PKT_HASH_4BYTES(ip_dst));
1040                         policy &= ~MAC_PKT_HASH_L3;
1041                 }
1042 
1043                 if (frag != NULL) {
1044                         uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
1045                         hash ^= PKT_HASH_4BYTES(identp);
1046                         goto done;
1047                 }
1048                 break;
1049         }
1050         default:
1051                 goto done;
1052         }
1053 
1054         if (policy == 0)
1055                 goto done;
1056 
1057         /* if ip header is in its own mblk, skip it */
1058         if (MBLKL(mp) <= skip_len) {
1059                 skip_len -= MBLKL(mp);
1060                 mp = mp->b_cont;
1061                 if (mp == NULL)
1062                         goto done;
1063         }
1064 
1065         /* parse ULP header */
1066 again:
1067         switch (proto) {
1068         case IPPROTO_TCP:
1069         case IPPROTO_UDP:
1070         case IPPROTO_ESP:
1071         case IPPROTO_SCTP:
1072                 /*
1073                  * These Internet Protocols are intentionally designed
1074                  * for hashing from the git-go.  Port numbers are in the first
1075                  * word for transports, SPI is first for ESP.
1076                  */
1077                 if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
1078                         goto done;
1079                 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
1080                 break;
1081 
1082         case IPPROTO_AH: {
1083                 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
1084                 uint_t ah_length = AH_TOTAL_LEN(ah);
1085 
1086                 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
1087                         goto done;
1088 
1089                 proto = ah->ah_nexthdr;
1090                 skip_len += ah_length;
1091 
1092                 /* if AH header is in its own mblk, skip it */
1093                 if (MBLKL(mp) <= skip_len) {
1094                         skip_len -= MBLKL(mp);
1095                         mp = mp->b_cont;
1096                         if (mp == NULL)
1097                                 goto done;
1098                 }
1099 
1100                 goto again;
1101         }
1102         }
1103 
1104 done:
1105         return (hash);
1106 }