1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2019 Joyent, Inc.
24 */
25
26 /*
27 * MAC Services Module - misc utilities
28 */
29
30 #include <sys/types.h>
31 #include <sys/mac.h>
32 #include <sys/mac_impl.h>
33 #include <sys/mac_client_priv.h>
34 #include <sys/mac_client_impl.h>
35 #include <sys/mac_soft_ring.h>
36 #include <sys/strsubr.h>
37 #include <sys/strsun.h>
38 #include <sys/vlan.h>
39 #include <sys/pattr.h>
40 #include <sys/pci_tools.h>
41 #include <inet/ip.h>
42 #include <inet/ip_impl.h>
43 #include <inet/ip6.h>
44 #include <sys/vtrace.h>
45 #include <sys/dlpi.h>
46 #include <sys/sunndi.h>
47 #include <inet/ipsec_impl.h>
48 #include <inet/sadb.h>
49 #include <inet/ipsecesp.h>
50 #include <inet/ipsecah.h>
51
52 /*
53 * Copy an mblk, preserving its hardware checksum flags.
54 */
55 static mblk_t *
56 mac_copymsg_cksum(mblk_t *mp)
57 {
58 mblk_t *mp1;
59 uint32_t start, stuff, end, value, flags;
60
61 mp1 = copymsg(mp);
62 if (mp1 == NULL)
63 return (NULL);
64
65 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
66 (void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
67 flags, KM_NOSLEEP);
68
69 return (mp1);
70 }
71
72 /*
73 * Copy an mblk chain, presenting the hardware checksum flags of the
74 * individual mblks.
75 */
76 mblk_t *
77 mac_copymsgchain_cksum(mblk_t *mp)
78 {
79 mblk_t *nmp = NULL;
80 mblk_t **nmpp = &nmp;
81
82 for (; mp != NULL; mp = mp->b_next) {
83 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
84 freemsgchain(nmp);
85 return (NULL);
86 }
87
88 nmpp = &((*nmpp)->b_next);
89 }
90
91 return (nmp);
92 }
93
94 /*
95 * Process the specified mblk chain for proper handling of hardware
96 * checksum offload. This routine is invoked for loopback traffic
97 * between MAC clients.
98 * The function handles a NULL mblk chain passed as argument.
99 */
100 mblk_t *
101 mac_fix_cksum(mblk_t *mp_chain)
102 {
103 mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
104 uint32_t flags, start, stuff, end, value;
105
106 for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
107 uint16_t len;
108 uint32_t offset;
109 struct ether_header *ehp;
110 uint16_t sap;
111
112 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
113 &flags);
114 if (flags == 0)
115 continue;
116
117 /*
118 * Since the processing of checksum offload for loopback
119 * traffic requires modification of the packet contents,
120 * ensure sure that we are always modifying our own copy.
121 */
122 if (DB_REF(mp) > 1) {
123 mp1 = copymsg(mp);
124 if (mp1 == NULL)
125 continue;
126 mp1->b_next = mp->b_next;
127 mp->b_next = NULL;
128 freemsg(mp);
129 if (prev != NULL)
130 prev->b_next = mp1;
131 else
132 new_chain = mp1;
133 mp = mp1;
134 }
135
136 /*
137 * Ethernet, and optionally VLAN header.
138 */
139 /* LINTED: improper alignment cast */
140 ehp = (struct ether_header *)mp->b_rptr;
141 if (ntohs(ehp->ether_type) == VLAN_TPID) {
142 struct ether_vlan_header *evhp;
143
144 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
145 /* LINTED: improper alignment cast */
146 evhp = (struct ether_vlan_header *)mp->b_rptr;
147 sap = ntohs(evhp->ether_type);
148 offset = sizeof (struct ether_vlan_header);
149 } else {
150 sap = ntohs(ehp->ether_type);
151 offset = sizeof (struct ether_header);
152 }
153
154 if (MBLKL(mp) <= offset) {
155 offset -= MBLKL(mp);
156 if (mp->b_cont == NULL) {
157 /* corrupted packet, skip it */
158 if (prev != NULL)
159 prev->b_next = mp->b_next;
160 else
161 new_chain = mp->b_next;
162 mp1 = mp->b_next;
163 mp->b_next = NULL;
164 freemsg(mp);
165 mp = mp1;
166 continue;
167 }
168 mp = mp->b_cont;
169 }
170
171 if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
172 ipha_t *ipha = NULL;
173
174 /*
175 * In order to compute the full and header
176 * checksums, we need to find and parse
177 * the IP and/or ULP headers.
178 */
179
180 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
181
182 /*
183 * IP header.
184 */
185 if (sap != ETHERTYPE_IP)
186 continue;
187
188 ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
189 /* LINTED: improper alignment cast */
190 ipha = (ipha_t *)(mp->b_rptr + offset);
191
192 if (flags & HCK_FULLCKSUM) {
193 ipaddr_t src, dst;
194 uint32_t cksum;
195 uint16_t *up;
196 uint8_t proto;
197
198 /*
199 * Pointer to checksum field in ULP header.
200 */
201 proto = ipha->ipha_protocol;
202 ASSERT(ipha->ipha_version_and_hdr_length ==
203 IP_SIMPLE_HDR_VERSION);
204
205 switch (proto) {
206 case IPPROTO_TCP:
207 /* LINTED: improper alignment cast */
208 up = IPH_TCPH_CHECKSUMP(ipha,
209 IP_SIMPLE_HDR_LENGTH);
210 break;
211
212 case IPPROTO_UDP:
213 /* LINTED: improper alignment cast */
214 up = IPH_UDPH_CHECKSUMP(ipha,
215 IP_SIMPLE_HDR_LENGTH);
216 break;
217
218 default:
219 cmn_err(CE_WARN, "mac_fix_cksum: "
220 "unexpected protocol: %d", proto);
221 continue;
222 }
223
224 /*
225 * Pseudo-header checksum.
226 */
227 src = ipha->ipha_src;
228 dst = ipha->ipha_dst;
229 len = ntohs(ipha->ipha_length) -
230 IP_SIMPLE_HDR_LENGTH;
231
232 cksum = (dst >> 16) + (dst & 0xFFFF) +
233 (src >> 16) + (src & 0xFFFF);
234 cksum += htons(len);
235
236 /*
237 * The checksum value stored in the packet needs
238 * to be correct. Compute it here.
239 */
240 *up = 0;
241 cksum += (((proto) == IPPROTO_UDP) ?
242 IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
243 cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
244 offset, cksum);
245 *(up) = (uint16_t)(cksum ? cksum : ~cksum);
246
247 /*
248 * Flag the packet so that it appears
249 * that the checksum has already been
250 * verified by the hardware.
251 */
252 flags &= ~HCK_FULLCKSUM;
253 flags |= HCK_FULLCKSUM_OK;
254 value = 0;
255 }
256
257 if (flags & HCK_IPV4_HDRCKSUM) {
258 ASSERT(ipha != NULL);
259 ipha->ipha_hdr_checksum =
260 (uint16_t)ip_csum_hdr(ipha);
261 flags &= ~HCK_IPV4_HDRCKSUM;
262 flags |= HCK_IPV4_HDRCKSUM_OK;
263
264 }
265 }
266
267 if (flags & HCK_PARTIALCKSUM) {
268 uint16_t *up, partial, cksum;
269 uchar_t *ipp; /* ptr to beginning of IP header */
270
271 if (mp->b_cont != NULL) {
272 mblk_t *mp1;
273
274 mp1 = msgpullup(mp, offset + end);
275 if (mp1 == NULL)
276 continue;
277 mp1->b_next = mp->b_next;
278 mp->b_next = NULL;
279 freemsg(mp);
280 if (prev != NULL)
281 prev->b_next = mp1;
282 else
283 new_chain = mp1;
284 mp = mp1;
285 }
286
287 ipp = mp->b_rptr + offset;
288 /* LINTED: cast may result in improper alignment */
289 up = (uint16_t *)((uchar_t *)ipp + stuff);
290 partial = *up;
291 *up = 0;
292
293 cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
294 end - start, partial);
295 cksum = ~cksum;
296 *up = cksum ? cksum : ~cksum;
297
298 /*
299 * Since we already computed the whole checksum,
300 * indicate to the stack that it has already
301 * been verified by the hardware.
302 */
303 flags &= ~HCK_PARTIALCKSUM;
304 flags |= HCK_FULLCKSUM_OK;
305 value = 0;
306 }
307
308 (void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
309 value, flags, KM_NOSLEEP);
310 }
311
312 return (new_chain);
313 }
314
315 /*
316 * Add VLAN tag to the specified mblk.
317 */
318 mblk_t *
319 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
320 {
321 mblk_t *hmp;
322 struct ether_vlan_header *evhp;
323 struct ether_header *ehp;
324 uint32_t start, stuff, end, value, flags;
325
326 ASSERT(pri != 0 || vid != 0);
327
328 /*
329 * Allocate an mblk for the new tagged ethernet header,
330 * and copy the MAC addresses and ethertype from the
331 * original header.
332 */
333
334 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
335 if (hmp == NULL) {
336 freemsg(mp);
337 return (NULL);
338 }
339
340 evhp = (struct ether_vlan_header *)hmp->b_rptr;
341 ehp = (struct ether_header *)mp->b_rptr;
342
343 bcopy(ehp, evhp, (ETHERADDRL * 2));
344 evhp->ether_type = ehp->ether_type;
345 evhp->ether_tpid = htons(ETHERTYPE_VLAN);
346
347 hmp->b_wptr += sizeof (struct ether_vlan_header);
348 mp->b_rptr += sizeof (struct ether_header);
349
350 /*
351 * Free the original message if it's now empty. Link the
352 * rest of messages to the header message.
353 */
354 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
355 (void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags,
356 KM_NOSLEEP);
357 if (MBLKL(mp) == 0) {
358 hmp->b_cont = mp->b_cont;
359 freeb(mp);
360 } else {
361 hmp->b_cont = mp;
362 }
363 ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
364
365 /*
366 * Initialize the new TCI (Tag Control Information).
367 */
368 evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
369
370 return (hmp);
371 }
372
373 /*
374 * Adds a VLAN tag with the specified VID and priority to each mblk of
375 * the specified chain.
376 */
377 mblk_t *
378 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
379 {
380 mblk_t *next_mp, **prev, *mp;
381
382 mp = mp_chain;
383 prev = &mp_chain;
384
385 while (mp != NULL) {
386 next_mp = mp->b_next;
387 mp->b_next = NULL;
388 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
389 freemsgchain(next_mp);
390 break;
391 }
392 *prev = mp;
393 prev = &mp->b_next;
394 mp = mp->b_next = next_mp;
395 }
396
397 return (mp_chain);
398 }
399
400 /*
401 * Strip VLAN tag
402 */
403 mblk_t *
404 mac_strip_vlan_tag(mblk_t *mp)
405 {
406 mblk_t *newmp;
407 struct ether_vlan_header *evhp;
408
409 evhp = (struct ether_vlan_header *)mp->b_rptr;
410 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
411 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
412
413 if (DB_REF(mp) > 1) {
414 newmp = copymsg(mp);
415 if (newmp == NULL)
416 return (NULL);
417 freemsg(mp);
418 mp = newmp;
419 }
420
421 evhp = (struct ether_vlan_header *)mp->b_rptr;
422
423 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
424 mp->b_rptr += VLAN_TAGSZ;
425 }
426 return (mp);
427 }
428
429 /*
430 * Strip VLAN tag from each mblk of the chain.
431 */
432 mblk_t *
433 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
434 {
435 mblk_t *mp, *next_mp, **prev;
436
437 mp = mp_chain;
438 prev = &mp_chain;
439
440 while (mp != NULL) {
441 next_mp = mp->b_next;
442 mp->b_next = NULL;
443 if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
444 freemsgchain(next_mp);
445 break;
446 }
447 *prev = mp;
448 prev = &mp->b_next;
449 mp = mp->b_next = next_mp;
450 }
451
452 return (mp_chain);
453 }
454
455 /*
456 * Default callback function. Used when the datapath is not yet initialized.
457 */
458 /* ARGSUSED */
459 void
460 mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
461 boolean_t loopback)
462 {
463 mblk_t *mp1 = mp;
464
465 while (mp1 != NULL) {
466 mp1->b_prev = NULL;
467 mp1->b_queue = NULL;
468 mp1 = mp1->b_next;
469 }
470 freemsgchain(mp);
471 }
472
473 /*
474 * Determines the IPv6 header length accounting for all the optional IPv6
475 * headers (hop-by-hop, destination, routing and fragment). The header length
476 * and next header value (a transport header) is captured.
477 *
478 * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
479 * returns B_TRUE.
480 */
481 int
482 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
483 uint8_t *next_hdr, ip6_frag_t **fragp)
484 {
485 uint16_t length;
486 uint_t ehdrlen;
487 uint8_t *whereptr;
488 uint8_t *nexthdrp;
489 ip6_dest_t *desthdr;
490 ip6_rthdr_t *rthdr;
491 ip6_frag_t *fraghdr;
492
493 if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
494 return (ENOSPC);
495 /*
496 * Return EINVAL, which mac_protect callers treat explicitly as "let
497 * pass", flow callers treat as "not in a flow", and the rest treat
498 * as "don't do special processing".
499 */
500 if (IPH_HDR_VERSION(ip6h) != IPV6_VERSION)
501 return (EINVAL);
502 length = IPV6_HDR_LEN;
503 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
504
505 if (fragp != NULL)
506 *fragp = NULL;
507
508 nexthdrp = &ip6h->ip6_nxt;
509 while (whereptr < endptr) {
510 /* Is there enough left for len + nexthdr? */
511 if (whereptr + MIN_EHDR_LEN > endptr)
512 break;
513
514 switch (*nexthdrp) {
515 case IPPROTO_HOPOPTS:
516 case IPPROTO_DSTOPTS:
517 /* Assumes the headers are identical for hbh and dst */
518 desthdr = (ip6_dest_t *)whereptr;
519 ehdrlen = 8 * (desthdr->ip6d_len + 1);
520 if ((uchar_t *)desthdr + ehdrlen > endptr)
521 return (ENOSPC);
522 nexthdrp = &desthdr->ip6d_nxt;
523 break;
524 case IPPROTO_ROUTING:
525 rthdr = (ip6_rthdr_t *)whereptr;
526 ehdrlen = 8 * (rthdr->ip6r_len + 1);
527 if ((uchar_t *)rthdr + ehdrlen > endptr)
528 return (ENOSPC);
529 nexthdrp = &rthdr->ip6r_nxt;
530 break;
531 case IPPROTO_FRAGMENT:
532 fraghdr = (ip6_frag_t *)whereptr;
533 ehdrlen = sizeof (ip6_frag_t);
534 if ((uchar_t *)&fraghdr[1] > endptr)
535 return (ENOSPC);
536 nexthdrp = &fraghdr->ip6f_nxt;
537 if (fragp != NULL)
538 *fragp = fraghdr;
539 break;
540 case IPPROTO_NONE:
541 /* No next header means we're finished */
542 default:
543 *hdr_length = length;
544 *next_hdr = *nexthdrp;
545 return (0);
546 }
547 length += ehdrlen;
548 whereptr += ehdrlen;
549 *hdr_length = length;
550 *next_hdr = *nexthdrp;
551 }
552 switch (*nexthdrp) {
553 case IPPROTO_HOPOPTS:
554 case IPPROTO_DSTOPTS:
555 case IPPROTO_ROUTING:
556 case IPPROTO_FRAGMENT:
557 /*
558 * If any know extension headers are still to be processed,
559 * the packet's malformed (or at least all the IP header(s) are
560 * not in the same mblk - and that should never happen.
561 *
562 * Return ENOSPC because it MAY be spread across mblks, and
563 * and the rest of MAC or IPv6 itself can cope.
564 */
565 return (ENOSPC);
566
567 default:
568 /*
569 * If we get here, we know that all of the IP headers were in
570 * the same mblk, even if the ULP header is in the next mblk.
571 */
572 *hdr_length = length;
573 *next_hdr = *nexthdrp;
574 return (0);
575 }
576 }
577
578 /*
579 * The following set of routines are there to take care of interrupt
580 * re-targeting for legacy (fixed) interrupts. Some older versions
581 * of the popular NICs like e1000g do not support MSI-X interrupts
582 * and they reserve fixed interrupts for RX/TX rings. To re-target
583 * these interrupts, PCITOOL ioctls need to be used.
584 */
585 typedef struct mac_dladm_intr {
586 int ino;
587 int cpu_id;
588 char driver_path[MAXPATHLEN];
589 char nexus_path[MAXPATHLEN];
590 } mac_dladm_intr_t;
591
592 /* Bind the interrupt to cpu_num */
593 static int
594 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
595 {
596 pcitool_intr_set_t iset;
597 int err;
598
599 iset.old_cpu = oldcpuid;
600 iset.ino = ino;
601 iset.cpu_id = cpu_num;
602 iset.user_version = PCITOOL_VERSION;
603 err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
604 kcred, NULL);
605
606 return (err);
607 }
608
609 /*
610 * Search interrupt information. iget is filled in with the info to search
611 */
612 static boolean_t
613 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
614 {
615 int i;
616 char driver_path[2 * MAXPATHLEN];
617
618 for (i = 0; i < iget_p->num_devs; i++) {
619 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
620 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
621 ":%s%d", iget_p->dev[i].driver_name,
622 iget_p->dev[i].dev_inst);
623 /* Match the device path for the device path */
624 if (strcmp(driver_path, dln->driver_path) == 0) {
625 dln->ino = iget_p->ino;
626 dln->cpu_id = iget_p->cpu_id;
627 return (B_TRUE);
628 }
629 }
630 return (B_FALSE);
631 }
632
633 /*
634 * Get information about ino, i.e. if this is the interrupt for our
635 * device and where it is bound etc.
636 */
637 static boolean_t
638 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
639 mac_dladm_intr_t *dln)
640 {
641 pcitool_intr_get_t *iget_p;
642 int ipsz;
643 int nipsz;
644 int err;
645 uint8_t inum;
646
647 /*
648 * Check if SLEEP is OK, i.e if could come here in response to
649 * changing the fanout due to some callback from the driver, say
650 * link speed changes.
651 */
652 ipsz = PCITOOL_IGET_SIZE(0);
653 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
654
655 iget_p->num_devs_ret = 0;
656 iget_p->user_version = PCITOOL_VERSION;
657 iget_p->cpu_id = oldcpuid;
658 iget_p->ino = ino;
659
660 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
661 FKIOCTL, kcred, NULL);
662 if (err != 0) {
663 kmem_free(iget_p, ipsz);
664 return (B_FALSE);
665 }
666 if (iget_p->num_devs == 0) {
667 kmem_free(iget_p, ipsz);
668 return (B_FALSE);
669 }
670 inum = iget_p->num_devs;
671 if (iget_p->num_devs_ret < iget_p->num_devs) {
672 /* Reallocate */
673 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
674
675 kmem_free(iget_p, ipsz);
676 ipsz = nipsz;
677 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
678
679 iget_p->num_devs_ret = inum;
680 iget_p->cpu_id = oldcpuid;
681 iget_p->ino = ino;
682 iget_p->user_version = PCITOOL_VERSION;
683 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
684 FKIOCTL, kcred, NULL);
685 if (err != 0) {
686 kmem_free(iget_p, ipsz);
687 return (B_FALSE);
688 }
689 /* defensive */
690 if (iget_p->num_devs != iget_p->num_devs_ret) {
691 kmem_free(iget_p, ipsz);
692 return (B_FALSE);
693 }
694 }
695
696 if (mac_search_intrinfo(iget_p, dln)) {
697 kmem_free(iget_p, ipsz);
698 return (B_TRUE);
699 }
700 kmem_free(iget_p, ipsz);
701 return (B_FALSE);
702 }
703
704 /*
705 * Get the interrupts and check each one to see if it is for our device.
706 */
707 static int
708 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
709 {
710 pcitool_intr_info_t intr_info;
711 int err;
712 int ino;
713 int oldcpuid;
714
715 err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
716 FKIOCTL, kcred, NULL);
717 if (err != 0)
718 return (-1);
719
720 for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
721 for (ino = 0; ino < intr_info.num_intr; ino++) {
722 if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
723 if (dln->cpu_id == cpuid)
724 return (0);
725 return (1);
726 }
727 }
728 }
729 return (-1);
730 }
731
732 /*
733 * Obtain the nexus parent node info. for mdip.
734 */
735 static dev_info_t *
736 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
737 {
738 struct dev_info *tdip = (struct dev_info *)mdip;
739 struct ddi_minor_data *minordata;
740 int circ;
741 dev_info_t *pdip;
742 char pathname[MAXPATHLEN];
743
744 while (tdip != NULL) {
745 /*
746 * The netboot code could call this function while walking the
747 * device tree so we need to use ndi_devi_tryenter() here to
748 * avoid deadlock.
749 */
750 if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0)
751 break;
752
753 for (minordata = tdip->devi_minor; minordata != NULL;
754 minordata = minordata->next) {
755 if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
756 strlen(DDI_NT_INTRCTL)) == 0) {
757 pdip = minordata->dip;
758 (void) ddi_pathname(pdip, pathname);
759 (void) snprintf(dln->nexus_path, MAXPATHLEN,
760 "/devices%s:intr", pathname);
761 (void) ddi_pathname_minor(minordata, pathname);
762 ndi_devi_exit((dev_info_t *)tdip, circ);
763 return (pdip);
764 }
765 }
766 ndi_devi_exit((dev_info_t *)tdip, circ);
767 tdip = tdip->devi_parent;
768 }
769 return (NULL);
770 }
771
772 /*
773 * For a primary MAC client, if the user has set a list or CPUs or
774 * we have obtained it implicitly, we try to retarget the interrupt
775 * for that device on one of the CPUs in the list.
776 * We assign the interrupt to the same CPU as the poll thread.
777 */
778 static boolean_t
779 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
780 {
781 ldi_handle_t lh = NULL;
782 ldi_ident_t li = NULL;
783 int err;
784 int ret;
785 mac_dladm_intr_t dln;
786 dev_info_t *dip;
787 struct ddi_minor_data *minordata;
788
789 dln.nexus_path[0] = '\0';
790 dln.driver_path[0] = '\0';
791
792 minordata = ((struct dev_info *)mdip)->devi_minor;
793 while (minordata != NULL) {
794 if (minordata->type == DDM_MINOR)
795 break;
796 minordata = minordata->next;
797 }
798 if (minordata == NULL)
799 return (B_FALSE);
800
801 (void) ddi_pathname_minor(minordata, dln.driver_path);
802
803 dip = mac_get_nexus_node(mdip, &dln);
804 /* defensive */
805 if (dip == NULL)
806 return (B_FALSE);
807
808 err = ldi_ident_from_major(ddi_driver_major(dip), &li);
809 if (err != 0)
810 return (B_FALSE);
811
812 err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
813 if (err != 0)
814 return (B_FALSE);
815
816 ret = mac_validate_intr(lh, &dln, cpuid);
817 if (ret < 0) {
818 (void) ldi_close(lh, FREAD|FWRITE, kcred);
819 return (B_FALSE);
820 }
821 /* cmn_note? */
822 if (ret != 0)
823 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
824 != 0) {
825 (void) ldi_close(lh, FREAD|FWRITE, kcred);
826 return (B_FALSE);
827 }
828 (void) ldi_close(lh, FREAD|FWRITE, kcred);
829 return (B_TRUE);
830 }
831
832 void
833 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
834 {
835 dev_info_t *mdip = (dev_info_t *)arg;
836 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
837 mac_resource_props_t *mrp;
838 mac_perim_handle_t mph;
839 flow_entry_t *flent = mcip->mci_flent;
840 mac_soft_ring_set_t *rx_srs;
841 mac_cpus_t *srs_cpu;
842
843 if (!mac_check_interrupt_binding(mdip, cpuid))
844 cpuid = -1;
845 mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
846 mrp = MCIP_RESOURCE_PROPS(mcip);
847 mrp->mrp_rx_intr_cpu = cpuid;
848 if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
849 rx_srs = flent->fe_rx_srs[1];
850 srs_cpu = &rx_srs->srs_cpu;
851 srs_cpu->mc_rx_intr_cpu = cpuid;
852 }
853 mac_perim_exit(mph);
854 }
855
856 int32_t
857 mac_client_intr_cpu(mac_client_handle_t mch)
858 {
859 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
860 mac_cpus_t *srs_cpu;
861 mac_soft_ring_set_t *rx_srs;
862 flow_entry_t *flent = mcip->mci_flent;
863 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
864 mac_ring_t *ring;
865 mac_intr_t *mintr;
866
867 /*
868 * Check if we need to retarget the interrupt. We do this only
869 * for the primary MAC client. We do this if we have the only
870 * exclusive ring in the group.
871 */
872 if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
873 rx_srs = flent->fe_rx_srs[1];
874 srs_cpu = &rx_srs->srs_cpu;
875 ring = rx_srs->srs_ring;
876 mintr = &ring->mr_info.mri_intr;
877 /*
878 * If ddi_handle is present or the poll CPU is
879 * already bound to the interrupt CPU, return -1.
880 */
881 if (mintr->mi_ddi_handle != NULL ||
882 ((mrp->mrp_ncpus != 0) &&
883 (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
884 return (-1);
885 }
886 return (srs_cpu->mc_rx_pollid);
887 }
888 return (-1);
889 }
890
891 void *
892 mac_get_devinfo(mac_handle_t mh)
893 {
894 mac_impl_t *mip = (mac_impl_t *)mh;
895
896 return ((void *)mip->mi_dip);
897 }
898
899 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
900 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
901 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
902
903 uint64_t
904 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
905 {
906 struct ether_header *ehp;
907 uint64_t hash = 0;
908 uint16_t sap;
909 uint_t skip_len;
910 uint8_t proto;
911 boolean_t ip_fragmented;
912
913 /*
914 * We may want to have one of these per MAC type plugin in the
915 * future. For now supports only ethernet.
916 */
917 if (media != DL_ETHER)
918 return (0L);
919
920 /* for now we support only outbound packets */
921 ASSERT(is_outbound);
922 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
923 ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
924
925 /* compute L2 hash */
926
927 ehp = (struct ether_header *)mp->b_rptr;
928
929 if ((policy & MAC_PKT_HASH_L2) != 0) {
930 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
931 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
932 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
933 policy &= ~MAC_PKT_HASH_L2;
934 }
935
936 if (policy == 0)
937 goto done;
938
939 /* skip ethernet header */
940
941 sap = ntohs(ehp->ether_type);
942 if (sap == ETHERTYPE_VLAN) {
943 struct ether_vlan_header *evhp;
944 mblk_t *newmp = NULL;
945
946 skip_len = sizeof (struct ether_vlan_header);
947 if (MBLKL(mp) < skip_len) {
948 /* the vlan tag is the payload, pull up first */
949 newmp = msgpullup(mp, -1);
950 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
951 goto done;
952 }
953 evhp = (struct ether_vlan_header *)newmp->b_rptr;
954 } else {
955 evhp = (struct ether_vlan_header *)mp->b_rptr;
956 }
957
958 sap = ntohs(evhp->ether_type);
959 freemsg(newmp);
960 } else {
961 skip_len = sizeof (struct ether_header);
962 }
963
964 /* if ethernet header is in its own mblk, skip it */
965 if (MBLKL(mp) <= skip_len) {
966 skip_len -= MBLKL(mp);
967 mp = mp->b_cont;
968 if (mp == NULL)
969 goto done;
970 }
971
972 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
973
974 /* compute IP src/dst addresses hash and skip IPv{4,6} header */
975
976 switch (sap) {
977 case ETHERTYPE_IP: {
978 ipha_t *iphp;
979
980 /*
981 * If the header is not aligned, the header doesn't fit in the
982 * mblk, OR we have a bad IP version, bail now. Note that this
983 * may cause packets reordering.
984 */
985 iphp = (ipha_t *)(mp->b_rptr + skip_len);
986 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
987 !OK_32PTR((char *)iphp) ||
988 IPH_HDR_VERSION(iphp) != IPV4_VERSION)
989 goto done;
990
991 proto = iphp->ipha_protocol;
992 skip_len += IPH_HDR_LENGTH(iphp);
993
994 /* Check if the packet is fragmented. */
995 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
996 IPH_OFFSET;
997
998 /*
999 * For fragmented packets, use addresses in addition to
1000 * the frag_id to generate the hash inorder to get
1001 * better distribution.
1002 */
1003 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
1004 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
1005 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
1006
1007 hash ^= (PKT_HASH_4BYTES(ip_src) ^
1008 PKT_HASH_4BYTES(ip_dst));
1009 policy &= ~MAC_PKT_HASH_L3;
1010 }
1011
1012 if (ip_fragmented) {
1013 uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
1014 hash ^= PKT_HASH_2BYTES(identp);
1015 goto done;
1016 }
1017 break;
1018 }
1019 case ETHERTYPE_IPV6: {
1020 ip6_t *ip6hp;
1021 ip6_frag_t *frag = NULL;
1022 uint16_t hdr_length;
1023
1024 /*
1025 * If the header is not aligned or the header doesn't fit
1026 * in the mblk, bail now. Note that this may cause packets
1027 * reordering.
1028 */
1029
1030 ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
1031 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
1032 !OK_32PTR((char *)ip6hp))
1033 goto done;
1034
1035 /* Also bail, regardless of why, if the function below fails. */
1036 if (mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
1037 &proto, &frag) != 0)
1038 goto done;
1039 skip_len += hdr_length;
1040
1041 /*
1042 * For fragmented packets, use addresses in addition to
1043 * the frag_id to generate the hash inorder to get
1044 * better distribution.
1045 */
1046 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
1047 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
1048 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
1049
1050 hash ^= (PKT_HASH_4BYTES(ip_src) ^
1051 PKT_HASH_4BYTES(ip_dst));
1052 policy &= ~MAC_PKT_HASH_L3;
1053 }
1054
1055 if (frag != NULL) {
1056 uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
1057 hash ^= PKT_HASH_4BYTES(identp);
1058 goto done;
1059 }
1060 break;
1061 }
1062 default:
1063 goto done;
1064 }
1065
1066 if (policy == 0)
1067 goto done;
1068
1069 /* if ip header is in its own mblk, skip it */
1070 if (MBLKL(mp) <= skip_len) {
1071 skip_len -= MBLKL(mp);
1072 mp = mp->b_cont;
1073 if (mp == NULL)
1074 goto done;
1075 }
1076
1077 /* parse ULP header */
1078 again:
1079 switch (proto) {
1080 case IPPROTO_TCP:
1081 case IPPROTO_UDP:
1082 case IPPROTO_ESP:
1083 case IPPROTO_SCTP:
1084 /*
1085 * These Internet Protocols are intentionally designed
1086 * for hashing from the git-go. Port numbers are in the first
1087 * word for transports, SPI is first for ESP.
1088 */
1089 if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
1090 goto done;
1091 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
1092 break;
1093
1094 case IPPROTO_AH: {
1095 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
1096 uint_t ah_length = AH_TOTAL_LEN(ah);
1097
1098 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
1099 goto done;
1100
1101 proto = ah->ah_nexthdr;
1102 skip_len += ah_length;
1103
1104 /* if AH header is in its own mblk, skip it */
1105 if (MBLKL(mp) <= skip_len) {
1106 skip_len -= MBLKL(mp);
1107 mp = mp->b_cont;
1108 if (mp == NULL)
1109 goto done;
1110 }
1111
1112 goto again;
1113 }
1114 }
1115
1116 done:
1117 return (hash);
1118 }