Print this page
9832 Original bug discovered as 9560 has friends IPv4 packets coming in as IPv6 creating chaos
Reviewed by: Robert Mustacchi <rm@joyent.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/io/mac/mac_util.c
+++ new/usr/src/uts/common/io/mac/mac_util.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright 2019 Joyent, Inc.
23 24 */
24 25
25 26 /*
26 27 * MAC Services Module - misc utilities
27 28 */
28 29
29 30 #include <sys/types.h>
30 31 #include <sys/mac.h>
31 32 #include <sys/mac_impl.h>
32 33 #include <sys/mac_client_priv.h>
33 34 #include <sys/mac_client_impl.h>
34 35 #include <sys/mac_soft_ring.h>
35 36 #include <sys/strsubr.h>
36 37 #include <sys/strsun.h>
37 38 #include <sys/vlan.h>
38 39 #include <sys/pattr.h>
39 40 #include <sys/pci_tools.h>
40 41 #include <inet/ip.h>
41 42 #include <inet/ip_impl.h>
42 43 #include <inet/ip6.h>
43 44 #include <sys/vtrace.h>
44 45 #include <sys/dlpi.h>
45 46 #include <sys/sunndi.h>
46 47 #include <inet/ipsec_impl.h>
47 48 #include <inet/sadb.h>
48 49 #include <inet/ipsecesp.h>
49 50 #include <inet/ipsecah.h>
50 51
51 52 /*
52 53 * Copy an mblk, preserving its hardware checksum flags.
53 54 */
54 55 static mblk_t *
55 56 mac_copymsg_cksum(mblk_t *mp)
56 57 {
57 58 mblk_t *mp1;
58 59 uint32_t start, stuff, end, value, flags;
59 60
60 61 mp1 = copymsg(mp);
61 62 if (mp1 == NULL)
62 63 return (NULL);
63 64
64 65 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
65 66 (void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
66 67 flags, KM_NOSLEEP);
67 68
68 69 return (mp1);
69 70 }
70 71
71 72 /*
72 73 * Copy an mblk chain, presenting the hardware checksum flags of the
73 74 * individual mblks.
74 75 */
75 76 mblk_t *
76 77 mac_copymsgchain_cksum(mblk_t *mp)
77 78 {
78 79 mblk_t *nmp = NULL;
79 80 mblk_t **nmpp = &nmp;
80 81
81 82 for (; mp != NULL; mp = mp->b_next) {
82 83 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
83 84 freemsgchain(nmp);
84 85 return (NULL);
85 86 }
86 87
87 88 nmpp = &((*nmpp)->b_next);
88 89 }
89 90
90 91 return (nmp);
91 92 }
92 93
93 94 /*
94 95 * Process the specified mblk chain for proper handling of hardware
95 96 * checksum offload. This routine is invoked for loopback traffic
96 97 * between MAC clients.
97 98 * The function handles a NULL mblk chain passed as argument.
98 99 */
99 100 mblk_t *
100 101 mac_fix_cksum(mblk_t *mp_chain)
101 102 {
102 103 mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
103 104 uint32_t flags, start, stuff, end, value;
104 105
105 106 for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
106 107 uint16_t len;
107 108 uint32_t offset;
108 109 struct ether_header *ehp;
109 110 uint16_t sap;
110 111
111 112 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
112 113 &flags);
113 114 if (flags == 0)
114 115 continue;
115 116
116 117 /*
117 118 * Since the processing of checksum offload for loopback
118 119 * traffic requires modification of the packet contents,
119 120 * ensure sure that we are always modifying our own copy.
120 121 */
121 122 if (DB_REF(mp) > 1) {
122 123 mp1 = copymsg(mp);
123 124 if (mp1 == NULL)
124 125 continue;
125 126 mp1->b_next = mp->b_next;
126 127 mp->b_next = NULL;
127 128 freemsg(mp);
128 129 if (prev != NULL)
129 130 prev->b_next = mp1;
130 131 else
131 132 new_chain = mp1;
132 133 mp = mp1;
133 134 }
134 135
135 136 /*
136 137 * Ethernet, and optionally VLAN header.
137 138 */
138 139 /* LINTED: improper alignment cast */
139 140 ehp = (struct ether_header *)mp->b_rptr;
140 141 if (ntohs(ehp->ether_type) == VLAN_TPID) {
141 142 struct ether_vlan_header *evhp;
142 143
143 144 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
144 145 /* LINTED: improper alignment cast */
145 146 evhp = (struct ether_vlan_header *)mp->b_rptr;
146 147 sap = ntohs(evhp->ether_type);
147 148 offset = sizeof (struct ether_vlan_header);
148 149 } else {
149 150 sap = ntohs(ehp->ether_type);
150 151 offset = sizeof (struct ether_header);
151 152 }
152 153
153 154 if (MBLKL(mp) <= offset) {
154 155 offset -= MBLKL(mp);
155 156 if (mp->b_cont == NULL) {
156 157 /* corrupted packet, skip it */
157 158 if (prev != NULL)
158 159 prev->b_next = mp->b_next;
159 160 else
160 161 new_chain = mp->b_next;
161 162 mp1 = mp->b_next;
162 163 mp->b_next = NULL;
163 164 freemsg(mp);
164 165 mp = mp1;
165 166 continue;
166 167 }
167 168 mp = mp->b_cont;
168 169 }
169 170
170 171 if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
171 172 ipha_t *ipha = NULL;
172 173
173 174 /*
174 175 * In order to compute the full and header
175 176 * checksums, we need to find and parse
176 177 * the IP and/or ULP headers.
177 178 */
178 179
179 180 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
180 181
181 182 /*
182 183 * IP header.
183 184 */
184 185 if (sap != ETHERTYPE_IP)
185 186 continue;
186 187
187 188 ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
188 189 /* LINTED: improper alignment cast */
189 190 ipha = (ipha_t *)(mp->b_rptr + offset);
190 191
191 192 if (flags & HCK_FULLCKSUM) {
192 193 ipaddr_t src, dst;
193 194 uint32_t cksum;
194 195 uint16_t *up;
195 196 uint8_t proto;
196 197
197 198 /*
198 199 * Pointer to checksum field in ULP header.
199 200 */
200 201 proto = ipha->ipha_protocol;
201 202 ASSERT(ipha->ipha_version_and_hdr_length ==
202 203 IP_SIMPLE_HDR_VERSION);
203 204
204 205 switch (proto) {
205 206 case IPPROTO_TCP:
206 207 /* LINTED: improper alignment cast */
207 208 up = IPH_TCPH_CHECKSUMP(ipha,
208 209 IP_SIMPLE_HDR_LENGTH);
209 210 break;
210 211
211 212 case IPPROTO_UDP:
212 213 /* LINTED: improper alignment cast */
213 214 up = IPH_UDPH_CHECKSUMP(ipha,
214 215 IP_SIMPLE_HDR_LENGTH);
215 216 break;
216 217
217 218 default:
218 219 cmn_err(CE_WARN, "mac_fix_cksum: "
219 220 "unexpected protocol: %d", proto);
220 221 continue;
221 222 }
222 223
223 224 /*
224 225 * Pseudo-header checksum.
225 226 */
226 227 src = ipha->ipha_src;
227 228 dst = ipha->ipha_dst;
228 229 len = ntohs(ipha->ipha_length) -
229 230 IP_SIMPLE_HDR_LENGTH;
230 231
231 232 cksum = (dst >> 16) + (dst & 0xFFFF) +
232 233 (src >> 16) + (src & 0xFFFF);
233 234 cksum += htons(len);
234 235
235 236 /*
236 237 * The checksum value stored in the packet needs
237 238 * to be correct. Compute it here.
238 239 */
239 240 *up = 0;
240 241 cksum += (((proto) == IPPROTO_UDP) ?
241 242 IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
242 243 cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
243 244 offset, cksum);
244 245 *(up) = (uint16_t)(cksum ? cksum : ~cksum);
245 246
246 247 /*
247 248 * Flag the packet so that it appears
248 249 * that the checksum has already been
249 250 * verified by the hardware.
250 251 */
251 252 flags &= ~HCK_FULLCKSUM;
252 253 flags |= HCK_FULLCKSUM_OK;
253 254 value = 0;
254 255 }
255 256
256 257 if (flags & HCK_IPV4_HDRCKSUM) {
257 258 ASSERT(ipha != NULL);
258 259 ipha->ipha_hdr_checksum =
259 260 (uint16_t)ip_csum_hdr(ipha);
260 261 flags &= ~HCK_IPV4_HDRCKSUM;
261 262 flags |= HCK_IPV4_HDRCKSUM_OK;
262 263
263 264 }
264 265 }
265 266
266 267 if (flags & HCK_PARTIALCKSUM) {
267 268 uint16_t *up, partial, cksum;
268 269 uchar_t *ipp; /* ptr to beginning of IP header */
269 270
270 271 if (mp->b_cont != NULL) {
271 272 mblk_t *mp1;
272 273
273 274 mp1 = msgpullup(mp, offset + end);
274 275 if (mp1 == NULL)
275 276 continue;
276 277 mp1->b_next = mp->b_next;
277 278 mp->b_next = NULL;
278 279 freemsg(mp);
279 280 if (prev != NULL)
280 281 prev->b_next = mp1;
281 282 else
282 283 new_chain = mp1;
283 284 mp = mp1;
284 285 }
285 286
286 287 ipp = mp->b_rptr + offset;
287 288 /* LINTED: cast may result in improper alignment */
288 289 up = (uint16_t *)((uchar_t *)ipp + stuff);
289 290 partial = *up;
290 291 *up = 0;
291 292
292 293 cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
293 294 end - start, partial);
294 295 cksum = ~cksum;
295 296 *up = cksum ? cksum : ~cksum;
296 297
297 298 /*
298 299 * Since we already computed the whole checksum,
299 300 * indicate to the stack that it has already
300 301 * been verified by the hardware.
301 302 */
302 303 flags &= ~HCK_PARTIALCKSUM;
303 304 flags |= HCK_FULLCKSUM_OK;
304 305 value = 0;
305 306 }
306 307
307 308 (void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
308 309 value, flags, KM_NOSLEEP);
309 310 }
310 311
311 312 return (new_chain);
312 313 }
313 314
314 315 /*
315 316 * Add VLAN tag to the specified mblk.
316 317 */
317 318 mblk_t *
318 319 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
319 320 {
320 321 mblk_t *hmp;
321 322 struct ether_vlan_header *evhp;
322 323 struct ether_header *ehp;
323 324 uint32_t start, stuff, end, value, flags;
324 325
325 326 ASSERT(pri != 0 || vid != 0);
326 327
327 328 /*
328 329 * Allocate an mblk for the new tagged ethernet header,
329 330 * and copy the MAC addresses and ethertype from the
330 331 * original header.
331 332 */
332 333
333 334 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
334 335 if (hmp == NULL) {
335 336 freemsg(mp);
336 337 return (NULL);
337 338 }
338 339
339 340 evhp = (struct ether_vlan_header *)hmp->b_rptr;
340 341 ehp = (struct ether_header *)mp->b_rptr;
341 342
342 343 bcopy(ehp, evhp, (ETHERADDRL * 2));
343 344 evhp->ether_type = ehp->ether_type;
344 345 evhp->ether_tpid = htons(ETHERTYPE_VLAN);
345 346
346 347 hmp->b_wptr += sizeof (struct ether_vlan_header);
347 348 mp->b_rptr += sizeof (struct ether_header);
348 349
349 350 /*
350 351 * Free the original message if it's now empty. Link the
351 352 * rest of messages to the header message.
352 353 */
353 354 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
354 355 (void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags,
355 356 KM_NOSLEEP);
356 357 if (MBLKL(mp) == 0) {
357 358 hmp->b_cont = mp->b_cont;
358 359 freeb(mp);
359 360 } else {
360 361 hmp->b_cont = mp;
361 362 }
362 363 ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
363 364
364 365 /*
365 366 * Initialize the new TCI (Tag Control Information).
366 367 */
367 368 evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
368 369
369 370 return (hmp);
370 371 }
371 372
372 373 /*
373 374 * Adds a VLAN tag with the specified VID and priority to each mblk of
374 375 * the specified chain.
375 376 */
376 377 mblk_t *
377 378 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
378 379 {
379 380 mblk_t *next_mp, **prev, *mp;
380 381
381 382 mp = mp_chain;
382 383 prev = &mp_chain;
383 384
384 385 while (mp != NULL) {
385 386 next_mp = mp->b_next;
386 387 mp->b_next = NULL;
387 388 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
388 389 freemsgchain(next_mp);
389 390 break;
390 391 }
391 392 *prev = mp;
392 393 prev = &mp->b_next;
393 394 mp = mp->b_next = next_mp;
394 395 }
395 396
396 397 return (mp_chain);
397 398 }
398 399
399 400 /*
400 401 * Strip VLAN tag
401 402 */
402 403 mblk_t *
403 404 mac_strip_vlan_tag(mblk_t *mp)
404 405 {
405 406 mblk_t *newmp;
406 407 struct ether_vlan_header *evhp;
407 408
408 409 evhp = (struct ether_vlan_header *)mp->b_rptr;
409 410 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
410 411 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
411 412
412 413 if (DB_REF(mp) > 1) {
413 414 newmp = copymsg(mp);
414 415 if (newmp == NULL)
415 416 return (NULL);
416 417 freemsg(mp);
417 418 mp = newmp;
418 419 }
419 420
420 421 evhp = (struct ether_vlan_header *)mp->b_rptr;
421 422
422 423 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
423 424 mp->b_rptr += VLAN_TAGSZ;
424 425 }
425 426 return (mp);
426 427 }
427 428
428 429 /*
429 430 * Strip VLAN tag from each mblk of the chain.
430 431 */
431 432 mblk_t *
432 433 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
433 434 {
434 435 mblk_t *mp, *next_mp, **prev;
435 436
436 437 mp = mp_chain;
437 438 prev = &mp_chain;
438 439
439 440 while (mp != NULL) {
440 441 next_mp = mp->b_next;
441 442 mp->b_next = NULL;
442 443 if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
443 444 freemsgchain(next_mp);
444 445 break;
445 446 }
446 447 *prev = mp;
447 448 prev = &mp->b_next;
448 449 mp = mp->b_next = next_mp;
449 450 }
450 451
451 452 return (mp_chain);
452 453 }
453 454
454 455 /*
455 456 * Default callback function. Used when the datapath is not yet initialized.
456 457 */
457 458 /* ARGSUSED */
458 459 void
459 460 mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
460 461 boolean_t loopback)
461 462 {
462 463 mblk_t *mp1 = mp;
463 464
464 465 while (mp1 != NULL) {
465 466 mp1->b_prev = NULL;
466 467 mp1->b_queue = NULL;
467 468 mp1 = mp1->b_next;
468 469 }
469 470 freemsgchain(mp);
|
↓ open down ↓ |
437 lines elided |
↑ open up ↑ |
470 471 }
471 472
472 473 /*
473 474 * Determines the IPv6 header length accounting for all the optional IPv6
474 475 * headers (hop-by-hop, destination, routing and fragment). The header length
475 476 * and next header value (a transport header) is captured.
476 477 *
477 478 * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
478 479 * returns B_TRUE.
479 480 */
480 -boolean_t
481 +int
481 482 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
482 483 uint8_t *next_hdr, ip6_frag_t **fragp)
483 484 {
484 485 uint16_t length;
485 486 uint_t ehdrlen;
486 487 uint8_t *whereptr;
487 488 uint8_t *nexthdrp;
488 489 ip6_dest_t *desthdr;
489 490 ip6_rthdr_t *rthdr;
490 491 ip6_frag_t *fraghdr;
491 492
492 493 if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
493 - return (B_FALSE);
494 - ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
494 + return (ENOSPC);
495 + /*
496 + * Return EINVAL, which mac_protect callers treat explicitly as "let
497 + * pass", flow callers treat as "not in a flow", and the rest treat
498 + * as "don't do special processing".
499 + */
500 + if (IPH_HDR_VERSION(ip6h) != IPV6_VERSION)
501 + return (EINVAL);
495 502 length = IPV6_HDR_LEN;
496 503 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
497 504
498 505 if (fragp != NULL)
499 506 *fragp = NULL;
500 507
501 508 nexthdrp = &ip6h->ip6_nxt;
502 509 while (whereptr < endptr) {
503 510 /* Is there enough left for len + nexthdr? */
504 511 if (whereptr + MIN_EHDR_LEN > endptr)
505 512 break;
506 513
507 514 switch (*nexthdrp) {
508 515 case IPPROTO_HOPOPTS:
509 516 case IPPROTO_DSTOPTS:
510 517 /* Assumes the headers are identical for hbh and dst */
511 518 desthdr = (ip6_dest_t *)whereptr;
512 519 ehdrlen = 8 * (desthdr->ip6d_len + 1);
513 520 if ((uchar_t *)desthdr + ehdrlen > endptr)
514 - return (B_FALSE);
521 + return (ENOSPC);
515 522 nexthdrp = &desthdr->ip6d_nxt;
516 523 break;
517 524 case IPPROTO_ROUTING:
518 525 rthdr = (ip6_rthdr_t *)whereptr;
519 526 ehdrlen = 8 * (rthdr->ip6r_len + 1);
520 527 if ((uchar_t *)rthdr + ehdrlen > endptr)
521 - return (B_FALSE);
528 + return (ENOSPC);
522 529 nexthdrp = &rthdr->ip6r_nxt;
523 530 break;
524 531 case IPPROTO_FRAGMENT:
525 532 fraghdr = (ip6_frag_t *)whereptr;
526 533 ehdrlen = sizeof (ip6_frag_t);
527 534 if ((uchar_t *)&fraghdr[1] > endptr)
528 - return (B_FALSE);
535 + return (ENOSPC);
529 536 nexthdrp = &fraghdr->ip6f_nxt;
530 537 if (fragp != NULL)
531 538 *fragp = fraghdr;
532 539 break;
533 540 case IPPROTO_NONE:
534 541 /* No next header means we're finished */
535 542 default:
536 543 *hdr_length = length;
537 544 *next_hdr = *nexthdrp;
538 - return (B_TRUE);
545 + return (0);
539 546 }
540 547 length += ehdrlen;
541 548 whereptr += ehdrlen;
542 549 *hdr_length = length;
543 550 *next_hdr = *nexthdrp;
544 551 }
545 552 switch (*nexthdrp) {
546 553 case IPPROTO_HOPOPTS:
547 554 case IPPROTO_DSTOPTS:
548 555 case IPPROTO_ROUTING:
549 556 case IPPROTO_FRAGMENT:
550 557 /*
551 558 * If any know extension headers are still to be processed,
552 559 * the packet's malformed (or at least all the IP header(s) are
553 560 * not in the same mblk - and that should never happen.
561 + *
562 + * Return ENOSPC because it MAY be spread across mblks, and
563 + * and the rest of MAC or IPv6 itself can cope.
554 564 */
555 - return (B_FALSE);
565 + return (ENOSPC);
556 566
557 567 default:
558 568 /*
559 569 * If we get here, we know that all of the IP headers were in
560 570 * the same mblk, even if the ULP header is in the next mblk.
561 571 */
562 572 *hdr_length = length;
563 573 *next_hdr = *nexthdrp;
564 - return (B_TRUE);
574 + return (0);
565 575 }
566 576 }
567 577
568 578 /*
569 579 * The following set of routines are there to take care of interrupt
570 580 * re-targeting for legacy (fixed) interrupts. Some older versions
571 581 * of the popular NICs like e1000g do not support MSI-X interrupts
572 582 * and they reserve fixed interrupts for RX/TX rings. To re-target
573 583 * these interrupts, PCITOOL ioctls need to be used.
574 584 */
575 585 typedef struct mac_dladm_intr {
576 586 int ino;
577 587 int cpu_id;
578 588 char driver_path[MAXPATHLEN];
579 589 char nexus_path[MAXPATHLEN];
580 590 } mac_dladm_intr_t;
581 591
582 592 /* Bind the interrupt to cpu_num */
583 593 static int
584 594 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
585 595 {
586 596 pcitool_intr_set_t iset;
587 597 int err;
588 598
589 599 iset.old_cpu = oldcpuid;
590 600 iset.ino = ino;
591 601 iset.cpu_id = cpu_num;
592 602 iset.user_version = PCITOOL_VERSION;
593 603 err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
594 604 kcred, NULL);
595 605
596 606 return (err);
597 607 }
598 608
599 609 /*
600 610 * Search interrupt information. iget is filled in with the info to search
601 611 */
602 612 static boolean_t
603 613 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
604 614 {
605 615 int i;
606 616 char driver_path[2 * MAXPATHLEN];
607 617
608 618 for (i = 0; i < iget_p->num_devs; i++) {
609 619 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
610 620 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
611 621 ":%s%d", iget_p->dev[i].driver_name,
612 622 iget_p->dev[i].dev_inst);
613 623 /* Match the device path for the device path */
614 624 if (strcmp(driver_path, dln->driver_path) == 0) {
615 625 dln->ino = iget_p->ino;
616 626 dln->cpu_id = iget_p->cpu_id;
617 627 return (B_TRUE);
618 628 }
619 629 }
620 630 return (B_FALSE);
621 631 }
622 632
623 633 /*
624 634 * Get information about ino, i.e. if this is the interrupt for our
625 635 * device and where it is bound etc.
626 636 */
627 637 static boolean_t
628 638 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
629 639 mac_dladm_intr_t *dln)
630 640 {
631 641 pcitool_intr_get_t *iget_p;
632 642 int ipsz;
633 643 int nipsz;
634 644 int err;
635 645 uint8_t inum;
636 646
637 647 /*
638 648 * Check if SLEEP is OK, i.e if could come here in response to
639 649 * changing the fanout due to some callback from the driver, say
640 650 * link speed changes.
641 651 */
642 652 ipsz = PCITOOL_IGET_SIZE(0);
643 653 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
644 654
645 655 iget_p->num_devs_ret = 0;
646 656 iget_p->user_version = PCITOOL_VERSION;
647 657 iget_p->cpu_id = oldcpuid;
648 658 iget_p->ino = ino;
649 659
650 660 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
651 661 FKIOCTL, kcred, NULL);
652 662 if (err != 0) {
653 663 kmem_free(iget_p, ipsz);
654 664 return (B_FALSE);
655 665 }
656 666 if (iget_p->num_devs == 0) {
657 667 kmem_free(iget_p, ipsz);
658 668 return (B_FALSE);
659 669 }
660 670 inum = iget_p->num_devs;
661 671 if (iget_p->num_devs_ret < iget_p->num_devs) {
662 672 /* Reallocate */
663 673 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
664 674
665 675 kmem_free(iget_p, ipsz);
666 676 ipsz = nipsz;
667 677 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
668 678
669 679 iget_p->num_devs_ret = inum;
670 680 iget_p->cpu_id = oldcpuid;
671 681 iget_p->ino = ino;
672 682 iget_p->user_version = PCITOOL_VERSION;
673 683 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
674 684 FKIOCTL, kcred, NULL);
675 685 if (err != 0) {
676 686 kmem_free(iget_p, ipsz);
677 687 return (B_FALSE);
678 688 }
679 689 /* defensive */
680 690 if (iget_p->num_devs != iget_p->num_devs_ret) {
681 691 kmem_free(iget_p, ipsz);
682 692 return (B_FALSE);
683 693 }
684 694 }
685 695
686 696 if (mac_search_intrinfo(iget_p, dln)) {
687 697 kmem_free(iget_p, ipsz);
688 698 return (B_TRUE);
689 699 }
690 700 kmem_free(iget_p, ipsz);
691 701 return (B_FALSE);
692 702 }
693 703
694 704 /*
695 705 * Get the interrupts and check each one to see if it is for our device.
696 706 */
697 707 static int
698 708 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
699 709 {
700 710 pcitool_intr_info_t intr_info;
701 711 int err;
702 712 int ino;
703 713 int oldcpuid;
704 714
705 715 err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
706 716 FKIOCTL, kcred, NULL);
707 717 if (err != 0)
708 718 return (-1);
709 719
710 720 for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
711 721 for (ino = 0; ino < intr_info.num_intr; ino++) {
712 722 if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
713 723 if (dln->cpu_id == cpuid)
714 724 return (0);
715 725 return (1);
716 726 }
717 727 }
718 728 }
719 729 return (-1);
720 730 }
721 731
722 732 /*
723 733 * Obtain the nexus parent node info. for mdip.
724 734 */
725 735 static dev_info_t *
726 736 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
727 737 {
728 738 struct dev_info *tdip = (struct dev_info *)mdip;
729 739 struct ddi_minor_data *minordata;
730 740 int circ;
731 741 dev_info_t *pdip;
732 742 char pathname[MAXPATHLEN];
733 743
734 744 while (tdip != NULL) {
735 745 /*
736 746 * The netboot code could call this function while walking the
737 747 * device tree so we need to use ndi_devi_tryenter() here to
738 748 * avoid deadlock.
739 749 */
740 750 if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0)
741 751 break;
742 752
743 753 for (minordata = tdip->devi_minor; minordata != NULL;
744 754 minordata = minordata->next) {
745 755 if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
746 756 strlen(DDI_NT_INTRCTL)) == 0) {
747 757 pdip = minordata->dip;
748 758 (void) ddi_pathname(pdip, pathname);
749 759 (void) snprintf(dln->nexus_path, MAXPATHLEN,
750 760 "/devices%s:intr", pathname);
751 761 (void) ddi_pathname_minor(minordata, pathname);
752 762 ndi_devi_exit((dev_info_t *)tdip, circ);
753 763 return (pdip);
754 764 }
755 765 }
756 766 ndi_devi_exit((dev_info_t *)tdip, circ);
757 767 tdip = tdip->devi_parent;
758 768 }
759 769 return (NULL);
760 770 }
761 771
762 772 /*
763 773 * For a primary MAC client, if the user has set a list or CPUs or
764 774 * we have obtained it implicitly, we try to retarget the interrupt
765 775 * for that device on one of the CPUs in the list.
766 776 * We assign the interrupt to the same CPU as the poll thread.
767 777 */
768 778 static boolean_t
769 779 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
770 780 {
771 781 ldi_handle_t lh = NULL;
772 782 ldi_ident_t li = NULL;
773 783 int err;
774 784 int ret;
775 785 mac_dladm_intr_t dln;
776 786 dev_info_t *dip;
777 787 struct ddi_minor_data *minordata;
778 788
779 789 dln.nexus_path[0] = '\0';
780 790 dln.driver_path[0] = '\0';
781 791
782 792 minordata = ((struct dev_info *)mdip)->devi_minor;
783 793 while (minordata != NULL) {
784 794 if (minordata->type == DDM_MINOR)
785 795 break;
786 796 minordata = minordata->next;
787 797 }
788 798 if (minordata == NULL)
789 799 return (B_FALSE);
790 800
791 801 (void) ddi_pathname_minor(minordata, dln.driver_path);
792 802
793 803 dip = mac_get_nexus_node(mdip, &dln);
794 804 /* defensive */
795 805 if (dip == NULL)
796 806 return (B_FALSE);
797 807
798 808 err = ldi_ident_from_major(ddi_driver_major(dip), &li);
799 809 if (err != 0)
800 810 return (B_FALSE);
801 811
802 812 err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
803 813 if (err != 0)
804 814 return (B_FALSE);
805 815
806 816 ret = mac_validate_intr(lh, &dln, cpuid);
807 817 if (ret < 0) {
808 818 (void) ldi_close(lh, FREAD|FWRITE, kcred);
809 819 return (B_FALSE);
810 820 }
811 821 /* cmn_note? */
812 822 if (ret != 0)
813 823 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
814 824 != 0) {
815 825 (void) ldi_close(lh, FREAD|FWRITE, kcred);
816 826 return (B_FALSE);
817 827 }
818 828 (void) ldi_close(lh, FREAD|FWRITE, kcred);
819 829 return (B_TRUE);
820 830 }
821 831
822 832 void
823 833 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
824 834 {
825 835 dev_info_t *mdip = (dev_info_t *)arg;
826 836 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
827 837 mac_resource_props_t *mrp;
828 838 mac_perim_handle_t mph;
829 839 flow_entry_t *flent = mcip->mci_flent;
830 840 mac_soft_ring_set_t *rx_srs;
831 841 mac_cpus_t *srs_cpu;
832 842
833 843 if (!mac_check_interrupt_binding(mdip, cpuid))
834 844 cpuid = -1;
835 845 mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
836 846 mrp = MCIP_RESOURCE_PROPS(mcip);
837 847 mrp->mrp_rx_intr_cpu = cpuid;
838 848 if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
839 849 rx_srs = flent->fe_rx_srs[1];
840 850 srs_cpu = &rx_srs->srs_cpu;
841 851 srs_cpu->mc_rx_intr_cpu = cpuid;
842 852 }
843 853 mac_perim_exit(mph);
844 854 }
845 855
846 856 int32_t
847 857 mac_client_intr_cpu(mac_client_handle_t mch)
848 858 {
849 859 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
850 860 mac_cpus_t *srs_cpu;
851 861 mac_soft_ring_set_t *rx_srs;
852 862 flow_entry_t *flent = mcip->mci_flent;
853 863 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
854 864 mac_ring_t *ring;
855 865 mac_intr_t *mintr;
856 866
857 867 /*
858 868 * Check if we need to retarget the interrupt. We do this only
859 869 * for the primary MAC client. We do this if we have the only
860 870 * exclusive ring in the group.
861 871 */
862 872 if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
863 873 rx_srs = flent->fe_rx_srs[1];
864 874 srs_cpu = &rx_srs->srs_cpu;
865 875 ring = rx_srs->srs_ring;
866 876 mintr = &ring->mr_info.mri_intr;
867 877 /*
868 878 * If ddi_handle is present or the poll CPU is
869 879 * already bound to the interrupt CPU, return -1.
870 880 */
871 881 if (mintr->mi_ddi_handle != NULL ||
872 882 ((mrp->mrp_ncpus != 0) &&
873 883 (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
874 884 return (-1);
875 885 }
876 886 return (srs_cpu->mc_rx_pollid);
877 887 }
878 888 return (-1);
879 889 }
880 890
881 891 void *
882 892 mac_get_devinfo(mac_handle_t mh)
883 893 {
884 894 mac_impl_t *mip = (mac_impl_t *)mh;
885 895
886 896 return ((void *)mip->mi_dip);
887 897 }
888 898
889 899 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
890 900 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
891 901 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
892 902
893 903 uint64_t
894 904 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
895 905 {
896 906 struct ether_header *ehp;
897 907 uint64_t hash = 0;
898 908 uint16_t sap;
899 909 uint_t skip_len;
900 910 uint8_t proto;
901 911 boolean_t ip_fragmented;
902 912
903 913 /*
904 914 * We may want to have one of these per MAC type plugin in the
905 915 * future. For now supports only ethernet.
906 916 */
907 917 if (media != DL_ETHER)
908 918 return (0L);
909 919
910 920 /* for now we support only outbound packets */
911 921 ASSERT(is_outbound);
912 922 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
913 923 ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
914 924
915 925 /* compute L2 hash */
916 926
917 927 ehp = (struct ether_header *)mp->b_rptr;
918 928
919 929 if ((policy & MAC_PKT_HASH_L2) != 0) {
920 930 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
921 931 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
922 932 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
923 933 policy &= ~MAC_PKT_HASH_L2;
924 934 }
925 935
926 936 if (policy == 0)
927 937 goto done;
928 938
929 939 /* skip ethernet header */
930 940
931 941 sap = ntohs(ehp->ether_type);
932 942 if (sap == ETHERTYPE_VLAN) {
933 943 struct ether_vlan_header *evhp;
934 944 mblk_t *newmp = NULL;
935 945
936 946 skip_len = sizeof (struct ether_vlan_header);
937 947 if (MBLKL(mp) < skip_len) {
938 948 /* the vlan tag is the payload, pull up first */
939 949 newmp = msgpullup(mp, -1);
940 950 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
941 951 goto done;
942 952 }
943 953 evhp = (struct ether_vlan_header *)newmp->b_rptr;
944 954 } else {
945 955 evhp = (struct ether_vlan_header *)mp->b_rptr;
946 956 }
947 957
948 958 sap = ntohs(evhp->ether_type);
949 959 freemsg(newmp);
950 960 } else {
951 961 skip_len = sizeof (struct ether_header);
952 962 }
953 963
954 964 /* if ethernet header is in its own mblk, skip it */
955 965 if (MBLKL(mp) <= skip_len) {
956 966 skip_len -= MBLKL(mp);
957 967 mp = mp->b_cont;
958 968 if (mp == NULL)
959 969 goto done;
960 970 }
|
↓ open down ↓ |
386 lines elided |
↑ open up ↑ |
961 971
962 972 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
963 973
964 974 /* compute IP src/dst addresses hash and skip IPv{4,6} header */
965 975
966 976 switch (sap) {
967 977 case ETHERTYPE_IP: {
968 978 ipha_t *iphp;
969 979
970 980 /*
971 - * If the header is not aligned or the header doesn't fit
972 - * in the mblk, bail now. Note that this may cause packets
973 - * reordering.
981 + * If the header is not aligned, the header doesn't fit in the
982 + * mblk, OR we have a bad IP version, bail now. Note that this
983 + * may cause packets reordering.
974 984 */
975 985 iphp = (ipha_t *)(mp->b_rptr + skip_len);
976 986 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
977 - !OK_32PTR((char *)iphp))
987 + !OK_32PTR((char *)iphp) ||
988 + IPH_HDR_VERSION(iphp) != IPV4_VERSION)
978 989 goto done;
979 990
980 991 proto = iphp->ipha_protocol;
981 992 skip_len += IPH_HDR_LENGTH(iphp);
982 993
983 994 /* Check if the packet is fragmented. */
984 995 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
985 996 IPH_OFFSET;
986 997
987 998 /*
988 999 * For fragmented packets, use addresses in addition to
989 1000 * the frag_id to generate the hash inorder to get
990 1001 * better distribution.
991 1002 */
992 1003 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
993 1004 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
994 1005 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
995 1006
996 1007 hash ^= (PKT_HASH_4BYTES(ip_src) ^
997 1008 PKT_HASH_4BYTES(ip_dst));
998 1009 policy &= ~MAC_PKT_HASH_L3;
999 1010 }
1000 1011
1001 1012 if (ip_fragmented) {
1002 1013 uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
1003 1014 hash ^= PKT_HASH_2BYTES(identp);
1004 1015 goto done;
1005 1016 }
1006 1017 break;
1007 1018 }
1008 1019 case ETHERTYPE_IPV6: {
1009 1020 ip6_t *ip6hp;
1010 1021 ip6_frag_t *frag = NULL;
1011 1022 uint16_t hdr_length;
1012 1023
1013 1024 /*
|
↓ open down ↓ |
26 lines elided |
↑ open up ↑ |
1014 1025 * If the header is not aligned or the header doesn't fit
1015 1026 * in the mblk, bail now. Note that this may cause packets
1016 1027 * reordering.
1017 1028 */
1018 1029
1019 1030 ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
1020 1031 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
1021 1032 !OK_32PTR((char *)ip6hp))
1022 1033 goto done;
1023 1034
1024 - if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
1025 - &proto, &frag))
1035 + /* Also bail, regardless of why, if the function below fails. */
1036 + if (mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
1037 + &proto, &frag) != 0)
1026 1038 goto done;
1027 1039 skip_len += hdr_length;
1028 1040
1029 1041 /*
1030 1042 * For fragmented packets, use addresses in addition to
1031 1043 * the frag_id to generate the hash inorder to get
1032 1044 * better distribution.
1033 1045 */
1034 1046 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
1035 1047 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
1036 1048 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
1037 1049
1038 1050 hash ^= (PKT_HASH_4BYTES(ip_src) ^
1039 1051 PKT_HASH_4BYTES(ip_dst));
1040 1052 policy &= ~MAC_PKT_HASH_L3;
1041 1053 }
1042 1054
1043 1055 if (frag != NULL) {
1044 1056 uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
1045 1057 hash ^= PKT_HASH_4BYTES(identp);
1046 1058 goto done;
1047 1059 }
1048 1060 break;
1049 1061 }
1050 1062 default:
1051 1063 goto done;
1052 1064 }
1053 1065
1054 1066 if (policy == 0)
1055 1067 goto done;
1056 1068
1057 1069 /* if ip header is in its own mblk, skip it */
1058 1070 if (MBLKL(mp) <= skip_len) {
1059 1071 skip_len -= MBLKL(mp);
1060 1072 mp = mp->b_cont;
1061 1073 if (mp == NULL)
1062 1074 goto done;
1063 1075 }
1064 1076
1065 1077 /* parse ULP header */
1066 1078 again:
1067 1079 switch (proto) {
1068 1080 case IPPROTO_TCP:
1069 1081 case IPPROTO_UDP:
1070 1082 case IPPROTO_ESP:
1071 1083 case IPPROTO_SCTP:
1072 1084 /*
1073 1085 * These Internet Protocols are intentionally designed
1074 1086 * for hashing from the git-go. Port numbers are in the first
1075 1087 * word for transports, SPI is first for ESP.
1076 1088 */
1077 1089 if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
1078 1090 goto done;
1079 1091 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
1080 1092 break;
1081 1093
1082 1094 case IPPROTO_AH: {
1083 1095 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
1084 1096 uint_t ah_length = AH_TOTAL_LEN(ah);
1085 1097
1086 1098 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
1087 1099 goto done;
1088 1100
1089 1101 proto = ah->ah_nexthdr;
1090 1102 skip_len += ah_length;
1091 1103
1092 1104 /* if AH header is in its own mblk, skip it */
1093 1105 if (MBLKL(mp) <= skip_len) {
1094 1106 skip_len -= MBLKL(mp);
1095 1107 mp = mp->b_cont;
1096 1108 if (mp == NULL)
1097 1109 goto done;
1098 1110 }
1099 1111
1100 1112 goto again;
1101 1113 }
1102 1114 }
1103 1115
1104 1116 done:
1105 1117 return (hash);
1106 1118 }
|
↓ open down ↓ |
71 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX