1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
  26  */
  27 /* Copyright (c) 1990 Mentat Inc. */
  28 
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #include <sys/strsubr.h>
  32 #include <sys/dlpi.h>
  33 #include <sys/strsun.h>
  34 #include <sys/zone.h>
  35 #include <sys/ddi.h>
  36 #include <sys/sunddi.h>
  37 #include <sys/cmn_err.h>
  38 #include <sys/debug.h>
  39 #include <sys/atomic.h>
  40 
  41 #include <sys/systm.h>
  42 #include <sys/param.h>
  43 #include <sys/kmem.h>
  44 #include <sys/sdt.h>
  45 #include <sys/socket.h>
  46 #include <sys/mac.h>
  47 #include <net/if.h>
  48 #include <net/if_arp.h>
  49 #include <net/route.h>
  50 #include <sys/sockio.h>
  51 #include <netinet/in.h>
  52 #include <net/if_dl.h>
  53 
  54 #include <inet/common.h>
  55 #include <inet/mi.h>
  56 #include <inet/mib2.h>
  57 #include <inet/nd.h>
  58 #include <inet/arp.h>
  59 #include <inet/snmpcom.h>
  60 #include <inet/kstatcom.h>
  61 
  62 #include <netinet/igmp_var.h>
  63 #include <netinet/ip6.h>
  64 #include <netinet/icmp6.h>
  65 #include <netinet/sctp.h>
  66 
  67 #include <inet/ip.h>
  68 #include <inet/ip_impl.h>
  69 #include <inet/ip6.h>
  70 #include <inet/ip6_asp.h>
  71 #include <inet/tcp.h>
  72 #include <inet/ip_multi.h>
  73 #include <inet/ip_if.h>
  74 #include <inet/ip_ire.h>
  75 #include <inet/ip_ftable.h>
  76 #include <inet/ip_rts.h>
  77 #include <inet/optcom.h>
  78 #include <inet/ip_ndp.h>
  79 #include <inet/ip_listutils.h>
  80 #include <netinet/igmp.h>
  81 #include <netinet/ip_mroute.h>
  82 #include <inet/ipp_common.h>
  83 
  84 #include <net/pfkeyv2.h>
  85 #include <inet/sadb.h>
  86 #include <inet/ipsec_impl.h>
  87 #include <inet/ipdrop.h>
  88 #include <inet/ip_netinfo.h>
  89 
  90 #include <sys/pattr.h>
  91 #include <inet/ipclassifier.h>
  92 #include <inet/sctp_ip.h>
  93 #include <inet/sctp/sctp_impl.h>
  94 #include <inet/udp_impl.h>
  95 #include <sys/sunddi.h>
  96 
  97 #include <sys/tsol/label.h>
  98 #include <sys/tsol/tnet.h>
  99 
 100 #ifdef  DEBUG
 101 extern boolean_t skip_sctp_cksum;
 102 #endif
 103 
 104 int
 105 ip_output_simple_v6(mblk_t *mp, ip_xmit_attr_t *ixa)
 106 {
 107         ip6_t           *ip6h;
 108         in6_addr_t      firsthop; /* In IP header */
 109         in6_addr_t      dst;    /* End of source route, or ip6_dst if none */
 110         ire_t           *ire;
 111         in6_addr_t      setsrc;
 112         int             error;
 113         ill_t           *ill = NULL;
 114         dce_t           *dce = NULL;
 115         nce_t           *nce;
 116         iaflags_t       ixaflags = ixa->ixa_flags;
 117         ip_stack_t      *ipst = ixa->ixa_ipst;
 118         uint8_t         *nexthdrp;
 119         boolean_t       repeat = B_FALSE;
 120         boolean_t       multirt = B_FALSE;
 121         uint_t          ifindex;
 122         int64_t         now;
 123 
 124         ip6h = (ip6_t *)mp->b_rptr;
 125         ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
 126 
 127         ASSERT(ixa->ixa_nce == NULL);
 128 
 129         ixa->ixa_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
 130         ASSERT(ixa->ixa_pktlen == msgdsize(mp));
 131         if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ixa->ixa_ip_hdr_length,
 132             &nexthdrp)) {
 133                 /* Malformed packet */
 134                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 135                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
 136                 ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
 137                 freemsg(mp);
 138                 return (EINVAL);
 139         }
 140         ixa->ixa_protocol = *nexthdrp;
 141 
 142         /*
 143          * Assumes that source routed packets have already been massaged by
 144          * the ULP (ip_massage_options_v6) and as a result ip6_dst is the next
 145          * hop in the source route. The final destination is used for IPsec
 146          * policy and DCE lookup.
 147          */
 148         firsthop = ip6h->ip6_dst;
 149         dst = ip_get_dst_v6(ip6h, mp, NULL);
 150 
 151 repeat_ire:
 152         error = 0;
 153         setsrc = ipv6_all_zeros;
 154         ire = ip_select_route_v6(&firsthop, ip6h->ip6_src, ixa, NULL, &setsrc,
 155             &error, &multirt);
 156         ASSERT(ire != NULL);    /* IRE_NOROUTE if none found */
 157         if (error != 0) {
 158                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 159                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
 160                 ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
 161                 freemsg(mp);
 162                 goto done;
 163         }
 164 
 165         if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
 166                 /* ire_ill might be NULL hence need to skip some code */
 167                 if (ixaflags & IXAF_SET_SOURCE)
 168                         ip6h->ip6_src = ipv6_loopback;
 169                 ixa->ixa_fragsize = IP_MAXPACKET;
 170                 ire->ire_ob_pkt_count++;
 171                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 172                 /* No dce yet; use default one */
 173                 error = (ire->ire_sendfn)(ire, mp, ip6h, ixa,
 174                     &ipst->ips_dce_default->dce_ident);
 175                 goto done;
 176         }
 177 
 178         /* Note that ip6_dst is only used for IRE_MULTICAST */
 179         nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst);
 180         if (nce == NULL) {
 181                 /* Allocation failure? */
 182                 ip_drop_output("ire_to_nce", mp, ill);
 183                 freemsg(mp);
 184                 error = ENOBUFS;
 185                 goto done;
 186         }
 187         if (nce->nce_is_condemned) {
 188                 nce_t *nce1;
 189 
 190                 nce1 = ire_handle_condemned_nce(nce, ire, NULL, ip6h, B_TRUE);
 191                 nce_refrele(nce);
 192                 if (nce1 == NULL) {
 193                         if (!repeat) {
 194                                 /* Try finding a better IRE */
 195                                 repeat = B_TRUE;
 196                                 ire_refrele(ire);
 197                                 goto repeat_ire;
 198                         }
 199                         /* Tried twice - drop packet */
 200                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
 201                         ip_drop_output("No nce", mp, ill);
 202                         freemsg(mp);
 203                         error = ENOBUFS;
 204                         goto done;
 205                 }
 206                 nce = nce1;
 207         }
 208         /*
 209          * For multicast with multirt we have a flag passed back from
 210          * ire_lookup_multi_ill_v6 since we don't have an IRE for each
 211          * possible multicast address.
 212          * We also need a flag for multicast since we can't check
 213          * whether RTF_MULTIRT is set in ixa_ire for multicast.
 214          */
 215         if (multirt) {
 216                 ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
 217                 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
 218         } else {
 219                 ixa->ixa_postfragfn = ire->ire_postfragfn;
 220                 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
 221         }
 222         ASSERT(ixa->ixa_nce == NULL);
 223         ixa->ixa_nce = nce;
 224 
 225         /*
 226          * Check for a dce_t with a path mtu.
 227          */
 228         ifindex = 0;
 229         if (IN6_IS_ADDR_LINKSCOPE(&dst))
 230                 ifindex = nce->nce_common->ncec_ill->ill_phyint->phyint_ifindex;
 231 
 232         dce = dce_lookup_v6(&dst, ifindex, ipst, NULL);
 233         ASSERT(dce != NULL);
 234 
 235         if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
 236                 ixa->ixa_fragsize = IPV6_MIN_MTU;
 237         } else if (dce->dce_flags & DCEF_PMTU) {
 238                 /*
 239                  * To avoid a periodic timer to increase the path MTU we
 240                  * look at dce_last_change_time each time we send a packet.
 241                  */
 242                 now = ddi_get_lbolt64();
 243                 if (TICK_TO_SEC(now) - dce->dce_last_change_time >
 244                     ipst->ips_ip_pathmtu_interval) {
 245                         /*
 246                          * Older than 20 minutes. Drop the path MTU information.
 247                          */
 248                         mutex_enter(&dce->dce_lock);
 249                         dce->dce_flags &= ~DCEF_PMTU;
 250                         dce->dce_last_change_time = TICK_TO_SEC(now);
 251                         mutex_exit(&dce->dce_lock);
 252                         dce_increment_generation(dce);
 253                         ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
 254                 } else {
 255                         uint_t fragsize;
 256 
 257                         fragsize = ip_get_base_mtu(nce->nce_ill, ire);
 258                         if (fragsize > dce->dce_pmtu)
 259                                 fragsize = dce->dce_pmtu;
 260                         ixa->ixa_fragsize = fragsize;
 261                 }
 262         } else {
 263                 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
 264         }
 265 
 266         /*
 267          * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
 268          * interface for source address selection.
 269          */
 270         ill = ire_nexthop_ill(ire);
 271 
 272         if (ixaflags & IXAF_SET_SOURCE) {
 273                 in6_addr_t      src;
 274 
 275                 /*
 276                  * We use the final destination to get
 277                  * correct selection for source routed packets
 278                  */
 279 
 280                 /* If unreachable we have no ill but need some source */
 281                 if (ill == NULL) {
 282                         src = ipv6_loopback;
 283                         error = 0;
 284                 } else {
 285                         error = ip_select_source_v6(ill, &setsrc, &dst,
 286                             ixa->ixa_zoneid, ipst, B_FALSE,
 287                             ixa->ixa_src_preferences, &src, NULL, NULL);
 288                 }
 289                 if (error != 0) {
 290                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
 291                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
 292                         ip_drop_output("ipIfStatsOutDiscards - no source",
 293                             mp, ill);
 294                         freemsg(mp);
 295                         goto done;
 296                 }
 297                 ip6h->ip6_src = src;
 298         } else if (ixaflags & IXAF_VERIFY_SOURCE) {
 299                 /* Check if the IP source is assigned to the host. */
 300                 if (!ip_verify_src(mp, ixa, NULL)) {
 301                         /* Don't send a packet with a source that isn't ours */
 302                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 303                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
 304                         ip_drop_output("ipIfStatsOutDiscards - invalid source",
 305                             mp, ill);
 306                         freemsg(mp);
 307                         error = EADDRNOTAVAIL;
 308                         goto done;
 309                 }
 310         }
 311 
 312         /*
 313          * Check against global IPsec policy to set the AH/ESP attributes.
 314          * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
 315          */
 316         if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
 317                 ASSERT(ixa->ixa_ipsec_policy == NULL);
 318                 mp = ip_output_attach_policy(mp, NULL, ip6h, NULL, ixa);
 319                 if (mp == NULL) {
 320                         /* MIB and ip_drop_packet already done */
 321                         return (EHOSTUNREACH);  /* IPsec policy failure */
 322                 }
 323         }
 324 
 325         if (ill != NULL) {
 326                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
 327         } else {
 328                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 329         }
 330 
 331         /*
 332          * We update the statistics on the most specific IRE i.e., the first
 333          * one we found.
 334          * We don't have an IRE when we fragment, hence ire_ob_pkt_count
 335          * can only count the use prior to fragmentation. However the MIB
 336          * counters on the ill will be incremented in post fragmentation.
 337          */
 338         ire->ire_ob_pkt_count++;
 339 
 340         /*
 341          * Based on ire_type and ire_flags call one of:
 342          *      ire_send_local_v6 - for IRE_LOCAL and IRE_LOOPBACK
 343          *      ire_send_multirt_v6 - if RTF_MULTIRT
 344          *      ire_send_noroute_v6 - if RTF_REJECT or RTF_BLACHOLE
 345          *      ire_send_multicast_v6 - for IRE_MULTICAST
 346          *      ire_send_wire_v6 - for the rest.
 347          */
 348         error = (ire->ire_sendfn)(ire, mp, ip6h, ixa, &dce->dce_ident);
 349 done:
 350         ire_refrele(ire);
 351         if (dce != NULL)
 352                 dce_refrele(dce);
 353         if (ill != NULL)
 354                 ill_refrele(ill);
 355         if (ixa->ixa_nce != NULL)
 356                 nce_refrele(ixa->ixa_nce);
 357         ixa->ixa_nce = NULL;
 358         return (error);
 359 }
 360 
 361 /*
 362  * ire_sendfn() functions.
 363  * These functions use the following xmit_attr:
 364  *  - ixa_fragsize - read to determine whether or not to fragment
 365  *  - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
 366  *  - ixa_ipsec_*  are used inside IPsec
 367  *  - IXAF_LOOPBACK_COPY - for multicast
 368  */
 369 
 370 
 371 /*
 372  * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
 373  *
 374  * The checks for restrict_interzone_loopback are done in ire_route_recursive.
 375  */
 376 /* ARGSUSED4 */
 377 int
 378 ire_send_local_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
 379     ip_xmit_attr_t *ixa, uint32_t *identp)
 380 {
 381         ip6_t           *ip6h = (ip6_t *)iph_arg;
 382         ip_stack_t      *ipst = ixa->ixa_ipst;
 383         ill_t           *ill = ire->ire_ill;
 384         ip_recv_attr_t  iras;   /* NOTE: No bzero for performance */
 385         uint_t          pktlen = ixa->ixa_pktlen;
 386 
 387         /*
 388          * No fragmentation, no nce, and no application of IPsec.
 389          *
 390          *
 391          * Note different order between IP provider and FW_HOOKS than in
 392          * send_wire case.
 393          */
 394 
 395         /*
 396          * DTrace this as ip:::send.  A packet blocked by FW_HOOKS will fire the
 397          * send probe, but not the receive probe.
 398          */
 399         DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
 400             ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
 401             int, 1);
 402 
 403         DTRACE_PROBE4(ip6__loopback__out__start,
 404             ill_t *, NULL, ill_t *, ill,
 405             ip6_t *, ip6h, mblk_t *, mp);
 406 
 407         if (HOOKS6_INTERESTED_LOOPBACK_OUT(ipst)) {
 408                 int     error;
 409 
 410                 FW_HOOKS(ipst->ips_ip6_loopback_out_event,
 411                     ipst->ips_ipv6firewall_loopback_out,
 412                     NULL, ill, ip6h, mp, mp, 0, ipst, error);
 413 
 414                 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
 415                 if (mp == NULL)
 416                         return (error);
 417 
 418                 /*
 419                  * Even if the destination was changed by the filter we use the
 420                  * forwarding decision that was made based on the address
 421                  * in ip_output/ip_set_destination.
 422                  */
 423                 /* Length could be different */
 424                 ip6h = (ip6_t *)mp->b_rptr;
 425                 pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
 426         }
 427 
 428         /*
 429          * If a callback is enabled then we need to know the
 430          * source and destination zoneids for the packet. We already
 431          * have those handy.
 432          */
 433         if (ipst->ips_ip6_observe.he_interested) {
 434                 zoneid_t szone, dzone;
 435                 zoneid_t stackzoneid;
 436 
 437                 stackzoneid = netstackid_to_zoneid(
 438                     ipst->ips_netstack->netstack_stackid);
 439 
 440                 if (stackzoneid == GLOBAL_ZONEID) {
 441                         /* Shared-IP zone */
 442                         dzone = ire->ire_zoneid;
 443                         szone = ixa->ixa_zoneid;
 444                 } else {
 445                         szone = dzone = stackzoneid;
 446                 }
 447                 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
 448         }
 449 
 450         /* Handle lo0 stats */
 451         ipst->ips_loopback_packets++;
 452 
 453         /*
 454          * Update output mib stats. Note that we can't move into the icmp
 455          * sender (icmp_output etc) since they don't know the ill and the
 456          * stats are per ill.
 457          */
 458         if (ixa->ixa_protocol == IPPROTO_ICMPV6) {
 459                 icmp6_t         *icmp6;
 460 
 461                 icmp6 = (icmp6_t *)((uchar_t *)ip6h + ixa->ixa_ip_hdr_length);
 462                 icmp_update_out_mib_v6(ill, icmp6);
 463         }
 464 
 465         DTRACE_PROBE4(ip6__loopback__in__start,
 466             ill_t *, ill, ill_t *, NULL,
 467             ip6_t *, ip6h, mblk_t *, mp);
 468 
 469         if (HOOKS6_INTERESTED_LOOPBACK_IN(ipst)) {
 470                 int     error;
 471 
 472                 FW_HOOKS(ipst->ips_ip6_loopback_in_event,
 473                     ipst->ips_ipv6firewall_loopback_in,
 474                     ill, NULL, ip6h, mp, mp, 0, ipst, error);
 475 
 476                 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
 477                 if (mp == NULL)
 478                         return (error);
 479 
 480                 /*
 481                  * Even if the destination was changed by the filter we use the
 482                  * forwarding decision that was made based on the address
 483                  * in ip_output/ip_set_destination.
 484                  */
 485                 /* Length could be different */
 486                 ip6h = (ip6_t *)mp->b_rptr;
 487                 pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
 488         }
 489 
 490         DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
 491             ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
 492             int, 1);
 493 
 494         /* Map ixa to ira including IPsec policies */
 495         ipsec_out_to_in(ixa, ill, &iras);
 496         iras.ira_pktlen = pktlen;
 497 
 498         ire->ire_ib_pkt_count++;
 499         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
 500         UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
 501 
 502         /* Destined to ire_zoneid - use that for fanout */
 503         iras.ira_zoneid = ire->ire_zoneid;
 504 
 505         if (is_system_labeled()) {
 506                 iras.ira_flags |= IRAF_SYSTEM_LABELED;
 507 
 508                 /*
 509                  * This updates ira_cred, ira_tsl and ira_free_flags based
 510                  * on the label. We don't expect this to ever fail for
 511                  * loopback packets, so we silently drop the packet should it
 512                  * fail.
 513                  */
 514                 if (!tsol_get_pkt_label(mp, IPV6_VERSION, &iras)) {
 515                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 516                         ip_drop_input("tsol_get_pkt_label", mp, ill);
 517                         freemsg(mp);
 518                         return (0);
 519                 }
 520                 ASSERT(iras.ira_tsl != NULL);
 521 
 522                 /* tsol_get_pkt_label sometimes does pullupmsg */
 523                 ip6h = (ip6_t *)mp->b_rptr;
 524         }
 525 
 526         ip_fanout_v6(mp, ip6h, &iras);
 527 
 528         /* We moved any IPsec refs from ixa to iras */
 529         ira_cleanup(&iras, B_FALSE);
 530         return (0);
 531 }
 532 
 533 static void
 534 multirt_check_v6(ire_t *ire, ip6_t *ip6h, ip_xmit_attr_t *ixa)
 535 {
 536         ip_stack_t *ipst = ixa->ixa_ipst;
 537 
 538         /* Limit the TTL on multirt packets. Do this even if IPV6_HOPLIMIT */
 539         if (ire->ire_type & IRE_MULTICAST) {
 540                 if (ip6h->ip6_hops > 1) {
 541                         ip2dbg(("ire_send_multirt_v6: forcing multicast "
 542                             "multirt TTL to 1 (was %d)\n", ip6h->ip6_hops));
 543                         ip6h->ip6_hops = 1;
 544                 }
 545                 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
 546         } else if ((ipst->ips_ip_multirt_ttl > 0) &&
 547             (ip6h->ip6_hops > ipst->ips_ip_multirt_ttl)) {
 548                 ip6h->ip6_hops = ipst->ips_ip_multirt_ttl;
 549                 /*
 550                  * Need to ensure we don't increase the ttl should we go through
 551                  * ire_send_multicast.
 552                  */
 553                 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
 554         }
 555 
 556         /* For IPv6 this also needs to insert a fragment header */
 557         ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR;
 558 }
 559 
 560 /*
 561  * ire_sendfn for IRE_MULTICAST
 562  *
 563  * Note that we do path MTU discovery by default for IPv6 multicast. But
 564  * since unconnected UDP and RAW sockets don't set IXAF_PMTU_DISCOVERY
 565  * only connected sockets get this by default.
 566  */
 567 int
 568 ire_send_multicast_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
 569     ip_xmit_attr_t *ixa, uint32_t *identp)
 570 {
 571         ip6_t           *ip6h = (ip6_t *)iph_arg;
 572         ip_stack_t      *ipst = ixa->ixa_ipst;
 573         ill_t           *ill = ire->ire_ill;
 574         iaflags_t       ixaflags = ixa->ixa_flags;
 575 
 576         /*
 577          * The IRE_MULTICAST is the same whether or not multirt is in use.
 578          * Hence we need special-case code.
 579          */
 580         if (ixaflags & IXAF_MULTIRT_MULTICAST)
 581                 multirt_check_v6(ire, ip6h, ixa);
 582 
 583         /*
 584          * Check if anything in ip_input_v6 wants a copy of the transmitted
 585          * packet (after IPsec and fragmentation)
 586          *
 587          * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
 588          *    RSVP and the rsvp daemon is an example of a
 589          *    protocol and user level process that
 590          *    handles it's own routing. Hence, it uses the
 591          *    SO_DONTROUTE option to accomplish this.
 592          * 2. If the sender has set IP_MULTICAST_LOOP, then we just
 593          *    check whether there are any receivers for the group on the ill
 594          *    (ignoring the zoneid).
 595          * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
 596          *    any members in other shared-IP zones.
 597          *    If such members exist, then we indicate that the sending zone
 598          *    shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
 599          *    behavior.
 600          *
 601          * When we loopback we skip hardware checksum to make sure loopback
 602          * copy is checksumed.
 603          *
 604          * Note that ire_ill is the upper in the case of IPMP.
 605          */
 606         ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
 607         if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
 608             !(ixaflags & IXAF_DONTROUTE)) {
 609                 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
 610         } else if (ixaflags & IXAF_MULTICAST_LOOP) {
 611                 /*
 612                  * If this zone or any other zone has members then loopback
 613                  * a copy.
 614                  */
 615                 if (ill_hasmembers_v6(ill, &ip6h->ip6_dst))
 616                         ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
 617         } else if (ipst->ips_netstack->netstack_numzones > 1) {
 618                 /*
 619                  * This zone should not have a copy. But there are some other
 620                  * zones which might have members.
 621                  */
 622                 if (ill_hasmembers_otherzones_v6(ill, &ip6h->ip6_dst,
 623                     ixa->ixa_zoneid)) {
 624                         ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
 625                         ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
 626                         ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
 627                 }
 628         }
 629 
 630         /*
 631          * Unless IPV6_HOPLIMIT or ire_send_multirt_v6 already set a ttl,
 632          * force the ttl to the IP_MULTICAST_TTL value
 633          */
 634         if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
 635                 ip6h->ip6_hops = ixa->ixa_multicast_ttl;
 636         }
 637 
 638         return (ire_send_wire_v6(ire, mp, ip6h, ixa, identp));
 639 }
 640 
 641 /*
 642  * ire_sendfn for IREs with RTF_MULTIRT
 643  */
 644 int
 645 ire_send_multirt_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
 646     ip_xmit_attr_t *ixa, uint32_t *identp)
 647 {
 648         ip6_t           *ip6h = (ip6_t *)iph_arg;
 649 
 650         multirt_check_v6(ire, ip6h, ixa);
 651 
 652         if (ire->ire_type & IRE_MULTICAST)
 653                 return (ire_send_multicast_v6(ire, mp, ip6h, ixa, identp));
 654         else
 655                 return (ire_send_wire_v6(ire, mp, ip6h, ixa, identp));
 656 }
 657 
 658 /*
 659  * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
 660  */
 661 /* ARGSUSED4 */
 662 int
 663 ire_send_noroute_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
 664     ip_xmit_attr_t *ixa, uint32_t *identp)
 665 {
 666         ip6_t           *ip6h = (ip6_t *)iph_arg;
 667         ip_stack_t      *ipst = ixa->ixa_ipst;
 668         ill_t           *ill;
 669         ip_recv_attr_t  iras;
 670         boolean_t       dummy;
 671 
 672         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
 673 
 674         if (ire->ire_type & IRE_NOROUTE) {
 675                 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
 676                 ip_rts_change_v6(RTM_MISS, &ip6h->ip6_dst, 0, 0, 0, 0, 0, 0,
 677                     RTA_DST, ipst);
 678         }
 679 
 680         if (ire->ire_flags & RTF_BLACKHOLE) {
 681                 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
 682                 freemsg(mp);
 683                 /* No error even for local senders - silent blackhole */
 684                 return (0);
 685         }
 686         ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
 687 
 688         /*
 689          * We need an ill_t for the ip_recv_attr_t even though this packet
 690          * was never received and icmp_unreachable doesn't currently use
 691          * ira_ill.
 692          */
 693         ill = ill_lookup_on_name("lo0", B_FALSE,
 694             !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
 695         if (ill == NULL) {
 696                 freemsg(mp);
 697                 return (EHOSTUNREACH);
 698         }
 699 
 700         bzero(&iras, sizeof (iras));
 701         /* Map ixa to ira including IPsec policies */
 702         ipsec_out_to_in(ixa, ill, &iras);
 703 
 704         icmp_unreachable_v6(mp, ICMP6_DST_UNREACH_NOROUTE, B_FALSE, &iras);
 705         /* We moved any IPsec refs from ixa to iras */
 706         ira_cleanup(&iras, B_FALSE);
 707 
 708         ill_refrele(ill);
 709         return (EHOSTUNREACH);
 710 }
 711 
 712 /*
 713  * Calculate a checksum ignoring any hardware capabilities
 714  *
 715  * Returns B_FALSE if the packet was too short for the checksum. Caller
 716  * should free and do stats.
 717  */
 718 static boolean_t
 719 ip_output_sw_cksum_v6(mblk_t *mp, ip6_t *ip6h, ip_xmit_attr_t *ixa)
 720 {
 721         ip_stack_t      *ipst = ixa->ixa_ipst;
 722         uint_t          pktlen = ixa->ixa_pktlen;
 723         uint16_t        *cksump;
 724         uint32_t        cksum;
 725         uint8_t         protocol = ixa->ixa_protocol;
 726         uint16_t        ip_hdr_length = ixa->ixa_ip_hdr_length;
 727 
 728 #define iphs    ((uint16_t *)ip6h)
 729 
 730         /* Just in case it contained garbage */
 731         DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
 732 
 733         /*
 734          * Calculate ULP checksum
 735          */
 736         if (protocol == IPPROTO_TCP) {
 737                 cksump = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_length);
 738                 cksum = IP_TCP_CSUM_COMP;
 739         } else if (protocol == IPPROTO_UDP) {
 740                 cksump = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_length);
 741                 cksum = IP_UDP_CSUM_COMP;
 742         } else if (protocol == IPPROTO_SCTP) {
 743                 sctp_hdr_t      *sctph;
 744 
 745                 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
 746                 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
 747                 /*
 748                  * Zero out the checksum field to ensure proper
 749                  * checksum calculation.
 750                  */
 751                 sctph->sh_chksum = 0;
 752 #ifdef  DEBUG
 753                 if (!skip_sctp_cksum)
 754 #endif
 755                         sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
 756                 return (B_TRUE);
 757         } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
 758                 /*
 759                  * icmp has placed length and routing
 760                  * header adjustment in the checksum field.
 761                  */
 762                 cksump = (uint16_t *)(((uint8_t *)ip6h) + ip_hdr_length +
 763                     ixa->ixa_raw_cksum_offset);
 764                 cksum = htons(protocol);
 765         } else if (protocol == IPPROTO_ICMPV6) {
 766                 cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
 767                 cksum = IP_ICMPV6_CSUM_COMP;    /* Pseudo-header cksum */
 768         } else {
 769                 return (B_TRUE);
 770         }
 771 
 772         /* ULP puts the checksum field is in the first mblk */
 773         ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
 774 
 775         /*
 776          * We accumulate the pseudo header checksum in cksum.
 777          * This is pretty hairy code, so watch close.  One
 778          * thing to keep in mind is that UDP and TCP have
 779          * stored their respective datagram lengths in their
 780          * checksum fields.  This lines things up real nice.
 781          */
 782         cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
 783             iphs[8] + iphs[9] + iphs[10] + iphs[11] +
 784             iphs[12] + iphs[13] + iphs[14] + iphs[15] +
 785             iphs[16] + iphs[17] + iphs[18] + iphs[19];
 786         cksum = IP_CSUM(mp, ip_hdr_length, cksum);
 787 
 788         /*
 789          * For UDP/IPv6 a zero UDP checksum is not allowed.
 790          * Change to 0xffff
 791          */
 792         if (protocol == IPPROTO_UDP && cksum == 0)
 793                 *cksump = ~cksum;
 794         else
 795                 *cksump = cksum;
 796 
 797         IP6_STAT(ipst, ip6_out_sw_cksum);
 798         IP6_STAT_UPDATE(ipst, ip6_out_sw_cksum_bytes, pktlen);
 799 
 800         /* No IP header checksum for IPv6 */
 801 
 802         return (B_TRUE);
 803 #undef  iphs
 804 }
 805 
 806 /* There are drivers that can't do partial checksum for ICMPv6 */
 807 int nxge_cksum_workaround = 1;
 808 
 809 /*
 810  * Calculate the ULP checksum - try to use hardware.
 811  * In the case of MULTIRT or multicast the
 812  * IXAF_NO_HW_CKSUM is set in which case we use software.
 813  *
 814  * Returns B_FALSE if the packet was too short for the checksum. Caller
 815  * should free and do stats.
 816  */
 817 static boolean_t
 818 ip_output_cksum_v6(iaflags_t ixaflags, mblk_t *mp, ip6_t *ip6h,
 819     ip_xmit_attr_t *ixa, ill_t *ill)
 820 {
 821         uint_t          pktlen = ixa->ixa_pktlen;
 822         uint16_t        *cksump;
 823         uint16_t        hck_flags;
 824         uint32_t        cksum;
 825         uint8_t         protocol = ixa->ixa_protocol;
 826         uint16_t        ip_hdr_length = ixa->ixa_ip_hdr_length;
 827 
 828 #define iphs    ((uint16_t *)ip6h)
 829 
 830         if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
 831             !dohwcksum) {
 832                 return (ip_output_sw_cksum_v6(mp, ip6h, ixa));
 833         }
 834 
 835         /*
 836          * Calculate ULP checksum. Note that we don't use cksump and cksum
 837          * if the ill has FULL support.
 838          */
 839         if (protocol == IPPROTO_TCP) {
 840                 cksump = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_length);
 841                 cksum = IP_TCP_CSUM_COMP;       /* Pseudo-header cksum */
 842         } else if (protocol == IPPROTO_UDP) {
 843                 cksump = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_length);
 844                 cksum = IP_UDP_CSUM_COMP;       /* Pseudo-header cksum */
 845         } else if (protocol == IPPROTO_SCTP) {
 846                 sctp_hdr_t      *sctph;
 847 
 848                 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
 849                 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
 850                 /*
 851                  * Zero out the checksum field to ensure proper
 852                  * checksum calculation.
 853                  */
 854                 sctph->sh_chksum = 0;
 855 #ifdef  DEBUG
 856                 if (!skip_sctp_cksum)
 857 #endif
 858                         sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
 859                 goto ip_hdr_cksum;
 860         } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
 861                 /*
 862                  * icmp has placed length and routing
 863                  * header adjustment in the checksum field.
 864                  */
 865                 cksump = (uint16_t *)(((uint8_t *)ip6h) + ip_hdr_length +
 866                     ixa->ixa_raw_cksum_offset);
 867                 cksum = htons(protocol);
 868         } else if (protocol == IPPROTO_ICMPV6) {
 869                 cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
 870                 cksum = IP_ICMPV6_CSUM_COMP;    /* Pseudo-header cksum */
 871         } else {
 872         ip_hdr_cksum:
 873                 /* No IP header checksum for IPv6 */
 874                 return (B_TRUE);
 875         }
 876 
 877         /* ULP puts the checksum field is in the first mblk */
 878         ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
 879 
 880         /*
 881          * Underlying interface supports hardware checksum offload for
 882          * the payload; leave the payload checksum for the hardware to
 883          * calculate.  N.B: We only need to set up checksum info on the
 884          * first mblk.
 885          */
 886         hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
 887 
 888         DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
 889         if (hck_flags & HCKSUM_INET_FULL_V6) {
 890                 /*
 891                  * Hardware calculates pseudo-header, header and the
 892                  * payload checksums, so clear the checksum field in
 893                  * the protocol header.
 894                  */
 895                 *cksump = 0;
 896                 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
 897                 return (B_TRUE);
 898         }
 899         if (((hck_flags) & HCKSUM_INET_PARTIAL) &&
 900             (protocol != IPPROTO_ICMPV6 || !nxge_cksum_workaround)) {
 901                 /*
 902                  * Partial checksum offload has been enabled.  Fill
 903                  * the checksum field in the protocol header with the
 904                  * pseudo-header checksum value.
 905                  *
 906                  * We accumulate the pseudo header checksum in cksum.
 907                  * This is pretty hairy code, so watch close.  One
 908                  * thing to keep in mind is that UDP and TCP have
 909                  * stored their respective datagram lengths in their
 910                  * checksum fields.  This lines things up real nice.
 911                  */
 912                 cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
 913                     iphs[8] + iphs[9] + iphs[10] + iphs[11] +
 914                     iphs[12] + iphs[13] + iphs[14] + iphs[15] +
 915                     iphs[16] + iphs[17] + iphs[18] + iphs[19];
 916                 cksum += *(cksump);
 917                 cksum = (cksum & 0xFFFF) + (cksum >> 16);
 918                 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
 919 
 920                 /*
 921                  * Offsets are relative to beginning of IP header.
 922                  */
 923                 DB_CKSUMSTART(mp) = ip_hdr_length;
 924                 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ip6h;
 925                 DB_CKSUMEND(mp) = pktlen;
 926                 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
 927                 return (B_TRUE);
 928         }
 929         /* Hardware capabilities include neither full nor partial IPv6 */
 930         return (ip_output_sw_cksum_v6(mp, ip6h, ixa));
 931 #undef  iphs
 932 }
 933 
 934 /*
 935  * ire_sendfn for offlink and onlink destinations.
 936  * Also called from the multicast, and multirt send functions.
 937  *
 938  * Assumes that the caller has a hold on the ire.
 939  *
 940  * This function doesn't care if the IRE just became condemned since that
 941  * can happen at any time.
 942  */
 943 /* ARGSUSED */
 944 int
 945 ire_send_wire_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
 946     ip_xmit_attr_t *ixa, uint32_t *identp)
 947 {
 948         ip_stack_t      *ipst = ixa->ixa_ipst;
 949         ip6_t           *ip6h = (ip6_t *)iph_arg;
 950         iaflags_t       ixaflags = ixa->ixa_flags;
 951         ill_t           *ill;
 952         uint32_t        pktlen = ixa->ixa_pktlen;
 953 
 954         ASSERT(ixa->ixa_nce != NULL);
 955         ill = ixa->ixa_nce->nce_ill;
 956 
 957         /*
 958          * Update output mib stats. Note that we can't move into the icmp
 959          * sender (icmp_output etc) since they don't know the ill and the
 960          * stats are per ill.
 961          *
 962          * With IPMP we record the stats on the upper ill.
 963          */
 964         if (ixa->ixa_protocol == IPPROTO_ICMPV6) {
 965                 icmp6_t         *icmp6;
 966 
 967                 icmp6 = (icmp6_t *)((uchar_t *)ip6h + ixa->ixa_ip_hdr_length);
 968                 icmp_update_out_mib_v6(ixa->ixa_nce->nce_common->ncec_ill,
 969                     icmp6);
 970         }
 971 
 972         if (ixaflags & IXAF_DONTROUTE)
 973                 ip6h->ip6_hops = 1;
 974 
 975         /*
 976          * This might set b_band, thus the IPsec and fragmentation
 977          * code in IP ensures that b_band is updated in the first mblk.
 978          */
 979         if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
 980                 /* ip_process translates an IS_UNDER_IPMP */
 981                 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
 982                 if (mp == NULL) {
 983                         /* ip_drop_packet and MIB done */
 984                         return (0);     /* Might just be delayed */
 985                 }
 986         }
 987 
 988         /*
 989          * To handle IPsec/iptun's labeling needs we need to tag packets
 990          * while we still have ixa_tsl
 991          */
 992         if (is_system_labeled() && ixa->ixa_tsl != NULL &&
 993             (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 ||
 994             ill->ill_mactype == DL_IPV6)) {
 995                 cred_t *newcr;
 996 
 997                 newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl,
 998                     KM_NOSLEEP);
 999                 if (newcr == NULL) {
1000                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1001                         ip_drop_output("ipIfStatsOutDiscards - newcr",
1002                             mp, ill);
1003                         freemsg(mp);
1004                         return (ENOBUFS);
1005                 }
1006                 mblk_setcred(mp, newcr, NOPID);
1007                 crfree(newcr);  /* mblk_setcred did its own crhold */
1008         }
1009 
1010         /*
1011          * IXAF_IPV6_ADD_FRAGHDR is set for CGTP so that we will add a
1012          * fragment header without fragmenting. CGTP on the receiver will
1013          * filter duplicates on the ident field.
1014          */
1015         if (pktlen > ixa->ixa_fragsize ||
1016             (ixaflags & (IXAF_IPSEC_SECURE|IXAF_IPV6_ADD_FRAGHDR))) {
1017                 uint32_t ident;
1018 
1019                 if (ixaflags & IXAF_IPSEC_SECURE)
1020                         pktlen += ipsec_out_extra_length(ixa);
1021 
1022                 if (pktlen > IP_MAXPACKET)
1023                         return (EMSGSIZE);
1024 
1025                 if (ixaflags & IXAF_SET_ULP_CKSUM) {
1026                         /*
1027                          * Compute ULP checksum using software
1028                          */
1029                         if (!ip_output_sw_cksum_v6(mp, ip6h, ixa)) {
1030                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1031                                 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1032                                 freemsg(mp);
1033                                 return (EINVAL);
1034                         }
1035                         /* Avoid checksum again below if we only add fraghdr */
1036                         ixaflags &= ~IXAF_SET_ULP_CKSUM;
1037                 }
1038 
1039                 /*
1040                  * If we need a fragment header, pick the ident and insert
1041                  * the header before IPsec to we have a place to store
1042                  * the ident value.
1043                  */
1044                 if ((ixaflags & IXAF_IPV6_ADD_FRAGHDR) ||
1045                     pktlen > ixa->ixa_fragsize) {
1046                         /*
1047                          * If this packet would generate a icmp_frag_needed
1048                          * message, we need to handle it before we do the IPsec
1049                          * processing. Otherwise, we need to strip the IPsec
1050                          * headers before we send up the message to the ULPs
1051                          * which becomes messy and difficult.
1052                          */
1053                         if ((pktlen > ixa->ixa_fragsize) &&
1054                             (ixaflags & IXAF_DONTFRAG)) {
1055                                 /* Generate ICMP and return error */
1056                                 ip_recv_attr_t  iras;
1057 
1058                                 DTRACE_PROBE4(ip6__fragsize__fail,
1059                                     uint_t, pktlen, uint_t, ixa->ixa_fragsize,
1060                                     uint_t, ixa->ixa_pktlen,
1061                                     uint_t, ixa->ixa_pmtu);
1062 
1063                                 bzero(&iras, sizeof (iras));
1064                                 /* Map ixa to ira including IPsec policies */
1065                                 ipsec_out_to_in(ixa, ill, &iras);
1066 
1067                                 ip_drop_output("ICMP6_PKT_TOO_BIG", mp, ill);
1068                                 icmp_pkt2big_v6(mp, ixa->ixa_fragsize, B_TRUE,
1069                                     &iras);
1070                                 /* We moved any IPsec refs from ixa to iras */
1071                                 ira_cleanup(&iras, B_FALSE);
1072                                 return (EMSGSIZE);
1073                         }
1074                         DTRACE_PROBE4(ip6__fragsize__ok, uint_t, pktlen,
1075                             uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
1076                             uint_t, ixa->ixa_pmtu);
1077                         /*
1078                          * Assign an ident value for this packet. There could
1079                          * be other threads targeting the same destination, so
1080                          * we have to arrange for a atomic increment.
1081                          * Normally ixa_extra_ident is 0, but in the case of
1082                          * LSO it will be the number of TCP segments  that the
1083                          * driver/hardware will extraly construct.
1084                          *
1085                          * Note that cl_inet_ipident has only been used for
1086                          * IPv4. We don't use it here.
1087                          */
1088                         ident = atomic_add_32_nv(identp, ixa->ixa_extra_ident +
1089                             1);
1090                         ixa->ixa_ident = ident;      /* In case we do IPsec */
1091                 }
1092                 if (ixaflags & IXAF_IPSEC_SECURE) {
1093                         /*
1094                          * Pass in sufficient information so that
1095                          * IPsec can determine whether to fragment, and
1096                          * which function to call after fragmentation.
1097                          */
1098                         return (ipsec_out_process(mp, ixa));
1099                 }
1100 
1101                 mp = ip_fraghdr_add_v6(mp, ident, ixa);
1102                 if (mp == NULL) {
1103                         /* MIB and ip_drop_output already done */
1104                         return (ENOMEM);
1105                 }
1106                 ASSERT(pktlen == ixa->ixa_pktlen);
1107                 pktlen += sizeof (ip6_frag_t);
1108 
1109                 if (pktlen > ixa->ixa_fragsize) {
1110                         return (ip_fragment_v6(mp, ixa->ixa_nce, ixaflags,
1111                             pktlen, ixa->ixa_fragsize,
1112                             ixa->ixa_xmit_hint, ixa->ixa_zoneid,
1113                             ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn,
1114                             &ixa->ixa_cookie));
1115                 }
1116         }
1117         if (ixaflags & IXAF_SET_ULP_CKSUM) {
1118                 /* Compute ULP checksum and IP header checksum */
1119                 /* An IS_UNDER_IPMP ill is ok here */
1120                 if (!ip_output_cksum_v6(ixaflags, mp, ip6h, ixa, ill)) {
1121                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1122                         ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1123                         freemsg(mp);
1124                         return (EINVAL);
1125                 }
1126         }
1127         return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
1128             pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
1129             ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));
1130 }
1131 
1132 /*
1133  * Post fragmentation function for RTF_MULTIRT routes.
1134  * Since IRE_MULTICASTs might have RTF_MULTIRT, this function
1135  * checks IXAF_LOOPBACK_COPY.
1136  *
1137  * If no packet is sent due to failures then we return an errno, but if at
1138  * least one succeeded we return zero.
1139  */
1140 int
1141 ip_postfrag_multirt_v6(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
1142     uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
1143     uintptr_t *ixacookie)
1144 {
1145         irb_t           *irb;
1146         ip6_t           *ip6h = (ip6_t *)mp->b_rptr;
1147         ire_t           *ire;
1148         ire_t           *ire1;
1149         mblk_t          *mp1;
1150         nce_t           *nce1;
1151         ill_t           *ill = nce->nce_ill;
1152         ill_t           *ill1;
1153         ip_stack_t      *ipst = ill->ill_ipst;
1154         int             error = 0;
1155         int             num_sent = 0;
1156         int             err;
1157         uint_t          ire_type;
1158         in6_addr_t      nexthop;
1159 
1160         ASSERT(!(ixaflags & IXAF_IS_IPV4));
1161 
1162         /* Check for IXAF_LOOPBACK_COPY */
1163         if (ixaflags & IXAF_LOOPBACK_COPY) {
1164                 mblk_t *mp1;
1165 
1166                 mp1 = copymsg(mp);
1167                 if (mp1 == NULL) {
1168                         /* Failed to deliver the loopback copy. */
1169                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1170                         ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1171                         error = ENOBUFS;
1172                 } else {
1173                         ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
1174                             nolzid);
1175                 }
1176         }
1177 
1178         /*
1179          * Loop over RTF_MULTIRT for ip6_dst in the same bucket. Send
1180          * a copy to each one.
1181          * Use the nce (nexthop) and ip6_dst to find the ire.
1182          *
1183          * MULTIRT is not designed to work with shared-IP zones thus we don't
1184          * need to pass a zoneid or a label to the IRE lookup.
1185          */
1186         if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, &ip6h->ip6_dst)) {
1187                 /* Broadcast and multicast case */
1188                 ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0, 0, NULL,
1189                     ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
1190         } else {
1191                 /* Unicast case */
1192                 ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, &nce->nce_addr,
1193                     0, NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL);
1194         }
1195 
1196         if (ire == NULL ||
1197             (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1198             !(ire->ire_flags & RTF_MULTIRT)) {
1199                 /* Drop */
1200                 ip_drop_output("ip_postfrag_multirt didn't find route",
1201                     mp, nce->nce_ill);
1202                 if (ire != NULL)
1203                         ire_refrele(ire);
1204                 return (ENETUNREACH);
1205         }
1206 
1207         irb = ire->ire_bucket;
1208         irb_refhold(irb);
1209         for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1210                 if (IRE_IS_CONDEMNED(ire1) ||
1211                     !(ire1->ire_flags & RTF_MULTIRT))
1212                         continue;
1213 
1214                 /* Note: When IPv6 uses radix tree we don't need this check */
1215                 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6))
1216                         continue;
1217 
1218                 /* Do the ire argument one after the loop */
1219                 if (ire1 == ire)
1220                         continue;
1221 
1222                 ill1 = ire_nexthop_ill(ire1);
1223                 if (ill1 == NULL) {
1224                         /*
1225                          * This ire might not have been picked by
1226                          * ire_route_recursive, in which case ire_dep might
1227                          * not have been setup yet.
1228                          * We kick ire_route_recursive to try to resolve
1229                          * starting at ire1.
1230                          */
1231                         ire_t *ire2;
1232                         uint_t match_flags = MATCH_IRE_DSTONLY;
1233 
1234                         if (ire1->ire_ill != NULL)
1235                                 match_flags |= MATCH_IRE_ILL;
1236                         ire2 = ire_route_recursive_impl_v6(ire1,
1237                             &ire1->ire_addr_v6, ire1->ire_type, ire1->ire_ill,
1238                             ire1->ire_zoneid, NULL, match_flags,
1239                             IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
1240                         if (ire2 != NULL)
1241                                 ire_refrele(ire2);
1242                         ill1 = ire_nexthop_ill(ire1);
1243                 }
1244                 if (ill1 == NULL) {
1245                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1246                         ip_drop_output("ipIfStatsOutDiscards - no ill",
1247                             mp, ill);
1248                         error = ENETUNREACH;
1249                         continue;
1250                 }
1251                 /* Pick the addr and type to use for ndp_nce_init */
1252                 if (nce->nce_common->ncec_flags & NCE_F_MCAST) {
1253                         ire_type = IRE_MULTICAST;
1254                         nexthop = ip6h->ip6_dst;
1255                 } else {
1256                         ire_type = ire1->ire_type;   /* Doesn't matter */
1257                         nexthop = ire1->ire_gateway_addr_v6;
1258                 }
1259 
1260                 /* If IPMP meta or under, then we just drop */
1261                 if (ill1->ill_grp != NULL) {
1262                         BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
1263                         ip_drop_output("ipIfStatsOutDiscards - IPMP",
1264                             mp, ill1);
1265                         ill_refrele(ill1);
1266                         error = ENETUNREACH;
1267                         continue;
1268                 }
1269 
1270                 nce1 = ndp_nce_init(ill1, &nexthop, ire_type);
1271                 if (nce1 == NULL) {
1272                         BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
1273                         ip_drop_output("ipIfStatsOutDiscards - no nce",
1274                             mp, ill1);
1275                         ill_refrele(ill1);
1276                         error = ENOBUFS;
1277                         continue;
1278                 }
1279                 mp1 = copymsg(mp);
1280                 if (mp1 == NULL) {
1281                         BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
1282                         ip_drop_output("ipIfStatsOutDiscards", mp, ill1);
1283                         nce_refrele(nce1);
1284                         ill_refrele(ill1);
1285                         error = ENOBUFS;
1286                         continue;
1287                 }
1288                 /* Preserve HW checksum for this copy */
1289                 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
1290                 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
1291                 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
1292                 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
1293                 DB_LSOMSS(mp1) = DB_LSOMSS(mp);
1294 
1295                 ire1->ire_ob_pkt_count++;
1296                 err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone,
1297                     0, ixacookie);
1298                 if (err == 0)
1299                         num_sent++;
1300                 else
1301                         error = err;
1302                 nce_refrele(nce1);
1303                 ill_refrele(ill1);
1304         }
1305         irb_refrele(irb);
1306         ire_refrele(ire);
1307         /* Finally, the main one */
1308         err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
1309             ixacookie);
1310         if (err == 0)
1311                 num_sent++;
1312         else
1313                 error = err;
1314         if (num_sent > 0)
1315                 return (0);
1316         else
1317                 return (error);
1318 }