1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 /* Copyright (c) 1990 Mentat Inc. */
  27 
  28 #include <sys/types.h>
  29 #include <sys/stream.h>
  30 #include <sys/strsubr.h>
  31 #include <sys/dlpi.h>
  32 #include <sys/strsun.h>
  33 #include <sys/zone.h>
  34 #include <sys/ddi.h>
  35 #include <sys/sunddi.h>
  36 #include <sys/cmn_err.h>
  37 #include <sys/debug.h>
  38 #include <sys/atomic.h>
  39 
  40 #include <sys/systm.h>
  41 #include <sys/param.h>
  42 #include <sys/kmem.h>
  43 #include <sys/sdt.h>
  44 #include <sys/socket.h>
  45 #include <sys/mac.h>
  46 #include <net/if.h>
  47 #include <net/if_arp.h>
  48 #include <net/route.h>
  49 #include <sys/sockio.h>
  50 #include <netinet/in.h>
  51 #include <net/if_dl.h>
  52 
  53 #include <inet/common.h>
  54 #include <inet/mi.h>
  55 #include <inet/mib2.h>
  56 #include <inet/nd.h>
  57 #include <inet/arp.h>
  58 #include <inet/snmpcom.h>
  59 #include <inet/kstatcom.h>
  60 
  61 #include <netinet/igmp_var.h>
  62 #include <netinet/ip6.h>
  63 #include <netinet/icmp6.h>
  64 #include <netinet/sctp.h>
  65 
  66 #include <inet/ip.h>
  67 #include <inet/ip_impl.h>
  68 #include <inet/ip6.h>
  69 #include <inet/ip6_asp.h>
  70 #include <inet/tcp.h>
  71 #include <inet/ip_multi.h>
  72 #include <inet/ip_if.h>
  73 #include <inet/ip_ire.h>
  74 #include <inet/ip_ftable.h>
  75 #include <inet/ip_rts.h>
  76 #include <inet/optcom.h>
  77 #include <inet/ip_ndp.h>
  78 #include <inet/ip_listutils.h>
  79 #include <netinet/igmp.h>
  80 #include <netinet/ip_mroute.h>
  81 #include <inet/ipp_common.h>
  82 
  83 #include <net/pfkeyv2.h>
  84 #include <inet/sadb.h>
  85 #include <inet/ipsec_impl.h>
  86 #include <inet/ipdrop.h>
  87 #include <inet/ip_netinfo.h>
  88 
  89 #include <sys/pattr.h>
  90 #include <inet/ipclassifier.h>
  91 #include <inet/sctp_ip.h>
  92 #include <inet/sctp/sctp_impl.h>
  93 #include <inet/udp_impl.h>
  94 #include <sys/sunddi.h>
  95 
  96 #include <sys/tsol/label.h>
  97 #include <sys/tsol/tnet.h>
  98 
  99 #ifdef  DEBUG
 100 extern boolean_t skip_sctp_cksum;
 101 #endif
 102 
 103 int
 104 ip_output_simple_v6(mblk_t *mp, ip_xmit_attr_t *ixa)
 105 {
 106         ip6_t           *ip6h;
 107         in6_addr_t      firsthop; /* In IP header */
 108         in6_addr_t      dst;    /* End of source route, or ip6_dst if none */
 109         ire_t           *ire;
 110         in6_addr_t      setsrc;
 111         int             error;
 112         ill_t           *ill = NULL;
 113         dce_t           *dce = NULL;
 114         nce_t           *nce;
 115         iaflags_t       ixaflags = ixa->ixa_flags;
 116         ip_stack_t      *ipst = ixa->ixa_ipst;
 117         uint8_t         *nexthdrp;
 118         boolean_t       repeat = B_FALSE;
 119         boolean_t       multirt = B_FALSE;
 120         uint_t          ifindex;
 121         int64_t         now;
 122 
 123         ip6h = (ip6_t *)mp->b_rptr;
 124         ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
 125 
 126         ASSERT(ixa->ixa_nce == NULL);
 127 
 128         ixa->ixa_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
 129         ASSERT(ixa->ixa_pktlen == msgdsize(mp));
 130         if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ixa->ixa_ip_hdr_length,
 131             &nexthdrp)) {
 132                 /* Malformed packet */
 133                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 134                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
 135                 ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
 136                 freemsg(mp);
 137                 return (EINVAL);
 138         }
 139         ixa->ixa_protocol = *nexthdrp;
 140 
 141         /*
 142          * Assumes that source routed packets have already been massaged by
 143          * the ULP (ip_massage_options_v6) and as a result ip6_dst is the next
 144          * hop in the source route. The final destination is used for IPsec
 145          * policy and DCE lookup.
 146          */
 147         firsthop = ip6h->ip6_dst;
 148         dst = ip_get_dst_v6(ip6h, mp, NULL);
 149 
 150 repeat_ire:
 151         error = 0;
 152         setsrc = ipv6_all_zeros;
 153         ire = ip_select_route_v6(&firsthop, ip6h->ip6_src, ixa, NULL, &setsrc,
 154             &error, &multirt);
 155         ASSERT(ire != NULL);    /* IRE_NOROUTE if none found */
 156         if (error != 0) {
 157                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 158                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
 159                 ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
 160                 freemsg(mp);
 161                 goto done;
 162         }
 163 
 164         if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
 165                 /* ire_ill might be NULL hence need to skip some code */
 166                 if (ixaflags & IXAF_SET_SOURCE)
 167                         ip6h->ip6_src = ipv6_loopback;
 168                 ixa->ixa_fragsize = IP_MAXPACKET;
 169                 ire->ire_ob_pkt_count++;
 170                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 171                 /* No dce yet; use default one */
 172                 error = (ire->ire_sendfn)(ire, mp, ip6h, ixa,
 173                     &ipst->ips_dce_default->dce_ident);
 174                 goto done;
 175         }
 176 
 177         /* Note that ip6_dst is only used for IRE_MULTICAST */
 178         nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst);
 179         if (nce == NULL) {
 180                 /* Allocation failure? */
 181                 ip_drop_output("ire_to_nce", mp, ill);
 182                 freemsg(mp);
 183                 error = ENOBUFS;
 184                 goto done;
 185         }
 186         if (nce->nce_is_condemned) {
 187                 nce_t *nce1;
 188 
 189                 nce1 = ire_handle_condemned_nce(nce, ire, NULL, ip6h, B_TRUE);
 190                 nce_refrele(nce);
 191                 if (nce1 == NULL) {
 192                         if (!repeat) {
 193                                 /* Try finding a better IRE */
 194                                 repeat = B_TRUE;
 195                                 ire_refrele(ire);
 196                                 goto repeat_ire;
 197                         }
 198                         /* Tried twice - drop packet */
 199                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
 200                         ip_drop_output("No nce", mp, ill);
 201                         freemsg(mp);
 202                         error = ENOBUFS;
 203                         goto done;
 204                 }
 205                 nce = nce1;
 206         }
 207         /*
 208          * For multicast with multirt we have a flag passed back from
 209          * ire_lookup_multi_ill_v6 since we don't have an IRE for each
 210          * possible multicast address.
 211          * We also need a flag for multicast since we can't check
 212          * whether RTF_MULTIRT is set in ixa_ire for multicast.
 213          */
 214         if (multirt) {
 215                 ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
 216                 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
 217         } else {
 218                 ixa->ixa_postfragfn = ire->ire_postfragfn;
 219                 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
 220         }
 221         ASSERT(ixa->ixa_nce == NULL);
 222         ixa->ixa_nce = nce;
 223 
 224         /*
 225          * Check for a dce_t with a path mtu.
 226          */
 227         ifindex = 0;
 228         if (IN6_IS_ADDR_LINKSCOPE(&dst))
 229                 ifindex = nce->nce_common->ncec_ill->ill_phyint->phyint_ifindex;
 230 
 231         dce = dce_lookup_v6(&dst, ifindex, ipst, NULL);
 232         ASSERT(dce != NULL);
 233 
 234         if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
 235                 ixa->ixa_fragsize = IPV6_MIN_MTU;
 236         } else if (dce->dce_flags & DCEF_PMTU) {
 237                 /*
 238                  * To avoid a periodic timer to increase the path MTU we
 239                  * look at dce_last_change_time each time we send a packet.
 240                  */
 241                 now = ddi_get_lbolt64();
 242                 if (TICK_TO_SEC(now) - dce->dce_last_change_time >
 243                     ipst->ips_ip_pathmtu_interval) {
 244                         /*
 245                          * Older than 20 minutes. Drop the path MTU information.
 246                          */
 247                         mutex_enter(&dce->dce_lock);
 248                         dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
 249                         dce->dce_last_change_time = TICK_TO_SEC(now);
 250                         mutex_exit(&dce->dce_lock);
 251                         dce_increment_generation(dce);
 252                         ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
 253                 } else {
 254                         uint_t fragsize;
 255 
 256                         fragsize = ip_get_base_mtu(nce->nce_ill, ire);
 257                         if (fragsize > dce->dce_pmtu)
 258                                 fragsize = dce->dce_pmtu;
 259                         ixa->ixa_fragsize = fragsize;
 260                 }
 261         } else {
 262                 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
 263         }
 264 
 265         /*
 266          * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
 267          * interface for source address selection.
 268          */
 269         ill = ire_nexthop_ill(ire);
 270 
 271         if (ixaflags & IXAF_SET_SOURCE) {
 272                 in6_addr_t      src;
 273 
 274                 /*
 275                  * We use the final destination to get
 276                  * correct selection for source routed packets
 277                  */
 278 
 279                 /* If unreachable we have no ill but need some source */
 280                 if (ill == NULL) {
 281                         src = ipv6_loopback;
 282                         error = 0;
 283                 } else {
 284                         error = ip_select_source_v6(ill, &setsrc, &dst,
 285                             ixa->ixa_zoneid, ipst, B_FALSE,
 286                             ixa->ixa_src_preferences, &src, NULL, NULL);
 287                 }
 288                 if (error != 0) {
 289                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
 290                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
 291                         ip_drop_output("ipIfStatsOutDiscards - no source",
 292                             mp, ill);
 293                         freemsg(mp);
 294                         goto done;
 295                 }
 296                 ip6h->ip6_src = src;
 297         } else if (ixaflags & IXAF_VERIFY_SOURCE) {
 298                 /* Check if the IP source is assigned to the host. */
 299                 if (!ip_verify_src(mp, ixa, NULL)) {
 300                         /* Don't send a packet with a source that isn't ours */
 301                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 302                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
 303                         ip_drop_output("ipIfStatsOutDiscards - invalid source",
 304                             mp, ill);
 305                         freemsg(mp);
 306                         error = EADDRNOTAVAIL;
 307                         goto done;
 308                 }
 309         }
 310 
 311         /*
 312          * Check against global IPsec policy to set the AH/ESP attributes.
 313          * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
 314          */
 315         if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
 316                 ASSERT(ixa->ixa_ipsec_policy == NULL);
 317                 mp = ip_output_attach_policy(mp, NULL, ip6h, NULL, ixa);
 318                 if (mp == NULL) {
 319                         /* MIB and ip_drop_packet already done */
 320                         return (EHOSTUNREACH);  /* IPsec policy failure */
 321                 }
 322         }
 323 
 324         if (ill != NULL) {
 325                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
 326         } else {
 327                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 328         }
 329 
 330         /*
 331          * We update the statistics on the most specific IRE i.e., the first
 332          * one we found.
 333          * We don't have an IRE when we fragment, hence ire_ob_pkt_count
 334          * can only count the use prior to fragmentation. However the MIB
 335          * counters on the ill will be incremented in post fragmentation.
 336          */
 337         ire->ire_ob_pkt_count++;
 338 
 339         /*
 340          * Based on ire_type and ire_flags call one of:
 341          *      ire_send_local_v6 - for IRE_LOCAL and IRE_LOOPBACK
 342          *      ire_send_multirt_v6 - if RTF_MULTIRT
 343          *      ire_send_noroute_v6 - if RTF_REJECT or RTF_BLACHOLE
 344          *      ire_send_multicast_v6 - for IRE_MULTICAST
 345          *      ire_send_wire_v6 - for the rest.
 346          */
 347         error = (ire->ire_sendfn)(ire, mp, ip6h, ixa, &dce->dce_ident);
 348 done:
 349         ire_refrele(ire);
 350         if (dce != NULL)
 351                 dce_refrele(dce);
 352         if (ill != NULL)
 353                 ill_refrele(ill);
 354         if (ixa->ixa_nce != NULL)
 355                 nce_refrele(ixa->ixa_nce);
 356         ixa->ixa_nce = NULL;
 357         return (error);
 358 }
 359 
 360 /*
 361  * ire_sendfn() functions.
 362  * These functions use the following xmit_attr:
 363  *  - ixa_fragsize - read to determine whether or not to fragment
 364  *  - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
 365  *  - ixa_ipsec_*  are used inside IPsec
 366  *  - IXAF_LOOPBACK_COPY - for multicast
 367  */
 368 
 369 
 370 /*
 371  * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
 372  *
 373  * The checks for restrict_interzone_loopback are done in ire_route_recursive.
 374  */
 375 /* ARGSUSED4 */
 376 int
 377 ire_send_local_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
 378     ip_xmit_attr_t *ixa, uint32_t *identp)
 379 {
 380         ip6_t           *ip6h = (ip6_t *)iph_arg;
 381         ip_stack_t      *ipst = ixa->ixa_ipst;
 382         ill_t           *ill = ire->ire_ill;
 383         ip_recv_attr_t  iras;   /* NOTE: No bzero for performance */
 384         uint_t          pktlen = ixa->ixa_pktlen;
 385 
 386         /*
 387          * No fragmentation, no nce, and no application of IPsec.
 388          *
 389          *
 390          * Note different order between IP provider and FW_HOOKS than in
 391          * send_wire case.
 392          */
 393 
 394         /*
 395          * DTrace this as ip:::send.  A packet blocked by FW_HOOKS will fire the
 396          * send probe, but not the receive probe.
 397          */
 398         DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
 399             ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
 400             int, 1);
 401 
 402         DTRACE_PROBE4(ip6__loopback__out__start,
 403             ill_t *, NULL, ill_t *, ill,
 404             ip6_t *, ip6h, mblk_t *, mp);
 405 
 406         if (HOOKS6_INTERESTED_LOOPBACK_OUT(ipst)) {
 407                 int     error;
 408 
 409                 FW_HOOKS(ipst->ips_ip6_loopback_out_event,
 410                     ipst->ips_ipv6firewall_loopback_out,
 411                     NULL, ill, ip6h, mp, mp, 0, ipst, error);
 412 
 413                 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
 414                 if (mp == NULL)
 415                         return (error);
 416 
 417                 /*
 418                  * Even if the destination was changed by the filter we use the
 419                  * forwarding decision that was made based on the address
 420                  * in ip_output/ip_set_destination.
 421                  */
 422                 /* Length could be different */
 423                 ip6h = (ip6_t *)mp->b_rptr;
 424                 pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
 425         }
 426 
 427         /*
 428          * If a callback is enabled then we need to know the
 429          * source and destination zoneids for the packet. We already
 430          * have those handy.
 431          */
 432         if (ipst->ips_ip6_observe.he_interested) {
 433                 zoneid_t szone, dzone;
 434                 zoneid_t stackzoneid;
 435 
 436                 stackzoneid = netstackid_to_zoneid(
 437                     ipst->ips_netstack->netstack_stackid);
 438 
 439                 if (stackzoneid == GLOBAL_ZONEID) {
 440                         /* Shared-IP zone */
 441                         dzone = ire->ire_zoneid;
 442                         szone = ixa->ixa_zoneid;
 443                 } else {
 444                         szone = dzone = stackzoneid;
 445                 }
 446                 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
 447         }
 448 
 449         /* Handle lo0 stats */
 450         ipst->ips_loopback_packets++;
 451 
 452         /*
 453          * Update output mib stats. Note that we can't move into the icmp
 454          * sender (icmp_output etc) since they don't know the ill and the
 455          * stats are per ill.
 456          */
 457         if (ixa->ixa_protocol == IPPROTO_ICMPV6) {
 458                 icmp6_t         *icmp6;
 459 
 460                 icmp6 = (icmp6_t *)((uchar_t *)ip6h + ixa->ixa_ip_hdr_length);
 461                 icmp_update_out_mib_v6(ill, icmp6);
 462         }
 463 
 464         DTRACE_PROBE4(ip6__loopback__in__start,
 465             ill_t *, ill, ill_t *, NULL,
 466             ip6_t *, ip6h, mblk_t *, mp);
 467 
 468         if (HOOKS6_INTERESTED_LOOPBACK_IN(ipst)) {
 469                 int     error;
 470 
 471                 FW_HOOKS(ipst->ips_ip6_loopback_in_event,
 472                     ipst->ips_ipv6firewall_loopback_in,
 473                     ill, NULL, ip6h, mp, mp, 0, ipst, error);
 474 
 475                 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
 476                 if (mp == NULL)
 477                         return (error);
 478 
 479                 /*
 480                  * Even if the destination was changed by the filter we use the
 481                  * forwarding decision that was made based on the address
 482                  * in ip_output/ip_set_destination.
 483                  */
 484                 /* Length could be different */
 485                 ip6h = (ip6_t *)mp->b_rptr;
 486                 pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
 487         }
 488 
 489         DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
 490             ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
 491             int, 1);
 492 
 493         /* Map ixa to ira including IPsec policies */
 494         ipsec_out_to_in(ixa, ill, &iras);
 495         iras.ira_pktlen = pktlen;
 496 
 497         ire->ire_ib_pkt_count++;
 498         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
 499         UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
 500 
 501         /* Destined to ire_zoneid - use that for fanout */
 502         iras.ira_zoneid = ire->ire_zoneid;
 503 
 504         if (is_system_labeled()) {
 505                 iras.ira_flags |= IRAF_SYSTEM_LABELED;
 506 
 507                 /*
 508                  * This updates ira_cred, ira_tsl and ira_free_flags based
 509                  * on the label. We don't expect this to ever fail for
 510                  * loopback packets, so we silently drop the packet should it
 511                  * fail.
 512                  */
 513                 if (!tsol_get_pkt_label(mp, IPV6_VERSION, &iras)) {
 514                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 515                         ip_drop_input("tsol_get_pkt_label", mp, ill);
 516                         freemsg(mp);
 517                         return (0);
 518                 }
 519                 ASSERT(iras.ira_tsl != NULL);
 520 
 521                 /* tsol_get_pkt_label sometimes does pullupmsg */
 522                 ip6h = (ip6_t *)mp->b_rptr;
 523         }
 524 
 525         ip_fanout_v6(mp, ip6h, &iras);
 526 
 527         /* We moved any IPsec refs from ixa to iras */
 528         ira_cleanup(&iras, B_FALSE);
 529         return (0);
 530 }
 531 
 532 static void
 533 multirt_check_v6(ire_t *ire, ip6_t *ip6h, ip_xmit_attr_t *ixa)
 534 {
 535         ip_stack_t *ipst = ixa->ixa_ipst;
 536 
 537         /* Limit the TTL on multirt packets. Do this even if IPV6_HOPLIMIT */
 538         if (ire->ire_type & IRE_MULTICAST) {
 539                 if (ip6h->ip6_hops > 1) {
 540                         ip2dbg(("ire_send_multirt_v6: forcing multicast "
 541                             "multirt TTL to 1 (was %d)\n", ip6h->ip6_hops));
 542                         ip6h->ip6_hops = 1;
 543                 }
 544                 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
 545         } else if ((ipst->ips_ip_multirt_ttl > 0) &&
 546             (ip6h->ip6_hops > ipst->ips_ip_multirt_ttl)) {
 547                 ip6h->ip6_hops = ipst->ips_ip_multirt_ttl;
 548                 /*
 549                  * Need to ensure we don't increase the ttl should we go through
 550                  * ire_send_multicast.
 551                  */
 552                 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
 553         }
 554 
 555         /* For IPv6 this also needs to insert a fragment header */
 556         ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR;
 557 }
 558 
 559 /*
 560  * ire_sendfn for IRE_MULTICAST
 561  *
 562  * Note that we do path MTU discovery by default for IPv6 multicast. But
 563  * since unconnected UDP and RAW sockets don't set IXAF_PMTU_DISCOVERY
 564  * only connected sockets get this by default.
 565  */
 566 int
 567 ire_send_multicast_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
 568     ip_xmit_attr_t *ixa, uint32_t *identp)
 569 {
 570         ip6_t           *ip6h = (ip6_t *)iph_arg;
 571         ip_stack_t      *ipst = ixa->ixa_ipst;
 572         ill_t           *ill = ire->ire_ill;
 573         iaflags_t       ixaflags = ixa->ixa_flags;
 574 
 575         /*
 576          * The IRE_MULTICAST is the same whether or not multirt is in use.
 577          * Hence we need special-case code.
 578          */
 579         if (ixaflags & IXAF_MULTIRT_MULTICAST)
 580                 multirt_check_v6(ire, ip6h, ixa);
 581 
 582         /*
 583          * Check if anything in ip_input_v6 wants a copy of the transmitted
 584          * packet (after IPsec and fragmentation)
 585          *
 586          * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
 587          *    RSVP and the rsvp daemon is an example of a
 588          *    protocol and user level process that
 589          *    handles it's own routing. Hence, it uses the
 590          *    SO_DONTROUTE option to accomplish this.
 591          * 2. If the sender has set IP_MULTICAST_LOOP, then we just
 592          *    check whether there are any receivers for the group on the ill
 593          *    (ignoring the zoneid).
 594          * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
 595          *    any members in other shared-IP zones.
 596          *    If such members exist, then we indicate that the sending zone
 597          *    shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
 598          *    behavior.
 599          *
 600          * When we loopback we skip hardware checksum to make sure loopback
 601          * copy is checksumed.
 602          *
 603          * Note that ire_ill is the upper in the case of IPMP.
 604          */
 605         ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
 606         if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
 607             !(ixaflags & IXAF_DONTROUTE)) {
 608                 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
 609         } else if (ixaflags & IXAF_MULTICAST_LOOP) {
 610                 /*
 611                  * If this zone or any other zone has members then loopback
 612                  * a copy.
 613                  */
 614                 if (ill_hasmembers_v6(ill, &ip6h->ip6_dst))
 615                         ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
 616         } else if (ipst->ips_netstack->netstack_numzones > 1) {
 617                 /*
 618                  * This zone should not have a copy. But there are some other
 619                  * zones which might have members.
 620                  */
 621                 if (ill_hasmembers_otherzones_v6(ill, &ip6h->ip6_dst,
 622                     ixa->ixa_zoneid)) {
 623                         ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
 624                         ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
 625                         ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
 626                 }
 627         }
 628 
 629         /*
 630          * Unless IPV6_HOPLIMIT or ire_send_multirt_v6 already set a ttl,
 631          * force the ttl to the IP_MULTICAST_TTL value
 632          */
 633         if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
 634                 ip6h->ip6_hops = ixa->ixa_multicast_ttl;
 635         }
 636 
 637         return (ire_send_wire_v6(ire, mp, ip6h, ixa, identp));
 638 }
 639 
 640 /*
 641  * ire_sendfn for IREs with RTF_MULTIRT
 642  */
 643 int
 644 ire_send_multirt_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
 645     ip_xmit_attr_t *ixa, uint32_t *identp)
 646 {
 647         ip6_t           *ip6h = (ip6_t *)iph_arg;
 648 
 649         multirt_check_v6(ire, ip6h, ixa);
 650 
 651         if (ire->ire_type & IRE_MULTICAST)
 652                 return (ire_send_multicast_v6(ire, mp, ip6h, ixa, identp));
 653         else
 654                 return (ire_send_wire_v6(ire, mp, ip6h, ixa, identp));
 655 }
 656 
 657 /*
 658  * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
 659  */
 660 /* ARGSUSED4 */
 661 int
 662 ire_send_noroute_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
 663     ip_xmit_attr_t *ixa, uint32_t *identp)
 664 {
 665         ip6_t           *ip6h = (ip6_t *)iph_arg;
 666         ip_stack_t      *ipst = ixa->ixa_ipst;
 667         ill_t           *ill;
 668         ip_recv_attr_t  iras;
 669         boolean_t       dummy;
 670 
 671         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
 672 
 673         if (ire->ire_type & IRE_NOROUTE) {
 674                 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
 675                 ip_rts_change_v6(RTM_MISS, &ip6h->ip6_dst, 0, 0, 0, 0, 0, 0,
 676                     RTA_DST, ipst);
 677         }
 678 
 679         if (ire->ire_flags & RTF_BLACKHOLE) {
 680                 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
 681                 freemsg(mp);
 682                 /* No error even for local senders - silent blackhole */
 683                 return (0);
 684         }
 685         ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
 686 
 687         /*
 688          * We need an ill_t for the ip_recv_attr_t even though this packet
 689          * was never received and icmp_unreachable doesn't currently use
 690          * ira_ill.
 691          */
 692         ill = ill_lookup_on_name("lo0", B_FALSE,
 693             !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
 694         if (ill == NULL) {
 695                 freemsg(mp);
 696                 return (EHOSTUNREACH);
 697         }
 698 
 699         bzero(&iras, sizeof (iras));
 700         /* Map ixa to ira including IPsec policies */
 701         ipsec_out_to_in(ixa, ill, &iras);
 702 
 703         icmp_unreachable_v6(mp, ICMP6_DST_UNREACH_NOROUTE, B_FALSE, &iras);
 704         /* We moved any IPsec refs from ixa to iras */
 705         ira_cleanup(&iras, B_FALSE);
 706 
 707         ill_refrele(ill);
 708         return (EHOSTUNREACH);
 709 }
 710 
 711 /*
 712  * Calculate a checksum ignoring any hardware capabilities
 713  *
 714  * Returns B_FALSE if the packet was too short for the checksum. Caller
 715  * should free and do stats.
 716  */
 717 static boolean_t
 718 ip_output_sw_cksum_v6(mblk_t *mp, ip6_t *ip6h, ip_xmit_attr_t *ixa)
 719 {
 720         ip_stack_t      *ipst = ixa->ixa_ipst;
 721         uint_t          pktlen = ixa->ixa_pktlen;
 722         uint16_t        *cksump;
 723         uint32_t        cksum;
 724         uint8_t         protocol = ixa->ixa_protocol;
 725         uint16_t        ip_hdr_length = ixa->ixa_ip_hdr_length;
 726 
 727 #define iphs    ((uint16_t *)ip6h)
 728 
 729         /* Just in case it contained garbage */
 730         DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
 731 
 732         /*
 733          * Calculate ULP checksum
 734          */
 735         if (protocol == IPPROTO_TCP) {
 736                 cksump = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_length);
 737                 cksum = IP_TCP_CSUM_COMP;
 738         } else if (protocol == IPPROTO_UDP) {
 739                 cksump = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_length);
 740                 cksum = IP_UDP_CSUM_COMP;
 741         } else if (protocol == IPPROTO_SCTP) {
 742                 sctp_hdr_t      *sctph;
 743 
 744                 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
 745                 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
 746                 /*
 747                  * Zero out the checksum field to ensure proper
 748                  * checksum calculation.
 749                  */
 750                 sctph->sh_chksum = 0;
 751 #ifdef  DEBUG
 752                 if (!skip_sctp_cksum)
 753 #endif
 754                         sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
 755                 return (B_TRUE);
 756         } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
 757                 /*
 758                  * icmp has placed length and routing
 759                  * header adjustment in the checksum field.
 760                  */
 761                 cksump = (uint16_t *)(((uint8_t *)ip6h) + ip_hdr_length +
 762                     ixa->ixa_raw_cksum_offset);
 763                 cksum = htons(protocol);
 764         } else if (protocol == IPPROTO_ICMPV6) {
 765                 cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
 766                 cksum = IP_ICMPV6_CSUM_COMP;    /* Pseudo-header cksum */
 767         } else {
 768                 return (B_TRUE);
 769         }
 770 
 771         /* ULP puts the checksum field is in the first mblk */
 772         ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
 773 
 774         /*
 775          * We accumulate the pseudo header checksum in cksum.
 776          * This is pretty hairy code, so watch close.  One
 777          * thing to keep in mind is that UDP and TCP have
 778          * stored their respective datagram lengths in their
 779          * checksum fields.  This lines things up real nice.
 780          */
 781         cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
 782             iphs[8] + iphs[9] + iphs[10] + iphs[11] +
 783             iphs[12] + iphs[13] + iphs[14] + iphs[15] +
 784             iphs[16] + iphs[17] + iphs[18] + iphs[19];
 785         cksum = IP_CSUM(mp, ip_hdr_length, cksum);
 786 
 787         /*
 788          * For UDP/IPv6 a zero UDP checksum is not allowed.
 789          * Change to 0xffff
 790          */
 791         if (protocol == IPPROTO_UDP && cksum == 0)
 792                 *cksump = ~cksum;
 793         else
 794                 *cksump = cksum;
 795 
 796         IP6_STAT(ipst, ip6_out_sw_cksum);
 797         IP6_STAT_UPDATE(ipst, ip6_out_sw_cksum_bytes, pktlen);
 798 
 799         /* No IP header checksum for IPv6 */
 800 
 801         return (B_TRUE);
 802 #undef  iphs
 803 }
 804 
 805 /* There are drivers that can't do partial checksum for ICMPv6 */
 806 int nxge_cksum_workaround = 1;
 807 
 808 /*
 809  * Calculate the ULP checksum - try to use hardware.
 810  * In the case of MULTIRT or multicast the
 811  * IXAF_NO_HW_CKSUM is set in which case we use software.
 812  *
 813  * Returns B_FALSE if the packet was too short for the checksum. Caller
 814  * should free and do stats.
 815  */
 816 static boolean_t
 817 ip_output_cksum_v6(iaflags_t ixaflags, mblk_t *mp, ip6_t *ip6h,
 818     ip_xmit_attr_t *ixa, ill_t *ill)
 819 {
 820         uint_t          pktlen = ixa->ixa_pktlen;
 821         uint16_t        *cksump;
 822         uint16_t        hck_flags;
 823         uint32_t        cksum;
 824         uint8_t         protocol = ixa->ixa_protocol;
 825         uint16_t        ip_hdr_length = ixa->ixa_ip_hdr_length;
 826 
 827 #define iphs    ((uint16_t *)ip6h)
 828 
 829         if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
 830             !dohwcksum) {
 831                 return (ip_output_sw_cksum_v6(mp, ip6h, ixa));
 832         }
 833 
 834         /*
 835          * Calculate ULP checksum. Note that we don't use cksump and cksum
 836          * if the ill has FULL support.
 837          */
 838         if (protocol == IPPROTO_TCP) {
 839                 cksump = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_length);
 840                 cksum = IP_TCP_CSUM_COMP;       /* Pseudo-header cksum */
 841         } else if (protocol == IPPROTO_UDP) {
 842                 cksump = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_length);
 843                 cksum = IP_UDP_CSUM_COMP;       /* Pseudo-header cksum */
 844         } else if (protocol == IPPROTO_SCTP) {
 845                 sctp_hdr_t      *sctph;
 846 
 847                 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
 848                 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
 849                 /*
 850                  * Zero out the checksum field to ensure proper
 851                  * checksum calculation.
 852                  */
 853                 sctph->sh_chksum = 0;
 854 #ifdef  DEBUG
 855                 if (!skip_sctp_cksum)
 856 #endif
 857                         sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
 858                 goto ip_hdr_cksum;
 859         } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
 860                 /*
 861                  * icmp has placed length and routing
 862                  * header adjustment in the checksum field.
 863                  */
 864                 cksump = (uint16_t *)(((uint8_t *)ip6h) + ip_hdr_length +
 865                     ixa->ixa_raw_cksum_offset);
 866                 cksum = htons(protocol);
 867         } else if (protocol == IPPROTO_ICMPV6) {
 868                 cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
 869                 cksum = IP_ICMPV6_CSUM_COMP;    /* Pseudo-header cksum */
 870         } else {
 871         ip_hdr_cksum:
 872                 /* No IP header checksum for IPv6 */
 873                 return (B_TRUE);
 874         }
 875 
 876         /* ULP puts the checksum field is in the first mblk */
 877         ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
 878 
 879         /*
 880          * Underlying interface supports hardware checksum offload for
 881          * the payload; leave the payload checksum for the hardware to
 882          * calculate.  N.B: We only need to set up checksum info on the
 883          * first mblk.
 884          */
 885         hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
 886 
 887         DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
 888         if (hck_flags & HCKSUM_INET_FULL_V6) {
 889                 /*
 890                  * Hardware calculates pseudo-header, header and the
 891                  * payload checksums, so clear the checksum field in
 892                  * the protocol header.
 893                  */
 894                 *cksump = 0;
 895                 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
 896                 return (B_TRUE);
 897         }
 898         if (((hck_flags) & HCKSUM_INET_PARTIAL) &&
 899             (protocol != IPPROTO_ICMPV6 || !nxge_cksum_workaround)) {
 900                 /*
 901                  * Partial checksum offload has been enabled.  Fill
 902                  * the checksum field in the protocol header with the
 903                  * pseudo-header checksum value.
 904                  *
 905                  * We accumulate the pseudo header checksum in cksum.
 906                  * This is pretty hairy code, so watch close.  One
 907                  * thing to keep in mind is that UDP and TCP have
 908                  * stored their respective datagram lengths in their
 909                  * checksum fields.  This lines things up real nice.
 910                  */
 911                 cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
 912                     iphs[8] + iphs[9] + iphs[10] + iphs[11] +
 913                     iphs[12] + iphs[13] + iphs[14] + iphs[15] +
 914                     iphs[16] + iphs[17] + iphs[18] + iphs[19];
 915                 cksum += *(cksump);
 916                 cksum = (cksum & 0xFFFF) + (cksum >> 16);
 917                 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
 918 
 919                 /*
 920                  * Offsets are relative to beginning of IP header.
 921                  */
 922                 DB_CKSUMSTART(mp) = ip_hdr_length;
 923                 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ip6h;
 924                 DB_CKSUMEND(mp) = pktlen;
 925                 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
 926                 return (B_TRUE);
 927         }
 928         /* Hardware capabilities include neither full nor partial IPv6 */
 929         return (ip_output_sw_cksum_v6(mp, ip6h, ixa));
 930 #undef  iphs
 931 }
 932 
 933 /*
 934  * ire_sendfn for offlink and onlink destinations.
 935  * Also called from the multicast, and multirt send functions.
 936  *
 937  * Assumes that the caller has a hold on the ire.
 938  *
 939  * This function doesn't care if the IRE just became condemned since that
 940  * can happen at any time.
 941  */
 942 /* ARGSUSED */
 943 int
 944 ire_send_wire_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
 945     ip_xmit_attr_t *ixa, uint32_t *identp)
 946 {
 947         ip_stack_t      *ipst = ixa->ixa_ipst;
 948         ip6_t           *ip6h = (ip6_t *)iph_arg;
 949         iaflags_t       ixaflags = ixa->ixa_flags;
 950         ill_t           *ill;
 951         uint32_t        pktlen = ixa->ixa_pktlen;
 952 
 953         ASSERT(ixa->ixa_nce != NULL);
 954         ill = ixa->ixa_nce->nce_ill;
 955 
 956         /*
 957          * Update output mib stats. Note that we can't move into the icmp
 958          * sender (icmp_output etc) since they don't know the ill and the
 959          * stats are per ill.
 960          *
 961          * With IPMP we record the stats on the upper ill.
 962          */
 963         if (ixa->ixa_protocol == IPPROTO_ICMPV6) {
 964                 icmp6_t         *icmp6;
 965 
 966                 icmp6 = (icmp6_t *)((uchar_t *)ip6h + ixa->ixa_ip_hdr_length);
 967                 icmp_update_out_mib_v6(ixa->ixa_nce->nce_common->ncec_ill,
 968                     icmp6);
 969         }
 970 
 971         if (ixaflags & IXAF_DONTROUTE)
 972                 ip6h->ip6_hops = 1;
 973 
 974         /*
 975          * This might set b_band, thus the IPsec and fragmentation
 976          * code in IP ensures that b_band is updated in the first mblk.
 977          */
 978         if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
 979                 /* ip_process translates an IS_UNDER_IPMP */
 980                 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
 981                 if (mp == NULL) {
 982                         /* ip_drop_packet and MIB done */
 983                         return (0);     /* Might just be delayed */
 984                 }
 985         }
 986 
 987         /*
 988          * To handle IPsec/iptun's labeling needs we need to tag packets
 989          * while we still have ixa_tsl
 990          */
 991         if (is_system_labeled() && ixa->ixa_tsl != NULL &&
 992             (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 ||
 993             ill->ill_mactype == DL_IPV6)) {
 994                 cred_t *newcr;
 995 
 996                 newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl,
 997                     KM_NOSLEEP);
 998                 if (newcr == NULL) {
 999                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1000                         ip_drop_output("ipIfStatsOutDiscards - newcr",
1001                             mp, ill);
1002                         freemsg(mp);
1003                         return (ENOBUFS);
1004                 }
1005                 mblk_setcred(mp, newcr, NOPID);
1006                 crfree(newcr);  /* mblk_setcred did its own crhold */
1007         }
1008 
1009         /*
1010          * IXAF_IPV6_ADD_FRAGHDR is set for CGTP so that we will add a
1011          * fragment header without fragmenting. CGTP on the receiver will
1012          * filter duplicates on the ident field.
1013          */
1014         if (pktlen > ixa->ixa_fragsize ||
1015             (ixaflags & (IXAF_IPSEC_SECURE|IXAF_IPV6_ADD_FRAGHDR))) {
1016                 uint32_t ident;
1017 
1018                 if (ixaflags & IXAF_IPSEC_SECURE)
1019                         pktlen += ipsec_out_extra_length(ixa);
1020 
1021                 if (pktlen > IP_MAXPACKET)
1022                         return (EMSGSIZE);
1023 
1024                 if (ixaflags & IXAF_SET_ULP_CKSUM) {
1025                         /*
1026                          * Compute ULP checksum using software
1027                          */
1028                         if (!ip_output_sw_cksum_v6(mp, ip6h, ixa)) {
1029                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1030                                 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1031                                 freemsg(mp);
1032                                 return (EINVAL);
1033                         }
1034                         /* Avoid checksum again below if we only add fraghdr */
1035                         ixaflags &= ~IXAF_SET_ULP_CKSUM;
1036                 }
1037 
1038                 /*
1039                  * If we need a fragment header, pick the ident and insert
1040                  * the header before IPsec to we have a place to store
1041                  * the ident value.
1042                  */
1043                 if ((ixaflags & IXAF_IPV6_ADD_FRAGHDR) ||
1044                     pktlen > ixa->ixa_fragsize) {
1045                         /*
1046                          * If this packet would generate a icmp_frag_needed
1047                          * message, we need to handle it before we do the IPsec
1048                          * processing. Otherwise, we need to strip the IPsec
1049                          * headers before we send up the message to the ULPs
1050                          * which becomes messy and difficult.
1051                          */
1052                         if ((pktlen > ixa->ixa_fragsize) &&
1053                             (ixaflags & IXAF_DONTFRAG)) {
1054                                 /* Generate ICMP and return error */
1055                                 ip_recv_attr_t  iras;
1056 
1057                                 DTRACE_PROBE4(ip6__fragsize__fail,
1058                                     uint_t, pktlen, uint_t, ixa->ixa_fragsize,
1059                                     uint_t, ixa->ixa_pktlen,
1060                                     uint_t, ixa->ixa_pmtu);
1061 
1062                                 bzero(&iras, sizeof (iras));
1063                                 /* Map ixa to ira including IPsec policies */
1064                                 ipsec_out_to_in(ixa, ill, &iras);
1065 
1066                                 ip_drop_output("ICMP6_PKT_TOO_BIG", mp, ill);
1067                                 icmp_pkt2big_v6(mp, ixa->ixa_fragsize, B_TRUE,
1068                                     &iras);
1069                                 /* We moved any IPsec refs from ixa to iras */
1070                                 ira_cleanup(&iras, B_FALSE);
1071                                 return (EMSGSIZE);
1072                         }
1073                         DTRACE_PROBE4(ip6__fragsize__ok, uint_t, pktlen,
1074                             uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
1075                             uint_t, ixa->ixa_pmtu);
1076                         /*
1077                          * Assign an ident value for this packet. There could
1078                          * be other threads targeting the same destination, so
1079                          * we have to arrange for a atomic increment.
1080                          * Normally ixa_extra_ident is 0, but in the case of
1081                          * LSO it will be the number of TCP segments  that the
1082                          * driver/hardware will extraly construct.
1083                          *
1084                          * Note that cl_inet_ipident has only been used for
1085                          * IPv4. We don't use it here.
1086                          */
1087                         ident = atomic_add_32_nv(identp, ixa->ixa_extra_ident +
1088                             1);
1089                         ixa->ixa_ident = ident;      /* In case we do IPsec */
1090                 }
1091                 if (ixaflags & IXAF_IPSEC_SECURE) {
1092                         /*
1093                          * Pass in sufficient information so that
1094                          * IPsec can determine whether to fragment, and
1095                          * which function to call after fragmentation.
1096                          */
1097                         return (ipsec_out_process(mp, ixa));
1098                 }
1099 
1100                 mp = ip_fraghdr_add_v6(mp, ident, ixa);
1101                 if (mp == NULL) {
1102                         /* MIB and ip_drop_output already done */
1103                         return (ENOMEM);
1104                 }
1105                 ASSERT(pktlen == ixa->ixa_pktlen);
1106                 pktlen += sizeof (ip6_frag_t);
1107 
1108                 if (pktlen > ixa->ixa_fragsize) {
1109                         return (ip_fragment_v6(mp, ixa->ixa_nce, ixaflags,
1110                             pktlen, ixa->ixa_fragsize,
1111                             ixa->ixa_xmit_hint, ixa->ixa_zoneid,
1112                             ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn,
1113                             &ixa->ixa_cookie));
1114                 }
1115         }
1116         if (ixaflags & IXAF_SET_ULP_CKSUM) {
1117                 /* Compute ULP checksum and IP header checksum */
1118                 /* An IS_UNDER_IPMP ill is ok here */
1119                 if (!ip_output_cksum_v6(ixaflags, mp, ip6h, ixa, ill)) {
1120                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1121                         ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1122                         freemsg(mp);
1123                         return (EINVAL);
1124                 }
1125         }
1126         return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
1127             pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
1128             ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));
1129 }
1130 
1131 /*
1132  * Post fragmentation function for RTF_MULTIRT routes.
1133  * Since IRE_MULTICASTs might have RTF_MULTIRT, this function
1134  * checks IXAF_LOOPBACK_COPY.
1135  *
1136  * If no packet is sent due to failures then we return an errno, but if at
1137  * least one succeeded we return zero.
1138  */
1139 int
1140 ip_postfrag_multirt_v6(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
1141     uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
1142     uintptr_t *ixacookie)
1143 {
1144         irb_t           *irb;
1145         ip6_t           *ip6h = (ip6_t *)mp->b_rptr;
1146         ire_t           *ire;
1147         ire_t           *ire1;
1148         mblk_t          *mp1;
1149         nce_t           *nce1;
1150         ill_t           *ill = nce->nce_ill;
1151         ill_t           *ill1;
1152         ip_stack_t      *ipst = ill->ill_ipst;
1153         int             error = 0;
1154         int             num_sent = 0;
1155         int             err;
1156         uint_t          ire_type;
1157         in6_addr_t      nexthop;
1158 
1159         ASSERT(!(ixaflags & IXAF_IS_IPV4));
1160 
1161         /* Check for IXAF_LOOPBACK_COPY */
1162         if (ixaflags & IXAF_LOOPBACK_COPY) {
1163                 mblk_t *mp1;
1164 
1165                 mp1 = copymsg(mp);
1166                 if (mp1 == NULL) {
1167                         /* Failed to deliver the loopback copy. */
1168                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1169                         ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1170                         error = ENOBUFS;
1171                 } else {
1172                         ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
1173                             nolzid);
1174                 }
1175         }
1176 
1177         /*
1178          * Loop over RTF_MULTIRT for ip6_dst in the same bucket. Send
1179          * a copy to each one.
1180          * Use the nce (nexthop) and ip6_dst to find the ire.
1181          *
1182          * MULTIRT is not designed to work with shared-IP zones thus we don't
1183          * need to pass a zoneid or a label to the IRE lookup.
1184          */
1185         if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, &ip6h->ip6_dst)) {
1186                 /* Broadcast and multicast case */
1187                 ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0, 0, NULL,
1188                     ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
1189         } else {
1190                 /* Unicast case */
1191                 ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, &nce->nce_addr,
1192                     0, NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL);
1193         }
1194 
1195         if (ire == NULL ||
1196             (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1197             !(ire->ire_flags & RTF_MULTIRT)) {
1198                 /* Drop */
1199                 ip_drop_output("ip_postfrag_multirt didn't find route",
1200                     mp, nce->nce_ill);
1201                 if (ire != NULL)
1202                         ire_refrele(ire);
1203                 return (ENETUNREACH);
1204         }
1205 
1206         irb = ire->ire_bucket;
1207         irb_refhold(irb);
1208         for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1209                 if (IRE_IS_CONDEMNED(ire1) ||
1210                     !(ire1->ire_flags & RTF_MULTIRT))
1211                         continue;
1212 
1213                 /* Note: When IPv6 uses radix tree we don't need this check */
1214                 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6))
1215                         continue;
1216 
1217                 /* Do the ire argument one after the loop */
1218                 if (ire1 == ire)
1219                         continue;
1220 
1221                 ill1 = ire_nexthop_ill(ire1);
1222                 if (ill1 == NULL) {
1223                         /*
1224                          * This ire might not have been picked by
1225                          * ire_route_recursive, in which case ire_dep might
1226                          * not have been setup yet.
1227                          * We kick ire_route_recursive to try to resolve
1228                          * starting at ire1.
1229                          */
1230                         ire_t *ire2;
1231                         uint_t match_flags = MATCH_IRE_DSTONLY;
1232 
1233                         if (ire1->ire_ill != NULL)
1234                                 match_flags |= MATCH_IRE_ILL;
1235                         ire2 = ire_route_recursive_impl_v6(ire1,
1236                             &ire1->ire_addr_v6, ire1->ire_type, ire1->ire_ill,
1237                             ire1->ire_zoneid, NULL, match_flags,
1238                             IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
1239                         if (ire2 != NULL)
1240                                 ire_refrele(ire2);
1241                         ill1 = ire_nexthop_ill(ire1);
1242                 }
1243                 if (ill1 == NULL) {
1244                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1245                         ip_drop_output("ipIfStatsOutDiscards - no ill",
1246                             mp, ill);
1247                         error = ENETUNREACH;
1248                         continue;
1249                 }
1250                 /* Pick the addr and type to use for ndp_nce_init */
1251                 if (nce->nce_common->ncec_flags & NCE_F_MCAST) {
1252                         ire_type = IRE_MULTICAST;
1253                         nexthop = ip6h->ip6_dst;
1254                 } else {
1255                         ire_type = ire1->ire_type;   /* Doesn't matter */
1256                         nexthop = ire1->ire_gateway_addr_v6;
1257                 }
1258 
1259                 /* If IPMP meta or under, then we just drop */
1260                 if (ill1->ill_grp != NULL) {
1261                         BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
1262                         ip_drop_output("ipIfStatsOutDiscards - IPMP",
1263                             mp, ill1);
1264                         ill_refrele(ill1);
1265                         error = ENETUNREACH;
1266                         continue;
1267                 }
1268 
1269                 nce1 = ndp_nce_init(ill1, &nexthop, ire_type);
1270                 if (nce1 == NULL) {
1271                         BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
1272                         ip_drop_output("ipIfStatsOutDiscards - no nce",
1273                             mp, ill1);
1274                         ill_refrele(ill1);
1275                         error = ENOBUFS;
1276                         continue;
1277                 }
1278                 mp1 = copymsg(mp);
1279                 if (mp1 == NULL) {
1280                         BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
1281                         ip_drop_output("ipIfStatsOutDiscards", mp, ill1);
1282                         nce_refrele(nce1);
1283                         ill_refrele(ill1);
1284                         error = ENOBUFS;
1285                         continue;
1286                 }
1287                 /* Preserve HW checksum for this copy */
1288                 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
1289                 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
1290                 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
1291                 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
1292                 DB_LSOMSS(mp1) = DB_LSOMSS(mp);
1293 
1294                 ire1->ire_ob_pkt_count++;
1295                 err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone,
1296                     0, ixacookie);
1297                 if (err == 0)
1298                         num_sent++;
1299                 else
1300                         error = err;
1301                 nce_refrele(nce1);
1302                 ill_refrele(ill1);
1303         }
1304         irb_refrele(irb);
1305         ire_refrele(ire);
1306         /* Finally, the main one */
1307         err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
1308             ixacookie);
1309         if (err == 0)
1310                 num_sent++;
1311         else
1312                 error = err;
1313         if (num_sent > 0)
1314                 return (0);
1315         else
1316                 return (error);
1317 }