1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2018 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * NAT engine.  Mappings, 1-1 The rules in vxlnat_rules.c are only consulted
  18  * if the 1-1 map (kept here) misses or if the outbound lookup (vnetid,
  19  * protocol, src-IP, dst-IP, src-port, dst-port) misses.
  20  *
  21  * The plan is for inbound to hit conn_ts, whose conn_private points to
  22  * entries here.  The conn_recv* functions live here too (for now).
  23  */
  24 
  25 #include <sys/types.h>
  26 #include <sys/socket.h>
  27 #include <sys/ksynch.h>
  28 #include <sys/ksocket.h>
  29 #include <sys/kmem.h>
  30 #include <sys/stream.h>
  31 #include <sys/strsubr.h>
  32 #include <sys/strsun.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/debug.h>
  35 #include <sys/dtrace.h>
  36 #include <sys/errno.h>
  37 #include <sys/tihdr.h>
  38 #include <netinet/in.h>
  39 #include <netinet/udp.h>
  40 #include <inet/ip.h>
  41 #include <inet/ip6.h>
  42 #include <inet/tcp_impl.h>
  43 #include <inet/udp_impl.h>
  44 #include <inet/tcp.h>
  45 
  46 #include <inet/vxlnat_impl.h>
  47 
  48 static boolean_t vxlnat_vxlan_input(ksocket_t, mblk_t *, size_t, int, void *);
  49 static mblk_t *vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed,
  50     boolean_t to_private);
  51 
  52 /*
  53  * Initialized to NULL, read/write protected by vxlnat_mutex.
  54  * Receive functions shouldn't have to access this directly.
  55  */
  56 ksocket_t vxlnat_underlay;
  57 ire_t *vxlnat_underlay_ire;
  58 
  59 void
  60 vxlnat_closesock(void)
  61 {
  62         ASSERT(MUTEX_HELD(&vxlnat_mutex));
  63         if (vxlnat_underlay_ire != NULL) {
  64                 ire_refrele(vxlnat_underlay_ire);
  65                 vxlnat_underlay_ire = NULL;
  66         }
  67         if (vxlnat_underlay != NULL) {
  68                 (void) ksocket_close(vxlnat_underlay, zone_kcred());
  69                 vxlnat_underlay = NULL;
  70         }
  71 }
  72 
  73 static int
  74 vxlnat_opensock(in6_addr_t *underlay_ip)
  75 {
  76         int rc, val;
  77         /* Assume rest is initialized to 0s. */
  78         struct sockaddr_in6 sin6 = {AF_INET6, BE_16(IPPORT_VXLAN)};
  79         ip_stack_t *ipst = vxlnat_netstack->netstack_ip;
  80 
  81         ASSERT(MUTEX_HELD(&vxlnat_mutex));
  82         /* Open... */
  83         rc = ksocket_socket(&vxlnat_underlay, AF_INET6, SOCK_DGRAM, 0,
  84             KSOCKET_SLEEP, zone_kcred());
  85         if (rc != 0)
  86                 return (rc);
  87 
  88         /* Bind... */
  89         sin6.sin6_addr = *underlay_ip;
  90         rc = ksocket_bind(vxlnat_underlay, (struct sockaddr *)(&sin6),
  91             sizeof (sin6), zone_kcred());
  92         if (rc != 0) {
  93                 vxlnat_closesock();
  94                 return (rc);
  95         }
  96 
  97         /* Use source-port hashing when sending packets out VXLAN... */
  98         val = UDP_HASH_VXLAN;
  99         rc = ksocket_setsockopt(vxlnat_underlay, IPPROTO_UDP,
 100             UDP_SRCPORT_HASH, &val, sizeof (val), kcred);
 101         if (rc != 0) {
 102                 vxlnat_closesock();
 103                 return (rc);
 104         }
 105 
 106         /*
 107          * Grab the IRE for underlay address.
 108          */
 109         ASSERT3P(vxlnat_underlay_ire, ==, NULL);
 110         vxlnat_underlay_ire = (IN6_IS_ADDR_V4MAPPED(underlay_ip)) ?
 111             ire_ftable_lookup_simple_v4(underlay_ip->_S6_un._S6_u32[3],
 112             0, ipst, NULL) :
 113             ire_ftable_lookup_simple_v6(underlay_ip, 0, ipst, NULL);
 114         if (vxlnat_underlay_ire == NULL) {
 115                 DTRACE_PROBE1(vxlnat__opensock__ire__fail, in6_addr_t *,
 116                     underlay_ip);
 117                 vxlnat_closesock();
 118                 return (EADDRNOTAVAIL);
 119         }
 120 
 121         /* Once we return from this, start eating data. */
 122         rc = ksocket_krecv_set(vxlnat_underlay, vxlnat_vxlan_input, NULL);
 123         if (rc != 0) {
 124                 vxlnat_closesock();
 125         }
 126 
 127         return (rc);
 128 }
 129 
 130 /*
 131  * Establish a VXLAN-listening kernel socket.
 132  * XXX KEBE ASKS ==> Support more than one VXLAN address?
 133  */
 134 /* ARGSUSED */
 135 int
 136 vxlnat_vxlan_addr(in6_addr_t *underlay_ip)
 137 {
 138         int rc;
 139 
 140         ASSERT(MUTEX_HELD(&vxlnat_mutex));
 141         /* For now, we make this a one-underlay-address-only solution. */
 142         vxlnat_closesock();
 143         rc = vxlnat_opensock(underlay_ip);
 144         return (rc);
 145 }
 146 
 147 /*
 148  * Free a remote VXLAN destination.
 149  */
 150 void
 151 vxlnat_remote_free(vxlnat_remote_t *remote)
 152 {
 153         ASSERT0(remote->vxnrem_refcount);
 154 
 155         kmem_free(remote, sizeof (*remote));
 156 }
 157 
 158 /*
 159  * Like other unlink functions, assume the appropriate lock is held.
 160  */
 161 void
 162 vxlnat_remote_unlink(vxlnat_remote_t *remote)
 163 {
 164         vxlnat_vnet_t *vnet = remote->vxnrem_vnet;
 165 
 166         ASSERT3P(vnet, !=, NULL);
 167         ASSERT(MUTEX_HELD(&vnet->vxnv_remote_lock));
 168 
 169         /* First unlink so nobody else can find me */
 170         avl_remove(&vnet->vxnv_remotes, remote);
 171 
 172         /*
 173          * We still hold a vnet reference, so races shouldn't be a problem.
 174          * Still, for added safety, NULL it out first.
 175          */
 176         remote->vxnrem_vnet = NULL;  /* Condemn this entry. */
 177         VXNV_REFRELE(vnet);
 178         VXNREM_REFRELE(remote); /* Internment release. */
 179 }
 180 
 181 /*
 182  * Find or create a remote VXLAN destination.
 183  */
 184 static vxlnat_remote_t *
 185 vxlnat_get_remote(vxlnat_vnet_t *vnet, in6_addr_t *remote_addr,
 186     boolean_t create_on_miss)
 187 {
 188         vxlnat_remote_t *remote, searcher;
 189         avl_index_t where;
 190 
 191         searcher.vxnrem_addr = *remote_addr;
 192         mutex_enter(&vnet->vxnv_remote_lock);
 193         remote = avl_find(&vnet->vxnv_remotes, &searcher, &where);
 194         if (remote == NULL && create_on_miss) {
 195                 /* Not as critical if we can't allocate here. */
 196                 remote = kmem_zalloc(sizeof (*remote),
 197                     KM_NOSLEEP | KM_NORMALPRI);
 198                 if (remote != NULL) {
 199                         remote->vxnrem_addr = *remote_addr;
 200                         remote->vxnrem_refcount = 1; /* Internment reference. */
 201                         VXNV_REFHOLD(vnet);
 202                         remote->vxnrem_vnet = vnet;
 203                         /* Rest is filled in by caller. */
 204                         avl_insert(&vnet->vxnv_remotes, remote, where);
 205                 }
 206         }
 207         if (remote != NULL)
 208                 VXNREM_REFHOLD(remote);
 209         mutex_exit(&vnet->vxnv_remote_lock);
 210         return (remote);
 211 }
 212 
 213 /*
 214  * Cache inbound packet information in the vnet's remotes section.
 215  *
 216  * NOTE: This function assumes a trustworthy underlay network.  If the
 217  * underlay isn't trustworthy, this function should be renamed, and reduced to
 218  * a "strip and reality-check the ethernet header" function.
 219  *
 220  * Caller has stripped any pre-ethernet data from mp.  We return mp
 221  * stripped down to its IP header.
 222  */
 223 static mblk_t *
 224 vxlnat_cache_remote(mblk_t *mp, struct sockaddr_in6 *underlay_src,
 225     vxlnat_vnet_t *vnet)
 226 {
 227         struct ether_vlan_header *evh;
 228         struct ether_header *eh;
 229         vxlnat_remote_t *remote;
 230         uint16_t vlan, ethertype;
 231         ether_addr_t remote_ether;
 232         ipha_t *ipha;
 233         ip6_t *ip6h;
 234         in6_addr_t remote_addr;
 235 
 236         /* Assume (for now) we have at least a VLAN header's worth of data. */
 237         if (MBLKL(mp) < sizeof (*evh)) {
 238                 /* XXX KEBE ASKS - should we be more forgiving? */
 239                 DTRACE_PROBE1(vxlnat__in__drop__etherhdr, mblk_t *, mp);
 240                 freemsg(mp);
 241                 return (NULL);
 242         }
 243 
 244         eh = (struct ether_header *)mp->b_rptr;
 245         ethertype = ntohs(eh->ether_type);
 246         ether_copy(&eh->ether_shost, &remote_ether);
 247         if (ethertype == ETHERTYPE_VLAN) {
 248                 evh = (struct ether_vlan_header *)eh;
 249                 /* Keep it in network order... */
 250                 vlan = evh->ether_tci;
 251                 ethertype = ntohs(evh->ether_type);
 252                 ASSERT(vlan != 0);
 253                 mp->b_rptr += sizeof (*evh);
 254         } else {
 255                 evh = NULL;
 256                 vlan = 0;
 257                 mp->b_rptr += sizeof (*eh);
 258         }
 259         if (ethertype != ETHERTYPE_IP && ethertype != ETHERTYPE_IPV6) {
 260                 /*
 261                  * XXX KEBE SAYS for now, don't handle non-IP packets.
 262                  * This includes ARP.
 263                  */
 264                 DTRACE_PROBE1(vxlnat__in__drop__nonip, mblk_t *, mp);
 265                 freemsg(mp);
 266                 return (NULL);
 267         }
 268 
 269         /* Handle case of split ether + IP headers. */
 270         if (MBLKL(mp) < sizeof (ipha_t)) {
 271                 mblk_t *freemp;
 272                 
 273                 if (MBLKL(mp) > 0 || mp->b_cont == NULL) {
 274                         /* The IP header is split ACROSS MBLKS! Bail for now. */
 275                         DTRACE_PROBE1(vxlnat__in__drop__splitip, mblk_t *, mp);
 276                         freemsg(mp);
 277                         return (NULL);
 278                 }
 279                 freemp = mp;
 280                 mp = mp->b_cont;
 281                 freeb(freemp);
 282         }
 283         /* LINTED -- alignment... */
 284         ipha = (ipha_t *)mp->b_rptr;
 285 
 286         if (IPH_HDR_VERSION(ipha) == IPV4_VERSION) {
 287                 if (ethertype != ETHERTYPE_IP) {
 288                         /* XXX KEBE ASKS - should we be more forgiving? */
 289                         DTRACE_PROBE1(vxlnat__in__drop__etherhdr4,
 290                             mblk_t *, mp);
 291                         freemsg(mp);
 292                         return (NULL);
 293                 }
 294                 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
 295                     &remote_addr);
 296         } else {
 297                 if (ethertype != ETHERTYPE_IPV6 ||
 298                     IPH_HDR_VERSION(ipha) != IPV6_VERSION ||
 299                     MBLKL(mp) < sizeof (ip6_t)) {    
 300                         /* XXX KEBE ASKS - should we be more forgiving? */
 301                         DTRACE_PROBE1(vxlnat__in__drop__etherhdr6,
 302                             mblk_t *, mp);
 303                         freemsg(mp);
 304                         return (NULL);
 305                 }
 306                 ip6h = (ip6_t *)ipha;
 307                 remote_addr = ip6h->ip6_src;
 308         }
 309 
 310         /* Find remote and replace OR create new remote. */
 311         remote = vxlnat_get_remote(vnet, &remote_addr, B_TRUE);
 312         if (remote != NULL) {
 313                 /*
 314                  * See if this entry needs fixing or filling-in.  This might
 315                  * get a bit racy with read-only threads that actually
 316                  * transmit, but it only means dropped-packets in the worst
 317                  * case.
 318                  *
 319                  * It's THIS PART that inspires the warning about trusting the
 320                  * underlay network.
 321                  *
 322                  * XXX KEBE ASKS -- should we just replace things w/o checking?
 323                  */
 324                 /* Replace the ethernet address? */
 325                 if (ether_cmp(&remote->vxnrem_ether, &remote_ether) != 0)
 326                         ether_copy(&remote_ether, &remote->vxnrem_ether);
 327                 /*
 328                  * Replace the underlay? NOTE: Fix if/when underlay becomes
 329                  * IPv6.
 330                  */
 331                 if (!IN6_ARE_ADDR_EQUAL(&remote->vxnrem_uaddr,
 332                     &underlay_src->sin6_addr)) {
 333                         remote->vxnrem_uaddr = underlay_src->sin6_addr;
 334                 }
 335                 /* Replace the vlan ID. Maintain network order... */
 336                 if (remote->vxnrem_vlan != vlan)
 337                         remote->vxnrem_vlan = vlan;
 338         }
 339         /*
 340          * Else just continue and pray for better luck on another packet or
 341          * on the return flight.  It is IP, we can Just Drop It (TM)...
 342          */
 343 
 344         /* We're done with the remote entry now. */
 345         VXNREM_REFRELE(remote);
 346 
 347         /* Advance rptr to the inner IP header and proceed. */
 348         mp->b_rptr = (uint8_t *)ipha;
 349         return (mp);
 350 }
 351 
 352 /*
 353  * Extract transport-level information to find a NAT flow.
 354  * Consume mp and return B_FALSE if there's a problem.  Fill in "ports"
 355  * and "protocol" and return B_TRUE if there's not.
 356  */
 357 static boolean_t
 358 vxlnat_grab_transport(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t *ports,
 359     uint8_t *protocol, uint8_t **nexthdr_ptr)
 360 {
 361         uint8_t *nexthdr;
 362 
 363         /* Punt on IPv6 for now... */
 364         if (ip6h != NULL) {
 365                 freemsg(mp);
 366                 return (B_FALSE);
 367         }
 368 
 369         ASSERT(ipha != NULL);
 370         *protocol = ipha->ipha_protocol;
 371         nexthdr = ((uint8_t *)ipha + IPH_HDR_LENGTH(ipha));
 372         *nexthdr_ptr = nexthdr; /* Get this out of the way now. */
 373         if (nexthdr > mp->b_wptr) {
 374                 DTRACE_PROBE1(vxlnat__in__drop__trnexthdr, mblk_t *, mp);
 375                 freemsg(mp);
 376                 return (B_FALSE);
 377         }
 378         switch (*protocol) {
 379         case IPPROTO_TCP: {
 380                 tcpha_t *tcph = (tcpha_t *)nexthdr;
 381 
 382                 if (nexthdr + sizeof (*tcph) > mp->b_wptr) {
 383                         DTRACE_PROBE1(vxlnat__in__drop__tcpnexthdr, mblk_t *,
 384                             mp);
 385                         freemsg(mp);
 386                         return (B_FALSE);
 387                 }
 388                 *ports = *((uint32_t *)tcph);
 389                 /* XXX KEBE SAYS - grab other metadata here NOW? */
 390                 break;
 391         }
 392         case IPPROTO_UDP: {
 393                 udpha_t *udph = (udpha_t *)nexthdr;
 394 
 395                 if (nexthdr + sizeof (*udph) > mp->b_wptr) {
 396                         DTRACE_PROBE1(vxlnat__in__drop__udpnexthdr, mblk_t *,
 397                             mp);
 398                         freemsg(mp);
 399                         return (B_FALSE);
 400                 }
 401                 *ports = *((uint32_t *)udph);
 402                 /*
 403                  * XXX KEBE SAYS - not as much as TCP, but grab other metadata
 404                  * here NOW?
 405                  */
 406                 break;
 407         }
 408         case IPPROTO_ICMP: {
 409                 icmph_t *icmph = (icmph_t *)nexthdr;
 410 
 411                 if (nexthdr + sizeof (*icmph) > mp->b_wptr) {
 412                         DTRACE_PROBE1(vxlnat__in__drop__icmpnexthdr, mblk_t *,
 413                             mp);
 414                         freemsg(mp);
 415                         return (B_FALSE);
 416                 }
 417                 /* XXX KEBE SAYS sort out ICMP header... */
 418                 switch (icmph->icmph_type) {
 419                 case ICMP_ECHO_REQUEST:
 420                 case ICMP_TIME_STAMP_REQUEST:
 421                 case ICMP_TIME_EXCEEDED:
 422                 case ICMP_INFO_REQUEST:
 423                 case ICMP_ADDRESS_MASK_REPLY:
 424                         /* All ones we can sorta cope with... */
 425                         break;
 426                 default:
 427                         DTRACE_PROBE2(vxlnat__in__drop__icmptype, int,
 428                             icmph->icmph_type, mblk_t *, mp);
 429                         freemsg(mp);
 430                         return (B_FALSE);
 431                 }
 432                 /* NOTE: as of now, will switch position depending on endian. */
 433                 *ports = icmph->icmph_echo_ident;
 434                 break;
 435         }
 436         default:
 437                 *ports = 0;
 438                 break;
 439         }
 440 
 441         return (B_TRUE);
 442 }
 443 
 444 /*
 445  * This is the evaluate-packet vs. NAT flow state function.
 446  * This function does NOT alter "mp".
 447  */
 448 static boolean_t
 449 vxlnat_verify_natstate(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
 450     vxlnat_flow_t *flow, uint8_t *nexthdr)
 451 {
 452         /* XXX KEBE SAYS FILL ME IN! */
 453         return (B_FALSE);
 454 }
 455 
 456 /*
 457  * Inspect the packet and find ports & protos (or ICMP types & codes)
 458  * and see if we have an established NAT flow.
 459  *
 460  * XXX KEBE WONDERS if the transmission path will more closely resemble
 461  * vxlnat_one_vxlan_fixed() because of ipha_ident issues or not...
 462  *
 463  * B_TRUE means the packet was handled, and we shouldn't continue processing
 464  * (even if "was handled" means droppage).
 465  */
 466 static boolean_t
 467 vxlnat_one_vxlan_flow(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
 468     ip6_t *ip6h)
 469 {
 470         vxlnat_flow_t *flow, searcher;
 471         uint8_t *nexthdr;
 472 
 473         /*
 474          * XXX KEBE WONDERS, should we return vxlnat_flow_t instead if we
 475          * miss?  That way, we only need to find the ports/protocol ONCE.
 476          */
 477 
 478         if (ip6h != NULL) {
 479                 /* Eventually, grab addresses for "searcher". */
 480                 return (B_FALSE);       /* Bail on IPv6 for now... */
 481         } else {
 482                 ASSERT(ipha != NULL);
 483                 searcher.vxnfl_isv4 = B_TRUE;   /* Required? */
 484                 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
 485                     &searcher.vxnfl_src);
 486                 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_dst),
 487                     &searcher.vxnfl_dst);
 488         }
 489 
 490         if (!vxlnat_grab_transport(mp, ipha, ip6h, &searcher.vxnfl_ports,
 491             &searcher.vxnfl_protocol, &nexthdr)) {
 492                 DTRACE_PROBE1(vxlnat__in__flowgrab, mblk_t *, mp);
 493                 freemsg(mp);
 494                 return (B_TRUE);
 495         }
 496         
 497 
 498         /*
 499          * XXX KEBE SAYS Eventually put the rw&find in an IPv4-only block,
 500          * because IPv6 (if we NAT it like IPv4) will have its own table/tree.
 501          */
 502         rw_enter(&vnet->vxnv_flowv4_lock, RW_READER);
 503         flow = avl_find(&vnet->vxnv_flows_v4, &searcher, NULL);
 504         if (flow != NULL)
 505                 VXNFL_REFHOLD(flow);
 506         rw_exit(&vnet->vxnv_flowv4_lock);
 507 
 508         if (flow == NULL)
 509                 return (B_FALSE);       /* Let caller handle things. */
 510 
 511         if (!vxlnat_verify_natstate(mp, ipha, ip6h, flow, nexthdr)) {
 512                 freemsg(mp);    /* XXX KEBE SAYS FOR NOW... */
 513         } else {
 514                 /* XXX KEBE SAYS PROCESS... */
 515         }
 516 
 517         VXNFL_REFRELE(flow);
 518         return (B_TRUE);
 519 }
 520 
 521 /*
 522  * We have a new packet that seems to require a new NAT flow.  Construct that
 523  * flow now, and intern it as both a conn_t in IP *and* in the vnet's
 524  * appropriate vxnv_flows* tree.  Return NULL if we have a problem.
 525  */
 526 static vxlnat_flow_t *
 527 vxlnat_new_flow(vxlnat_rule_t *rule, in6_addr_t *inner_src, in6_addr_t *dst,
 528     uint32_t ports, uint8_t protocol)
 529 {
 530         vxlnat_vnet_t *vnet = rule->vxnr_vnet;
 531         vxlnat_flow_t *flow, *oldflow;
 532         avl_tree_t *flowtree;
 533         krwlock_t *flowlock;
 534         avl_index_t where;
 535 
 536         flow = kmem_alloc(sizeof (*flow), KM_NOSLEEP | KM_NORMALPRI);
 537         if (flow == NULL)
 538                 return (NULL);
 539 
 540         flow->vxnfl_dst = *dst;
 541         flow->vxnfl_src = *inner_src;
 542         flow->vxnfl_ports = ports;
 543         flow->vxnfl_protocol = protocol;
 544         flow->vxnfl_refcount = 2; /* One for internment, one for caller. */
 545         /* Assume no mixed-IP-version mappings for now. */
 546         if (IN6_IS_ADDR_V4MAPPED(inner_src)) {
 547                 ASSERT(IN6_IS_ADDR_V4MAPPED(dst));
 548                 flow->vxnfl_isv4 = B_TRUE;
 549                 flowtree = &vnet->vxnv_flows_v4;
 550                 flowlock = &vnet->vxnv_flowv4_lock;
 551         } else {
 552                 ASSERT(!IN6_IS_ADDR_V4MAPPED(dst));
 553                 flow->vxnfl_isv4 = B_FALSE;
 554                 /* XXX KEBE SAYS we don't do IPv6 for now. */
 555                 DTRACE_PROBE2(vxlnat__flow__newv6, in6_addr_t *, inner_src,
 556                     in6_addr_t *, dst);
 557                 kmem_free(flow, sizeof (*flow));
 558                 return (NULL);
 559         }
 560         VXNR_REFHOLD(rule);     /* For the flow itself... */
 561         flow->vxnfl_rule = rule;
 562 
 563         rw_enter(flowlock, RW_WRITER);
 564         oldflow = (vxlnat_flow_t *)avl_find(flowtree, flow, &where);
 565         if (oldflow != NULL) {
 566                 /*
 567                  * Hmmm, someone put one in while we were dinking around.
 568                  * XXX KEBE SAYS return the old one, refheld, for now.
 569                  */
 570                 VXNR_REFRELE(rule);
 571                 kmem_free(flow, sizeof (*flow));
 572                 VXNFL_REFHOLD(oldflow);
 573                 flow = oldflow;
 574         } else {
 575                 avl_insert(flowtree, flow, where);
 576                 /*
 577                  * Do conn_t magic here, except for the conn_t activation.  I
 578                  * am aware of holding the rwlock-as-write here.  We may need
 579                  * to move this outside the rwlock hold, and
 580                  * reacquire-on-failure.
 581                  */
 582                 if (!vxlnat_new_conn(flow)) {
 583                         ASSERT(flow->vxnfl_connp == NULL);
 584                         avl_remove(flowtree, flow);
 585                         VXNR_REFRELE(flow->vxnfl_rule);
 586                         kmem_free(flow, sizeof (*flow));
 587                         flow = NULL;
 588                 }
 589         }
 590         rw_exit(flowlock);
 591         
 592         /* We just created this one, activate it. */
 593         if (oldflow == NULL && flow != NULL)
 594                 vxlnat_activate_conn(flow);
 595 
 596         return (flow);
 597 }
 598 
 599 void
 600 vxlnat_flow_free(vxlnat_flow_t *flow)
 601 {
 602         ASSERT(flow->vxnfl_refcount == 0);
 603 
 604         /* XXX KEBE SAYS FILL ME IN?! */
 605         /* XXX KEBE ASKS ipcl_hash_remove()? */
 606 
 607         flow->vxnfl_connp->conn_priv = NULL; /* Sufficient? */
 608         CONN_DEC_REF(flow->vxnfl_connp);
 609         VXNR_REFRELE(flow->vxnfl_rule);
 610         kmem_free(flow, sizeof (*flow));
 611 }
 612 
 613 static boolean_t
 614 vxlnat_verify_initial(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
 615     uint32_t ports, uint8_t protocol, uint8_t *nexthdr)
 616 {
 617         /* XXX KEBE SAYS FILL ME IN! */
 618         freemsg(mp);
 619         return (B_FALSE);
 620 }
 621 
 622 /*
 623  * If we reach here, we need to find a NAT rule, and see if we can/should
 624  * CREATE a new NAT flow, or whether or not we should drop, maybe even
 625  * returning an ICMP message of some sort.
 626  *
 627  * B_TRUE means the packet was handled, and we shouldn't continue processing
 628  * (even if "was handled" means droppage).
 629  */
 630 static boolean_t
 631 vxlnat_one_vxlan_rule(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
 632     ip6_t *ip6h)
 633 {
 634         vxlnat_rule_t *rule;
 635         vxlnat_flow_t *flow;
 636         in6_addr_t v4m_src, v4m_dst, *inner_src, *dst;
 637         uint32_t ports;
 638         uint8_t protocol;
 639         uint8_t *nexthdr;
 640 
 641         /* XXX handle IPv6 later, assigning inner_src and dst to ip6_t addrs. */
 642         if (ip6h != NULL)
 643                 return (B_FALSE);
 644 
 645         ASSERT3P(ipha, !=, NULL);
 646         inner_src = &v4m_src;
 647         dst = &v4m_dst;
 648         IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src), inner_src);
 649         IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_dst), dst);
 650 
 651         mutex_enter(&vnet->vxnv_rule_lock);
 652         rule = list_head(&vnet->vxnv_rules);
 653 
 654         /*
 655          * search for a match in the nat rules
 656          * XXX investigate perf issues with with respect to list_t size
 657          * XXX KEBE SAYS rewrite when we start doing IPv6 to use "inner_src"
 658          * and "dst". 
 659          */
 660         while (rule != NULL) {
 661                 ipaddr_t ipaddr;
 662                 uint32_t netmask = 0xffffffff;
 663                 uint8_t prefix = rule->vxnr_prefix - 96;
 664 
 665                 /* calculate the v4 netmask */
 666                 netmask <<= (32 - prefix);
 667                 netmask = htonl(netmask);
 668 
 669                 IN6_V4MAPPED_TO_IPADDR(&rule->vxnr_myaddr, ipaddr);
 670                 /* XXX ASSERT vlanid? */
 671                 if ((ipaddr & netmask) == (ipha->ipha_src & netmask)) {
 672                         VXNR_REFHOLD(rule);
 673                         break;
 674                 }
 675 
 676                 rule = list_next(&vnet->vxnv_rules, rule);
 677         }
 678 
 679         mutex_exit(&vnet->vxnv_rule_lock);
 680 
 681         if (rule == NULL)
 682                 return (B_FALSE);
 683 
 684         /* process packet */
 685 
 686         /*
 687          * Grab transport header, and figure out if we can proceed.
 688          *
 689          * NOTE: vxlnat_grab_transport() will free/consume mp if it fails,
 690          * because we want to isolate non-flow-starters without having them
 691          * create new flows.  This means we return B_TRUE (consumed mp) on
 692          * failure. 
 693          */
 694         if (!vxlnat_grab_transport(mp, ipha, ip6h, &ports, &protocol, &nexthdr))
 695                 return (B_TRUE); /* see above... */
 696         if (!vxlnat_verify_initial(mp, ipha, ip6h, ports, protocol, nexthdr))
 697                 return (B_TRUE);
 698         
 699 
 700         flow = vxlnat_new_flow(rule, inner_src, dst, ports, protocol);
 701         if (flow != NULL) {
 702                 /*
 703                  * Call same function that vxlnat_one_vxlan_flow() uses
 704                  * to remap & transmit the packet out the external side.
 705                  *
 706                  * NOTE:  We've already checked the initial-packet-
 707                  * qualification, so unlike the main datapath, we don't
 708                  * need to call vxlnat_verify_natstate()
 709                  */
 710 
 711                  /* XXX KEBE SAYS PROCESS... */
 712                 
 713                 VXNFL_REFRELE(flow);
 714                 return (B_TRUE);
 715         }
 716 
 717         return (B_FALSE);
 718 }
 719 
 720 /*
 721  * See if the inbound VXLAN packet hits a 1-1/fixed mapping, and process if it
 722  * does.  B_TRUE means the packet was handled, and we shouldn't continue
 723  * processing (even if "was handled" means droppage).
 724  */
 725 static boolean_t
 726 vxlnat_one_vxlan_fixed(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
 727     ip6_t *ip6h)
 728 {
 729         vxlnat_fixed_t *fixed, fsearch;
 730         mblk_t *newmp;
 731         ire_t *outbound_ire;
 732         /* Use C99's initializers for fun & profit. */
 733         ip_recv_attr_t iras = { IRAF_IS_IPV4 | IRAF_VERIFIED_SRC };
 734 
 735         if (ipha != NULL) {
 736                 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
 737                     &fsearch.vxnf_addr);
 738         } else {
 739                 /* vxlnat_cache_remote() did reality checks... */
 740                 ASSERT(ipha == NULL && ip6h != NULL);
 741                 fsearch.vxnf_addr = ip6h->ip6_src;
 742         }
 743 
 744         rw_enter(&vnet->vxnv_fixed_lock, RW_READER);
 745         fixed = avl_find(&vnet->vxnv_fixed_ips, &fsearch, NULL);
 746         if (fixed != NULL)
 747                 VXNF_REFHOLD(fixed);
 748         rw_exit(&vnet->vxnv_fixed_lock);
 749         if (fixed == NULL)
 750                 return (B_FALSE);       /* Try another method of processing. */
 751 
 752         newmp = NULL;
 753         /*
 754          * XXX KEBE ASKS --> Do an MTU check NOW?!  That way, we have
 755          * pre-natted data.  One gotcha, external dests may have
 756          * different PathMTUs so see below about EMSGSIZE...
 757          *
 758          * For now, let the post-NAT crunch through
 759          * ire_recv_forward_v4() take care of all of that.
 760          */
 761 
 762         if (ipha != NULL)
 763                 newmp = vxlnat_fixed_fixv4(mp, fixed, B_FALSE);
 764         else {
 765                 freemsg(mp); /* XXX handle ip6h */
 766                 return (B_TRUE);
 767         }
 768 
 769         if (newmp == NULL)
 770                 return (B_TRUE);        /* mp eaten by vxlnat_fixed_fixv4() */
 771 
 772 
 773         ASSERT3P(ipha, ==, newmp->b_rptr);
 774         /* XXX KEBE ASKS, IRR_ALLOCATE okay?!? */
 775         /* XXX KEBE SAYS XMIT HINT! */
 776         outbound_ire = ire_route_recursive_dstonly_v4(ipha->ipha_dst,
 777             IRR_ALLOCATE, 0, vxlnat_netstack->netstack_ip);
 778         VERIFY3P(outbound_ire, !=, NULL);
 779         if (outbound_ire->ire_type == IRE_NOROUTE) {
 780                 /* Bail! */
 781                 DTRACE_PROBE2(vxlnat__in__drop__fixedire, ipaddr_t,
 782                     ipha->ipha_dst, mblk_t *, mp);
 783                 VXNF_REFRELE(fixed);
 784                 freemsg(mp);
 785                 return (B_TRUE);
 786         }
 787 
 788         iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
 789         if (iras.ira_ip_hdr_length > sizeof (ipha_t))
 790                 iras.ira_flags |= IRAF_IPV4_OPTIONS;
 791         iras.ira_xmit_hint = 0; /* XXX KEBE SAYS FIX ME! */
 792         iras.ira_zoneid = outbound_ire->ire_zoneid;
 793         iras.ira_pktlen = ntohs(ipha->ipha_length);
 794         iras.ira_protocol = ipha->ipha_protocol;
 795         /* XXX KEBE ASKS rifindex & ruifindex ?!? */
 796         /*
 797          * NOTE: AT LEAST ira_ill needs ILLF_ROUTER set, as
 798          * well as the ill for the external NIC (where
 799          * off-link destinations live).  For fixed, ira_ill
 800          * should be the ill of the external source.
 801          */
 802         iras.ira_rill = vxlnat_underlay_ire->ire_ill;
 803         iras.ira_ill = fixed->vxnf_ire->ire_ill;
 804         /* XXX KEBE ASKS cred & cpid ? */
 805         iras.ira_verified_src = ipha->ipha_src;
 806         /* XXX KEBE SAYS don't sweat IPsec stuff. */
 807         /* XXX KEBE SAYS ALSO don't sweat l2src & mhip */
 808 
 809         /* Okay, we're good! Let's pretend we're forwarding. */
 810         ire_recv_forward_v4(outbound_ire, mp, ipha, &iras);
 811         ire_refrele(outbound_ire);
 812 
 813         return (B_TRUE);
 814 }
 815 
 816 /*
 817  * Process exactly one VXLAN packet.
 818  */
 819 static void
 820 vxlnat_one_vxlan(mblk_t *mp, struct sockaddr_in6 *underlay_src)
 821 {
 822         vxlan_hdr_t *vxh;
 823         vxlnat_vnet_t *vnet;
 824         ipha_t *ipha;
 825         ip6_t *ip6h;
 826 
 827         if (MBLKL(mp) < sizeof (*vxh)) {
 828                 /* XXX KEBE ASKS -- should we be more forgiving? */
 829                 DTRACE_PROBE1(vxlnat__in__drop__vxlsize, mblk_t *, mp);
 830                 freemsg(mp);
 831                 return;
 832         }
 833         vxh = (vxlan_hdr_t *)mp->b_rptr;
 834 
 835         /* If we start using more than just the one flag, fix it. */
 836         if (vxh->vxlan_flags != VXLAN_F_VDI_WIRE) {
 837                 DTRACE_PROBE1(vxlnat__in__drop__VDI, mblk_t *, mp);
 838                 freemsg(mp);
 839                 return;
 840         }
 841 
 842         /* Remember, we key off of what's on the wire. */
 843         vnet = vxlnat_get_vnet(VXLAN_ID_WIRE32(vxh->vxlan_id), B_FALSE);
 844         if (vnet == NULL) {
 845                 DTRACE_PROBE1(vxlnat__in__drop__vnetid, uint32_t,
 846                     VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)));
 847                 freemsg(mp);
 848                 return;
 849         }
 850 
 851         DTRACE_PROBE2(vxlnat__in__vnet, uint32_t,
 852             VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)),
 853             vxlnat_vnet_t, vnet);
 854 
 855         /*
 856          * Arrived-from-vxlan processing steps:
 857          * 1.) Locate the ethernet header and check/update/add-into remotes.
 858          * 2.) Search 1-1s, process if hit.
 859          * 3.) Search flows, process if hit.
 860          * 4.) Search rules, create new flow (or not) if hit.
 861          * 5.) Drop the packet.
 862          */
 863 
 864         /* 1.) Locate the ethernet header and check/update/add-into remotes. */
 865         mp->b_rptr += sizeof (*vxh);
 866         while (MBLKL(mp) == 0) {
 867                 mblk_t *oldmp = mp;
 868 
 869                 mp = mp->b_cont;
 870                 freeb(oldmp);
 871         }
 872         mp = vxlnat_cache_remote(mp, underlay_src, vnet);
 873         if (mp == NULL)
 874                 goto bail_no_free;
 875 
 876         /* Let's cache the IP header here... */
 877         ipha = (ipha_t *)mp->b_rptr;
 878         switch (IPH_HDR_VERSION(ipha)) {
 879         case IPV4_VERSION:
 880                 ip6h = NULL;
 881                 break;
 882         case IPV6_VERSION:
 883                 ip6h = (ip6_t *)ipha;
 884                 ipha = NULL;
 885                 break;
 886         default:
 887                 DTRACE_PROBE2(vxlnat__in__drop__ipvers, int,
 888                     IPH_HDR_VERSION(ipha), mblk_t *, mp);
 889                 goto bail_and_free;
 890         }
 891 
 892         /* 2.) Search 1-1s, process if hit. */
 893         if (vxlnat_one_vxlan_fixed(vnet, mp, ipha, ip6h))
 894                 goto bail_no_free;      /* Success means mp was consumed. */
 895 
 896         /* 3.) Search flows, process if hit. */
 897         if (vxlnat_one_vxlan_flow(vnet, mp, ipha, ip6h))
 898                 goto bail_no_free;      /* Success means mp was consumed. */
 899 
 900         /* 4.) Search rules, create new flow (or not) if hit. */
 901         if (vxlnat_one_vxlan_rule(vnet, mp, ipha, ip6h))
 902                 goto bail_no_free;      /* Success means mp was consumed. */
 903 
 904         /* 5.) Nothing, drop the packet. */
 905 
 906         DTRACE_PROBE2(vxlnat__in___drop__nohits, vxlnat_vnet_t *, vnet,
 907             mblk_t *, mp);
 908 
 909 bail_and_free:
 910         freemsg(mp);
 911 bail_no_free:
 912         VXNV_REFRELE(vnet);
 913 }
 914 /*
 915  * ONLY return B_FALSE if we get a packet-clogging event.
 916  */
 917 /* ARGSUSED */
 918 static boolean_t
 919 vxlnat_vxlan_input(ksocket_t insock, mblk_t *chain, size_t msgsize, int oob,
 920     void *ignored)
 921 {
 922         mblk_t *mp, *nextmp;
 923 
 924         /*
 925          * XXX KEBE ASKS --> move hold & release outside of loop?
 926          * If so, hold rwlock here.
 927          */
 928 
 929         for (mp = chain; mp != NULL; mp = nextmp) {
 930                 struct T_unitdata_ind *tudi;
 931                 struct sockaddr_in6 *sin6;
 932 
 933                 nextmp = mp->b_next;
 934                 if (DB_TYPE(mp) != M_PROTO || mp->b_cont == NULL) {
 935                         DTRACE_PROBE1(vxlnat__in__drop__mblk, mblk_t *, mp);
 936                         freemsg(mp);
 937                         continue;
 938                 }
 939 
 940                 /* LINTED -- aligned */
 941                 tudi = (struct T_unitdata_ind *)mp->b_rptr;
 942                 if (tudi->PRIM_type != T_UNITDATA_IND) {
 943                         DTRACE_PROBE1(vxlnat__in__drop__TPI, mblk_t *, mp);
 944                         freemsg(mp);
 945                         continue;
 946                 }
 947                 /* LINTED -- aligned */
 948                 sin6 = (struct sockaddr_in6 *)(mp->b_rptr + tudi->SRC_offset);
 949                 VERIFY(sin6->sin6_family == AF_INET6);
 950                 VERIFY(tudi->SRC_length >= sizeof (*sin6));
 951 
 952                 vxlnat_one_vxlan(mp->b_cont, sin6);
 953                 freeb(mp);
 954         }
 955 
 956         return (B_TRUE);
 957 }
 958 
 959 /*
 960  * Use RFC 1141's technique (with a check for -0).
 961  *
 962  * newsum = oldsum - (new16a + old16a - new16b + old16b ...);
 963  *
 964  * NOTE: "oldsum" is right off the wire in wire-native order.
 965  * NOTE2: "old" and "new" ALSO point to things in wire-native order.
 966  * NOTE3:  THIS MUST TAKE A MULTIPLE OF 2 BYTES (i.e. uint16_t array).
 967  * NOTE4: The 32-bit running sum means we can't take len > 64k.
 968  */
 969 uint16_t
 970 vxlnat_cksum_adjust(uint16_t oldsum, uint16_t *old, uint16_t *new, uint_t len)
 971 {
 972         uint32_t newsum = ntohs(oldsum);
 973 
 974         ASSERT((len & 0x1) == 0);
 975         while (len != 0) {
 976                 newsum -= ntohs(*new);
 977                 newsum += ntohs(*old);
 978                 len -= 2;
 979                 old++;
 980                 new++;
 981         }
 982         newsum += (newsum >> 16) & 0xffff;
 983 
 984         return (newsum == 0xffff ? 0 : htons(newsum));
 985 }
 986 
 987 /*
 988  * Fix inner headers on an ICMP packet.
 989  *
 990  * XXX KEBE SAYS FOR NOW, just do addresses for 1-1/fixed.  When we do
 991  * flows, include old_port/new_port as well.
 992  */
 993 static mblk_t *
 994 vxlnat_fix_icmp_inner_v4(mblk_t *mp, icmph_t *icmph, ipaddr_t old_one,
 995     ipaddr_t new_one, boolean_t to_private)
 996 {
 997         mblk_t *newmp;
 998         ipha_t *inner_ipha;
 999         ipaddr_t *new_ones_place;
1000 
1001         if ((uint8_t *)(icmph + 1) + sizeof (ipha_t) > mp->b_wptr) {
1002                 /* Pay the pullup tax. */
1003                 newmp = msgpullup(mp, -1);
1004                 freemsg(mp);
1005                 if (newmp == NULL) {
1006                         DTRACE_PROBE1(vxlnat__fixicmp__pullupfail, void *,
1007                             NULL);
1008                         return (NULL);
1009                 }
1010                 if (MBLKL(newmp) < 2 * sizeof (ipha_t) + sizeof (icmph_t)) {
1011                         /* Wow! Too-tiny ICMP packet. */
1012                         DTRACE_PROBE1(vxlnat__fixicmp__tootiny, mblk_t *,
1013                             newmp);
1014                         freeb(newmp);
1015                         return (NULL);
1016                 }
1017                 mp = newmp;
1018                 /* Temporarily use inner_ipha for the outer one. */
1019                 inner_ipha = (ipha_t *)mp->b_rptr;
1020                 icmph = (icmph_t *)(mp->b_rptr + IPH_HDR_LENGTH(inner_ipha));
1021         }
1022         inner_ipha = (ipha_t *)(icmph + 1);
1023         new_ones_place = to_private ?
1024             &inner_ipha->ipha_src : &inner_ipha->ipha_dst;
1025         if (*new_ones_place != old_one) {
1026                 /* Either I'm buggy or the packet is. */
1027                 DTRACE_PROBE2(vxlnat__fixicmp__badinneraddr, ipaddr_t,
1028                     old_one, ipaddr_t, *new_ones_place);
1029                 freeb(mp);
1030                 return (NULL);
1031         }
1032         *new_ones_place = new_one;
1033 
1034         /* Adjust ICMP checksum... */
1035         icmph->icmph_checksum = vxlnat_cksum_adjust(icmph->icmph_checksum,
1036             (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
1037 
1038         /*
1039          * XXX KEBE ASKS, recompute *inner-packet* checksums?  Let's not for
1040          * now, but consider this Fair Warning (or some other VH album...).
1041          */
1042         return (mp);
1043 }
1044 
1045 /*
1046  * Take a 1-1/fixed IPv4 packet and convert it for transmission out the
1047  * appropriate end. "to_private" is what it says on the tin.
1048  * ALWAYS consumes "mp", regardless of return value.
1049  */
1050 static mblk_t *
1051 vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed, boolean_t to_private)
1052 {
1053         ipaddr_t new_one, old_one;
1054         ipaddr_t *new_ones_place;
1055         ipha_t *ipha = (ipha_t *)mp->b_rptr;
1056         uint8_t *nexthdr, *end_wptr;
1057 
1058         if (to_private) {
1059                 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_addr, new_one);
1060                 new_ones_place = &ipha->ipha_dst;
1061         } else {
1062                 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_pubaddr, new_one);
1063                 new_ones_place = &ipha->ipha_src;
1064         }
1065 
1066         old_one = *new_ones_place;
1067         *new_ones_place = new_one;
1068 
1069         /*
1070          * Recompute the IP header checksum, and check for the TCP or UDP
1071          * checksum as well, as they'll need recomputing as well.
1072          */
1073 
1074         /* First, the IPv4 header itself. */
1075         ipha->ipha_hdr_checksum = vxlnat_cksum_adjust(ipha->ipha_hdr_checksum,
1076             (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
1077 
1078         nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
1079         if (nexthdr >= mp->b_wptr) {
1080                 nexthdr = mp->b_cont->b_rptr +
1081                     (MBLKL(mp) - IPH_HDR_LENGTH(ipha));
1082                 end_wptr = mp->b_cont->b_wptr;
1083         } else {
1084                 end_wptr = mp->b_wptr;
1085         }
1086 
1087         switch (ipha->ipha_protocol) {
1088         case IPPROTO_TCP: {
1089                 tcpha_t *tcph = (tcpha_t *)nexthdr;
1090 
1091                 if (nexthdr + sizeof (*tcph) >= end_wptr) {
1092                         /* Bail for now. */
1093                         DTRACE_PROBE1(vxlnat__fix__tcp__mblkspan, mblk_t *,
1094                             mp);
1095                         freemsg(mp);
1096                         return (NULL);
1097                 }
1098                 tcph->tha_sum = vxlnat_cksum_adjust(tcph->tha_sum,
1099                     (uint16_t *)&old_one, (uint16_t *)&new_one,
1100                     sizeof (ipaddr_t));
1101                 break;  /* Out of switch. */
1102         }
1103         case IPPROTO_UDP: {
1104                 udpha_t *udph = (udpha_t *)nexthdr;
1105 
1106                 if (nexthdr + sizeof (*udph) >= end_wptr) {
1107                         /* Bail for now. */
1108                         DTRACE_PROBE1(vxlnat__fix__udp__mblkspan, mblk_t *,
1109                             mp);
1110                         freemsg(mp);
1111                         return (NULL);
1112                 }
1113                 udph->uha_checksum = vxlnat_cksum_adjust(udph->uha_checksum,
1114                     (uint16_t *)&old_one, (uint16_t *)&new_one,
1115                     sizeof (ipaddr_t));
1116                 break;  /* Out of switch. */
1117         }
1118         case IPPROTO_ICMP: {
1119                 icmph_t *icmph = (icmph_t *)nexthdr;
1120 
1121                 /*
1122                  * We need to check the case of ICMP messages that contain
1123                  * IP packets.  We will need to at least change the addresses,
1124                  * and *maybe* the checksums too if necessary.
1125                  *
1126                  * This may replicate some of icmp_inbound_v4(), alas.
1127                  */
1128                 if (nexthdr + sizeof (*icmph) >= end_wptr) {
1129                         mblk_t *newmp;
1130                         /*
1131                          * Unlike the others, we're going to pay the pullup
1132                          * tax here.
1133                          */
1134                         newmp = msgpullup(mp, -1);
1135                         freemsg(mp);
1136                         if (newmp == NULL) {
1137                                 DTRACE_PROBE1(vxlnat__icmp__pullupfail, void *,
1138                                     NULL);
1139                                 return (NULL);
1140                         }
1141                         mp = newmp;
1142                         ipha = (ipha_t *)(mp->b_rptr);
1143                         nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
1144                         icmph = (icmph_t *)nexthdr;
1145                 }
1146 
1147                 switch (icmph->icmph_type) {
1148                 case ICMP_ADDRESS_MASK_REPLY:
1149                 case ICMP_ADDRESS_MASK_REQUEST:
1150                 case ICMP_TIME_STAMP_REPLY:
1151                 case ICMP_TIME_STAMP_REQUEST:
1152                 case ICMP_ECHO_REQUEST:
1153                 case ICMP_ECHO_REPLY:
1154                         /* These merely need to get passed along. */
1155                         break;
1156                 case ICMP_ROUTER_ADVERTISEMENT:
1157                 case ICMP_ROUTER_SOLICITATION:
1158                         /* These shouldn't be traversing a NAT at all. Drop. */
1159                         DTRACE_PROBE1(vxlnat__icmp__cantpass, int,
1160                             icmph->icmph_type);
1161                         freemsg(mp);
1162                         return (NULL);
1163                 case ICMP_PARAM_PROBLEM:
1164                 case ICMP_TIME_EXCEEDED:
1165                 case ICMP_DEST_UNREACHABLE:
1166                         /* These include inner-IP headers we need to adjust. */
1167                         mp = vxlnat_fix_icmp_inner_v4(mp, icmph, old_one,
1168                             new_one, to_private);
1169                         break;
1170                 default:
1171                         /* Pass along to receiver, but warn. */
1172                         DTRACE_PROBE1(vxlnat__icmp__unknown, int,
1173                             icmph->icmph_type);
1174                         break;
1175                 }
1176         }
1177         /* Otherwise we can't make any other assumptions for now... */
1178         default:
1179                 break;
1180         }
1181 
1182         return (mp);
1183 }
1184 
1185 vxlnat_remote_t *
1186 vxlnat_xmit_vxlanv4(mblk_t *mp, in6_addr_t *overlay_dst,
1187     vxlnat_remote_t *remote, uint8_t *myether, vxlnat_vnet_t *vnet)
1188 {
1189         struct sockaddr_in6 sin6 = {AF_INET6};
1190         struct msghdr msghdr = {NULL};
1191         mblk_t *vlan_mp;
1192         extern uint_t vxlan_alloc_size, vxlan_noalloc_min;
1193         vxlan_hdr_t *vxh;
1194         struct ether_vlan_header *evh;
1195         int rc;
1196         cred_t *cred;
1197 
1198         if (remote == NULL || remote->vxnrem_vnet == NULL) {
1199                 DTRACE_PROBE1(vxlnat__xmit__vxlanv4, vxlnat_remote_t *, remote);
1200                 /* Release the condemned remote. */
1201                 if (remote != NULL)
1202                         VXNREM_REFRELE(remote);
1203 
1204                 /* See if we have a remote ready to use... */
1205                 remote = vxlnat_get_remote(vnet, overlay_dst, B_FALSE);
1206 
1207                 if (remote == NULL) {
1208                         /*
1209                          * We need to do the moral equivalent of PF_KEY
1210                          * ACQUIRE or overlay's queue-resolve so that we can
1211                          * have someone in user-space send me a remote.  Until
1212                          * then, drop the reference if condemned, free the
1213                          * message, and return NULL.
1214                          */
1215 
1216                         freemsg(mp);
1217                         return (NULL);
1218                 }
1219         }
1220         ASSERT(vnet == remote->vxnrem_vnet);
1221 
1222         if (DB_REF(mp) > 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
1223                 vlan_mp = allocb(vxlan_alloc_size, BPRI_HI);
1224                 if (vlan_mp == NULL) {
1225                         DTRACE_PROBE1(vxlnat__xmit__vxlanv4__allocfail,
1226                             vxlnat_remote_t *, remote);
1227                         freemsg(mp);
1228                         /* Just drop the packet, but don't tell caller. */
1229                         return (remote);
1230                 }
1231                 vlan_mp->b_wptr = DB_LIM(vlan_mp);
1232                 vlan_mp->b_rptr = vlan_mp->b_wptr;
1233                 vlan_mp->b_cont = mp;
1234         } else {
1235                 vlan_mp = mp;
1236         }
1237         vlan_mp->b_rptr -= sizeof (*vxh) + sizeof (*evh);
1238         vxh = (vxlan_hdr_t *)vlan_mp->b_rptr;
1239         vxh->vxlan_flags = VXLAN_F_VDI_WIRE;
1240         vxh->vxlan_id = vnet->vxnv_vnetid;        /* Already in wire-order. */
1241 
1242         /* Fill in the Ethernet header. */
1243         evh = (struct ether_vlan_header *)(vxh + 1);
1244         ether_copy(&remote->vxnrem_ether, &evh->ether_dhost);
1245         ether_copy(myether, &evh->ether_shost);
1246         evh->ether_tpid = htons(ETHERTYPE_VLAN);
1247         evh->ether_tci = remote->vxnrem_vlan;
1248         evh->ether_type = htons(ETHERTYPE_IP);
1249 
1250         msghdr.msg_name = (struct sockaddr_storage *)&sin6;
1251         msghdr.msg_namelen = sizeof (sin6);
1252         /* Address family and other zeroing already done up top. */
1253         sin6.sin6_port = htons(IPPORT_VXLAN);
1254         sin6.sin6_addr = remote->vxnrem_uaddr;
1255         
1256         /*
1257          * cred_t dance is because we may be getting this straight from
1258          * interrupt context.
1259          */
1260         cred = zone_get_kcred(netstack_get_zoneid(vxlnat_netstack));
1261         if (cred == NULL) {
1262                 DTRACE_PROBE1(vxlnat__xmit__vxlan4__credfail, 
1263                     vxlnat_remote_t *, remote);
1264                 freemsg(vlan_mp);
1265         }
1266         /*
1267          * Use MSG_DONTWAIT to avoid blocks, esp. if we're getting this
1268          * straight from interrupt context.
1269          */
1270         rc = ksocket_sendmblk(vxlnat_underlay, &msghdr, MSG_DONTWAIT, &vlan_mp,
1271             cred);
1272         crfree(cred);
1273         if (rc != 0) {
1274                 DTRACE_PROBE2(vxlnat__xmit__vxlan4__sendfail, int, rc,
1275                     vxlnat_remote_t *, remote);
1276                 freemsg(vlan_mp);
1277         }
1278         return (remote);
1279 }
1280 
1281 /*
1282  * New ire_{recv,send}fn implementations if we're doing 1-1 mappings.
1283  */
1284 int
1285 vxlnat_fixed_ire_send_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
1286     ip_xmit_attr_t *ixa, uint32_t *identp)
1287 {
1288         /* XXX KEBE SAYS FILL ME IN, but for now... */
1289         freemsg(mp);
1290         return (EOPNOTSUPP);
1291 }
1292 
1293 void
1294 vxlnat_fixed_ire_recv_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
1295     ip_recv_attr_t *ira)
1296 {
1297         /* XXX KEBE SAYS FILL ME IN, but for now... */
1298         freemsg(mp);
1299 }
1300 
1301 /*
1302  * I believe the common case for this will be from self-generated ICMP
1303  * messages.  Other same-netstack-originated traffic will also come through
1304  * here (one internal reaching what turns out to be another internal).
1305  */
1306 int
1307 vxlnat_fixed_ire_send_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1308     ip_xmit_attr_t *ixa, uint32_t *identp)
1309 {
1310         ip_recv_attr_t iras;    /* NOTE: No bzero because we pay more later */
1311         ipha_t *ipha = (ipha_t *)iph_arg;
1312 
1313         /*
1314          * XXX KEBE ASKS, any DTrace probes or other instrumentation that
1315          * perhaps should be set?
1316          */
1317 
1318         /* Map ixa to ira. */
1319         iras.ira_pktlen = ixa->ixa_pktlen;
1320         /* XXX KEBE ASKS more?!? */
1321 
1322         /*
1323          * In normal TCP/IP processing, this shortcuts the IP header checksum
1324          * AND POSSIBLY THE ULP checksum cases.  Since this is likely to head
1325          * back into the internal network, we need to recompute things again.
1326          */
1327         if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
1328                 freemsg(mp);
1329                 return (EMSGSIZE);
1330         }
1331 #if 0
1332         /* XXX KEBE ASKS Special-case ICMP here? */
1333         if (ipha->ipha_protocol == IPPROTO_ICMP) {
1334                 icmph_t *icmph;
1335 
1336                 icmph = (icmph_t *)((uint8_t *)ipha + IPH_HDR_LENGTH(ipha));
1337                 if ((uint8_t *)icmph >= mp->b_wptr) {
1338                         freemsg(mp);
1339                         return (EMSGSIZE);
1340                 }
1341                 icmph->icmph_checksum = 0;
1342                 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
1343         }
1344 #endif
1345 
1346         vxlnat_fixed_ire_recv_v4(ire, mp, iph_arg, &iras);
1347 
1348         return (0);
1349 }
1350 
1351 void
1352 vxlnat_fixed_ire_recv_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1353     ip_recv_attr_t *ira)
1354 {
1355         vxlnat_fixed_t *fixed;
1356         vxlnat_vnet_t *vnet;
1357         ipha_t *ipha = (ipha_t *)iph_arg;
1358         int newmtu;
1359 
1360         /* Make a note for DAD that this address is in use */
1361         ire->ire_last_used_time = LBOLT_FASTPATH;
1362 
1363         /* Only target the IRE_LOCAL with the right zoneid. */
1364         ira->ira_zoneid = ire->ire_zoneid;
1365 
1366         /*
1367          * XXX KEBE ASKS, any DTrace probes or other instrumentation that
1368          * perhaps should be set?
1369          */
1370 
1371         /*
1372          * Reality check some things.
1373          */
1374         fixed = (vxlnat_fixed_t *)ire->ire_dep_sib_next;
1375         vnet = fixed->vxnf_vnet;
1376 
1377         ASSERT3P(ire, ==, fixed->vxnf_ire);
1378 
1379         if (IRE_IS_CONDEMNED(ire) || vnet == NULL)
1380                 goto detach_ire_and_bail;
1381 
1382         /*
1383          * Not a common-case, but a possible one.  If our underlay MTU is
1384          * smaller than the external MTU, it is possible that we will have a
1385          * size mismatch and therefore need to either fragment at the VXLAN
1386          * layer (VXLAN UDP packet sent as two or more IP fragments) OR
1387          * if IPH_DF is set, send an ICMP_NEEDS_FRAGMENTATION back to the
1388          * sender.  Perform the check here BEFORE we NAT the packet.
1389          */
1390         ASSERT(vxlnat_underlay_ire->ire_ill != NULL);
1391         newmtu = vxlnat_underlay_ire->ire_ill->ill_mtu - sizeof (ipha_t) -
1392             sizeof (udpha_t) - sizeof (vxlan_hdr_t) -
1393             sizeof (struct ether_vlan_header);
1394         if ((ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF) &&
1395             ntohs(ipha->ipha_length) > newmtu) {
1396                 icmp_frag_needed(mp, newmtu, ira);
1397                 /* We're done.  Assume icmp_frag_needed() consumed mp. */
1398                 return;
1399         }
1400 
1401         /*
1402          * So we're here, and since we have a refheld IRE, we have a refheld
1403          * fixed and vnet. Do some of what ip_input_local_v4() does (inbound
1404          * checksum?  some ira checks?), but otherwise, swap the destination
1405          * address as mapped in "fixed", recompute any checksums, and send it
1406          * along its merry way (with a ttl decement too) to a VXLAN
1407          * destination.
1408          */
1409         mp = vxlnat_fixed_fixv4(mp, fixed, B_TRUE);
1410         if (mp == NULL)
1411                 return; /* Assume it's been freed & dtraced already. */
1412 
1413         /*
1414          * Otherwise, we're ready to transmit this packet over the vxlan
1415          * socket.
1416          */
1417         fixed->vxnf_remote = vxlnat_xmit_vxlanv4(mp, &fixed->vxnf_addr,
1418             fixed->vxnf_remote, fixed->vxnf_myether, vnet);
1419         if (fixed->vxnf_remote == NULL) {
1420                 /* XXX KEBE ASKS, DTrace probe here?  Or in-function? */
1421                 DTRACE_PROBE2(vxlnat__fixed__xmitdrop,
1422                     in6_addr_t *, &fixed->vxnf_addr,
1423                     uint32_t, VXLAN_ID_NTOH(vnet->vxnv_vnetid));
1424         }
1425         return;
1426 
1427 detach_ire_and_bail:
1428         /* Oh no, something's condemned.  Drop the IRE now. */
1429         ire->ire_recvfn = ire_recv_local_v4;
1430         ire->ire_dep_sib_next = NULL;
1431         VXNF_REFRELE(fixed);
1432         /* Pass the packet back... */
1433         ire_recv_local_v4(ire, mp, iph_arg, ira);
1434         return;
1435 }