1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2018 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * NAT engine.  Mappings, 1-1 The rules in vxlnat_rules.c are only consulted
  18  * if the 1-1 map (kept here) misses or if the outbound lookup (vnetid,
  19  * protocol, src-IP, dst-IP, src-port, dst-port) misses.
  20  *
  21  * The plan is for inbound to hit conn_ts, whose conn_private points to
  22  * entries here.  The conn_recv* functions live here too (for now).
  23  */
  24 
  25 #include <sys/types.h>
  26 #include <sys/socket.h>
  27 #include <sys/ksynch.h>
  28 #include <sys/ksocket.h>
  29 #include <sys/kmem.h>
  30 #include <sys/stream.h>
  31 #include <sys/strsubr.h>
  32 #include <sys/strsun.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/debug.h>
  35 #include <sys/dtrace.h>
  36 #include <sys/errno.h>
  37 #include <sys/tihdr.h>
  38 #include <netinet/in.h>
  39 #include <netinet/udp.h>
  40 #include <inet/ip.h>
  41 #include <inet/ip6.h>
  42 #include <inet/udp_impl.h>
  43 #include <inet/tcp.h>
  44 
  45 #include <inet/vxlnat_impl.h>
  46 
  47 static boolean_t vxlnat_vxlan_input(ksocket_t, mblk_t *, size_t, int, void *);
  48 static mblk_t *vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed,
  49     boolean_t to_private);
  50 
  51 /*
  52  * Initialized to NULL, read/write protected by vxlnat_mutex.
  53  * Receive functions shouldn't have to access this directly.
  54  */
  55 ksocket_t vxlnat_underlay;
  56 ire_t *vxlnat_underlay_ire;
  57 
  58 void
  59 vxlnat_closesock(void)
  60 {
  61         ASSERT(MUTEX_HELD(&vxlnat_mutex));
  62         if (vxlnat_underlay_ire != NULL) {
  63                 ire_refrele(vxlnat_underlay_ire);
  64                 vxlnat_underlay_ire = NULL;
  65         }
  66         if (vxlnat_underlay != NULL) {
  67                 (void) ksocket_close(vxlnat_underlay, zone_kcred());
  68                 vxlnat_underlay = NULL;
  69         }
  70 }
  71 
  72 static int
  73 vxlnat_opensock(in6_addr_t *underlay_ip)
  74 {
  75         int rc, val;
  76         /* Assume rest is initialized to 0s. */
  77         struct sockaddr_in6 sin6 = {AF_INET6, BE_16(IPPORT_VXLAN)};
  78         ip_stack_t *ipst = vxlnat_netstack->netstack_ip;
  79 
  80         ASSERT(MUTEX_HELD(&vxlnat_mutex));
  81         /* Open... */
  82         rc = ksocket_socket(&vxlnat_underlay, AF_INET6, SOCK_DGRAM, 0,
  83             KSOCKET_SLEEP, zone_kcred());
  84         if (rc != 0)
  85                 return (rc);
  86 
  87         /* Bind... */
  88         sin6.sin6_addr = *underlay_ip;
  89         rc = ksocket_bind(vxlnat_underlay, (struct sockaddr *)(&sin6),
  90             sizeof (sin6), zone_kcred());
  91         if (rc != 0) {
  92                 vxlnat_closesock();
  93                 return (rc);
  94         }
  95 
  96         /* Use source-port hashing when sending packets out VXLAN... */
  97         val = UDP_HASH_VXLAN;
  98         rc = ksocket_setsockopt(vxlnat_underlay, IPPROTO_UDP,
  99             UDP_SRCPORT_HASH, &val, sizeof (val), kcred);
 100         if (rc != 0) {
 101                 vxlnat_closesock();
 102                 return (rc);
 103         }
 104 
 105         /*
 106          * Grab the IRE for underlay address.
 107          */
 108         ASSERT3P(vxlnat_underlay_ire, ==, NULL);
 109         vxlnat_underlay_ire = (IN6_IS_ADDR_V4MAPPED(underlay_ip)) ?
 110             ire_ftable_lookup_simple_v4(underlay_ip->_S6_un._S6_u32[3],
 111             0, ipst, NULL) :
 112             ire_ftable_lookup_simple_v6(underlay_ip, 0, ipst, NULL);
 113         if (vxlnat_underlay_ire == NULL) {
 114                 DTRACE_PROBE1(vxlnat__opensock__ire__fail, in6_addr_t *,
 115                     underlay_ip);
 116                 vxlnat_closesock();
 117                 return (EADDRNOTAVAIL);
 118         }
 119 
 120         /* Once we return from this, start eating data. */
 121         rc = ksocket_krecv_set(vxlnat_underlay, vxlnat_vxlan_input, NULL);
 122         if (rc != 0) {
 123                 vxlnat_closesock();
 124         }
 125 
 126         return (rc);
 127 }
 128 
 129 /*
 130  * Establish a VXLAN-listening kernel socket.
 131  * XXX KEBE ASKS ==> Support more than one VXLAN address?
 132  */
 133 /* ARGSUSED */
 134 int
 135 vxlnat_vxlan_addr(in6_addr_t *underlay_ip)
 136 {
 137         int rc;
 138 
 139         ASSERT(MUTEX_HELD(&vxlnat_mutex));
 140         /* For now, we make this a one-underlay-address-only solution. */
 141         vxlnat_closesock();
 142         rc = vxlnat_opensock(underlay_ip);
 143         return (rc);
 144 }
 145 
 146 /*
 147  * Free a remote VXLAN destination.
 148  */
 149 void
 150 vxlnat_remote_free(vxlnat_remote_t *remote)
 151 {
 152         ASSERT0(remote->vxnrem_refcount);
 153 
 154         kmem_free(remote, sizeof (*remote));
 155 }
 156 
 157 /*
 158  * Like other unlink functions, assume the appropriate lock is held.
 159  */
 160 void
 161 vxlnat_remote_unlink(vxlnat_remote_t *remote)
 162 {
 163         vxlnat_vnet_t *vnet = remote->vxnrem_vnet;
 164 
 165         ASSERT3P(vnet, !=, NULL);
 166         ASSERT(MUTEX_HELD(&vnet->vxnv_remote_lock));
 167 
 168         /* First unlink so nobody else can find me */
 169         avl_remove(&vnet->vxnv_remotes, remote);
 170 
 171         /*
 172          * We still hold a vnet reference, so races shouldn't be a problem.
 173          * Still, for added safety, NULL it out first.
 174          */
 175         remote->vxnrem_vnet = NULL;  /* Condemn this entry. */
 176         VXNV_REFRELE(vnet);
 177         VXNREM_REFRELE(remote); /* Internment release. */
 178 }
 179 
 180 /*
 181  * Find or create a remote VXLAN destination.
 182  */
 183 static vxlnat_remote_t *
 184 vxlnat_get_remote(vxlnat_vnet_t *vnet, in6_addr_t *remote_addr,
 185     boolean_t create_on_miss)
 186 {
 187         vxlnat_remote_t *remote, searcher;
 188         avl_index_t where;
 189 
 190         searcher.vxnrem_addr = *remote_addr;
 191         mutex_enter(&vnet->vxnv_remote_lock);
 192         remote = avl_find(&vnet->vxnv_remotes, &searcher, &where);
 193         if (remote == NULL && create_on_miss) {
 194                 /* Not as critical if we can't allocate here. */
 195                 remote = kmem_zalloc(sizeof (*remote),
 196                     KM_NOSLEEP | KM_NORMALPRI);
 197                 if (remote != NULL) {
 198                         remote->vxnrem_addr = *remote_addr;
 199                         remote->vxnrem_refcount = 1; /* Internment reference. */
 200                         VXNV_REFHOLD(vnet);
 201                         remote->vxnrem_vnet = vnet;
 202                         /* Rest is filled in by caller. */
 203                         avl_insert(&vnet->vxnv_remotes, remote, where);
 204                 }
 205         }
 206         if (remote != NULL)
 207                 VXNREM_REFHOLD(remote);
 208         mutex_exit(&vnet->vxnv_remote_lock);
 209         return (remote);
 210 }
 211 
 212 /*
 213  * Cache inbound packet information in the vnet's remotes section.
 214  *
 215  * NOTE: This function assumes a trustworthy underlay network.  If the
 216  * underlay isn't trustworthy, this function should be renamed, and reduced to
 217  * a "strip and reality-check the ethernet header" function.
 218  *
 219  * Caller has stripped any pre-ethernet data from mp.  We return mp
 220  * stripped down to its IP header.
 221  */
 222 static mblk_t *
 223 vxlnat_cache_remote(mblk_t *mp, struct sockaddr_in6 *underlay_src,
 224     vxlnat_vnet_t *vnet)
 225 {
 226         struct ether_vlan_header *evh;
 227         struct ether_header *eh;
 228         vxlnat_remote_t *remote;
 229         uint16_t vlan, ethertype;
 230         ether_addr_t remote_ether;
 231         ipha_t *ipha;
 232         ip6_t *ip6h;
 233         in6_addr_t remote_addr;
 234 
 235         /* Assume (for now) we have at least a VLAN header's worth of data. */
 236         if (MBLKL(mp) < sizeof (*evh)) {
 237                 /* XXX KEBE ASKS - should we be more forgiving? */
 238                 DTRACE_PROBE1(vxlnat__in__drop__etherhdr, mblk_t *, mp);
 239                 freemsg(mp);
 240                 return (NULL);
 241         }
 242 
 243         eh = (struct ether_header *)mp->b_rptr;
 244         ethertype = ntohs(eh->ether_type);
 245         ether_copy(&eh->ether_shost, &remote_ether);
 246         if (ethertype == ETHERTYPE_VLAN) {
 247                 evh = (struct ether_vlan_header *)eh;
 248                 /* Keep it in network order... */
 249                 vlan = evh->ether_tci;
 250                 ethertype = ntohs(evh->ether_type);
 251                 ASSERT(vlan != 0);
 252                 mp->b_rptr += sizeof (*evh);
 253         } else {
 254                 evh = NULL;
 255                 vlan = 0;
 256                 mp->b_rptr += sizeof (*eh);
 257         }
 258         if (ethertype != ETHERTYPE_IP && ethertype != ETHERTYPE_IPV6) {
 259                 /*
 260                  * XXX KEBE SAYS for now, don't handle non-IP packets.
 261                  * This includes ARP.
 262                  */
 263                 DTRACE_PROBE1(vxlnat__in__drop__nonip, mblk_t *, mp);
 264                 freemsg(mp);
 265                 return (NULL);
 266         }
 267 
 268         /* Handle case of split ether + IP headers. */
 269         if (MBLKL(mp) < sizeof (ipha_t)) {
 270                 mblk_t *freemp;
 271                 
 272                 if (MBLKL(mp) > 0 || mp->b_cont == NULL) {
 273                         /* The IP header is split ACROSS MBLKS! Bail for now. */
 274                         DTRACE_PROBE1(vxlnat__in__drop__splitip, mblk_t *, mp);
 275                         freemsg(mp);
 276                         return (NULL);
 277                 }
 278                 freemp = mp;
 279                 mp = mp->b_cont;
 280                 freeb(freemp);
 281         }
 282         /* LINTED -- alignment... */
 283         ipha = (ipha_t *)mp->b_rptr;
 284 
 285         if (IPH_HDR_VERSION(ipha) == IPV4_VERSION) {
 286                 if (ethertype != ETHERTYPE_IP) {
 287                         /* XXX KEBE ASKS - should we be more forgiving? */
 288                         DTRACE_PROBE1(vxlnat__in__drop__etherhdr4,
 289                             mblk_t *, mp);
 290                         freemsg(mp);
 291                         return (NULL);
 292                 }
 293                 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
 294                     &remote_addr);
 295         } else {
 296                 if (ethertype != ETHERTYPE_IPV6 ||
 297                     IPH_HDR_VERSION(ipha) != IPV6_VERSION ||
 298                     MBLKL(mp) < sizeof (ip6_t)) {    
 299                         /* XXX KEBE ASKS - should we be more forgiving? */
 300                         DTRACE_PROBE1(vxlnat__in__drop__etherhdr6,
 301                             mblk_t *, mp);
 302                         freemsg(mp);
 303                         return (NULL);
 304                 }
 305                 ip6h = (ip6_t *)ipha;
 306                 remote_addr = ip6h->ip6_src;
 307         }
 308 
 309         /* Find remote and replace OR create new remote. */
 310         remote = vxlnat_get_remote(vnet, &remote_addr, B_TRUE);
 311         if (remote != NULL) {
 312                 /*
 313                  * See if this entry needs fixing or filling-in.  This might
 314                  * get a bit racy with read-only threads that actually
 315                  * transmit, but it only means dropped-packets in the worst
 316                  * case.
 317                  *
 318                  * It's THIS PART that inspires the warning about trusting the
 319                  * underlay network.
 320                  *
 321                  * XXX KEBE ASKS -- should we just replace things w/o checking?
 322                  */
 323                 /* Replace the ethernet address? */
 324                 if (ether_cmp(&remote->vxnrem_ether, &remote_ether) != 0)
 325                         ether_copy(&remote_ether, &remote->vxnrem_ether);
 326                 /*
 327                  * Replace the underlay? NOTE: Fix if/when underlay becomes
 328                  * IPv6.
 329                  */
 330                 if (!IN6_ARE_ADDR_EQUAL(&remote->vxnrem_uaddr,
 331                     &underlay_src->sin6_addr)) {
 332                         remote->vxnrem_uaddr = underlay_src->sin6_addr;
 333                 }
 334                 /* Replace the vlan ID. Maintain network order... */
 335                 if (remote->vxnrem_vlan != vlan)
 336                         remote->vxnrem_vlan = vlan;
 337         }
 338         /*
 339          * Else just continue and pray for better luck on another packet or
 340          * on the return flight.  It is IP, we can Just Drop It (TM)...
 341          */
 342 
 343         /* We're done with the remote entry now. */
 344         VXNREM_REFRELE(remote);
 345 
 346         /* Advance rptr to the inner IP header and proceed. */
 347         mp->b_rptr = (uint8_t *)ipha;
 348         return (mp);
 349 }
 350 
 351 /*
 352  * See if the inbound VXLAN packet hits a 1-1/fixed mapping, and process if it
 353  * does.  B_TRUE means the packet was handled, and we shouldn't continue
 354  * processing (even if "was handled" means droppage).
 355  */
 356 static boolean_t
 357 vxlnat_one_vxlan_fixed(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
 358     ip6_t *ip6h)
 359 {
 360         vxlnat_fixed_t *fixed, fsearch;
 361         mblk_t *newmp;
 362         ire_t *outbound_ire;
 363         /* Use C99's initializers for fun & profit. */
 364         ip_recv_attr_t iras = { IRAF_IS_IPV4 | IRAF_VERIFIED_SRC };
 365 
 366         if (ipha != NULL) {
 367                 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
 368                     &fsearch.vxnf_addr);
 369         } else {
 370                 /* vxlnat_cache_remote() did reality checks... */
 371                 ASSERT(ipha == NULL && ip6h != NULL);
 372                 fsearch.vxnf_addr = ip6h->ip6_src;
 373         }
 374 
 375         rw_enter(&vnet->vxnv_fixed_lock, RW_READER);
 376         fixed = avl_find(&vnet->vxnv_fixed_ips, &fsearch, NULL);
 377         if (fixed != NULL)
 378                 VXNF_REFHOLD(fixed);
 379         rw_exit(&vnet->vxnv_fixed_lock);
 380         if (fixed == NULL)
 381                 return (B_FALSE);       /* Try another method of processing. */
 382 
 383         newmp = NULL;
 384         /*
 385          * XXX KEBE ASKS --> Do an MTU check NOW?!  That way, we have
 386          * pre-natted data.  One gotcha, external dests may have
 387          * different PathMTUs so see below about EMSGSIZE...
 388          *
 389          * For now, let the post-NAT crunch through
 390          * ire_recv_forward_v4() take care of all of that.
 391          */
 392 
 393         if (ipha != NULL)
 394                 newmp = vxlnat_fixed_fixv4(mp, fixed, B_FALSE);
 395         else {
 396                 freemsg(mp); /* XXX handle ip6h */
 397                 return (B_TRUE);
 398         }
 399 
 400         if (newmp == NULL)
 401                 return (B_TRUE);        /* mp eaten by vxlnat_fixed_fixv4() */
 402 
 403 
 404         ASSERT3P(ipha, ==, newmp->b_rptr);
 405         /* XXX KEBE ASKS, IRR_ALLOCATE okay?!? */
 406         /* XXX KEBE SAYS XMIT HINT! */
 407         outbound_ire = ire_route_recursive_dstonly_v4(ipha->ipha_dst,
 408             IRR_ALLOCATE, 0, vxlnat_netstack->netstack_ip);
 409         VERIFY3P(outbound_ire, !=, NULL);
 410         if (outbound_ire->ire_type == IRE_NOROUTE) {
 411                 /* Bail! */
 412                 DTRACE_PROBE2(vxlnat__in__drop__fixedire, ipaddr_t,
 413                     ipha->ipha_dst, mblk_t *, mp);
 414                 VXNF_REFRELE(fixed);
 415                 freemsg(mp);
 416                 return (B_TRUE);
 417         }
 418 
 419         iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
 420         if (iras.ira_ip_hdr_length > sizeof (ipha_t))
 421                 iras.ira_flags |= IRAF_IPV4_OPTIONS;
 422         iras.ira_xmit_hint = 0; /* XXX KEBE SAYS FIX ME! */
 423         iras.ira_zoneid = outbound_ire->ire_zoneid;
 424         iras.ira_pktlen = ntohs(ipha->ipha_length);
 425         iras.ira_protocol = ipha->ipha_protocol;
 426         /* XXX KEBE ASKS rifindex & ruifindex ?!? */
 427         /*
 428          * NOTE: AT LEAST ira_ill needs ILLF_ROUTER set, as
 429          * well as the ill for the external NIC (where
 430          * off-link destinations live).  For fixed, ira_ill
 431          * should be the ill of the external source.
 432          */
 433         iras.ira_rill = vxlnat_underlay_ire->ire_ill;
 434         iras.ira_ill = fixed->vxnf_ire->ire_ill;
 435         /* XXX KEBE ASKS cred & cpid ? */
 436         iras.ira_verified_src = ipha->ipha_src;
 437         /* XXX KEBE SAYS don't sweat IPsec stuff. */
 438         /* XXX KEBE SAYS ALSO don't sweat l2src & mhip */
 439 
 440         /* Okay, we're good! Let's pretend we're forwarding. */
 441         ire_recv_forward_v4(outbound_ire, mp, ipha, &iras);
 442         ire_refrele(outbound_ire);
 443 
 444         return (B_TRUE);
 445 }
 446 
 447 /*
 448  * Process exactly one VXLAN packet.
 449  */
 450 static void
 451 vxlnat_one_vxlan(mblk_t *mp, struct sockaddr_in6 *underlay_src)
 452 {
 453         vxlan_hdr_t *vxh;
 454         vxlnat_vnet_t *vnet;
 455         ipha_t *ipha;
 456         ip6_t *ip6h;
 457 
 458         if (MBLKL(mp) < sizeof (*vxh)) {
 459                 /* XXX KEBE ASKS -- should we be more forgiving? */
 460                 DTRACE_PROBE1(vxlnat__in__drop__vxlsize, mblk_t *, mp);
 461                 freemsg(mp);
 462                 return;
 463         }
 464         vxh = (vxlan_hdr_t *)mp->b_rptr;
 465 
 466         /* If we start using more than just the one flag, fix it. */
 467         if (vxh->vxlan_flags != VXLAN_F_VDI_WIRE) {
 468                 DTRACE_PROBE1(vxlnat__in__drop__VDI, mblk_t *, mp);
 469                 freemsg(mp);
 470                 return;
 471         }
 472 
 473         /* Remember, we key off of what's on the wire. */
 474         vnet = vxlnat_get_vnet(VXLAN_ID_WIRE32(vxh->vxlan_id), B_FALSE);
 475         if (vnet == NULL) {
 476                 DTRACE_PROBE1(vxlnat__in__drop__vnetid, uint32_t,
 477                     VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)));
 478                 freemsg(mp);
 479                 return;
 480         }
 481 
 482         DTRACE_PROBE2(vxlnat__in__vnet, uint32_t,
 483             VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)),
 484             vxlnat_vnet_t, vnet);
 485 
 486         /*
 487          * Off-vxlan processing steps:
 488          * 1.) Locate the ethernet header and check/update/add-into remotes.
 489          * 2.) Search 1-1s, process if hit.
 490          * 3.) Search flows, process if hit.
 491          * 4.) Search rules, create new flow (or not) if hit.
 492          * 5.) Drop the packets.
 493          */
 494 
 495         /* 1.) Locate the ethernet header and check/update/add-into remotes. */
 496         mp->b_rptr += sizeof (*vxh);
 497         while (MBLKL(mp) == 0) {
 498                 mblk_t *oldmp = mp;
 499 
 500                 mp = mp->b_cont;
 501                 freeb(oldmp);
 502         }
 503         mp = vxlnat_cache_remote(mp, underlay_src, vnet);
 504         if (mp == NULL)
 505                 goto bail_no_free;
 506 
 507         /* Let's cache the IP header here... */
 508         ipha = (ipha_t *)mp->b_rptr;
 509         switch (IPH_HDR_VERSION(ipha)) {
 510         case IPV4_VERSION:
 511                 ip6h = NULL;
 512                 break;
 513         case IPV6_VERSION:
 514                 ip6h = (ip6_t *)ipha;
 515                 ipha = NULL;
 516                 break;
 517         default:
 518                 DTRACE_PROBE2(vxlnat__in__drop__ipvers, int,
 519                     IPH_HDR_VERSION(ipha), mblk_t *, mp);
 520                 goto bail_and_free;
 521         }
 522 
 523         /* 2.) Search 1-1s, process if hit. */
 524         if (vxlnat_one_vxlan_fixed(vnet, mp, ipha, ip6h))
 525                 goto bail_no_free;      /* Success means mp was consumed. */
 526 
 527 #ifdef notyet
 528         /* 3.) Search flows, process if hit. */
 529         if (vxlnat_one_vxlan_flow(vnet, mp, ipha, ip6h))
 530                 goto bail_no_free;      /* Success means mp was consumed. */
 531 
 532         /* 4.) Search rules, create new flow (or not) if hit. */
 533         if (vxlnat_one_vxlan_rule(vnet, mp, ipha, ip6h))
 534                 goto bail_no_free;      /* Success means mp was consumed. */
 535 #endif
 536 
 537         /* 5.) Nothing, drop the packet. */
 538 
 539         DTRACE_PROBE2(vxlnat__in___drop__nohits, vxlnat_vnet_t *, vnet,
 540             mblk_t *, mp);
 541 
 542 bail_and_free:
 543         freemsg(mp);
 544 bail_no_free:
 545         VXNV_REFRELE(vnet);
 546 }
 547 /*
 548  * ONLY return B_FALSE if we get a packet-clogging event.
 549  */
 550 /* ARGSUSED */
 551 static boolean_t
 552 vxlnat_vxlan_input(ksocket_t insock, mblk_t *chain, size_t msgsize, int oob,
 553     void *ignored)
 554 {
 555         mblk_t *mp, *nextmp;
 556 
 557         /*
 558          * XXX KEBE ASKS --> move hold & release outside of loop?
 559          * If so, hold rwlock here.
 560          */
 561 
 562         for (mp = chain; mp != NULL; mp = nextmp) {
 563                 struct T_unitdata_ind *tudi;
 564                 struct sockaddr_in6 *sin6;
 565 
 566                 nextmp = mp->b_next;
 567                 if (DB_TYPE(mp) != M_PROTO || mp->b_cont == NULL) {
 568                         DTRACE_PROBE1(vxlnat__in__drop__mblk, mblk_t *, mp);
 569                         freemsg(mp);
 570                         continue;
 571                 }
 572 
 573                 /* LINTED -- aligned */
 574                 tudi = (struct T_unitdata_ind *)mp->b_rptr;
 575                 if (tudi->PRIM_type != T_UNITDATA_IND) {
 576                         DTRACE_PROBE1(vxlnat__in__drop__TPI, mblk_t *, mp);
 577                         freemsg(mp);
 578                         continue;
 579                 }
 580                 /* LINTED -- aligned */
 581                 sin6 = (struct sockaddr_in6 *)(mp->b_rptr + tudi->SRC_offset);
 582                 VERIFY(sin6->sin6_family == AF_INET6);
 583                 VERIFY(tudi->SRC_length >= sizeof (*sin6));
 584 
 585                 vxlnat_one_vxlan(mp->b_cont, sin6);
 586                 freeb(mp);
 587         }
 588 
 589         return (B_TRUE);
 590 }
 591 
 592 /*
 593  * Use RFC 1141's technique (with a check for -0).
 594  *
 595  * newsum = oldsum - (new16a + old16a - new16b + old16b ...);
 596  *
 597  * NOTE: "oldsum" is right off the wire in wire-native order.
 598  * NOTE2: "old" and "new" ALSO point to things in wire-native order.
 599  * NOTE3:  THIS MUST TAKE A MULTIPLE OF 2 BYTES (i.e. uint16_t array).
 600  * NOTE4: The 32-bit running sum means we can't take len > 64k.
 601  */
 602 uint16_t
 603 vxlnat_cksum_adjust(uint16_t oldsum, uint16_t *old, uint16_t *new, uint_t len)
 604 {
 605         uint32_t newsum = ntohs(oldsum);
 606 
 607         ASSERT((len & 0x1) == 0);
 608         while (len != 0) {
 609                 newsum -= ntohs(*new);
 610                 newsum += ntohs(*old);
 611                 len -= 2;
 612                 old++;
 613                 new++;
 614         }
 615         newsum += (newsum >> 16) & 0xffff;
 616 
 617         return (newsum == 0xffff ? 0 : htons(newsum));
 618 }
 619 
 620 /*
 621  * Fix inner headers on an ICMP packet.
 622  *
 623  * XXX KEBE SAYS FOR NOW, just do addresses for 1-1/fixed.  When we do
 624  * flows, include old_port/new_port as well.
 625  */
 626 static mblk_t *
 627 vxlnat_fix_icmp_inner_v4(mblk_t *mp, icmph_t *icmph, ipaddr_t old_one,
 628     ipaddr_t new_one, boolean_t to_private)
 629 {
 630         mblk_t *newmp;
 631         ipha_t *inner_ipha;
 632         ipaddr_t *new_ones_place;
 633 
 634         if ((uint8_t *)(icmph + 1) + sizeof (ipha_t) > mp->b_wptr) {
 635                 /* Pay the pullup tax. */
 636                 newmp = msgpullup(mp, -1);
 637                 freemsg(mp);
 638                 if (newmp == NULL) {
 639                         DTRACE_PROBE1(vxlnat__fixicmp__pullupfail, void *,
 640                             NULL);
 641                         return (NULL);
 642                 }
 643                 if (MBLKL(newmp) < 2 * sizeof (ipha_t) + sizeof (icmph_t)) {
 644                         /* Wow! Too-tiny ICMP packet. */
 645                         DTRACE_PROBE1(vxlnat__fixicmp__tootiny, mblk_t *,
 646                             newmp);
 647                         freeb(newmp);
 648                         return (NULL);
 649                 }
 650                 mp = newmp;
 651                 /* Temporarily use inner_ipha for the outer one. */
 652                 inner_ipha = (ipha_t *)mp->b_rptr;
 653                 icmph = (icmph_t *)(mp->b_rptr + IPH_HDR_LENGTH(inner_ipha));
 654         }
 655         inner_ipha = (ipha_t *)(icmph + 1);
 656         new_ones_place = to_private ?
 657             &inner_ipha->ipha_src : &inner_ipha->ipha_dst;
 658         if (*new_ones_place != old_one) {
 659                 /* Either I'm buggy or the packet is. */
 660                 DTRACE_PROBE2(vxlnat__fixicmp__badinneraddr, ipaddr_t,
 661                     old_one, ipaddr_t, *new_ones_place);
 662                 freeb(mp);
 663                 return (NULL);
 664         }
 665         *new_ones_place = new_one;
 666 
 667         /* Adjust ICMP checksum... */
 668         icmph->icmph_checksum = vxlnat_cksum_adjust(icmph->icmph_checksum,
 669             (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
 670 
 671         /*
 672          * XXX KEBE ASKS, recompute *inner-packet* checksums?  Let's not for
 673          * now, but consider this Fair Warning (or some other VH album...).
 674          */
 675         return (mp);
 676 }
 677 
 678 /*
 679  * Take a 1-1/fixed IPv4 packet and convert it for transmission out the
 680  * appropriate end. "to_private" is what it says on the tin.
 681  * ALWAYS consumes "mp", regardless of return value.
 682  */
 683 static mblk_t *
 684 vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed, boolean_t to_private)
 685 {
 686         ipaddr_t new_one, old_one;
 687         ipaddr_t *new_ones_place;
 688         ipha_t *ipha = (ipha_t *)mp->b_rptr;
 689         uint8_t *nexthdr, *end_wptr;
 690 
 691         if (to_private) {
 692                 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_addr, new_one);
 693                 new_ones_place = &ipha->ipha_dst;
 694         } else {
 695                 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_pubaddr, new_one);
 696                 new_ones_place = &ipha->ipha_src;
 697         }
 698 
 699         old_one = *new_ones_place;
 700         *new_ones_place = new_one;
 701 
 702         /*
 703          * Recompute the IP header checksum, and check for the TCP or UDP
 704          * checksum as well, as they'll need recomputing as well.
 705          */
 706 
 707         /* First, the IPv4 header itself. */
 708         ipha->ipha_hdr_checksum = vxlnat_cksum_adjust(ipha->ipha_hdr_checksum,
 709             (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
 710 
 711         nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
 712         if (nexthdr >= mp->b_wptr) {
 713                 nexthdr = mp->b_cont->b_rptr +
 714                     (MBLKL(mp) - IPH_HDR_LENGTH(ipha));
 715                 end_wptr = mp->b_cont->b_wptr;
 716         } else {
 717                 end_wptr = mp->b_wptr;
 718         }
 719 
 720         switch (ipha->ipha_protocol) {
 721         case IPPROTO_TCP: {
 722                 tcpha_t *tcph = (tcpha_t *)nexthdr;
 723 
 724                 if (nexthdr + sizeof (*tcph) >= end_wptr) {
 725                         /* Bail for now. */
 726                         DTRACE_PROBE1(vxlnat__fix__tcp__mblkspan, mblk_t *,
 727                             mp);
 728                         freemsg(mp);
 729                         return (NULL);
 730                 }
 731                 tcph->tha_sum = vxlnat_cksum_adjust(tcph->tha_sum,
 732                     (uint16_t *)&old_one, (uint16_t *)&new_one,
 733                     sizeof (ipaddr_t));
 734                 break;  /* Out of switch. */
 735         }
 736         case IPPROTO_UDP: {
 737                 udpha_t *udph = (udpha_t *)nexthdr;
 738 
 739                 if (nexthdr + sizeof (*udph) >= end_wptr) {
 740                         /* Bail for now. */
 741                         DTRACE_PROBE1(vxlnat__fix__udp__mblkspan, mblk_t *,
 742                             mp);
 743                         freemsg(mp);
 744                         return (NULL);
 745                 }
 746                 udph->uha_checksum = vxlnat_cksum_adjust(udph->uha_checksum,
 747                     (uint16_t *)&old_one, (uint16_t *)&new_one,
 748                     sizeof (ipaddr_t));
 749                 break;  /* Out of switch. */
 750         }
 751         case IPPROTO_ICMP: {
 752                 icmph_t *icmph = (icmph_t *)nexthdr;
 753 
 754                 /*
 755                  * We need to check the case of ICMP messages that contain
 756                  * IP packets.  We will need to at least change the addresses,
 757                  * and *maybe* the checksums too if necessary.
 758                  *
 759                  * This may replicate some of icmp_inbound_v4(), alas.
 760                  */
 761                 if (nexthdr + sizeof (*icmph) >= end_wptr) {
 762                         mblk_t *newmp;
 763                         /*
 764                          * Unlike the others, we're going to pay the pullup
 765                          * tax here.
 766                          */
 767                         newmp = msgpullup(mp, -1);
 768                         freemsg(mp);
 769                         if (newmp == NULL) {
 770                                 DTRACE_PROBE1(vxlnat__icmp__pullupfail, void *,
 771                                     NULL);
 772                                 return (NULL);
 773                         }
 774                         mp = newmp;
 775                         ipha = (ipha_t *)(mp->b_rptr);
 776                         nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
 777                         icmph = (icmph_t *)nexthdr;
 778                 }
 779 
 780                 switch (icmph->icmph_type) {
 781                 case ICMP_ADDRESS_MASK_REPLY:
 782                 case ICMP_ADDRESS_MASK_REQUEST:
 783                 case ICMP_TIME_STAMP_REPLY:
 784                 case ICMP_TIME_STAMP_REQUEST:
 785                 case ICMP_ECHO_REQUEST:
 786                 case ICMP_ECHO_REPLY:
 787                         /* These merely need to get passed along. */
 788                         break;
 789                 case ICMP_ROUTER_ADVERTISEMENT:
 790                 case ICMP_ROUTER_SOLICITATION:
 791                         /* These shouldn't be traversing a NAT at all. Drop. */
 792                         DTRACE_PROBE1(vxlnat__icmp__cantpass, int,
 793                             icmph->icmph_type);
 794                         freemsg(mp);
 795                         return (NULL);
 796                 case ICMP_PARAM_PROBLEM:
 797                 case ICMP_TIME_EXCEEDED:
 798                 case ICMP_DEST_UNREACHABLE:
 799                         /* These include inner-IP headers we need to adjust. */
 800                         mp = vxlnat_fix_icmp_inner_v4(mp, icmph, old_one,
 801                             new_one, to_private);
 802                         break;
 803                 default:
 804                         /* Pass along to receiver, but warn. */
 805                         DTRACE_PROBE1(vxlnat__icmp__unknown, int,
 806                             icmph->icmph_type);
 807                         break;
 808                 }
 809         }
 810         /* Otherwise we can't make any other assumptions for now... */
 811         default:
 812                 break;
 813         }
 814 
 815         return (mp);
 816 }
 817 
 818 vxlnat_remote_t *
 819 vxlnat_xmit_vxlanv4(mblk_t *mp, in6_addr_t *overlay_dst,
 820     vxlnat_remote_t *remote, uint8_t *myether, vxlnat_vnet_t *vnet)
 821 {
 822         struct sockaddr_in6 sin6 = {AF_INET6};
 823         struct msghdr msghdr = {NULL};
 824         mblk_t *vlan_mp;
 825         extern uint_t vxlan_alloc_size, vxlan_noalloc_min;
 826         vxlan_hdr_t *vxh;
 827         struct ether_vlan_header *evh;
 828         int rc;
 829         cred_t *cred;
 830 
 831         if (remote == NULL || remote->vxnrem_vnet == NULL) {
 832                 DTRACE_PROBE1(vxlnat__xmit__vxlanv4, vxlnat_remote_t *, remote);
 833                 /* Release the condemned remote. */
 834                 if (remote != NULL)
 835                         VXNREM_REFRELE(remote);
 836 
 837                 /* See if we have a remote ready to use... */
 838                 remote = vxlnat_get_remote(vnet, overlay_dst, B_FALSE);
 839 
 840                 if (remote == NULL) {
 841                         /*
 842                          * We need to do the moral equivalent of PF_KEY
 843                          * ACQUIRE or overlay's queue-resolve so that we can
 844                          * have someone in user-space send me a remote.  Until
 845                          * then, drop the reference if condemned, free the
 846                          * message, and return NULL.
 847                          */
 848 
 849                         freemsg(mp);
 850                         return (NULL);
 851                 }
 852         }
 853         ASSERT(vnet == remote->vxnrem_vnet);
 854 
 855         if (DB_REF(mp) > 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
 856                 vlan_mp = allocb(vxlan_alloc_size, BPRI_HI);
 857                 if (vlan_mp == NULL) {
 858                         DTRACE_PROBE1(vxlnat__xmit__vxlanv4__allocfail,
 859                             vxlnat_remote_t *, remote);
 860                         freemsg(mp);
 861                         /* Just drop the packet, but don't tell caller. */
 862                         return (remote);
 863                 }
 864                 vlan_mp->b_wptr = DB_LIM(vlan_mp);
 865                 vlan_mp->b_rptr = vlan_mp->b_wptr;
 866                 vlan_mp->b_cont = mp;
 867         } else {
 868                 vlan_mp = mp;
 869         }
 870         vlan_mp->b_rptr -= sizeof (*vxh) + sizeof (*evh);
 871         vxh = (vxlan_hdr_t *)vlan_mp->b_rptr;
 872         vxh->vxlan_flags = VXLAN_F_VDI_WIRE;
 873         vxh->vxlan_id = vnet->vxnv_vnetid;        /* Already in wire-order. */
 874 
 875         /* Fill in the Ethernet header. */
 876         evh = (struct ether_vlan_header *)(vxh + 1);
 877         ether_copy(&remote->vxnrem_ether, &evh->ether_dhost);
 878         ether_copy(myether, &evh->ether_shost);
 879         evh->ether_tpid = htons(ETHERTYPE_VLAN);
 880         evh->ether_tci = remote->vxnrem_vlan;
 881         evh->ether_type = htons(ETHERTYPE_IP);
 882 
 883         msghdr.msg_name = (struct sockaddr_storage *)&sin6;
 884         msghdr.msg_namelen = sizeof (sin6);
 885         /* Address family and other zeroing already done up top. */
 886         sin6.sin6_port = htons(IPPORT_VXLAN);
 887         sin6.sin6_addr = remote->vxnrem_uaddr;
 888         
 889         /*
 890          * cred_t dance is because we may be getting this straight from
 891          * interrupt context.
 892          */
 893         cred = zone_get_kcred(netstack_get_zoneid(vxlnat_netstack));
 894         if (cred == NULL) {
 895                 DTRACE_PROBE1(vxlnat__xmit__vxlan4__credfail, 
 896                     vxlnat_remote_t *, remote);
 897                 freemsg(vlan_mp);
 898         }
 899         /*
 900          * Use MSG_DONTWAIT to avoid blocks, esp. if we're getting this
 901          * straight from interrupt context.
 902          */
 903         rc = ksocket_sendmblk(vxlnat_underlay, &msghdr, MSG_DONTWAIT, &vlan_mp,
 904             cred);
 905         crfree(cred);
 906         if (rc != 0) {
 907                 DTRACE_PROBE2(vxlnat__xmit__vxlan4__sendfail, int, rc,
 908                     vxlnat_remote_t *, remote);
 909                 freemsg(vlan_mp);
 910         }
 911         return (remote);
 912 }
 913 
 914 /*
 915  * New ire_{recv,send}fn implementations if we're doing 1-1 mappings.
 916  */
 917 int
 918 vxlnat_fixed_ire_send_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
 919     ip_xmit_attr_t *ixa, uint32_t *identp)
 920 {
 921         /* XXX KEBE SAYS FILL ME IN, but for now... */
 922         freemsg(mp);
 923         return (EOPNOTSUPP);
 924 }
 925 
 926 void
 927 vxlnat_fixed_ire_recv_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
 928     ip_recv_attr_t *ira)
 929 {
 930         /* XXX KEBE SAYS FILL ME IN, but for now... */
 931         freemsg(mp);
 932 }
 933 
 934 /*
 935  * I believe the common case for this will be from self-generated ICMP
 936  * messages.  Other same-netstack-originated traffic will also come through
 937  * here (one internal reaching what turns out to be another internal).
 938  */
 939 int
 940 vxlnat_fixed_ire_send_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
 941     ip_xmit_attr_t *ixa, uint32_t *identp)
 942 {
 943         ip_recv_attr_t iras;    /* NOTE: No bzero because we pay more later */
 944         ipha_t *ipha = (ipha_t *)iph_arg;
 945 
 946         /*
 947          * XXX KEBE ASKS, any DTrace probes or other instrumentation that
 948          * perhaps should be set?
 949          */
 950 
 951         /* Map ixa to ira. */
 952         iras.ira_pktlen = ixa->ixa_pktlen;
 953         /* XXX KEBE ASKS more?!? */
 954 
 955         /*
 956          * In normal TCP/IP processing, this shortcuts the IP header checksum
 957          * AND POSSIBLY THE ULP checksum cases.  Since this is likely to head
 958          * back into the internal network, we need to recompute things again.
 959          */
 960         if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
 961                 freemsg(mp);
 962                 return (EMSGSIZE);
 963         }
 964 #if 0
 965         /* XXX KEBE ASKS Special-case ICMP here? */
 966         if (ipha->ipha_protocol == IPPROTO_ICMP) {
 967                 icmph_t *icmph;
 968 
 969                 icmph = (icmph_t *)((uint8_t *)ipha + IPH_HDR_LENGTH(ipha));
 970                 if ((uint8_t *)icmph >= mp->b_wptr) {
 971                         freemsg(mp);
 972                         return (EMSGSIZE);
 973                 }
 974                 icmph->icmph_checksum = 0;
 975                 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
 976         }
 977 #endif
 978 
 979         vxlnat_fixed_ire_recv_v4(ire, mp, iph_arg, &iras);
 980 
 981         return (0);
 982 }
 983 
 984 void
 985 vxlnat_fixed_ire_recv_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
 986     ip_recv_attr_t *ira)
 987 {
 988         vxlnat_fixed_t *fixed;
 989         vxlnat_vnet_t *vnet;
 990         ipha_t *ipha = (ipha_t *)iph_arg;
 991         int newmtu;
 992 
 993         /* Make a note for DAD that this address is in use */
 994         ire->ire_last_used_time = LBOLT_FASTPATH;
 995 
 996         /* Only target the IRE_LOCAL with the right zoneid. */
 997         ira->ira_zoneid = ire->ire_zoneid;
 998 
 999         /*
1000          * XXX KEBE ASKS, any DTrace probes or other instrumentation that
1001          * perhaps should be set?
1002          */
1003 
1004         /*
1005          * Reality check some things.
1006          */
1007         fixed = (vxlnat_fixed_t *)ire->ire_dep_sib_next;
1008         vnet = fixed->vxnf_vnet;
1009 
1010         ASSERT3P(ire, ==, fixed->vxnf_ire);
1011 
1012         if (IRE_IS_CONDEMNED(ire) || vnet == NULL)
1013                 goto detach_ire_and_bail;
1014 
1015         /*
1016          * Not a common-case, but a possible one.  If our underlay MTU is
1017          * smaller than the external MTU, it is possible that we will have a
1018          * size mismatch and therefore need to either fragment at the VXLAN
1019          * layer (VXLAN UDP packet sent as two or more IP fragments) OR
1020          * if IPH_DF is set, send an ICMP_NEEDS_FRAGMENTATION back to the
1021          * sender.  Perform the check here BEFORE we NAT the packet.
1022          */
1023         ASSERT(vxlnat_underlay_ire->ire_ill != NULL);
1024         newmtu = vxlnat_underlay_ire->ire_ill->ill_mtu - sizeof (ipha_t) -
1025             sizeof (udpha_t) - sizeof (vxlan_hdr_t) -
1026             sizeof (struct ether_vlan_header);
1027         if ((ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF) &&
1028             ntohs(ipha->ipha_length) > newmtu) {
1029                 icmp_frag_needed(mp, newmtu, ira);
1030                 /* We're done.  Assume icmp_frag_needed() consumed mp. */
1031                 return;
1032         }
1033 
1034         /*
1035          * So we're here, and since we have a refheld IRE, we have a refheld
1036          * fixed and vnet. Do some of what ip_input_local_v4() does (inbound
1037          * checksum?  some ira checks?), but otherwise, swap the destination
1038          * address as mapped in "fixed", recompute any checksums, and send it
1039          * along its merry way (with a ttl decement too) to a VXLAN
1040          * destination.
1041          */
1042         mp = vxlnat_fixed_fixv4(mp, fixed, B_TRUE);
1043         if (mp == NULL)
1044                 return; /* Assume it's been freed & dtraced already. */
1045 
1046         /*
1047          * Otherwise, we're ready to transmit this packet over the vxlan
1048          * socket.
1049          */
1050         fixed->vxnf_remote = vxlnat_xmit_vxlanv4(mp, &fixed->vxnf_addr,
1051             fixed->vxnf_remote, fixed->vxnf_myether, vnet);
1052         if (fixed->vxnf_remote == NULL) {
1053                 /* XXX KEBE ASKS, DTrace probe here?  Or in-function? */
1054                 DTRACE_PROBE2(vxlnat__fixed__xmitdrop,
1055                     in6_addr_t *, &fixed->vxnf_addr,
1056                     uint32_t, VXLAN_ID_NTOH(vnet->vxnv_vnetid));
1057         }
1058         return;
1059 
1060 detach_ire_and_bail:
1061         /* Oh no, something's condemned.  Drop the IRE now. */
1062         ire->ire_recvfn = ire_recv_local_v4;
1063         ire->ire_dep_sib_next = NULL;
1064         VXNF_REFRELE(fixed);
1065         /* Pass the packet back... */
1066         ire_recv_local_v4(ire, mp, iph_arg, ira);
1067         return;
1068 }