1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2018 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * NAT engine.  Mappings, 1-1 The rules in vxlnat_rules.c are only consulted
  18  * if the 1-1 map (kept here) misses or if the outbound lookup (vnetid,
  19  * protocol, src-IP, dst-IP, src-port, dst-port) misses.
  20  *
  21  * The plan is for inbound to hit conn_ts, whose conn_private points to
  22  * entries here.  The conn_recv* functions live here too (for now).
  23  */
  24 
  25 #include <sys/types.h>
  26 #include <sys/socket.h>
  27 #include <sys/ksynch.h>
  28 #include <sys/ksocket.h>
  29 #include <sys/kmem.h>
  30 #include <sys/stream.h>
  31 #include <sys/strsubr.h>
  32 #include <sys/strsun.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/debug.h>
  35 #include <sys/dtrace.h>
  36 #include <sys/errno.h>
  37 #include <sys/tihdr.h>
  38 #include <netinet/in.h>
  39 #include <netinet/udp.h>
  40 #include <inet/ip.h>
  41 #include <inet/ip6.h>
  42 #include <inet/udp_impl.h>
  43 #include <inet/tcp.h>
  44 
  45 #include <inet/vxlnat_impl.h>
  46 
  47 static boolean_t vxlnat_vxlan_input(ksocket_t, mblk_t *, size_t, int, void *);
  48 static mblk_t *vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed,
  49     boolean_t to_private);
  50 
  51 /*
  52  * Initialized to NULL, read/write protected by vxlnat_mutex.
  53  * Receive functions shouldn't have to access this directly.
  54  */
  55 ksocket_t vxlnat_underlay;
  56 ire_t *vxlnat_underlay_ire;
  57 
  58 void
  59 vxlnat_closesock(void)
  60 {
  61         ASSERT(MUTEX_HELD(&vxlnat_mutex));
  62         if (vxlnat_underlay_ire != NULL) {
  63                 ire_refrele(vxlnat_underlay_ire);
  64                 vxlnat_underlay_ire = NULL;
  65         }
  66         if (vxlnat_underlay != NULL) {
  67                 (void) ksocket_close(vxlnat_underlay, zone_kcred());
  68                 vxlnat_underlay = NULL;
  69         }
  70 }
  71 
  72 static int
  73 vxlnat_opensock(in6_addr_t *underlay_ip)
  74 {
  75         int rc, val;
  76         /* Assume rest is initialized to 0s. */
  77         struct sockaddr_in6 sin6 = {AF_INET6, BE_16(IPPORT_VXLAN)};
  78         ip_stack_t *ipst = vxlnat_netstack->netstack_ip;
  79 
  80         ASSERT(MUTEX_HELD(&vxlnat_mutex));
  81         /* Open... */
  82         rc = ksocket_socket(&vxlnat_underlay, AF_INET6, SOCK_DGRAM, 0,
  83             KSOCKET_SLEEP, zone_kcred());
  84         if (rc != 0)
  85                 return (rc);
  86 
  87         /* Bind... */
  88         sin6.sin6_addr = *underlay_ip;
  89         rc = ksocket_bind(vxlnat_underlay, (struct sockaddr *)(&sin6),
  90             sizeof (sin6), zone_kcred());
  91         if (rc != 0) {
  92                 vxlnat_closesock();
  93                 return (rc);
  94         }
  95 
  96         /* Use source-port hashing when sending packets out VXLAN... */
  97         val = UDP_HASH_VXLAN;
  98         rc = ksocket_setsockopt(vxlnat_underlay, IPPROTO_UDP,
  99             UDP_SRCPORT_HASH, &val, sizeof (val), kcred);
 100         if (rc != 0) {
 101                 vxlnat_closesock();
 102                 return (rc);
 103         }
 104 
 105         /*
 106          * Grab the IRE for underlay address.
 107          */
 108         ASSERT3P(vxlnat_underlay_ire, ==, NULL);
 109         vxlnat_underlay_ire = (IN6_IS_ADDR_V4MAPPED(underlay_ip)) ?
 110             ire_ftable_lookup_simple_v4(underlay_ip->_S6_un._S6_u32[3],
 111             0, ipst, NULL) :
 112             ire_ftable_lookup_simple_v6(underlay_ip, 0, ipst, NULL);
 113         if (vxlnat_underlay_ire == NULL) {
 114                 DTRACE_PROBE1(vxlnat__opensock__ire__fail, in6_addr_t *,
 115                     underlay_ip);
 116                 vxlnat_closesock();
 117                 return (EADDRNOTAVAIL);
 118         }
 119 
 120         /* Once we return from this, start eating data. */
 121         rc = ksocket_krecv_set(vxlnat_underlay, vxlnat_vxlan_input, NULL);
 122         if (rc != 0) {
 123                 vxlnat_closesock();
 124         }
 125 
 126         return (rc);
 127 }
 128 
 129 /*
 130  * Establish a VXLAN-listening kernel socket.
 131  * XXX KEBE ASKS ==> Support more than one VXLAN address?
 132  */
 133 /* ARGSUSED */
 134 int
 135 vxlnat_vxlan_addr(in6_addr_t *underlay_ip)
 136 {
 137         int rc;
 138 
 139         ASSERT(MUTEX_HELD(&vxlnat_mutex));
 140         /* For now, we make this a one-underlay-address-only solution. */
 141         vxlnat_closesock();
 142         rc = vxlnat_opensock(underlay_ip);
 143         return (rc);
 144 }
 145 
 146 /*
 147  * Free a remote VXLAN destination.
 148  */
 149 void
 150 vxlnat_remote_free(vxlnat_remote_t *remote)
 151 {
 152         ASSERT0(remote->vxnrem_refcount);
 153 
 154         kmem_free(remote, sizeof (*remote));
 155 }
 156 
 157 /*
 158  * Like other unlink functions, assume the appropriate lock is held.
 159  */
 160 void
 161 vxlnat_remote_unlink(vxlnat_remote_t *remote)
 162 {
 163         vxlnat_vnet_t *vnet = remote->vxnrem_vnet;
 164 
 165         ASSERT3P(vnet, !=, NULL);
 166         ASSERT(MUTEX_HELD(&vnet->vxnv_remote_lock));
 167 
 168         /* First unlink so nobody else can find me */
 169         avl_remove(&vnet->vxnv_remotes, remote);
 170 
 171         /*
 172          * We still hold a vnet reference, so races shouldn't be a problem.
 173          * Still, for added safety, NULL it out first.
 174          */
 175         remote->vxnrem_vnet = NULL;  /* Condemn this entry. */
 176         VXNV_REFRELE(vnet);
 177         VXNREM_REFRELE(remote); /* Internment release. */
 178 }
 179 
 180 /*
 181  * Find or create a remote VXLAN destination.
 182  */
 183 static vxlnat_remote_t *
 184 vxlnat_get_remote(vxlnat_vnet_t *vnet, in6_addr_t *remote_addr,
 185     boolean_t create_on_miss)
 186 {
 187         vxlnat_remote_t *remote, searcher;
 188         avl_index_t where;
 189 
 190         searcher.vxnrem_addr = *remote_addr;
 191         mutex_enter(&vnet->vxnv_remote_lock);
 192         remote = avl_find(&vnet->vxnv_remotes, &searcher, &where);
 193         if (remote == NULL && create_on_miss) {
 194                 /* Not as critical if we can't allocate here. */
 195                 remote = kmem_zalloc(sizeof (*remote),
 196                     KM_NOSLEEP | KM_NORMALPRI);
 197                 if (remote != NULL) {
 198                         remote->vxnrem_addr = *remote_addr;
 199                         remote->vxnrem_refcount = 1; /* Internment reference. */
 200                         VXNV_REFHOLD(vnet);
 201                         remote->vxnrem_vnet = vnet;
 202                         /* Rest is filled in by caller. */
 203                         avl_insert(&vnet->vxnv_remotes, remote, where);
 204                 }
 205         }
 206         if (remote != NULL)
 207                 VXNREM_REFHOLD(remote);
 208         mutex_exit(&vnet->vxnv_remote_lock);
 209         return (remote);
 210 }
 211 
 212 /*
 213  * Cache inbound packet information in the vnet's remotes section.
 214  *
 215  * NOTE: This function assumes a trustworthy underlay network.  If the
 216  * underlay isn't trustworthy, this function should be renamed, and reduced to
 217  * a "strip and reality-check the ethernet header" function.
 218  *
 219  * Caller has stripped any pre-ethernet data from mp.  We return mp
 220  * stripped down to its IP header.
 221  */
 222 static mblk_t *
 223 vxlnat_cache_remote(mblk_t *mp, struct sockaddr_in6 *underlay_src,
 224     vxlnat_vnet_t *vnet)
 225 {
 226         struct ether_vlan_header *evh;
 227         struct ether_header *eh;
 228         vxlnat_remote_t *remote;
 229         uint16_t vlan, ethertype;
 230         ether_addr_t remote_ether;
 231         ipha_t *ipha;
 232         ip6_t *ip6h;
 233         in6_addr_t remote_addr;
 234 
 235         /* Assume (for now) we have at least a VLAN header's worth of data. */
 236         if (MBLKL(mp) < sizeof (*evh)) {
 237                 /* XXX KEBE ASKS - should we be more forgiving? */
 238                 DTRACE_PROBE1(vxlnat__in__drop__etherhdr, mblk_t *, mp);
 239                 freemsg(mp);
 240                 return (NULL);
 241         }
 242 
 243         eh = (struct ether_header *)mp->b_rptr;
 244         ethertype = ntohs(eh->ether_type);
 245         ether_copy(&eh->ether_shost, &remote_ether);
 246         if (ethertype == ETHERTYPE_VLAN) {
 247                 evh = (struct ether_vlan_header *)eh;
 248                 /* Keep it in network order... */
 249                 vlan = evh->ether_tci;
 250                 ethertype = ntohs(evh->ether_type);
 251                 ASSERT(vlan != 0);
 252                 mp->b_rptr += sizeof (*evh);
 253         } else {
 254                 evh = NULL;
 255                 vlan = 0;
 256                 mp->b_rptr += sizeof (*eh);
 257         }
 258         if (ethertype != ETHERTYPE_IP && ethertype != ETHERTYPE_IPV6) {
 259                 /*
 260                  * XXX KEBE SAYS for now, don't handle non-IP packets.
 261                  * This includes ARP.
 262                  */
 263                 DTRACE_PROBE1(vxlnat__in__drop__nonip, mblk_t *, mp);
 264                 freemsg(mp);
 265                 return (NULL);
 266         }
 267 
 268         /* Handle case of split ether + IP headers. */
 269         if (MBLKL(mp) < sizeof (ipha_t)) {
 270                 mblk_t *freemp;
 271                 
 272                 if (MBLKL(mp) > 0 || mp->b_cont == NULL) {
 273                         /* The IP header is split ACROSS MBLKS! Bail for now. */
 274                         DTRACE_PROBE1(vxlnat__in__drop__splitip, mblk_t *, mp);
 275                         freemsg(mp);
 276                         return (NULL);
 277                 }
 278                 freemp = mp;
 279                 mp = mp->b_cont;
 280                 freeb(freemp);
 281         }
 282         /* LINTED -- alignment... */
 283         ipha = (ipha_t *)mp->b_rptr;
 284 
 285         if (IPH_HDR_VERSION(ipha) == IPV4_VERSION) {
 286                 if (ethertype != ETHERTYPE_IP) {
 287                         /* XXX KEBE ASKS - should we be more forgiving? */
 288                         DTRACE_PROBE1(vxlnat__in__drop__etherhdr4,
 289                             mblk_t *, mp);
 290                         freemsg(mp);
 291                         return (NULL);
 292                 }
 293                 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
 294                     &remote_addr);
 295         } else {
 296                 if (ethertype != ETHERTYPE_IPV6 ||
 297                     IPH_HDR_VERSION(ipha) != IPV6_VERSION ||
 298                     MBLKL(mp) < sizeof (ip6_t)) {    
 299                         /* XXX KEBE ASKS - should we be more forgiving? */
 300                         DTRACE_PROBE1(vxlnat__in__drop__etherhdr6,
 301                             mblk_t *, mp);
 302                         freemsg(mp);
 303                         return (NULL);
 304                 }
 305                 ip6h = (ip6_t *)ipha;
 306                 remote_addr = ip6h->ip6_src;
 307         }
 308 
 309         /* Find remote and replace OR create new remote. */
 310         remote = vxlnat_get_remote(vnet, &remote_addr, B_TRUE);
 311         if (remote != NULL) {
 312                 /*
 313                  * See if this entry needs fixing or filling-in.  This might
 314                  * get a bit racy with read-only threads that actually
 315                  * transmit, but it only means dropped-packets in the worst
 316                  * case.
 317                  *
 318                  * It's THIS PART that inspires the warning about trusting the
 319                  * underlay network.
 320                  *
 321                  * XXX KEBE ASKS -- should we just replace things w/o checking?
 322                  */
 323                 /* Replace the ethernet address? */
 324                 if (ether_cmp(&remote->vxnrem_ether, &remote_ether) != 0)
 325                         ether_copy(&remote_ether, &remote->vxnrem_ether);
 326                 /*
 327                  * Replace the underlay? NOTE: Fix if/when underlay becomes
 328                  * IPv6.
 329                  */
 330                 if (!IN6_ARE_ADDR_EQUAL(&remote->vxnrem_uaddr,
 331                     &underlay_src->sin6_addr)) {
 332                         remote->vxnrem_uaddr = underlay_src->sin6_addr;
 333                 }
 334                 /* Replace the vlan ID. Maintain network order... */
 335                 if (remote->vxnrem_vlan != vlan)
 336                         remote->vxnrem_vlan = vlan;
 337         }
 338         /*
 339          * Else just continue and pray for better luck on another packet or
 340          * on the return flight.  It is IP, we can Just Drop It (TM)...
 341          */
 342 
 343         /* We're done with the remote entry now. */
 344         VXNREM_REFRELE(remote);
 345 
 346         /* Advance rptr to the inner IP header and proceed. */
 347         mp->b_rptr = (uint8_t *)ipha;
 348         return (mp);
 349 }
 350 
 351 /*
 352  * Process exactly one VXLAN packet.
 353  */
 354 static void
 355 vxlnat_one_vxlan(mblk_t *mp, struct sockaddr_in6 *underlay_src)
 356 {
 357         vxlan_hdr_t *vxh;
 358         vxlnat_vnet_t *vnet;
 359         ipha_t *ipha;
 360         ip6_t *ip6h;
 361         vxlnat_fixed_t *fixed, fsearch;
 362 
 363         if (MBLKL(mp) < sizeof (*vxh)) {
 364                 /* XXX KEBE ASKS -- should we be more forgiving? */
 365                 DTRACE_PROBE1(vxlnat__in__drop__vxlsize, mblk_t *, mp);
 366                 freemsg(mp);
 367                 return;
 368         }
 369         vxh = (vxlan_hdr_t *)mp->b_rptr;
 370 
 371         /* If we start using more than just the one flag, fix it. */
 372         if (vxh->vxlan_flags != VXLAN_F_VDI_WIRE) {
 373                 DTRACE_PROBE1(vxlnat__in__drop__VDI, mblk_t *, mp);
 374                 freemsg(mp);
 375                 return;
 376         }
 377 
 378         /* Remember, we key off of what's on the wire. */
 379         vnet = vxlnat_get_vnet(VXLAN_ID_WIRE32(vxh->vxlan_id), B_FALSE);
 380         if (vnet == NULL) {
 381                 DTRACE_PROBE1(vxlnat__in__drop__vnetid, uint32_t,
 382                     VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)));
 383                 freemsg(mp);
 384                 return;
 385         }
 386 
 387         DTRACE_PROBE2(vxlnat__in__vnet, uint32_t,
 388             VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)),
 389             vxlnat_vnet_t, vnet);
 390 
 391         /*
 392          * Off-vxlan processing steps:
 393          * 1.) Locate the ethernet header and check/update/add-into remotes.
 394          * 2.) Search 1-1s, process if hit.
 395          * 3.) Search flows, process if hit.
 396          * 4.) Search rules, create new flow (or not) if hit.
 397          * 5.) Drop the packets.
 398          */
 399 
 400         /* 1.) Locate the ethernet header and check/update/add-into remotes. */
 401         mp->b_rptr += sizeof (*vxh);
 402         while (MBLKL(mp) == 0) {
 403                 mblk_t *oldmp = mp;
 404 
 405                 mp = mp->b_cont;
 406                 freeb(oldmp);
 407         }
 408         mp = vxlnat_cache_remote(mp, underlay_src, vnet);
 409         if (mp == NULL) {
 410                 VXNV_REFRELE(vnet);
 411                 return;
 412         }
 413 
 414         /* 2.) Search 1-1s, process if hit. */
 415         ipha = (ipha_t *)mp->b_rptr;
 416         if (IPH_HDR_VERSION(ipha) == IPV4_VERSION) {
 417                 ip6h = NULL;
 418                 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
 419                     &fsearch.vxnf_addr);
 420         } else {
 421                 /* vxlnat_cache_remote() did reality checks... */
 422                 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
 423                 ip6h = (ip6_t *)ipha;
 424                 ipha = NULL;
 425                 fsearch.vxnf_addr = ip6h->ip6_src;
 426         }
 427         rw_enter(&vnet->vxnv_fixed_lock, RW_READER);
 428         fixed = avl_find(&vnet->vxnv_fixed_ips, &fsearch, NULL);
 429         if (fixed != NULL)
 430                 VXNF_REFHOLD(fixed);
 431         rw_exit(&vnet->vxnv_fixed_lock);
 432         if (fixed != NULL) {
 433                 mblk_t *newmp = NULL;
 434 
 435                 /*
 436                  * XXX KEBE ASKS --> Do MTU check NOW?!  That way, we have
 437                  * pre-natted data.  One gotcha, external dests may have
 438                  * different PathMTUs so see below about EMSGSIZE...
 439                  */
 440 
 441                 /* XXX KEBE SAYS -- FILL ME IN... but for now: */
 442                 if (ipha != NULL)
 443                         newmp = vxlnat_fixed_fixv4(mp, fixed, B_FALSE);
 444                 else
 445                         freemsg(mp); /* XXX handle ip6h */
 446 
 447                 if (newmp != NULL) {
 448                         ire_t *outbound_ire;
 449                         /* Use C99's initializers for fun & profit. */
 450                         ip_recv_attr_t iras =
 451                             { IRAF_IS_IPV4 | IRAF_VERIFIED_SRC };
 452 
 453                         ASSERT3P(ipha, !=, NULL);
 454                         ASSERT3P(ipha, ==, newmp->b_rptr);
 455                         /* XXX KEBE ASKS, IRR_ALLOCATE okay?!? */
 456                         outbound_ire = ire_route_recursive_dstonly_v4(
 457                             ipha->ipha_dst, IRR_ALLOCATE,
 458                             0 /* XXX KEBE SAYS XMIT HINT! */,
 459                             vxlnat_netstack->netstack_ip);
 460                         VERIFY3P(outbound_ire, !=, NULL);
 461                         if (outbound_ire->ire_type == IRE_NOROUTE) {
 462                                 /* Bail! */
 463                                 VXNF_REFRELE(fixed);
 464                                 VXNV_REFRELE(vnet);
 465                                 return;
 466                         }
 467 
 468                         iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
 469                         if (iras.ira_ip_hdr_length > sizeof (ipha_t))
 470                                 iras.ira_flags |= IRAF_IPV4_OPTIONS;
 471                         iras.ira_xmit_hint = 0; /* XXX KEBE SAYS FIX ME! */
 472                         iras.ira_zoneid = outbound_ire->ire_zoneid;
 473                         iras.ira_pktlen = ntohs(ipha->ipha_length);
 474                         iras.ira_protocol = ipha->ipha_protocol;
 475                         /* XXX KEBE ASKS rifindex & ruifindex ?!? */
 476                         /*
 477                          * NOTE: AT LEAST ira_ill needs ILLF_ROUTER set, as
 478                          * well as the ill for the external NIC (where
 479                          * off-link destinations live).  For fixed, ira_ill
 480                          * should be the ill of the external source.
 481                          */
 482                         iras.ira_rill = vxlnat_underlay_ire->ire_ill;
 483                         iras.ira_ill = fixed->vxnf_ire->ire_ill;
 484                         /* XXX KEBE ASKS cred & cpid ? */
 485                         iras.ira_verified_src = ipha->ipha_src;
 486                         /* XXX KEBE SAYS don't sweat IPsec stuff. */
 487                         /* XXX KEBE SAYS ALSO don't sweat l2src & mhip */
 488 
 489                         /* Okay, we're good! Let's pretend we're forwarding. */
 490                         ire_recv_forward_v4(outbound_ire, mp, ipha, &iras);
 491                         ire_refrele(outbound_ire);
 492                 }
 493 
 494                 /* All done... */
 495                 VXNF_REFRELE(fixed);
 496                 VXNV_REFRELE(vnet);
 497                 return;
 498         }
 499 
 500         /* XXX KEBE SAYS BUILD STEPS 3-4. */
 501 
 502         /* 5.) Nothing, drop the packet. */
 503         /* XXX KEBE ASKS DIAGNOSTIC? */
 504         VXNV_REFRELE(vnet);
 505         freemsg(mp);
 506 }
 507 /*
 508  * ONLY return B_FALSE if we get a packet-clogging event.
 509  */
 510 /* ARGSUSED */
 511 static boolean_t
 512 vxlnat_vxlan_input(ksocket_t insock, mblk_t *chain, size_t msgsize, int oob,
 513     void *ignored)
 514 {
 515         mblk_t *mp, *nextmp;
 516 
 517         /*
 518          * XXX KEBE ASKS --> move hold & release outside of loop?
 519          * If so, hold rwlock here.
 520          */
 521 
 522         for (mp = chain; mp != NULL; mp = nextmp) {
 523                 struct T_unitdata_ind *tudi;
 524                 struct sockaddr_in6 *sin6;
 525 
 526                 nextmp = mp->b_next;
 527                 if (DB_TYPE(mp) != M_PROTO || mp->b_cont == NULL) {
 528                         DTRACE_PROBE1(vxlnat__in__drop__mblk, mblk_t *, mp);
 529                         freemsg(mp);
 530                         continue;
 531                 }
 532 
 533                 /* LINTED -- aligned */
 534                 tudi = (struct T_unitdata_ind *)mp->b_rptr;
 535                 if (tudi->PRIM_type != T_UNITDATA_IND) {
 536                         DTRACE_PROBE1(vxlnat__in__drop__TPI, mblk_t *, mp);
 537                         freemsg(mp);
 538                         continue;
 539                 }
 540                 /* LINTED -- aligned */
 541                 sin6 = (struct sockaddr_in6 *)(mp->b_rptr + tudi->SRC_offset);
 542                 VERIFY(sin6->sin6_family == AF_INET6);
 543                 VERIFY(tudi->SRC_length >= sizeof (*sin6));
 544 
 545                 vxlnat_one_vxlan(mp->b_cont, sin6);
 546                 freeb(mp);
 547         }
 548 
 549         return (B_TRUE);
 550 }
 551 
 552 /*
 553  * Use RFC 1141's technique (with a check for -0).
 554  *
 555  * newsum = oldsum - (new16a + old16a - new16b + old16b ...);
 556  *
 557  * NOTE: "oldsum" is right off the wire in wire-native order.
 558  * NOTE2: "old" and "new" ALSO point to things in wire-native order.
 559  * NOTE3:  THIS MUST TAKE A MULTIPLE OF 2 BYTES (i.e. uint16_t array).
 560  * NOTE4: The 32-bit running sum means we can't take len > 64k.
 561  */
 562 uint16_t
 563 vxlnat_cksum_adjust(uint16_t oldsum, uint16_t *old, uint16_t *new, uint_t len)
 564 {
 565         uint32_t newsum = ntohs(oldsum);
 566 
 567         ASSERT((len & 0x1) == 0);
 568         while (len != 0) {
 569                 newsum -= ntohs(*new);
 570                 newsum += ntohs(*old);
 571                 len -= 2;
 572                 old++;
 573                 new++;
 574         }
 575         newsum += (newsum >> 16) & 0xffff;
 576 
 577         return (newsum == 0xffff ? 0 : htons(newsum));
 578 }
 579 
 580 /*
 581  * Fix inner headers on an ICMP packet.
 582  *
 583  * XXX KEBE SAYS FOR NOW, just do addresses for 1-1/fixed.  When we do
 584  * flows, include old_port/new_port as well.
 585  */
 586 static mblk_t *
 587 vxlnat_fix_icmp_inner_v4(mblk_t *mp, icmph_t *icmph, ipaddr_t old_one,
 588     ipaddr_t new_one, boolean_t to_private)
 589 {
 590         mblk_t *newmp;
 591         ipha_t *inner_ipha;
 592         ipaddr_t *new_ones_place;
 593 
 594         if ((uint8_t *)(icmph + 1) + sizeof (ipha_t) > mp->b_wptr) {
 595                 /* Pay the pullup tax. */
 596                 newmp = msgpullup(mp, -1);
 597                 freemsg(mp);
 598                 if (newmp == NULL) {
 599                         DTRACE_PROBE1(vxlnat__fixicmp__pullupfail, void *,
 600                             NULL);
 601                         return (NULL);
 602                 }
 603                 if (MBLKL(newmp) < 2 * sizeof (ipha_t) + sizeof (icmph_t)) {
 604                         /* Wow! Too-tiny ICMP packet. */
 605                         DTRACE_PROBE1(vxlnat__fixicmp__tootiny, mblk_t *,
 606                             newmp);
 607                         freeb(newmp);
 608                         return (NULL);
 609                 }
 610                 mp = newmp;
 611                 /* Temporarily use inner_ipha for the outer one. */
 612                 inner_ipha = (ipha_t *)mp->b_rptr;
 613                 icmph = (icmph_t *)(mp->b_rptr + IPH_HDR_LENGTH(inner_ipha));
 614         }
 615         inner_ipha = (ipha_t *)(icmph + 1);
 616         new_ones_place = to_private ?
 617             &inner_ipha->ipha_src : &inner_ipha->ipha_dst;
 618         if (*new_ones_place != old_one) {
 619                 /* Either I'm buggy or the packet is. */
 620                 DTRACE_PROBE2(vxlnat__fixicmp__badinneraddr, ipaddr_t,
 621                     old_one, ipaddr_t, *new_ones_place);
 622                 freeb(mp);
 623                 return (NULL);
 624         }
 625         *new_ones_place = new_one;
 626 
 627         /* Adjust ICMP checksum... */
 628         icmph->icmph_checksum = vxlnat_cksum_adjust(icmph->icmph_checksum,
 629             (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
 630 
 631         /*
 632          * XXX KEBE ASKS, recompute *inner-packet* checksums?  Let's not for
 633          * now, but consider this Fair Warning (or some other VH album...).
 634          */
 635         return (mp);
 636 }
 637 
 638 /*
 639  * Take a 1-1/fixed IPv4 packet and convert it for transmission out the
 640  * appropriate end. "to_private" is what it says on the tin.
 641  */
 642 static mblk_t *
 643 vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed, boolean_t to_private)
 644 {
 645         ipaddr_t new_one, old_one;
 646         ipaddr_t *new_ones_place;
 647         ipha_t *ipha = (ipha_t *)mp->b_rptr;
 648         uint8_t *nexthdr, *end_wptr;
 649 
 650         if (to_private) {
 651                 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_addr, new_one);
 652                 new_ones_place = &ipha->ipha_dst;
 653         } else {
 654                 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_pubaddr, new_one);
 655                 new_ones_place = &ipha->ipha_src;
 656         }
 657 
 658         old_one = *new_ones_place;
 659         *new_ones_place = new_one;
 660 
 661         /*
 662          * Recompute the IP header checksum, and check for the TCP or UDP
 663          * checksum as well, as they'll need recomputing as well.
 664          */
 665 
 666         /* First, the IPv4 header itself. */
 667         ipha->ipha_hdr_checksum = vxlnat_cksum_adjust(ipha->ipha_hdr_checksum,
 668             (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
 669 
 670         nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
 671         if (nexthdr >= mp->b_wptr) {
 672                 nexthdr = mp->b_cont->b_rptr +
 673                     (MBLKL(mp) - IPH_HDR_LENGTH(ipha));
 674                 end_wptr = mp->b_cont->b_wptr;
 675         } else {
 676                 end_wptr = mp->b_wptr;
 677         }
 678 
 679         switch (ipha->ipha_protocol) {
 680         case IPPROTO_TCP: {
 681                 tcpha_t *tcph = (tcpha_t *)nexthdr;
 682 
 683                 if (nexthdr + sizeof (*tcph) >= end_wptr) {
 684                         /* Bail for now. */
 685                         DTRACE_PROBE1(vxlnat__fix__tcp__mblkspan, mblk_t *,
 686                             mp);
 687                         freemsg(mp);
 688                         return (NULL);
 689                 }
 690                 tcph->tha_sum = vxlnat_cksum_adjust(tcph->tha_sum,
 691                     (uint16_t *)&old_one, (uint16_t *)&new_one,
 692                     sizeof (ipaddr_t));
 693                 break;  /* Out of switch. */
 694         }
 695         case IPPROTO_UDP: {
 696                 udpha_t *udph = (udpha_t *)nexthdr;
 697 
 698                 if (nexthdr + sizeof (*udph) >= end_wptr) {
 699                         /* Bail for now. */
 700                         DTRACE_PROBE1(vxlnat__fix__udp__mblkspan, mblk_t *,
 701                             mp);
 702                         freemsg(mp);
 703                         return (NULL);
 704                 }
 705                 udph->uha_checksum = vxlnat_cksum_adjust(udph->uha_checksum,
 706                     (uint16_t *)&old_one, (uint16_t *)&new_one,
 707                     sizeof (ipaddr_t));
 708                 break;  /* Out of switch. */
 709         }
 710         case IPPROTO_ICMP: {
 711                 icmph_t *icmph = (icmph_t *)nexthdr;
 712 
 713                 /*
 714                  * We need to check the case of ICMP messages that contain
 715                  * IP packets.  We will need to at least change the addresses,
 716                  * and *maybe* the checksums too if necessary.
 717                  *
 718                  * This may replicate some of icmp_inbound_v4(), alas.
 719                  */
 720                 if (nexthdr + sizeof (*icmph) >= end_wptr) {
 721                         mblk_t *newmp;
 722                         /*
 723                          * Unlike the others, we're going to pay the pullup
 724                          * tax here.
 725                          */
 726                         newmp = msgpullup(mp, -1);
 727                         freemsg(mp);
 728                         if (newmp == NULL) {
 729                                 DTRACE_PROBE1(vxlnat__icmp__pullupfail, void *,
 730                                     NULL);
 731                                 return (NULL);
 732                         }
 733                         mp = newmp;
 734                         ipha = (ipha_t *)(mp->b_rptr);
 735                         nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
 736                         icmph = (icmph_t *)nexthdr;
 737                 }
 738 
 739                 switch (icmph->icmph_type) {
 740                 case ICMP_ADDRESS_MASK_REPLY:
 741                 case ICMP_ADDRESS_MASK_REQUEST:
 742                 case ICMP_TIME_STAMP_REPLY:
 743                 case ICMP_TIME_STAMP_REQUEST:
 744                 case ICMP_ECHO_REQUEST:
 745                 case ICMP_ECHO_REPLY:
 746                         /* These merely need to get passed along. */
 747                         break;
 748                 case ICMP_ROUTER_ADVERTISEMENT:
 749                 case ICMP_ROUTER_SOLICITATION:
 750                         /* These shouldn't be traversing a NAT at all. Drop. */
 751                         DTRACE_PROBE1(vxlnat__icmp__cantpass, int,
 752                             icmph->icmph_type);
 753                         freemsg(mp);
 754                         return (NULL);
 755                 case ICMP_PARAM_PROBLEM:
 756                 case ICMP_TIME_EXCEEDED:
 757                 case ICMP_DEST_UNREACHABLE:
 758                         /* These include inner-IP headers we need to adjust. */
 759                         mp = vxlnat_fix_icmp_inner_v4(mp, icmph, old_one,
 760                             new_one, to_private);
 761                         break;
 762                 default:
 763                         /* Pass along to receiver, but warn. */
 764                         DTRACE_PROBE1(vxlnat__icmp__unknown, int,
 765                             icmph->icmph_type);
 766                         break;
 767                 }
 768         }
 769         /* Otherwise we can't make any other assumptions for now... */
 770         default:
 771                 break;
 772         }
 773 
 774         return (mp);
 775 }
 776 
 777 vxlnat_remote_t *
 778 vxlnat_xmit_vxlanv4(mblk_t *mp, in6_addr_t *overlay_dst,
 779     vxlnat_remote_t *remote, uint8_t *myether, vxlnat_vnet_t *vnet)
 780 {
 781         struct sockaddr_in6 sin6 = {AF_INET6};
 782         struct msghdr msghdr = {NULL};
 783         mblk_t *vlan_mp;
 784         extern uint_t vxlan_alloc_size, vxlan_noalloc_min;
 785         vxlan_hdr_t *vxh;
 786         struct ether_vlan_header *evh;
 787         int rc;
 788         cred_t *cred;
 789 
 790         if (remote == NULL || remote->vxnrem_vnet == NULL) {
 791                 DTRACE_PROBE1(vxlnat__xmit__vxlanv4, vxlnat_remote_t *, remote);
 792                 /* Release the condemned remote. */
 793                 if (remote != NULL)
 794                         VXNREM_REFRELE(remote);
 795 
 796                 /* See if we have a remote ready to use... */
 797                 remote = vxlnat_get_remote(vnet, overlay_dst, B_FALSE);
 798 
 799                 if (remote == NULL) {
 800                         /*
 801                          * We need to do the moral equivalent of PF_KEY
 802                          * ACQUIRE or overlay's queue-resolve so that we can
 803                          * have someone in user-space send me a remote.  Until
 804                          * then, drop the reference if condemned, free the
 805                          * message, and return NULL.
 806                          */
 807 
 808                         freemsg(mp);
 809                         return (NULL);
 810                 }
 811         }
 812         ASSERT(vnet == remote->vxnrem_vnet);
 813 
 814         if (DB_REF(mp) > 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
 815                 vlan_mp = allocb(vxlan_alloc_size, BPRI_HI);
 816                 if (vlan_mp == NULL) {
 817                         DTRACE_PROBE1(vxlnat__xmit__vxlanv4__allocfail,
 818                             vxlnat_remote_t *, remote);
 819                         freemsg(mp);
 820                         /* Just drop the packet, but don't tell caller. */
 821                         return (remote);
 822                 }
 823                 vlan_mp->b_wptr = DB_LIM(vlan_mp);
 824                 vlan_mp->b_rptr = vlan_mp->b_wptr;
 825                 vlan_mp->b_cont = mp;
 826         } else {
 827                 vlan_mp = mp;
 828         }
 829         vlan_mp->b_rptr -= sizeof (*vxh) + sizeof (*evh);
 830         vxh = (vxlan_hdr_t *)vlan_mp->b_rptr;
 831         vxh->vxlan_flags = VXLAN_F_VDI_WIRE;
 832         vxh->vxlan_id = vnet->vxnv_vnetid;        /* Already in wire-order. */
 833 
 834         /* Fill in the Ethernet header. */
 835         evh = (struct ether_vlan_header *)(vxh + 1);
 836         ether_copy(&remote->vxnrem_ether, &evh->ether_dhost);
 837         ether_copy(myether, &evh->ether_shost);
 838         evh->ether_tpid = htons(ETHERTYPE_VLAN);
 839         evh->ether_tci = remote->vxnrem_vlan;
 840         evh->ether_type = htons(ETHERTYPE_IP);
 841 
 842         msghdr.msg_name = (struct sockaddr_storage *)&sin6;
 843         msghdr.msg_namelen = sizeof (sin6);
 844         /* Address family and other zeroing already done up top. */
 845         sin6.sin6_port = htons(IPPORT_VXLAN);
 846         sin6.sin6_addr = remote->vxnrem_uaddr;
 847         
 848         /*
 849          * cred_t dance is because we may be getting this straight from
 850          * interrupt context.
 851          */
 852         cred = zone_get_kcred(netstack_get_zoneid(vxlnat_netstack));
 853         if (cred == NULL) {
 854                 DTRACE_PROBE1(vxlnat__xmit__vxlan4__credfail, 
 855                     vxlnat_remote_t *, remote);
 856                 freemsg(vlan_mp);
 857         }
 858         /*
 859          * Use MSG_DONTWAIT to avoid blocks, esp. if we're getting this
 860          * straight from interrupt context.
 861          */
 862         rc = ksocket_sendmblk(vxlnat_underlay, &msghdr, MSG_DONTWAIT, &vlan_mp,
 863             cred);
 864         crfree(cred);
 865         if (rc != 0) {
 866                 DTRACE_PROBE2(vxlnat__xmit__vxlan4__sendfail, int, rc,
 867                     vxlnat_remote_t *, remote);
 868                 freemsg(vlan_mp);
 869         }
 870         return (remote);
 871 }
 872 
 873 /*
 874  * New ire_{recv,send}fn implementations if we're doing 1-1 mappings.
 875  */
 876 int
 877 vxlnat_fixed_ire_send_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
 878     ip_xmit_attr_t *ixa, uint32_t *identp)
 879 {
 880         /* XXX KEBE SAYS FILL ME IN, but for now... */
 881         freemsg(mp);
 882         return (EOPNOTSUPP);
 883 }
 884 
 885 void
 886 vxlnat_fixed_ire_recv_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
 887     ip_recv_attr_t *ira)
 888 {
 889         /* XXX KEBE SAYS FILL ME IN, but for now... */
 890         freemsg(mp);
 891 }
 892 
 893 /*
 894  * I believe the common case for this will be from self-generated ICMP
 895  * messages.  Other same-netstack-originated traffic will also come through
 896  * here (one internal reaching what turns out to be another internal).
 897  */
 898 int
 899 vxlnat_fixed_ire_send_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
 900     ip_xmit_attr_t *ixa, uint32_t *identp)
 901 {
 902         ip_recv_attr_t iras;    /* NOTE: No bzero because we pay more later */
 903         ipha_t *ipha = (ipha_t *)iph_arg;
 904 
 905         /*
 906          * XXX KEBE ASKS, any DTrace probes or other instrumentation that
 907          * perhaps should be set?
 908          */
 909 
 910         /* Map ixa to ira. */
 911         iras.ira_pktlen = ixa->ixa_pktlen;
 912         /* XXX KEBE ASKS more?!? */
 913 
 914         /*
 915          * In normal TCP/IP processing, this shortcuts the IP header checksum
 916          * AND POSSIBLY THE ULP checksum cases.  Since this is likely to head
 917          * back into the internal network, we need to recompute things again.
 918          */
 919         if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
 920                 freemsg(mp);
 921                 return (EMSGSIZE);
 922         }
 923 #if 0
 924         /* XXX KEBE ASKS Special-case ICMP here? */
 925         if (ipha->ipha_protocol == IPPROTO_ICMP) {
 926                 icmph_t *icmph;
 927 
 928                 icmph = (icmph_t *)((uint8_t *)ipha + IPH_HDR_LENGTH(ipha));
 929                 if ((uint8_t *)icmph >= mp->b_wptr) {
 930                         freemsg(mp);
 931                         return (EMSGSIZE);
 932                 }
 933                 icmph->icmph_checksum = 0;
 934                 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
 935         }
 936 #endif
 937 
 938         vxlnat_fixed_ire_recv_v4(ire, mp, iph_arg, &iras);
 939 
 940         return (0);
 941 }
 942 
 943 void
 944 vxlnat_fixed_ire_recv_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
 945     ip_recv_attr_t *ira)
 946 {
 947         vxlnat_fixed_t *fixed;
 948         vxlnat_vnet_t *vnet;
 949         ipha_t *ipha = (ipha_t *)iph_arg;
 950         int newmtu;
 951 
 952         /* Make a note for DAD that this address is in use */
 953         ire->ire_last_used_time = LBOLT_FASTPATH;
 954 
 955         /* Only target the IRE_LOCAL with the right zoneid. */
 956         ira->ira_zoneid = ire->ire_zoneid;
 957 
 958         /*
 959          * XXX KEBE ASKS, any DTrace probes or other instrumentation that
 960          * perhaps should be set?
 961          */
 962 
 963         /*
 964          * Reality check some things.
 965          */
 966         fixed = (vxlnat_fixed_t *)ire->ire_dep_sib_next;
 967         vnet = fixed->vxnf_vnet;
 968 
 969         ASSERT3P(ire, ==, fixed->vxnf_ire);
 970 
 971         if (IRE_IS_CONDEMNED(ire) || vnet == NULL)
 972                 goto detach_ire_and_bail;
 973 
 974         /*
 975          * Not a common-case, but a possible one.  If our underlay MTU is
 976          * smaller than the external MTU, it is possible that we will have a
 977          * size mismatch and therefore need to either fragment at the VXLAN
 978          * layer (VXLAN UDP packet sent as two or more IP fragments) OR
 979          * if IPH_DF is set, send an ICMP_NEEDS_FRAGMENTATION back to the
 980          * sender.  Perform the check here BEFORE we NAT the packet.
 981          */
 982         ASSERT(vxlnat_underlay_ire->ire_ill != NULL);
 983         newmtu = vxlnat_underlay_ire->ire_ill->ill_mtu - sizeof (ipha_t) -
 984             sizeof (udpha_t) - sizeof (vxlan_hdr_t) -
 985             sizeof (struct ether_vlan_header);
 986         if ((ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF) &&
 987             ntohs(ipha->ipha_length) > newmtu) {
 988                 icmp_frag_needed(mp, newmtu, ira);
 989                 /* We're done.  Assume icmp_frag_needed() consumed mp. */
 990                 return;
 991         }
 992 
 993         /*
 994          * So we're here, and since we have a refheld IRE, we have a refheld
 995          * fixed and vnet. Do some of what ip_input_local_v4() does (inbound
 996          * checksum?  some ira checks?), but otherwise, swap the destination
 997          * address as mapped in "fixed", recompute any checksums, and send it
 998          * along its merry way (with a ttl decement too) to a VXLAN
 999          * destination.
1000          */
1001         mp = vxlnat_fixed_fixv4(mp, fixed, B_TRUE);
1002         if (mp == NULL)
1003                 return; /* Assume it's been freed & dtraced already. */
1004 
1005         /*
1006          * Otherwise, we're ready to transmit this packet over the vxlan
1007          * socket.
1008          */
1009         fixed->vxnf_remote = vxlnat_xmit_vxlanv4(mp, &fixed->vxnf_addr,
1010             fixed->vxnf_remote, fixed->vxnf_myether, vnet);
1011         if (fixed->vxnf_remote == NULL) {
1012                 /* XXX KEBE ASKS, DTrace probe here?  Or in-function? */
1013                 DTRACE_PROBE2(vxlnat__fixed__xmitdrop,
1014                     in6_addr_t *, &fixed->vxnf_addr,
1015                     uint32_t, VXLAN_ID_NTOH(vnet->vxnv_vnetid));
1016         }
1017         return;
1018 
1019 detach_ire_and_bail:
1020         /* Oh no, something's condemned.  Drop the IRE now. */
1021         ire->ire_recvfn = ire_recv_local_v4;
1022         ire->ire_dep_sib_next = NULL;
1023         VXNF_REFRELE(fixed);
1024         /* Pass the packet back... */
1025         ire_recv_local_v4(ire, mp, iph_arg, ira);
1026         return;
1027 }