1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2018 Joyent, Inc.
14 */
15
16 /*
17 * NAT engine. Mappings, 1-1 The rules in vxlnat_rules.c are only consulted
18 * if the 1-1 map (kept here) misses or if the outbound lookup (vnetid,
19 * protocol, src-IP, dst-IP, src-port, dst-port) misses.
20 *
21 * The plan is for inbound to hit conn_ts, whose conn_private points to
22 * entries here. The conn_recv* functions live here too (for now).
23 */
24
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/ksynch.h>
28 #include <sys/ksocket.h>
29 #include <sys/kmem.h>
30 #include <sys/stream.h>
31 #include <sys/strsubr.h>
32 #include <sys/strsun.h>
33 #include <sys/sysmacros.h>
34 #include <sys/debug.h>
35 #include <sys/dtrace.h>
36 #include <sys/errno.h>
37 #include <sys/tihdr.h>
38 #include <netinet/in.h>
39 #include <netinet/udp.h>
40 #include <inet/ip.h>
41 #include <inet/ip6.h>
42 #include <inet/udp_impl.h>
43 #include <inet/tcp.h>
44
45 #include <inet/vxlnat_impl.h>
46
47 static boolean_t vxlnat_vxlan_input(ksocket_t, mblk_t *, size_t, int, void *);
48 static mblk_t *vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed,
49 boolean_t to_private);
50
51 /*
52 * Initialized to NULL, read/write protected by vxlnat_mutex.
53 * Receive functions shouldn't have to access this directly.
54 */
55 ksocket_t vxlnat_underlay;
56 ire_t *vxlnat_underlay_ire;
57
58 void
59 vxlnat_closesock(void)
60 {
61 ASSERT(MUTEX_HELD(&vxlnat_mutex));
62 if (vxlnat_underlay_ire != NULL) {
63 ire_refrele(vxlnat_underlay_ire);
64 vxlnat_underlay_ire = NULL;
65 }
66 if (vxlnat_underlay != NULL) {
67 (void) ksocket_close(vxlnat_underlay, zone_kcred());
68 vxlnat_underlay = NULL;
69 }
70 }
71
72 static int
73 vxlnat_opensock(in6_addr_t *underlay_ip)
74 {
75 int rc, val;
76 /* Assume rest is initialized to 0s. */
77 struct sockaddr_in6 sin6 = {AF_INET6, BE_16(IPPORT_VXLAN)};
78 ip_stack_t *ipst = vxlnat_netstack->netstack_ip;
79
80 ASSERT(MUTEX_HELD(&vxlnat_mutex));
81 /* Open... */
82 rc = ksocket_socket(&vxlnat_underlay, AF_INET6, SOCK_DGRAM, 0,
83 KSOCKET_SLEEP, zone_kcred());
84 if (rc != 0)
85 return (rc);
86
87 /* Bind... */
88 sin6.sin6_addr = *underlay_ip;
89 rc = ksocket_bind(vxlnat_underlay, (struct sockaddr *)(&sin6),
90 sizeof (sin6), zone_kcred());
91 if (rc != 0) {
92 vxlnat_closesock();
93 return (rc);
94 }
95
96 /* Use source-port hashing when sending packets out VXLAN... */
97 val = UDP_HASH_VXLAN;
98 rc = ksocket_setsockopt(vxlnat_underlay, IPPROTO_UDP,
99 UDP_SRCPORT_HASH, &val, sizeof (val), kcred);
100 if (rc != 0) {
101 vxlnat_closesock();
102 return (rc);
103 }
104
105 /*
106 * Grab the IRE for underlay address.
107 */
108 ASSERT3P(vxlnat_underlay_ire, ==, NULL);
109 vxlnat_underlay_ire = (IN6_IS_ADDR_V4MAPPED(underlay_ip)) ?
110 ire_ftable_lookup_simple_v4(underlay_ip->_S6_un._S6_u32[3],
111 0, ipst, NULL) :
112 ire_ftable_lookup_simple_v6(underlay_ip, 0, ipst, NULL);
113 if (vxlnat_underlay_ire == NULL) {
114 DTRACE_PROBE1(vxlnat__opensock__ire__fail, in6_addr_t *,
115 underlay_ip);
116 vxlnat_closesock();
117 return (EADDRNOTAVAIL);
118 }
119
120 /* Once we return from this, start eating data. */
121 rc = ksocket_krecv_set(vxlnat_underlay, vxlnat_vxlan_input, NULL);
122 if (rc != 0) {
123 vxlnat_closesock();
124 }
125
126 return (rc);
127 }
128
129 /*
130 * Establish a VXLAN-listening kernel socket.
131 * XXX KEBE ASKS ==> Support more than one VXLAN address?
132 */
133 /* ARGSUSED */
134 int
135 vxlnat_vxlan_addr(in6_addr_t *underlay_ip)
136 {
137 int rc;
138
139 ASSERT(MUTEX_HELD(&vxlnat_mutex));
140 /* For now, we make this a one-underlay-address-only solution. */
141 vxlnat_closesock();
142 rc = vxlnat_opensock(underlay_ip);
143 return (rc);
144 }
145
146 /*
147 * Free a remote VXLAN destination.
148 */
149 void
150 vxlnat_remote_free(vxlnat_remote_t *remote)
151 {
152 ASSERT0(remote->vxnrem_refcount);
153
154 kmem_free(remote, sizeof (*remote));
155 }
156
157 /*
158 * Like other unlink functions, assume the appropriate lock is held.
159 */
160 void
161 vxlnat_remote_unlink(vxlnat_remote_t *remote)
162 {
163 vxlnat_vnet_t *vnet = remote->vxnrem_vnet;
164
165 ASSERT3P(vnet, !=, NULL);
166 ASSERT(MUTEX_HELD(&vnet->vxnv_remote_lock));
167
168 /* First unlink so nobody else can find me */
169 avl_remove(&vnet->vxnv_remotes, remote);
170
171 /*
172 * We still hold a vnet reference, so races shouldn't be a problem.
173 * Still, for added safety, NULL it out first.
174 */
175 remote->vxnrem_vnet = NULL; /* Condemn this entry. */
176 VXNV_REFRELE(vnet);
177 VXNREM_REFRELE(remote); /* Internment release. */
178 }
179
180 /*
181 * Find or create a remote VXLAN destination.
182 */
183 static vxlnat_remote_t *
184 vxlnat_get_remote(vxlnat_vnet_t *vnet, in6_addr_t *remote_addr,
185 boolean_t create_on_miss)
186 {
187 vxlnat_remote_t *remote, searcher;
188 avl_index_t where;
189
190 searcher.vxnrem_addr = *remote_addr;
191 mutex_enter(&vnet->vxnv_remote_lock);
192 remote = avl_find(&vnet->vxnv_remotes, &searcher, &where);
193 if (remote == NULL && create_on_miss) {
194 /* Not as critical if we can't allocate here. */
195 remote = kmem_zalloc(sizeof (*remote),
196 KM_NOSLEEP | KM_NORMALPRI);
197 if (remote != NULL) {
198 remote->vxnrem_addr = *remote_addr;
199 remote->vxnrem_refcount = 1; /* Internment reference. */
200 VXNV_REFHOLD(vnet);
201 remote->vxnrem_vnet = vnet;
202 /* Rest is filled in by caller. */
203 avl_insert(&vnet->vxnv_remotes, remote, where);
204 }
205 }
206 if (remote != NULL)
207 VXNREM_REFHOLD(remote);
208 mutex_exit(&vnet->vxnv_remote_lock);
209 return (remote);
210 }
211
212 /*
213 * Cache inbound packet information in the vnet's remotes section.
214 *
215 * NOTE: This function assumes a trustworthy underlay network. If the
216 * underlay isn't trustworthy, this function should be renamed, and reduced to
217 * a "strip and reality-check the ethernet header" function.
218 *
219 * Caller has stripped any pre-ethernet data from mp. We return mp
220 * stripped down to its IP header.
221 */
222 static mblk_t *
223 vxlnat_cache_remote(mblk_t *mp, struct sockaddr_in6 *underlay_src,
224 vxlnat_vnet_t *vnet)
225 {
226 struct ether_vlan_header *evh;
227 struct ether_header *eh;
228 vxlnat_remote_t *remote;
229 uint16_t vlan, ethertype;
230 ether_addr_t remote_ether;
231 ipha_t *ipha;
232 ip6_t *ip6h;
233 in6_addr_t remote_addr;
234
235 /* Assume (for now) we have at least a VLAN header's worth of data. */
236 if (MBLKL(mp) < sizeof (*evh)) {
237 /* XXX KEBE ASKS - should we be more forgiving? */
238 DTRACE_PROBE1(vxlnat__in__drop__etherhdr, mblk_t *, mp);
239 freemsg(mp);
240 return (NULL);
241 }
242
243 eh = (struct ether_header *)mp->b_rptr;
244 ethertype = ntohs(eh->ether_type);
245 ether_copy(&eh->ether_shost, &remote_ether);
246 if (ethertype == ETHERTYPE_VLAN) {
247 evh = (struct ether_vlan_header *)eh;
248 /* Keep it in network order... */
249 vlan = evh->ether_tci;
250 ethertype = ntohs(evh->ether_type);
251 ASSERT(vlan != 0);
252 mp->b_rptr += sizeof (*evh);
253 } else {
254 evh = NULL;
255 vlan = 0;
256 mp->b_rptr += sizeof (*eh);
257 }
258 if (ethertype != ETHERTYPE_IP && ethertype != ETHERTYPE_IPV6) {
259 /*
260 * XXX KEBE SAYS for now, don't handle non-IP packets.
261 * This includes ARP.
262 */
263 DTRACE_PROBE1(vxlnat__in__drop__nonip, mblk_t *, mp);
264 freemsg(mp);
265 return (NULL);
266 }
267
268 /* Handle case of split ether + IP headers. */
269 if (MBLKL(mp) < sizeof (ipha_t)) {
270 mblk_t *freemp;
271
272 if (MBLKL(mp) > 0 || mp->b_cont == NULL) {
273 /* The IP header is split ACROSS MBLKS! Bail for now. */
274 DTRACE_PROBE1(vxlnat__in__drop__splitip, mblk_t *, mp);
275 freemsg(mp);
276 return (NULL);
277 }
278 freemp = mp;
279 mp = mp->b_cont;
280 freeb(freemp);
281 }
282 /* LINTED -- alignment... */
283 ipha = (ipha_t *)mp->b_rptr;
284
285 if (IPH_HDR_VERSION(ipha) == IPV4_VERSION) {
286 if (ethertype != ETHERTYPE_IP) {
287 /* XXX KEBE ASKS - should we be more forgiving? */
288 DTRACE_PROBE1(vxlnat__in__drop__etherhdr4,
289 mblk_t *, mp);
290 freemsg(mp);
291 return (NULL);
292 }
293 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
294 &remote_addr);
295 } else {
296 if (ethertype != ETHERTYPE_IPV6 ||
297 IPH_HDR_VERSION(ipha) != IPV6_VERSION ||
298 MBLKL(mp) < sizeof (ip6_t)) {
299 /* XXX KEBE ASKS - should we be more forgiving? */
300 DTRACE_PROBE1(vxlnat__in__drop__etherhdr6,
301 mblk_t *, mp);
302 freemsg(mp);
303 return (NULL);
304 }
305 ip6h = (ip6_t *)ipha;
306 remote_addr = ip6h->ip6_src;
307 }
308
309 /* Find remote and replace OR create new remote. */
310 remote = vxlnat_get_remote(vnet, &remote_addr, B_TRUE);
311 if (remote != NULL) {
312 /*
313 * See if this entry needs fixing or filling-in. This might
314 * get a bit racy with read-only threads that actually
315 * transmit, but it only means dropped-packets in the worst
316 * case.
317 *
318 * It's THIS PART that inspires the warning about trusting the
319 * underlay network.
320 *
321 * XXX KEBE ASKS -- should we just replace things w/o checking?
322 */
323 /* Replace the ethernet address? */
324 if (ether_cmp(&remote->vxnrem_ether, &remote_ether) != 0)
325 ether_copy(&remote_ether, &remote->vxnrem_ether);
326 /*
327 * Replace the underlay? NOTE: Fix if/when underlay becomes
328 * IPv6.
329 */
330 if (!IN6_ARE_ADDR_EQUAL(&remote->vxnrem_uaddr,
331 &underlay_src->sin6_addr)) {
332 remote->vxnrem_uaddr = underlay_src->sin6_addr;
333 }
334 /* Replace the vlan ID. Maintain network order... */
335 if (remote->vxnrem_vlan != vlan)
336 remote->vxnrem_vlan = vlan;
337 }
338 /*
339 * Else just continue and pray for better luck on another packet or
340 * on the return flight. It is IP, we can Just Drop It (TM)...
341 */
342
343 /* We're done with the remote entry now. */
344 VXNREM_REFRELE(remote);
345
346 /* Advance rptr to the inner IP header and proceed. */
347 mp->b_rptr = (uint8_t *)ipha;
348 return (mp);
349 }
350
351 /*
352 * See if the inbound VXLAN packet hits a 1-1/fixed mapping, and process if it
353 * does. B_TRUE means the packet was handled, and we shouldn't continue
354 * processing (even if "was handled" means droppage).
355 */
356 static boolean_t
357 vxlnat_one_vxlan_fixed(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
358 ip6_t *ip6h)
359 {
360 vxlnat_fixed_t *fixed, fsearch;
361 mblk_t *newmp;
362 ire_t *outbound_ire;
363 /* Use C99's initializers for fun & profit. */
364 ip_recv_attr_t iras = { IRAF_IS_IPV4 | IRAF_VERIFIED_SRC };
365
366 if (ipha != NULL) {
367 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
368 &fsearch.vxnf_addr);
369 } else {
370 /* vxlnat_cache_remote() did reality checks... */
371 ASSERT(ipha == NULL && ip6h != NULL);
372 fsearch.vxnf_addr = ip6h->ip6_src;
373 }
374
375 rw_enter(&vnet->vxnv_fixed_lock, RW_READER);
376 fixed = avl_find(&vnet->vxnv_fixed_ips, &fsearch, NULL);
377 if (fixed != NULL)
378 VXNF_REFHOLD(fixed);
379 rw_exit(&vnet->vxnv_fixed_lock);
380 if (fixed == NULL)
381 return (B_FALSE); /* Try another method of processing. */
382
383 newmp = NULL;
384 /*
385 * XXX KEBE ASKS --> Do an MTU check NOW?! That way, we have
386 * pre-natted data. One gotcha, external dests may have
387 * different PathMTUs so see below about EMSGSIZE...
388 *
389 * For now, let the post-NAT crunch through
390 * ire_recv_forward_v4() take care of all of that.
391 */
392
393 if (ipha != NULL)
394 newmp = vxlnat_fixed_fixv4(mp, fixed, B_FALSE);
395 else {
396 freemsg(mp); /* XXX handle ip6h */
397 return (B_TRUE);
398 }
399
400 if (newmp == NULL)
401 return (B_TRUE); /* mp eaten by vxlnat_fixed_fixv4() */
402
403
404 ASSERT3P(ipha, ==, newmp->b_rptr);
405 /* XXX KEBE ASKS, IRR_ALLOCATE okay?!? */
406 /* XXX KEBE SAYS XMIT HINT! */
407 outbound_ire = ire_route_recursive_dstonly_v4(ipha->ipha_dst,
408 IRR_ALLOCATE, 0, vxlnat_netstack->netstack_ip);
409 VERIFY3P(outbound_ire, !=, NULL);
410 if (outbound_ire->ire_type == IRE_NOROUTE) {
411 /* Bail! */
412 DTRACE_PROBE2(vxlnat__in__drop__fixedire, ipaddr_t,
413 ipha->ipha_dst, mblk_t *, mp);
414 VXNF_REFRELE(fixed);
415 freemsg(mp);
416 return (B_TRUE);
417 }
418
419 iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
420 if (iras.ira_ip_hdr_length > sizeof (ipha_t))
421 iras.ira_flags |= IRAF_IPV4_OPTIONS;
422 iras.ira_xmit_hint = 0; /* XXX KEBE SAYS FIX ME! */
423 iras.ira_zoneid = outbound_ire->ire_zoneid;
424 iras.ira_pktlen = ntohs(ipha->ipha_length);
425 iras.ira_protocol = ipha->ipha_protocol;
426 /* XXX KEBE ASKS rifindex & ruifindex ?!? */
427 /*
428 * NOTE: AT LEAST ira_ill needs ILLF_ROUTER set, as
429 * well as the ill for the external NIC (where
430 * off-link destinations live). For fixed, ira_ill
431 * should be the ill of the external source.
432 */
433 iras.ira_rill = vxlnat_underlay_ire->ire_ill;
434 iras.ira_ill = fixed->vxnf_ire->ire_ill;
435 /* XXX KEBE ASKS cred & cpid ? */
436 iras.ira_verified_src = ipha->ipha_src;
437 /* XXX KEBE SAYS don't sweat IPsec stuff. */
438 /* XXX KEBE SAYS ALSO don't sweat l2src & mhip */
439
440 /* Okay, we're good! Let's pretend we're forwarding. */
441 ire_recv_forward_v4(outbound_ire, mp, ipha, &iras);
442 ire_refrele(outbound_ire);
443
444 return (B_TRUE);
445 }
446
447 /*
448 * Process exactly one VXLAN packet.
449 */
450 static void
451 vxlnat_one_vxlan(mblk_t *mp, struct sockaddr_in6 *underlay_src)
452 {
453 vxlan_hdr_t *vxh;
454 vxlnat_vnet_t *vnet;
455 ipha_t *ipha;
456 ip6_t *ip6h;
457
458 if (MBLKL(mp) < sizeof (*vxh)) {
459 /* XXX KEBE ASKS -- should we be more forgiving? */
460 DTRACE_PROBE1(vxlnat__in__drop__vxlsize, mblk_t *, mp);
461 freemsg(mp);
462 return;
463 }
464 vxh = (vxlan_hdr_t *)mp->b_rptr;
465
466 /* If we start using more than just the one flag, fix it. */
467 if (vxh->vxlan_flags != VXLAN_F_VDI_WIRE) {
468 DTRACE_PROBE1(vxlnat__in__drop__VDI, mblk_t *, mp);
469 freemsg(mp);
470 return;
471 }
472
473 /* Remember, we key off of what's on the wire. */
474 vnet = vxlnat_get_vnet(VXLAN_ID_WIRE32(vxh->vxlan_id), B_FALSE);
475 if (vnet == NULL) {
476 DTRACE_PROBE1(vxlnat__in__drop__vnetid, uint32_t,
477 VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)));
478 freemsg(mp);
479 return;
480 }
481
482 DTRACE_PROBE2(vxlnat__in__vnet, uint32_t,
483 VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)),
484 vxlnat_vnet_t, vnet);
485
486 /*
487 * Off-vxlan processing steps:
488 * 1.) Locate the ethernet header and check/update/add-into remotes.
489 * 2.) Search 1-1s, process if hit.
490 * 3.) Search flows, process if hit.
491 * 4.) Search rules, create new flow (or not) if hit.
492 * 5.) Drop the packets.
493 */
494
495 /* 1.) Locate the ethernet header and check/update/add-into remotes. */
496 mp->b_rptr += sizeof (*vxh);
497 while (MBLKL(mp) == 0) {
498 mblk_t *oldmp = mp;
499
500 mp = mp->b_cont;
501 freeb(oldmp);
502 }
503 mp = vxlnat_cache_remote(mp, underlay_src, vnet);
504 if (mp == NULL)
505 goto bail_no_free;
506
507 /* Let's cache the IP header here... */
508 ipha = (ipha_t *)mp->b_rptr;
509 switch (IPH_HDR_VERSION(ipha)) {
510 case IPV4_VERSION:
511 ip6h = NULL;
512 break;
513 case IPV6_VERSION:
514 ip6h = (ip6_t *)ipha;
515 ipha = NULL;
516 break;
517 default:
518 DTRACE_PROBE2(vxlnat__in__drop__ipvers, int,
519 IPH_HDR_VERSION(ipha), mblk_t *, mp);
520 goto bail_and_free;
521 }
522
523 /* 2.) Search 1-1s, process if hit. */
524 if (vxlnat_one_vxlan_fixed(vnet, mp, ipha, ip6h))
525 goto bail_no_free; /* Success means mp was consumed. */
526
527 #ifdef notyet
528 /* 3.) Search flows, process if hit. */
529 if (vxlnat_one_vxlan_flow(vnet, mp, ipha, ip6h))
530 goto bail_no_free; /* Success means mp was consumed. */
531
532 /* 4.) Search rules, create new flow (or not) if hit. */
533 if (vxlnat_one_vxlan_rule(vnet, mp, ipha, ip6h))
534 goto bail_no_free; /* Success means mp was consumed. */
535 #endif
536
537 /* 5.) Nothing, drop the packet. */
538
539 DTRACE_PROBE2(vxlnat__in___drop__nohits, vxlnat_vnet_t *, vnet,
540 mblk_t *, mp);
541
542 bail_and_free:
543 freemsg(mp);
544 bail_no_free:
545 VXNV_REFRELE(vnet);
546 }
547 /*
548 * ONLY return B_FALSE if we get a packet-clogging event.
549 */
550 /* ARGSUSED */
551 static boolean_t
552 vxlnat_vxlan_input(ksocket_t insock, mblk_t *chain, size_t msgsize, int oob,
553 void *ignored)
554 {
555 mblk_t *mp, *nextmp;
556
557 /*
558 * XXX KEBE ASKS --> move hold & release outside of loop?
559 * If so, hold rwlock here.
560 */
561
562 for (mp = chain; mp != NULL; mp = nextmp) {
563 struct T_unitdata_ind *tudi;
564 struct sockaddr_in6 *sin6;
565
566 nextmp = mp->b_next;
567 if (DB_TYPE(mp) != M_PROTO || mp->b_cont == NULL) {
568 DTRACE_PROBE1(vxlnat__in__drop__mblk, mblk_t *, mp);
569 freemsg(mp);
570 continue;
571 }
572
573 /* LINTED -- aligned */
574 tudi = (struct T_unitdata_ind *)mp->b_rptr;
575 if (tudi->PRIM_type != T_UNITDATA_IND) {
576 DTRACE_PROBE1(vxlnat__in__drop__TPI, mblk_t *, mp);
577 freemsg(mp);
578 continue;
579 }
580 /* LINTED -- aligned */
581 sin6 = (struct sockaddr_in6 *)(mp->b_rptr + tudi->SRC_offset);
582 VERIFY(sin6->sin6_family == AF_INET6);
583 VERIFY(tudi->SRC_length >= sizeof (*sin6));
584
585 vxlnat_one_vxlan(mp->b_cont, sin6);
586 freeb(mp);
587 }
588
589 return (B_TRUE);
590 }
591
592 /*
593 * Use RFC 1141's technique (with a check for -0).
594 *
595 * newsum = oldsum - (new16a + old16a - new16b + old16b ...);
596 *
597 * NOTE: "oldsum" is right off the wire in wire-native order.
598 * NOTE2: "old" and "new" ALSO point to things in wire-native order.
599 * NOTE3: THIS MUST TAKE A MULTIPLE OF 2 BYTES (i.e. uint16_t array).
600 * NOTE4: The 32-bit running sum means we can't take len > 64k.
601 */
602 uint16_t
603 vxlnat_cksum_adjust(uint16_t oldsum, uint16_t *old, uint16_t *new, uint_t len)
604 {
605 uint32_t newsum = ntohs(oldsum);
606
607 ASSERT((len & 0x1) == 0);
608 while (len != 0) {
609 newsum -= ntohs(*new);
610 newsum += ntohs(*old);
611 len -= 2;
612 old++;
613 new++;
614 }
615 newsum += (newsum >> 16) & 0xffff;
616
617 return (newsum == 0xffff ? 0 : htons(newsum));
618 }
619
620 /*
621 * Fix inner headers on an ICMP packet.
622 *
623 * XXX KEBE SAYS FOR NOW, just do addresses for 1-1/fixed. When we do
624 * flows, include old_port/new_port as well.
625 */
626 static mblk_t *
627 vxlnat_fix_icmp_inner_v4(mblk_t *mp, icmph_t *icmph, ipaddr_t old_one,
628 ipaddr_t new_one, boolean_t to_private)
629 {
630 mblk_t *newmp;
631 ipha_t *inner_ipha;
632 ipaddr_t *new_ones_place;
633
634 if ((uint8_t *)(icmph + 1) + sizeof (ipha_t) > mp->b_wptr) {
635 /* Pay the pullup tax. */
636 newmp = msgpullup(mp, -1);
637 freemsg(mp);
638 if (newmp == NULL) {
639 DTRACE_PROBE1(vxlnat__fixicmp__pullupfail, void *,
640 NULL);
641 return (NULL);
642 }
643 if (MBLKL(newmp) < 2 * sizeof (ipha_t) + sizeof (icmph_t)) {
644 /* Wow! Too-tiny ICMP packet. */
645 DTRACE_PROBE1(vxlnat__fixicmp__tootiny, mblk_t *,
646 newmp);
647 freeb(newmp);
648 return (NULL);
649 }
650 mp = newmp;
651 /* Temporarily use inner_ipha for the outer one. */
652 inner_ipha = (ipha_t *)mp->b_rptr;
653 icmph = (icmph_t *)(mp->b_rptr + IPH_HDR_LENGTH(inner_ipha));
654 }
655 inner_ipha = (ipha_t *)(icmph + 1);
656 new_ones_place = to_private ?
657 &inner_ipha->ipha_src : &inner_ipha->ipha_dst;
658 if (*new_ones_place != old_one) {
659 /* Either I'm buggy or the packet is. */
660 DTRACE_PROBE2(vxlnat__fixicmp__badinneraddr, ipaddr_t,
661 old_one, ipaddr_t, *new_ones_place);
662 freeb(mp);
663 return (NULL);
664 }
665 *new_ones_place = new_one;
666
667 /* Adjust ICMP checksum... */
668 icmph->icmph_checksum = vxlnat_cksum_adjust(icmph->icmph_checksum,
669 (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
670
671 /*
672 * XXX KEBE ASKS, recompute *inner-packet* checksums? Let's not for
673 * now, but consider this Fair Warning (or some other VH album...).
674 */
675 return (mp);
676 }
677
678 /*
679 * Take a 1-1/fixed IPv4 packet and convert it for transmission out the
680 * appropriate end. "to_private" is what it says on the tin.
681 * ALWAYS consumes "mp", regardless of return value.
682 */
683 static mblk_t *
684 vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed, boolean_t to_private)
685 {
686 ipaddr_t new_one, old_one;
687 ipaddr_t *new_ones_place;
688 ipha_t *ipha = (ipha_t *)mp->b_rptr;
689 uint8_t *nexthdr, *end_wptr;
690
691 if (to_private) {
692 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_addr, new_one);
693 new_ones_place = &ipha->ipha_dst;
694 } else {
695 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_pubaddr, new_one);
696 new_ones_place = &ipha->ipha_src;
697 }
698
699 old_one = *new_ones_place;
700 *new_ones_place = new_one;
701
702 /*
703 * Recompute the IP header checksum, and check for the TCP or UDP
704 * checksum as well, as they'll need recomputing as well.
705 */
706
707 /* First, the IPv4 header itself. */
708 ipha->ipha_hdr_checksum = vxlnat_cksum_adjust(ipha->ipha_hdr_checksum,
709 (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
710
711 nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
712 if (nexthdr >= mp->b_wptr) {
713 nexthdr = mp->b_cont->b_rptr +
714 (MBLKL(mp) - IPH_HDR_LENGTH(ipha));
715 end_wptr = mp->b_cont->b_wptr;
716 } else {
717 end_wptr = mp->b_wptr;
718 }
719
720 switch (ipha->ipha_protocol) {
721 case IPPROTO_TCP: {
722 tcpha_t *tcph = (tcpha_t *)nexthdr;
723
724 if (nexthdr + sizeof (*tcph) >= end_wptr) {
725 /* Bail for now. */
726 DTRACE_PROBE1(vxlnat__fix__tcp__mblkspan, mblk_t *,
727 mp);
728 freemsg(mp);
729 return (NULL);
730 }
731 tcph->tha_sum = vxlnat_cksum_adjust(tcph->tha_sum,
732 (uint16_t *)&old_one, (uint16_t *)&new_one,
733 sizeof (ipaddr_t));
734 break; /* Out of switch. */
735 }
736 case IPPROTO_UDP: {
737 udpha_t *udph = (udpha_t *)nexthdr;
738
739 if (nexthdr + sizeof (*udph) >= end_wptr) {
740 /* Bail for now. */
741 DTRACE_PROBE1(vxlnat__fix__udp__mblkspan, mblk_t *,
742 mp);
743 freemsg(mp);
744 return (NULL);
745 }
746 udph->uha_checksum = vxlnat_cksum_adjust(udph->uha_checksum,
747 (uint16_t *)&old_one, (uint16_t *)&new_one,
748 sizeof (ipaddr_t));
749 break; /* Out of switch. */
750 }
751 case IPPROTO_ICMP: {
752 icmph_t *icmph = (icmph_t *)nexthdr;
753
754 /*
755 * We need to check the case of ICMP messages that contain
756 * IP packets. We will need to at least change the addresses,
757 * and *maybe* the checksums too if necessary.
758 *
759 * This may replicate some of icmp_inbound_v4(), alas.
760 */
761 if (nexthdr + sizeof (*icmph) >= end_wptr) {
762 mblk_t *newmp;
763 /*
764 * Unlike the others, we're going to pay the pullup
765 * tax here.
766 */
767 newmp = msgpullup(mp, -1);
768 freemsg(mp);
769 if (newmp == NULL) {
770 DTRACE_PROBE1(vxlnat__icmp__pullupfail, void *,
771 NULL);
772 return (NULL);
773 }
774 mp = newmp;
775 ipha = (ipha_t *)(mp->b_rptr);
776 nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
777 icmph = (icmph_t *)nexthdr;
778 }
779
780 switch (icmph->icmph_type) {
781 case ICMP_ADDRESS_MASK_REPLY:
782 case ICMP_ADDRESS_MASK_REQUEST:
783 case ICMP_TIME_STAMP_REPLY:
784 case ICMP_TIME_STAMP_REQUEST:
785 case ICMP_ECHO_REQUEST:
786 case ICMP_ECHO_REPLY:
787 /* These merely need to get passed along. */
788 break;
789 case ICMP_ROUTER_ADVERTISEMENT:
790 case ICMP_ROUTER_SOLICITATION:
791 /* These shouldn't be traversing a NAT at all. Drop. */
792 DTRACE_PROBE1(vxlnat__icmp__cantpass, int,
793 icmph->icmph_type);
794 freemsg(mp);
795 return (NULL);
796 case ICMP_PARAM_PROBLEM:
797 case ICMP_TIME_EXCEEDED:
798 case ICMP_DEST_UNREACHABLE:
799 /* These include inner-IP headers we need to adjust. */
800 mp = vxlnat_fix_icmp_inner_v4(mp, icmph, old_one,
801 new_one, to_private);
802 break;
803 default:
804 /* Pass along to receiver, but warn. */
805 DTRACE_PROBE1(vxlnat__icmp__unknown, int,
806 icmph->icmph_type);
807 break;
808 }
809 }
810 /* Otherwise we can't make any other assumptions for now... */
811 default:
812 break;
813 }
814
815 return (mp);
816 }
817
818 vxlnat_remote_t *
819 vxlnat_xmit_vxlanv4(mblk_t *mp, in6_addr_t *overlay_dst,
820 vxlnat_remote_t *remote, uint8_t *myether, vxlnat_vnet_t *vnet)
821 {
822 struct sockaddr_in6 sin6 = {AF_INET6};
823 struct msghdr msghdr = {NULL};
824 mblk_t *vlan_mp;
825 extern uint_t vxlan_alloc_size, vxlan_noalloc_min;
826 vxlan_hdr_t *vxh;
827 struct ether_vlan_header *evh;
828 int rc;
829 cred_t *cred;
830
831 if (remote == NULL || remote->vxnrem_vnet == NULL) {
832 DTRACE_PROBE1(vxlnat__xmit__vxlanv4, vxlnat_remote_t *, remote);
833 /* Release the condemned remote. */
834 if (remote != NULL)
835 VXNREM_REFRELE(remote);
836
837 /* See if we have a remote ready to use... */
838 remote = vxlnat_get_remote(vnet, overlay_dst, B_FALSE);
839
840 if (remote == NULL) {
841 /*
842 * We need to do the moral equivalent of PF_KEY
843 * ACQUIRE or overlay's queue-resolve so that we can
844 * have someone in user-space send me a remote. Until
845 * then, drop the reference if condemned, free the
846 * message, and return NULL.
847 */
848
849 freemsg(mp);
850 return (NULL);
851 }
852 }
853 ASSERT(vnet == remote->vxnrem_vnet);
854
855 if (DB_REF(mp) > 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
856 vlan_mp = allocb(vxlan_alloc_size, BPRI_HI);
857 if (vlan_mp == NULL) {
858 DTRACE_PROBE1(vxlnat__xmit__vxlanv4__allocfail,
859 vxlnat_remote_t *, remote);
860 freemsg(mp);
861 /* Just drop the packet, but don't tell caller. */
862 return (remote);
863 }
864 vlan_mp->b_wptr = DB_LIM(vlan_mp);
865 vlan_mp->b_rptr = vlan_mp->b_wptr;
866 vlan_mp->b_cont = mp;
867 } else {
868 vlan_mp = mp;
869 }
870 vlan_mp->b_rptr -= sizeof (*vxh) + sizeof (*evh);
871 vxh = (vxlan_hdr_t *)vlan_mp->b_rptr;
872 vxh->vxlan_flags = VXLAN_F_VDI_WIRE;
873 vxh->vxlan_id = vnet->vxnv_vnetid; /* Already in wire-order. */
874
875 /* Fill in the Ethernet header. */
876 evh = (struct ether_vlan_header *)(vxh + 1);
877 ether_copy(&remote->vxnrem_ether, &evh->ether_dhost);
878 ether_copy(myether, &evh->ether_shost);
879 evh->ether_tpid = htons(ETHERTYPE_VLAN);
880 evh->ether_tci = remote->vxnrem_vlan;
881 evh->ether_type = htons(ETHERTYPE_IP);
882
883 msghdr.msg_name = (struct sockaddr_storage *)&sin6;
884 msghdr.msg_namelen = sizeof (sin6);
885 /* Address family and other zeroing already done up top. */
886 sin6.sin6_port = htons(IPPORT_VXLAN);
887 sin6.sin6_addr = remote->vxnrem_uaddr;
888
889 /*
890 * cred_t dance is because we may be getting this straight from
891 * interrupt context.
892 */
893 cred = zone_get_kcred(netstack_get_zoneid(vxlnat_netstack));
894 if (cred == NULL) {
895 DTRACE_PROBE1(vxlnat__xmit__vxlan4__credfail,
896 vxlnat_remote_t *, remote);
897 freemsg(vlan_mp);
898 }
899 /*
900 * Use MSG_DONTWAIT to avoid blocks, esp. if we're getting this
901 * straight from interrupt context.
902 */
903 rc = ksocket_sendmblk(vxlnat_underlay, &msghdr, MSG_DONTWAIT, &vlan_mp,
904 cred);
905 crfree(cred);
906 if (rc != 0) {
907 DTRACE_PROBE2(vxlnat__xmit__vxlan4__sendfail, int, rc,
908 vxlnat_remote_t *, remote);
909 freemsg(vlan_mp);
910 }
911 return (remote);
912 }
913
914 /*
915 * New ire_{recv,send}fn implementations if we're doing 1-1 mappings.
916 */
917 int
918 vxlnat_fixed_ire_send_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
919 ip_xmit_attr_t *ixa, uint32_t *identp)
920 {
921 /* XXX KEBE SAYS FILL ME IN, but for now... */
922 freemsg(mp);
923 return (EOPNOTSUPP);
924 }
925
926 void
927 vxlnat_fixed_ire_recv_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
928 ip_recv_attr_t *ira)
929 {
930 /* XXX KEBE SAYS FILL ME IN, but for now... */
931 freemsg(mp);
932 }
933
934 /*
935 * I believe the common case for this will be from self-generated ICMP
936 * messages. Other same-netstack-originated traffic will also come through
937 * here (one internal reaching what turns out to be another internal).
938 */
939 int
940 vxlnat_fixed_ire_send_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
941 ip_xmit_attr_t *ixa, uint32_t *identp)
942 {
943 ip_recv_attr_t iras; /* NOTE: No bzero because we pay more later */
944 ipha_t *ipha = (ipha_t *)iph_arg;
945
946 /*
947 * XXX KEBE ASKS, any DTrace probes or other instrumentation that
948 * perhaps should be set?
949 */
950
951 /* Map ixa to ira. */
952 iras.ira_pktlen = ixa->ixa_pktlen;
953 /* XXX KEBE ASKS more?!? */
954
955 /*
956 * In normal TCP/IP processing, this shortcuts the IP header checksum
957 * AND POSSIBLY THE ULP checksum cases. Since this is likely to head
958 * back into the internal network, we need to recompute things again.
959 */
960 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
961 freemsg(mp);
962 return (EMSGSIZE);
963 }
964 #if 0
965 /* XXX KEBE ASKS Special-case ICMP here? */
966 if (ipha->ipha_protocol == IPPROTO_ICMP) {
967 icmph_t *icmph;
968
969 icmph = (icmph_t *)((uint8_t *)ipha + IPH_HDR_LENGTH(ipha));
970 if ((uint8_t *)icmph >= mp->b_wptr) {
971 freemsg(mp);
972 return (EMSGSIZE);
973 }
974 icmph->icmph_checksum = 0;
975 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
976 }
977 #endif
978
979 vxlnat_fixed_ire_recv_v4(ire, mp, iph_arg, &iras);
980
981 return (0);
982 }
983
984 void
985 vxlnat_fixed_ire_recv_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
986 ip_recv_attr_t *ira)
987 {
988 vxlnat_fixed_t *fixed;
989 vxlnat_vnet_t *vnet;
990 ipha_t *ipha = (ipha_t *)iph_arg;
991 int newmtu;
992
993 /* Make a note for DAD that this address is in use */
994 ire->ire_last_used_time = LBOLT_FASTPATH;
995
996 /* Only target the IRE_LOCAL with the right zoneid. */
997 ira->ira_zoneid = ire->ire_zoneid;
998
999 /*
1000 * XXX KEBE ASKS, any DTrace probes or other instrumentation that
1001 * perhaps should be set?
1002 */
1003
1004 /*
1005 * Reality check some things.
1006 */
1007 fixed = (vxlnat_fixed_t *)ire->ire_dep_sib_next;
1008 vnet = fixed->vxnf_vnet;
1009
1010 ASSERT3P(ire, ==, fixed->vxnf_ire);
1011
1012 if (IRE_IS_CONDEMNED(ire) || vnet == NULL)
1013 goto detach_ire_and_bail;
1014
1015 /*
1016 * Not a common-case, but a possible one. If our underlay MTU is
1017 * smaller than the external MTU, it is possible that we will have a
1018 * size mismatch and therefore need to either fragment at the VXLAN
1019 * layer (VXLAN UDP packet sent as two or more IP fragments) OR
1020 * if IPH_DF is set, send an ICMP_NEEDS_FRAGMENTATION back to the
1021 * sender. Perform the check here BEFORE we NAT the packet.
1022 */
1023 ASSERT(vxlnat_underlay_ire->ire_ill != NULL);
1024 newmtu = vxlnat_underlay_ire->ire_ill->ill_mtu - sizeof (ipha_t) -
1025 sizeof (udpha_t) - sizeof (vxlan_hdr_t) -
1026 sizeof (struct ether_vlan_header);
1027 if ((ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF) &&
1028 ntohs(ipha->ipha_length) > newmtu) {
1029 icmp_frag_needed(mp, newmtu, ira);
1030 /* We're done. Assume icmp_frag_needed() consumed mp. */
1031 return;
1032 }
1033
1034 /*
1035 * So we're here, and since we have a refheld IRE, we have a refheld
1036 * fixed and vnet. Do some of what ip_input_local_v4() does (inbound
1037 * checksum? some ira checks?), but otherwise, swap the destination
1038 * address as mapped in "fixed", recompute any checksums, and send it
1039 * along its merry way (with a ttl decement too) to a VXLAN
1040 * destination.
1041 */
1042 mp = vxlnat_fixed_fixv4(mp, fixed, B_TRUE);
1043 if (mp == NULL)
1044 return; /* Assume it's been freed & dtraced already. */
1045
1046 /*
1047 * Otherwise, we're ready to transmit this packet over the vxlan
1048 * socket.
1049 */
1050 fixed->vxnf_remote = vxlnat_xmit_vxlanv4(mp, &fixed->vxnf_addr,
1051 fixed->vxnf_remote, fixed->vxnf_myether, vnet);
1052 if (fixed->vxnf_remote == NULL) {
1053 /* XXX KEBE ASKS, DTrace probe here? Or in-function? */
1054 DTRACE_PROBE2(vxlnat__fixed__xmitdrop,
1055 in6_addr_t *, &fixed->vxnf_addr,
1056 uint32_t, VXLAN_ID_NTOH(vnet->vxnv_vnetid));
1057 }
1058 return;
1059
1060 detach_ire_and_bail:
1061 /* Oh no, something's condemned. Drop the IRE now. */
1062 ire->ire_recvfn = ire_recv_local_v4;
1063 ire->ire_dep_sib_next = NULL;
1064 VXNF_REFRELE(fixed);
1065 /* Pass the packet back... */
1066 ire_recv_local_v4(ire, mp, iph_arg, ira);
1067 return;
1068 }