1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2018 Joyent, Inc.
14 */
15
16 /*
17 * NAT engine. Mappings, 1-1 The rules in vxlnat_rules.c are only consulted
18 * if the 1-1 map (kept here) misses or if the outbound lookup (vnetid,
19 * protocol, src-IP, dst-IP, src-port, dst-port) misses.
20 *
21 * The plan is for inbound to hit conn_ts, whose conn_private points to
22 * entries here. The conn_recv* functions live here too (for now).
23 */
24
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/ksynch.h>
28 #include <sys/ksocket.h>
29 #include <sys/kmem.h>
30 #include <sys/stream.h>
31 #include <sys/strsubr.h>
32 #include <sys/strsun.h>
33 #include <sys/sysmacros.h>
34 #include <sys/debug.h>
35 #include <sys/dtrace.h>
36 #include <sys/errno.h>
37 #include <sys/tihdr.h>
38 #include <netinet/in.h>
39 #include <netinet/udp.h>
40 #include <inet/ip.h>
41 #include <inet/ip6.h>
42 #include <inet/udp_impl.h>
43 #include <inet/tcp.h>
44
45 #include <inet/vxlnat_impl.h>
46
47 static boolean_t vxlnat_vxlan_input(ksocket_t, mblk_t *, size_t, int, void *);
48 static mblk_t *vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed,
49 boolean_t to_private);
50
51 /*
52 * Initialized to NULL, read/write protected by vxlnat_mutex.
53 * Receive functions shouldn't have to access this directly.
54 */
55 ksocket_t vxlnat_underlay;
56 ire_t *vxlnat_underlay_ire;
57
58 void
59 vxlnat_closesock(void)
60 {
61 ASSERT(MUTEX_HELD(&vxlnat_mutex));
62 if (vxlnat_underlay_ire != NULL) {
63 ire_refrele(vxlnat_underlay_ire);
64 vxlnat_underlay_ire = NULL;
65 }
66 if (vxlnat_underlay != NULL) {
67 (void) ksocket_close(vxlnat_underlay, zone_kcred());
68 vxlnat_underlay = NULL;
69 }
70 }
71
72 static int
73 vxlnat_opensock(in6_addr_t *underlay_ip)
74 {
75 int rc, val;
76 /* Assume rest is initialized to 0s. */
77 struct sockaddr_in6 sin6 = {AF_INET6, BE_16(IPPORT_VXLAN)};
78 ip_stack_t *ipst = vxlnat_netstack->netstack_ip;
79
80 ASSERT(MUTEX_HELD(&vxlnat_mutex));
81 /* Open... */
82 rc = ksocket_socket(&vxlnat_underlay, AF_INET6, SOCK_DGRAM, 0,
83 KSOCKET_SLEEP, zone_kcred());
84 if (rc != 0)
85 return (rc);
86
87 /* Bind... */
88 sin6.sin6_addr = *underlay_ip;
89 rc = ksocket_bind(vxlnat_underlay, (struct sockaddr *)(&sin6),
90 sizeof (sin6), zone_kcred());
91 if (rc != 0) {
92 vxlnat_closesock();
93 return (rc);
94 }
95
96 /* Use source-port hashing when sending packets out VXLAN... */
97 val = UDP_HASH_VXLAN;
98 rc = ksocket_setsockopt(vxlnat_underlay, IPPROTO_UDP,
99 UDP_SRCPORT_HASH, &val, sizeof (val), kcred);
100 if (rc != 0) {
101 vxlnat_closesock();
102 return (rc);
103 }
104
105 /*
106 * Grab the IRE for underlay address.
107 */
108 ASSERT3P(vxlnat_underlay_ire, ==, NULL);
109 vxlnat_underlay_ire = (IN6_IS_ADDR_V4MAPPED(underlay_ip)) ?
110 ire_ftable_lookup_simple_v4(underlay_ip->_S6_un._S6_u32[3],
111 0, ipst, NULL) :
112 ire_ftable_lookup_simple_v6(underlay_ip, 0, ipst, NULL);
113 if (vxlnat_underlay_ire == NULL) {
114 DTRACE_PROBE1(vxlnat__opensock__ire__fail, in6_addr_t *,
115 underlay_ip);
116 vxlnat_closesock();
117 return (EADDRNOTAVAIL);
118 }
119
120 /* Once we return from this, start eating data. */
121 rc = ksocket_krecv_set(vxlnat_underlay, vxlnat_vxlan_input, NULL);
122 if (rc != 0) {
123 vxlnat_closesock();
124 }
125
126 return (rc);
127 }
128
129 /*
130 * Establish a VXLAN-listening kernel socket.
131 * XXX KEBE ASKS ==> Support more than one VXLAN address?
132 */
133 /* ARGSUSED */
134 int
135 vxlnat_vxlan_addr(in6_addr_t *underlay_ip)
136 {
137 int rc;
138
139 ASSERT(MUTEX_HELD(&vxlnat_mutex));
140 /* For now, we make this a one-underlay-address-only solution. */
141 vxlnat_closesock();
142 rc = vxlnat_opensock(underlay_ip);
143 return (rc);
144 }
145
146 /*
147 * Free a remote VXLAN destination.
148 */
149 void
150 vxlnat_remote_free(vxlnat_remote_t *remote)
151 {
152 ASSERT0(remote->vxnrem_refcount);
153
154 kmem_free(remote, sizeof (*remote));
155 }
156
157 /*
158 * Like other unlink functions, assume the appropriate lock is held.
159 */
160 void
161 vxlnat_remote_unlink(vxlnat_remote_t *remote)
162 {
163 vxlnat_vnet_t *vnet = remote->vxnrem_vnet;
164
165 ASSERT3P(vnet, !=, NULL);
166 ASSERT(MUTEX_HELD(&vnet->vxnv_remote_lock));
167
168 /* First unlink so nobody else can find me */
169 avl_remove(&vnet->vxnv_remotes, remote);
170
171 /*
172 * We still hold a vnet reference, so races shouldn't be a problem.
173 * Still, for added safety, NULL it out first.
174 */
175 remote->vxnrem_vnet = NULL; /* Condemn this entry. */
176 VXNV_REFRELE(vnet);
177 VXNREM_REFRELE(remote); /* Internment release. */
178 }
179
180 /*
181 * Find or create a remote VXLAN destination.
182 */
183 static vxlnat_remote_t *
184 vxlnat_get_remote(vxlnat_vnet_t *vnet, in6_addr_t *remote_addr,
185 boolean_t create_on_miss)
186 {
187 vxlnat_remote_t *remote, searcher;
188 avl_index_t where;
189
190 searcher.vxnrem_addr = *remote_addr;
191 mutex_enter(&vnet->vxnv_remote_lock);
192 remote = avl_find(&vnet->vxnv_remotes, &searcher, &where);
193 if (remote == NULL && create_on_miss) {
194 /* Not as critical if we can't allocate here. */
195 remote = kmem_zalloc(sizeof (*remote),
196 KM_NOSLEEP | KM_NORMALPRI);
197 if (remote != NULL) {
198 remote->vxnrem_addr = *remote_addr;
199 remote->vxnrem_refcount = 1; /* Internment reference. */
200 VXNV_REFHOLD(vnet);
201 remote->vxnrem_vnet = vnet;
202 /* Rest is filled in by caller. */
203 avl_insert(&vnet->vxnv_remotes, remote, where);
204 }
205 }
206 if (remote != NULL)
207 VXNREM_REFHOLD(remote);
208 mutex_exit(&vnet->vxnv_remote_lock);
209 return (remote);
210 }
211
212 /*
213 * Cache inbound packet information in the vnet's remotes section.
214 *
215 * NOTE: This function assumes a trustworthy underlay network. If the
216 * underlay isn't trustworthy, this function should be renamed, and reduced to
217 * a "strip and reality-check the ethernet header" function.
218 *
219 * Caller has stripped any pre-ethernet data from mp. We return mp
220 * stripped down to its IP header.
221 */
222 static mblk_t *
223 vxlnat_cache_remote(mblk_t *mp, struct sockaddr_in6 *underlay_src,
224 vxlnat_vnet_t *vnet)
225 {
226 struct ether_vlan_header *evh;
227 struct ether_header *eh;
228 vxlnat_remote_t *remote;
229 uint16_t vlan, ethertype;
230 ether_addr_t remote_ether;
231 ipha_t *ipha;
232 ip6_t *ip6h;
233 in6_addr_t remote_addr;
234
235 /* Assume (for now) we have at least a VLAN header's worth of data. */
236 if (MBLKL(mp) < sizeof (*evh)) {
237 /* XXX KEBE ASKS - should we be more forgiving? */
238 DTRACE_PROBE1(vxlnat__in__drop__etherhdr, mblk_t *, mp);
239 freemsg(mp);
240 return (NULL);
241 }
242
243 eh = (struct ether_header *)mp->b_rptr;
244 ethertype = ntohs(eh->ether_type);
245 ether_copy(&eh->ether_shost, &remote_ether);
246 if (ethertype == ETHERTYPE_VLAN) {
247 evh = (struct ether_vlan_header *)eh;
248 /* Keep it in network order... */
249 vlan = evh->ether_tci;
250 ethertype = ntohs(evh->ether_type);
251 ASSERT(vlan != 0);
252 mp->b_rptr += sizeof (*evh);
253 } else {
254 evh = NULL;
255 vlan = 0;
256 mp->b_rptr += sizeof (*eh);
257 }
258 if (ethertype != ETHERTYPE_IP && ethertype != ETHERTYPE_IPV6) {
259 /*
260 * XXX KEBE SAYS for now, don't handle non-IP packets.
261 * This includes ARP.
262 */
263 DTRACE_PROBE1(vxlnat__in__drop__nonip, mblk_t *, mp);
264 freemsg(mp);
265 return (NULL);
266 }
267
268 /* Handle case of split ether + IP headers. */
269 if (MBLKL(mp) < sizeof (ipha_t)) {
270 mblk_t *freemp;
271
272 if (MBLKL(mp) > 0 || mp->b_cont == NULL) {
273 /* The IP header is split ACROSS MBLKS! Bail for now. */
274 DTRACE_PROBE1(vxlnat__in__drop__splitip, mblk_t *, mp);
275 freemsg(mp);
276 return (NULL);
277 }
278 freemp = mp;
279 mp = mp->b_cont;
280 freeb(freemp);
281 }
282 /* LINTED -- alignment... */
283 ipha = (ipha_t *)mp->b_rptr;
284
285 if (IPH_HDR_VERSION(ipha) == IPV4_VERSION) {
286 if (ethertype != ETHERTYPE_IP) {
287 /* XXX KEBE ASKS - should we be more forgiving? */
288 DTRACE_PROBE1(vxlnat__in__drop__etherhdr4,
289 mblk_t *, mp);
290 freemsg(mp);
291 return (NULL);
292 }
293 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
294 &remote_addr);
295 } else {
296 if (ethertype != ETHERTYPE_IPV6 ||
297 IPH_HDR_VERSION(ipha) != IPV6_VERSION ||
298 MBLKL(mp) < sizeof (ip6_t)) {
299 /* XXX KEBE ASKS - should we be more forgiving? */
300 DTRACE_PROBE1(vxlnat__in__drop__etherhdr6,
301 mblk_t *, mp);
302 freemsg(mp);
303 return (NULL);
304 }
305 ip6h = (ip6_t *)ipha;
306 remote_addr = ip6h->ip6_src;
307 }
308
309 /* Find remote and replace OR create new remote. */
310 remote = vxlnat_get_remote(vnet, &remote_addr, B_TRUE);
311 if (remote != NULL) {
312 /*
313 * See if this entry needs fixing or filling-in. This might
314 * get a bit racy with read-only threads that actually
315 * transmit, but it only means dropped-packets in the worst
316 * case.
317 *
318 * It's THIS PART that inspires the warning about trusting the
319 * underlay network.
320 *
321 * XXX KEBE ASKS -- should we just replace things w/o checking?
322 */
323 /* Replace the ethernet address? */
324 if (ether_cmp(&remote->vxnrem_ether, &remote_ether) != 0)
325 ether_copy(&remote_ether, &remote->vxnrem_ether);
326 /*
327 * Replace the underlay? NOTE: Fix if/when underlay becomes
328 * IPv6.
329 */
330 if (!IN6_ARE_ADDR_EQUAL(&remote->vxnrem_uaddr,
331 &underlay_src->sin6_addr)) {
332 remote->vxnrem_uaddr = underlay_src->sin6_addr;
333 }
334 /* Replace the vlan ID. Maintain network order... */
335 if (remote->vxnrem_vlan != vlan)
336 remote->vxnrem_vlan = vlan;
337 }
338 /*
339 * Else just continue and pray for better luck on another packet or
340 * on the return flight. It is IP, we can Just Drop It (TM)...
341 */
342
343 /* We're done with the remote entry now. */
344 VXNREM_REFRELE(remote);
345
346 /* Advance rptr to the inner IP header and proceed. */
347 mp->b_rptr = (uint8_t *)ipha;
348 return (mp);
349 }
350
351 /*
352 * Process exactly one VXLAN packet.
353 */
354 static void
355 vxlnat_one_vxlan(mblk_t *mp, struct sockaddr_in6 *underlay_src)
356 {
357 vxlan_hdr_t *vxh;
358 vxlnat_vnet_t *vnet;
359 ipha_t *ipha;
360 ip6_t *ip6h;
361 vxlnat_fixed_t *fixed, fsearch;
362
363 if (MBLKL(mp) < sizeof (*vxh)) {
364 /* XXX KEBE ASKS -- should we be more forgiving? */
365 DTRACE_PROBE1(vxlnat__in__drop__vxlsize, mblk_t *, mp);
366 freemsg(mp);
367 return;
368 }
369 vxh = (vxlan_hdr_t *)mp->b_rptr;
370
371 /* If we start using more than just the one flag, fix it. */
372 if (vxh->vxlan_flags != VXLAN_F_VDI_WIRE) {
373 DTRACE_PROBE1(vxlnat__in__drop__VDI, mblk_t *, mp);
374 freemsg(mp);
375 return;
376 }
377
378 /* Remember, we key off of what's on the wire. */
379 vnet = vxlnat_get_vnet(VXLAN_ID_WIRE32(vxh->vxlan_id), B_FALSE);
380 if (vnet == NULL) {
381 DTRACE_PROBE1(vxlnat__in__drop__vnetid, uint32_t,
382 VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)));
383 freemsg(mp);
384 return;
385 }
386
387 DTRACE_PROBE2(vxlnat__in__vnet, uint32_t,
388 VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)),
389 vxlnat_vnet_t, vnet);
390
391 /*
392 * Off-vxlan processing steps:
393 * 1.) Locate the ethernet header and check/update/add-into remotes.
394 * 2.) Search 1-1s, process if hit.
395 * 3.) Search flows, process if hit.
396 * 4.) Search rules, create new flow (or not) if hit.
397 * 5.) Drop the packets.
398 */
399
400 /* 1.) Locate the ethernet header and check/update/add-into remotes. */
401 mp->b_rptr += sizeof (*vxh);
402 while (MBLKL(mp) == 0) {
403 mblk_t *oldmp = mp;
404
405 mp = mp->b_cont;
406 freeb(oldmp);
407 }
408 mp = vxlnat_cache_remote(mp, underlay_src, vnet);
409 if (mp == NULL) {
410 VXNV_REFRELE(vnet);
411 return;
412 }
413
414 /* 2.) Search 1-1s, process if hit. */
415 ipha = (ipha_t *)mp->b_rptr;
416 if (IPH_HDR_VERSION(ipha) == IPV4_VERSION) {
417 ip6h = NULL;
418 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
419 &fsearch.vxnf_addr);
420 } else {
421 /* vxlnat_cache_remote() did reality checks... */
422 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
423 ip6h = (ip6_t *)ipha;
424 ipha = NULL;
425 fsearch.vxnf_addr = ip6h->ip6_src;
426 }
427 rw_enter(&vnet->vxnv_fixed_lock, RW_READER);
428 fixed = avl_find(&vnet->vxnv_fixed_ips, &fsearch, NULL);
429 if (fixed != NULL)
430 VXNF_REFHOLD(fixed);
431 rw_exit(&vnet->vxnv_fixed_lock);
432 if (fixed != NULL) {
433 mblk_t *newmp = NULL;
434
435 /*
436 * XXX KEBE ASKS --> Do MTU check NOW?! That way, we have
437 * pre-natted data. One gotcha, external dests may have
438 * different PathMTUs so see below about EMSGSIZE...
439 */
440
441 /* XXX KEBE SAYS -- FILL ME IN... but for now: */
442 if (ipha != NULL)
443 newmp = vxlnat_fixed_fixv4(mp, fixed, B_FALSE);
444 else
445 freemsg(mp); /* XXX handle ip6h */
446
447 if (newmp != NULL) {
448 ire_t *outbound_ire;
449 /* Use C99's initializers for fun & profit. */
450 ip_recv_attr_t iras =
451 { IRAF_IS_IPV4 | IRAF_VERIFIED_SRC };
452
453 ASSERT3P(ipha, !=, NULL);
454 ASSERT3P(ipha, ==, newmp->b_rptr);
455 /* XXX KEBE ASKS, IRR_ALLOCATE okay?!? */
456 outbound_ire = ire_route_recursive_dstonly_v4(
457 ipha->ipha_dst, IRR_ALLOCATE,
458 0 /* XXX KEBE SAYS XMIT HINT! */,
459 vxlnat_netstack->netstack_ip);
460 VERIFY3P(outbound_ire, !=, NULL);
461 if (outbound_ire->ire_type == IRE_NOROUTE) {
462 /* Bail! */
463 VXNF_REFRELE(fixed);
464 VXNV_REFRELE(vnet);
465 return;
466 }
467
468 iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
469 if (iras.ira_ip_hdr_length > sizeof (ipha_t))
470 iras.ira_flags |= IRAF_IPV4_OPTIONS;
471 iras.ira_xmit_hint = 0; /* XXX KEBE SAYS FIX ME! */
472 iras.ira_zoneid = outbound_ire->ire_zoneid;
473 iras.ira_pktlen = ntohs(ipha->ipha_length);
474 iras.ira_protocol = ipha->ipha_protocol;
475 /* XXX KEBE ASKS rifindex & ruifindex ?!? */
476 /*
477 * NOTE: AT LEAST ira_ill needs ILLF_ROUTER set, as
478 * well as the ill for the external NIC (where
479 * off-link destinations live). For fixed, ira_ill
480 * should be the ill of the external source.
481 */
482 iras.ira_rill = vxlnat_underlay_ire->ire_ill;
483 iras.ira_ill = fixed->vxnf_ire->ire_ill;
484 /* XXX KEBE ASKS cred & cpid ? */
485 iras.ira_verified_src = ipha->ipha_src;
486 /* XXX KEBE SAYS don't sweat IPsec stuff. */
487 /* XXX KEBE SAYS ALSO don't sweat l2src & mhip */
488
489 /* Okay, we're good! Let's pretend we're forwarding. */
490 ire_recv_forward_v4(outbound_ire, mp, ipha, &iras);
491 ire_refrele(outbound_ire);
492 }
493
494 /* All done... */
495 VXNF_REFRELE(fixed);
496 VXNV_REFRELE(vnet);
497 return;
498 }
499
500 /* XXX KEBE SAYS BUILD STEPS 3-4. */
501
502 /* 5.) Nothing, drop the packet. */
503 /* XXX KEBE ASKS DIAGNOSTIC? */
504 VXNV_REFRELE(vnet);
505 freemsg(mp);
506 }
507 /*
508 * ONLY return B_FALSE if we get a packet-clogging event.
509 */
510 /* ARGSUSED */
511 static boolean_t
512 vxlnat_vxlan_input(ksocket_t insock, mblk_t *chain, size_t msgsize, int oob,
513 void *ignored)
514 {
515 mblk_t *mp, *nextmp;
516
517 /*
518 * XXX KEBE ASKS --> move hold & release outside of loop?
519 * If so, hold rwlock here.
520 */
521
522 for (mp = chain; mp != NULL; mp = nextmp) {
523 struct T_unitdata_ind *tudi;
524 struct sockaddr_in6 *sin6;
525
526 nextmp = mp->b_next;
527 if (DB_TYPE(mp) != M_PROTO || mp->b_cont == NULL) {
528 DTRACE_PROBE1(vxlnat__in__drop__mblk, mblk_t *, mp);
529 freemsg(mp);
530 continue;
531 }
532
533 /* LINTED -- aligned */
534 tudi = (struct T_unitdata_ind *)mp->b_rptr;
535 if (tudi->PRIM_type != T_UNITDATA_IND) {
536 DTRACE_PROBE1(vxlnat__in__drop__TPI, mblk_t *, mp);
537 freemsg(mp);
538 continue;
539 }
540 /* LINTED -- aligned */
541 sin6 = (struct sockaddr_in6 *)(mp->b_rptr + tudi->SRC_offset);
542 VERIFY(sin6->sin6_family == AF_INET6);
543 VERIFY(tudi->SRC_length >= sizeof (*sin6));
544
545 vxlnat_one_vxlan(mp->b_cont, sin6);
546 freeb(mp);
547 }
548
549 return (B_TRUE);
550 }
551
552 /*
553 * Use RFC 1141's technique (with a check for -0).
554 *
555 * newsum = oldsum - (new16a + old16a - new16b + old16b ...);
556 *
557 * NOTE: "oldsum" is right off the wire in wire-native order.
558 * NOTE2: "old" and "new" ALSO point to things in wire-native order.
559 * NOTE3: THIS MUST TAKE A MULTIPLE OF 2 BYTES (i.e. uint16_t array).
560 * NOTE4: The 32-bit running sum means we can't take len > 64k.
561 */
562 uint16_t
563 vxlnat_cksum_adjust(uint16_t oldsum, uint16_t *old, uint16_t *new, uint_t len)
564 {
565 uint32_t newsum = ntohs(oldsum);
566
567 ASSERT((len & 0x1) == 0);
568 while (len != 0) {
569 newsum -= ntohs(*new);
570 newsum += ntohs(*old);
571 len -= 2;
572 old++;
573 new++;
574 }
575 newsum += (newsum >> 16) & 0xffff;
576
577 return (newsum == 0xffff ? 0 : htons(newsum));
578 }
579
580 /*
581 * Fix inner headers on an ICMP packet.
582 *
583 * XXX KEBE SAYS FOR NOW, just do addresses for 1-1/fixed. When we do
584 * flows, include old_port/new_port as well.
585 */
586 static mblk_t *
587 vxlnat_fix_icmp_inner_v4(mblk_t *mp, icmph_t *icmph, ipaddr_t old_one,
588 ipaddr_t new_one, boolean_t to_private)
589 {
590 mblk_t *newmp;
591 ipha_t *inner_ipha;
592 ipaddr_t *new_ones_place;
593
594 if ((uint8_t *)(icmph + 1) + sizeof (ipha_t) > mp->b_wptr) {
595 /* Pay the pullup tax. */
596 newmp = msgpullup(mp, -1);
597 freemsg(mp);
598 if (newmp == NULL) {
599 DTRACE_PROBE1(vxlnat__fixicmp__pullupfail, void *,
600 NULL);
601 return (NULL);
602 }
603 if (MBLKL(newmp) < 2 * sizeof (ipha_t) + sizeof (icmph_t)) {
604 /* Wow! Too-tiny ICMP packet. */
605 DTRACE_PROBE1(vxlnat__fixicmp__tootiny, mblk_t *,
606 newmp);
607 freeb(newmp);
608 return (NULL);
609 }
610 mp = newmp;
611 /* Temporarily use inner_ipha for the outer one. */
612 inner_ipha = (ipha_t *)mp->b_rptr;
613 icmph = (icmph_t *)(mp->b_rptr + IPH_HDR_LENGTH(inner_ipha));
614 }
615 inner_ipha = (ipha_t *)(icmph + 1);
616 new_ones_place = to_private ?
617 &inner_ipha->ipha_src : &inner_ipha->ipha_dst;
618 if (*new_ones_place != old_one) {
619 /* Either I'm buggy or the packet is. */
620 DTRACE_PROBE2(vxlnat__fixicmp__badinneraddr, ipaddr_t,
621 old_one, ipaddr_t, *new_ones_place);
622 freeb(mp);
623 return (NULL);
624 }
625 *new_ones_place = new_one;
626
627 /* Adjust ICMP checksum... */
628 icmph->icmph_checksum = vxlnat_cksum_adjust(icmph->icmph_checksum,
629 (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
630
631 /*
632 * XXX KEBE ASKS, recompute *inner-packet* checksums? Let's not for
633 * now, but consider this Fair Warning (or some other VH album...).
634 */
635 return (mp);
636 }
637
638 /*
639 * Take a 1-1/fixed IPv4 packet and convert it for transmission out the
640 * appropriate end. "to_private" is what it says on the tin.
641 */
642 static mblk_t *
643 vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed, boolean_t to_private)
644 {
645 ipaddr_t new_one, old_one;
646 ipaddr_t *new_ones_place;
647 ipha_t *ipha = (ipha_t *)mp->b_rptr;
648 uint8_t *nexthdr, *end_wptr;
649
650 if (to_private) {
651 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_addr, new_one);
652 new_ones_place = &ipha->ipha_dst;
653 } else {
654 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_pubaddr, new_one);
655 new_ones_place = &ipha->ipha_src;
656 }
657
658 old_one = *new_ones_place;
659 *new_ones_place = new_one;
660
661 /*
662 * Recompute the IP header checksum, and check for the TCP or UDP
663 * checksum as well, as they'll need recomputing as well.
664 */
665
666 /* First, the IPv4 header itself. */
667 ipha->ipha_hdr_checksum = vxlnat_cksum_adjust(ipha->ipha_hdr_checksum,
668 (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
669
670 nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
671 if (nexthdr >= mp->b_wptr) {
672 nexthdr = mp->b_cont->b_rptr +
673 (MBLKL(mp) - IPH_HDR_LENGTH(ipha));
674 end_wptr = mp->b_cont->b_wptr;
675 } else {
676 end_wptr = mp->b_wptr;
677 }
678
679 switch (ipha->ipha_protocol) {
680 case IPPROTO_TCP: {
681 tcpha_t *tcph = (tcpha_t *)nexthdr;
682
683 if (nexthdr + sizeof (*tcph) >= end_wptr) {
684 /* Bail for now. */
685 DTRACE_PROBE1(vxlnat__fix__tcp__mblkspan, mblk_t *,
686 mp);
687 freemsg(mp);
688 return (NULL);
689 }
690 tcph->tha_sum = vxlnat_cksum_adjust(tcph->tha_sum,
691 (uint16_t *)&old_one, (uint16_t *)&new_one,
692 sizeof (ipaddr_t));
693 break; /* Out of switch. */
694 }
695 case IPPROTO_UDP: {
696 udpha_t *udph = (udpha_t *)nexthdr;
697
698 if (nexthdr + sizeof (*udph) >= end_wptr) {
699 /* Bail for now. */
700 DTRACE_PROBE1(vxlnat__fix__udp__mblkspan, mblk_t *,
701 mp);
702 freemsg(mp);
703 return (NULL);
704 }
705 udph->uha_checksum = vxlnat_cksum_adjust(udph->uha_checksum,
706 (uint16_t *)&old_one, (uint16_t *)&new_one,
707 sizeof (ipaddr_t));
708 break; /* Out of switch. */
709 }
710 case IPPROTO_ICMP: {
711 icmph_t *icmph = (icmph_t *)nexthdr;
712
713 /*
714 * We need to check the case of ICMP messages that contain
715 * IP packets. We will need to at least change the addresses,
716 * and *maybe* the checksums too if necessary.
717 *
718 * This may replicate some of icmp_inbound_v4(), alas.
719 */
720 if (nexthdr + sizeof (*icmph) >= end_wptr) {
721 mblk_t *newmp;
722 /*
723 * Unlike the others, we're going to pay the pullup
724 * tax here.
725 */
726 newmp = msgpullup(mp, -1);
727 freemsg(mp);
728 if (newmp == NULL) {
729 DTRACE_PROBE1(vxlnat__icmp__pullupfail, void *,
730 NULL);
731 return (NULL);
732 }
733 mp = newmp;
734 ipha = (ipha_t *)(mp->b_rptr);
735 nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
736 icmph = (icmph_t *)nexthdr;
737 }
738
739 switch (icmph->icmph_type) {
740 case ICMP_ADDRESS_MASK_REPLY:
741 case ICMP_ADDRESS_MASK_REQUEST:
742 case ICMP_TIME_STAMP_REPLY:
743 case ICMP_TIME_STAMP_REQUEST:
744 case ICMP_ECHO_REQUEST:
745 case ICMP_ECHO_REPLY:
746 /* These merely need to get passed along. */
747 break;
748 case ICMP_ROUTER_ADVERTISEMENT:
749 case ICMP_ROUTER_SOLICITATION:
750 /* These shouldn't be traversing a NAT at all. Drop. */
751 DTRACE_PROBE1(vxlnat__icmp__cantpass, int,
752 icmph->icmph_type);
753 freemsg(mp);
754 return (NULL);
755 case ICMP_PARAM_PROBLEM:
756 case ICMP_TIME_EXCEEDED:
757 case ICMP_DEST_UNREACHABLE:
758 /* These include inner-IP headers we need to adjust. */
759 mp = vxlnat_fix_icmp_inner_v4(mp, icmph, old_one,
760 new_one, to_private);
761 break;
762 default:
763 /* Pass along to receiver, but warn. */
764 DTRACE_PROBE1(vxlnat__icmp__unknown, int,
765 icmph->icmph_type);
766 break;
767 }
768 }
769 /* Otherwise we can't make any other assumptions for now... */
770 default:
771 break;
772 }
773
774 return (mp);
775 }
776
777 vxlnat_remote_t *
778 vxlnat_xmit_vxlanv4(mblk_t *mp, in6_addr_t *overlay_dst,
779 vxlnat_remote_t *remote, uint8_t *myether, vxlnat_vnet_t *vnet)
780 {
781 struct sockaddr_in6 sin6 = {AF_INET6};
782 struct msghdr msghdr = {NULL};
783 mblk_t *vlan_mp;
784 extern uint_t vxlan_alloc_size, vxlan_noalloc_min;
785 vxlan_hdr_t *vxh;
786 struct ether_vlan_header *evh;
787 int rc;
788 cred_t *cred;
789
790 if (remote == NULL || remote->vxnrem_vnet == NULL) {
791 DTRACE_PROBE1(vxlnat__xmit__vxlanv4, vxlnat_remote_t *, remote);
792 /* Release the condemned remote. */
793 if (remote != NULL)
794 VXNREM_REFRELE(remote);
795
796 /* See if we have a remote ready to use... */
797 remote = vxlnat_get_remote(vnet, overlay_dst, B_FALSE);
798
799 if (remote == NULL) {
800 /*
801 * We need to do the moral equivalent of PF_KEY
802 * ACQUIRE or overlay's queue-resolve so that we can
803 * have someone in user-space send me a remote. Until
804 * then, drop the reference if condemned, free the
805 * message, and return NULL.
806 */
807
808 freemsg(mp);
809 return (NULL);
810 }
811 }
812 ASSERT(vnet == remote->vxnrem_vnet);
813
814 if (DB_REF(mp) > 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
815 vlan_mp = allocb(vxlan_alloc_size, BPRI_HI);
816 if (vlan_mp == NULL) {
817 DTRACE_PROBE1(vxlnat__xmit__vxlanv4__allocfail,
818 vxlnat_remote_t *, remote);
819 freemsg(mp);
820 /* Just drop the packet, but don't tell caller. */
821 return (remote);
822 }
823 vlan_mp->b_wptr = DB_LIM(vlan_mp);
824 vlan_mp->b_rptr = vlan_mp->b_wptr;
825 vlan_mp->b_cont = mp;
826 } else {
827 vlan_mp = mp;
828 }
829 vlan_mp->b_rptr -= sizeof (*vxh) + sizeof (*evh);
830 vxh = (vxlan_hdr_t *)vlan_mp->b_rptr;
831 vxh->vxlan_flags = VXLAN_F_VDI_WIRE;
832 vxh->vxlan_id = vnet->vxnv_vnetid; /* Already in wire-order. */
833
834 /* Fill in the Ethernet header. */
835 evh = (struct ether_vlan_header *)(vxh + 1);
836 ether_copy(&remote->vxnrem_ether, &evh->ether_dhost);
837 ether_copy(myether, &evh->ether_shost);
838 evh->ether_tpid = htons(ETHERTYPE_VLAN);
839 evh->ether_tci = remote->vxnrem_vlan;
840 evh->ether_type = htons(ETHERTYPE_IP);
841
842 msghdr.msg_name = (struct sockaddr_storage *)&sin6;
843 msghdr.msg_namelen = sizeof (sin6);
844 /* Address family and other zeroing already done up top. */
845 sin6.sin6_port = htons(IPPORT_VXLAN);
846 sin6.sin6_addr = remote->vxnrem_uaddr;
847
848 /*
849 * cred_t dance is because we may be getting this straight from
850 * interrupt context.
851 */
852 cred = zone_get_kcred(netstack_get_zoneid(vxlnat_netstack));
853 if (cred == NULL) {
854 DTRACE_PROBE1(vxlnat__xmit__vxlan4__credfail,
855 vxlnat_remote_t *, remote);
856 freemsg(vlan_mp);
857 }
858 /*
859 * Use MSG_DONTWAIT to avoid blocks, esp. if we're getting this
860 * straight from interrupt context.
861 */
862 rc = ksocket_sendmblk(vxlnat_underlay, &msghdr, MSG_DONTWAIT, &vlan_mp,
863 cred);
864 crfree(cred);
865 if (rc != 0) {
866 DTRACE_PROBE2(vxlnat__xmit__vxlan4__sendfail, int, rc,
867 vxlnat_remote_t *, remote);
868 freemsg(vlan_mp);
869 }
870 return (remote);
871 }
872
873 /*
874 * New ire_{recv,send}fn implementations if we're doing 1-1 mappings.
875 */
876 int
877 vxlnat_fixed_ire_send_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
878 ip_xmit_attr_t *ixa, uint32_t *identp)
879 {
880 /* XXX KEBE SAYS FILL ME IN, but for now... */
881 freemsg(mp);
882 return (EOPNOTSUPP);
883 }
884
885 void
886 vxlnat_fixed_ire_recv_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
887 ip_recv_attr_t *ira)
888 {
889 /* XXX KEBE SAYS FILL ME IN, but for now... */
890 freemsg(mp);
891 }
892
893 /*
894 * I believe the common case for this will be from self-generated ICMP
895 * messages. Other same-netstack-originated traffic will also come through
896 * here (one internal reaching what turns out to be another internal).
897 */
898 int
899 vxlnat_fixed_ire_send_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
900 ip_xmit_attr_t *ixa, uint32_t *identp)
901 {
902 ip_recv_attr_t iras; /* NOTE: No bzero because we pay more later */
903 ipha_t *ipha = (ipha_t *)iph_arg;
904
905 /*
906 * XXX KEBE ASKS, any DTrace probes or other instrumentation that
907 * perhaps should be set?
908 */
909
910 /* Map ixa to ira. */
911 iras.ira_pktlen = ixa->ixa_pktlen;
912 /* XXX KEBE ASKS more?!? */
913
914 /*
915 * In normal TCP/IP processing, this shortcuts the IP header checksum
916 * AND POSSIBLY THE ULP checksum cases. Since this is likely to head
917 * back into the internal network, we need to recompute things again.
918 */
919 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
920 freemsg(mp);
921 return (EMSGSIZE);
922 }
923 #if 0
924 /* XXX KEBE ASKS Special-case ICMP here? */
925 if (ipha->ipha_protocol == IPPROTO_ICMP) {
926 icmph_t *icmph;
927
928 icmph = (icmph_t *)((uint8_t *)ipha + IPH_HDR_LENGTH(ipha));
929 if ((uint8_t *)icmph >= mp->b_wptr) {
930 freemsg(mp);
931 return (EMSGSIZE);
932 }
933 icmph->icmph_checksum = 0;
934 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
935 }
936 #endif
937
938 vxlnat_fixed_ire_recv_v4(ire, mp, iph_arg, &iras);
939
940 return (0);
941 }
942
943 void
944 vxlnat_fixed_ire_recv_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
945 ip_recv_attr_t *ira)
946 {
947 vxlnat_fixed_t *fixed;
948 vxlnat_vnet_t *vnet;
949 ipha_t *ipha = (ipha_t *)iph_arg;
950 int newmtu;
951
952 /* Make a note for DAD that this address is in use */
953 ire->ire_last_used_time = LBOLT_FASTPATH;
954
955 /* Only target the IRE_LOCAL with the right zoneid. */
956 ira->ira_zoneid = ire->ire_zoneid;
957
958 /*
959 * XXX KEBE ASKS, any DTrace probes or other instrumentation that
960 * perhaps should be set?
961 */
962
963 /*
964 * Reality check some things.
965 */
966 fixed = (vxlnat_fixed_t *)ire->ire_dep_sib_next;
967 vnet = fixed->vxnf_vnet;
968
969 ASSERT3P(ire, ==, fixed->vxnf_ire);
970
971 if (IRE_IS_CONDEMNED(ire) || vnet == NULL)
972 goto detach_ire_and_bail;
973
974 /*
975 * Not a common-case, but a possible one. If our underlay MTU is
976 * smaller than the external MTU, it is possible that we will have a
977 * size mismatch and therefore need to either fragment at the VXLAN
978 * layer (VXLAN UDP packet sent as two or more IP fragments) OR
979 * if IPH_DF is set, send an ICMP_NEEDS_FRAGMENTATION back to the
980 * sender. Perform the check here BEFORE we NAT the packet.
981 */
982 ASSERT(vxlnat_underlay_ire->ire_ill != NULL);
983 newmtu = vxlnat_underlay_ire->ire_ill->ill_mtu - sizeof (ipha_t) -
984 sizeof (udpha_t) - sizeof (vxlan_hdr_t) -
985 sizeof (struct ether_vlan_header);
986 if ((ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF) &&
987 ntohs(ipha->ipha_length) > newmtu) {
988 icmp_frag_needed(mp, newmtu, ira);
989 /* We're done. Assume icmp_frag_needed() consumed mp. */
990 return;
991 }
992
993 /*
994 * So we're here, and since we have a refheld IRE, we have a refheld
995 * fixed and vnet. Do some of what ip_input_local_v4() does (inbound
996 * checksum? some ira checks?), but otherwise, swap the destination
997 * address as mapped in "fixed", recompute any checksums, and send it
998 * along its merry way (with a ttl decement too) to a VXLAN
999 * destination.
1000 */
1001 mp = vxlnat_fixed_fixv4(mp, fixed, B_TRUE);
1002 if (mp == NULL)
1003 return; /* Assume it's been freed & dtraced already. */
1004
1005 /*
1006 * Otherwise, we're ready to transmit this packet over the vxlan
1007 * socket.
1008 */
1009 fixed->vxnf_remote = vxlnat_xmit_vxlanv4(mp, &fixed->vxnf_addr,
1010 fixed->vxnf_remote, fixed->vxnf_myether, vnet);
1011 if (fixed->vxnf_remote == NULL) {
1012 /* XXX KEBE ASKS, DTrace probe here? Or in-function? */
1013 DTRACE_PROBE2(vxlnat__fixed__xmitdrop,
1014 in6_addr_t *, &fixed->vxnf_addr,
1015 uint32_t, VXLAN_ID_NTOH(vnet->vxnv_vnetid));
1016 }
1017 return;
1018
1019 detach_ire_and_bail:
1020 /* Oh no, something's condemned. Drop the IRE now. */
1021 ire->ire_recvfn = ire_recv_local_v4;
1022 ire->ire_dep_sib_next = NULL;
1023 VXNF_REFRELE(fixed);
1024 /* Pass the packet back... */
1025 ire_recv_local_v4(ire, mp, iph_arg, ira);
1026 return;
1027 }