1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2018 Joyent, Inc.
14 */
15
16 /*
17 * NAT engine. Mappings, 1-1 The rules in vxlnat_rules.c are only consulted
18 * if the 1-1 map (kept here) misses or if the outbound lookup (vnetid,
19 * protocol, src-IP, dst-IP, src-port, dst-port) misses.
20 *
21 * The plan is for inbound to hit conn_ts, whose conn_private points to
22 * entries here. The conn_recv* functions live here too (for now).
23 */
24
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/ksynch.h>
28 #include <sys/ksocket.h>
29 #include <sys/kmem.h>
30 #include <sys/stream.h>
31 #include <sys/strsubr.h>
32 #include <sys/strsun.h>
33 #include <sys/sysmacros.h>
34 #include <sys/debug.h>
35 #include <sys/dtrace.h>
36 #include <sys/errno.h>
37 #include <sys/tihdr.h>
38 #include <netinet/in.h>
39 #include <netinet/udp.h>
40 #include <inet/ip.h>
41 #include <inet/ip6.h>
42 #include <inet/udp_impl.h>
43 #include <inet/tcp.h>
44
45 #include <inet/vxlnat_impl.h>
46
47 static boolean_t vxlnat_vxlan_input(ksocket_t, mblk_t *, size_t, int, void *);
48 static mblk_t *vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed,
49 boolean_t to_private);
50
51 /*
52 * Initialized to NULL, read/write protected by vxlnat_mutex.
53 * Receive functions shouldn't have to access this directly.
54 */
55 ksocket_t vxlnat_underlay;
56 ire_t *vxlnat_underlay_ire;
57
58 void
59 vxlnat_closesock(void)
60 {
61 ASSERT(MUTEX_HELD(&vxlnat_mutex));
62 if (vxlnat_underlay_ire != NULL) {
63 ire_refrele(vxlnat_underlay_ire);
64 vxlnat_underlay_ire = NULL;
65 }
66 if (vxlnat_underlay != NULL) {
67 (void) ksocket_close(vxlnat_underlay, zone_kcred());
68 vxlnat_underlay = NULL;
69 }
70 }
71
72 static int
73 vxlnat_opensock(in6_addr_t *underlay_ip)
74 {
75 int rc, val;
76 /* Assume rest is initialized to 0s. */
77 struct sockaddr_in6 sin6 = {AF_INET6, BE_16(IPPORT_VXLAN)};
78 ip_stack_t *ipst = vxlnat_netstack->netstack_ip;
79
80 ASSERT(MUTEX_HELD(&vxlnat_mutex));
81 /* Open... */
82 rc = ksocket_socket(&vxlnat_underlay, AF_INET6, SOCK_DGRAM, 0,
83 KSOCKET_SLEEP, zone_kcred());
84 if (rc != 0)
85 return (rc);
86
87 /* Bind... */
88 sin6.sin6_addr = *underlay_ip;
89 rc = ksocket_bind(vxlnat_underlay, (struct sockaddr *)(&sin6),
90 sizeof (sin6), zone_kcred());
91 if (rc != 0) {
92 vxlnat_closesock();
93 return (rc);
94 }
95
96 /* Use source-port hashing when sending packets out VXLAN... */
97 val = UDP_HASH_VXLAN;
98 rc = ksocket_setsockopt(vxlnat_underlay, IPPROTO_UDP,
99 UDP_SRCPORT_HASH, &val, sizeof (val), kcred);
100 if (rc != 0) {
101 vxlnat_closesock();
102 return (rc);
103 }
104
105 /*
106 * Grab the IRE for underlay address.
107 */
108 ASSERT3P(vxlnat_underlay_ire, ==, NULL);
109 vxlnat_underlay_ire = (IN6_IS_ADDR_V4MAPPED(underlay_ip)) ?
110 ire_ftable_lookup_simple_v4(underlay_ip->_S6_un._S6_u32[3],
111 0, ipst, NULL) :
112 ire_ftable_lookup_simple_v6(underlay_ip, 0, ipst, NULL);
113 if (vxlnat_underlay_ire == NULL) {
114 DTRACE_PROBE1(vxlnat__opensock__ire__fail, in6_addr_t *,
115 underlay_ip);
116 vxlnat_closesock();
117 return (EADDRNOTAVAIL);
118 }
119
120 /* Once we return from this, start eating data. */
121 rc = ksocket_krecv_set(vxlnat_underlay, vxlnat_vxlan_input, NULL);
122 if (rc != 0) {
123 vxlnat_closesock();
124 }
125
126 return (rc);
127 }
128
129 /*
130 * Establish a VXLAN-listening kernel socket.
131 * XXX KEBE ASKS ==> Support more than one VXLAN address?
132 */
133 /* ARGSUSED */
134 int
135 vxlnat_vxlan_addr(in6_addr_t *underlay_ip)
136 {
137 int rc;
138
139 ASSERT(MUTEX_HELD(&vxlnat_mutex));
140 /* For now, we make this a one-underlay-address-only solution. */
141 vxlnat_closesock();
142 rc = vxlnat_opensock(underlay_ip);
143 return (rc);
144 }
145
146 /*
147 * Free a remote VXLAN destination.
148 */
149 void
150 vxlnat_remote_free(vxlnat_remote_t *remote)
151 {
152 ASSERT0(remote->vxnrem_refcount);
153
154 kmem_free(remote, sizeof (*remote));
155 }
156
157 /*
158 * Like other unlink functions, assume the appropriate lock is held.
159 */
160 void
161 vxlnat_remote_unlink(vxlnat_remote_t *remote)
162 {
163 vxlnat_vnet_t *vnet = remote->vxnrem_vnet;
164
165 ASSERT3P(vnet, !=, NULL);
166 ASSERT(MUTEX_HELD(&vnet->vxnv_remote_lock));
167
168 /* First unlink so nobody else can find me */
169 avl_remove(&vnet->vxnv_remotes, remote);
170
171 /*
172 * We still hold a vnet reference, so races shouldn't be a problem.
173 * Still, for added safety, NULL it out first.
174 */
175 remote->vxnrem_vnet = NULL; /* Condemn this entry. */
176 VXNV_REFRELE(vnet);
177 VXNREM_REFRELE(remote); /* Internment release. */
178 }
179
180 /*
181 * Find or create a remote VXLAN destination.
182 */
183 static vxlnat_remote_t *
184 vxlnat_get_remote(vxlnat_vnet_t *vnet, in6_addr_t *remote_addr,
185 boolean_t create_on_miss)
186 {
187 vxlnat_remote_t *remote, searcher;
188 avl_index_t where;
189
190 searcher.vxnrem_addr = *remote_addr;
191 mutex_enter(&vnet->vxnv_remote_lock);
192 remote = avl_find(&vnet->vxnv_remotes, &searcher, &where);
193 if (remote == NULL && create_on_miss) {
194 /* Not as critical if we can't allocate here. */
195 remote = kmem_zalloc(sizeof (*remote),
196 KM_NOSLEEP | KM_NORMALPRI);
197 if (remote != NULL) {
198 remote->vxnrem_addr = *remote_addr;
199 remote->vxnrem_refcount = 1; /* Internment reference. */
200 VXNV_REFHOLD(vnet);
201 remote->vxnrem_vnet = vnet;
202 /* Rest is filled in by caller. */
203 avl_insert(&vnet->vxnv_remotes, remote, where);
204 }
205 }
206 if (remote != NULL)
207 VXNREM_REFHOLD(remote);
208 mutex_exit(&vnet->vxnv_remote_lock);
209 return (remote);
210 }
211
212 /*
213 * Cache inbound packet information in the vnet's remotes section.
214 *
215 * NOTE: This function assumes a trustworthy underlay network. If the
216 * underlay isn't trustworthy, this function should be renamed, and reduced to
217 * a "strip and reality-check the ethernet header" function.
218 *
219 * Caller has stripped any pre-ethernet data from mp. We return mp
220 * stripped down to its IP header.
221 */
222 static mblk_t *
223 vxlnat_cache_remote(mblk_t *mp, struct sockaddr_in6 *underlay_src,
224 vxlnat_vnet_t *vnet)
225 {
226 struct ether_vlan_header *evh;
227 struct ether_header *eh;
228 vxlnat_remote_t *remote;
229 uint16_t vlan, ethertype;
230 ether_addr_t remote_ether;
231 ipha_t *ipha;
232 ip6_t *ip6h;
233 in6_addr_t remote_addr;
234
235 /* Assume (for now) we have at least a VLAN header's worth of data. */
236 if (MBLKL(mp) < sizeof (*evh)) {
237 /* XXX KEBE ASKS - should we be more forgiving? */
238 DTRACE_PROBE1(vxlnat__in__drop__etherhdr, mblk_t *, mp);
239 freemsg(mp);
240 return (NULL);
241 }
242
243 eh = (struct ether_header *)mp->b_rptr;
244 ethertype = ntohs(eh->ether_type);
245 ether_copy(&eh->ether_shost, &remote_ether);
246 if (ethertype == ETHERTYPE_VLAN) {
247 evh = (struct ether_vlan_header *)eh;
248 /* Keep it in network order... */
249 vlan = evh->ether_tci;
250 ethertype = ntohs(evh->ether_type);
251 ASSERT(vlan != 0);
252 mp->b_rptr += sizeof (*evh);
253 } else {
254 evh = NULL;
255 vlan = 0;
256 mp->b_rptr += sizeof (*eh);
257 }
258 if (ethertype != ETHERTYPE_IP && ethertype != ETHERTYPE_IPV6) {
259 /*
260 * XXX KEBE SAYS for now, don't handle non-IP packets.
261 * This includes ARP.
262 */
263 DTRACE_PROBE1(vxlnat__in__drop__nonip, mblk_t *, mp);
264 freemsg(mp);
265 return (NULL);
266 }
267
268 /* Handle case of split ether + IP headers. */
269 if (MBLKL(mp) < sizeof (ipha_t)) {
270 mblk_t *freemp;
271
272 if (MBLKL(mp) > 0 || mp->b_cont == NULL) {
273 /* The IP header is split ACROSS MBLKS! Bail for now. */
274 DTRACE_PROBE1(vxlnat__in__drop__splitip, mblk_t *, mp);
275 freemsg(mp);
276 return (NULL);
277 }
278 freemp = mp;
279 mp = mp->b_cont;
280 freeb(freemp);
281 }
282 /* LINTED -- alignment... */
283 ipha = (ipha_t *)mp->b_rptr;
284
285 if (IPH_HDR_VERSION(ipha) == IPV4_VERSION) {
286 if (ethertype != ETHERTYPE_IP) {
287 /* XXX KEBE ASKS - should we be more forgiving? */
288 DTRACE_PROBE1(vxlnat__in__drop__etherhdr4,
289 mblk_t *, mp);
290 freemsg(mp);
291 return (NULL);
292 }
293 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
294 &remote_addr);
295 } else {
296 if (ethertype != ETHERTYPE_IPV6 ||
297 IPH_HDR_VERSION(ipha) != IPV6_VERSION ||
298 MBLKL(mp) < sizeof (ip6_t)) {
299 /* XXX KEBE ASKS - should we be more forgiving? */
300 DTRACE_PROBE1(vxlnat__in__drop__etherhdr6,
301 mblk_t *, mp);
302 freemsg(mp);
303 return (NULL);
304 }
305 ip6h = (ip6_t *)ipha;
306 remote_addr = ip6h->ip6_src;
307 }
308
309 /* Find remote and replace OR create new remote. */
310 remote = vxlnat_get_remote(vnet, &remote_addr, B_TRUE);
311 if (remote != NULL) {
312 /*
313 * See if this entry needs fixing or filling-in. This might
314 * get a bit racy with read-only threads that actually
315 * transmit, but it only means dropped-packets in the worst
316 * case.
317 *
318 * It's THIS PART that inspires the warning about trusting the
319 * underlay network.
320 *
321 * XXX KEBE ASKS -- should we just replace things w/o checking?
322 */
323 /* Replace the ethernet address? */
324 if (ether_cmp(&remote->vxnrem_ether, &remote_ether) != 0)
325 ether_copy(&remote_ether, &remote->vxnrem_ether);
326 /*
327 * Replace the underlay? NOTE: Fix if/when underlay becomes
328 * IPv6.
329 */
330 if (!IN6_ARE_ADDR_EQUAL(&remote->vxnrem_uaddr,
331 &underlay_src->sin6_addr)) {
332 remote->vxnrem_uaddr = underlay_src->sin6_addr;
333 }
334 /* Replace the vlan ID. Maintain network order... */
335 if (remote->vxnrem_vlan != vlan)
336 remote->vxnrem_vlan = vlan;
337 }
338 /*
339 * Else just continue and pray for better luck on another packet or
340 * on the return flight. It is IP, we can Just Drop It (TM)...
341 */
342
343 /* We're done with the remote entry now. */
344 VXNREM_REFRELE(remote);
345
346 /* Advance rptr to the inner IP header and proceed. */
347 mp->b_rptr = (uint8_t *)ipha;
348 return (mp);
349 }
350
351 /*
352 * Inspect the packet and find ports & protos (or ICMP types & codes)
353 * and see if we have an established NAT flow.
354 *
355 * XXX KEBE WONDERS if the transmission path will more closely resemble
356 * vxlnat_one_vxlan_fixed() because of ipha_ident issues or not...
357 *
358 * B_TRUE means the packet was handled, and we shouldn't continue processing
359 * (even if "was handled" means droppage).
360 */
361 static boolean_t
362 vxlnat_one_vxlan_flow(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
363 ip6_t *ip6h)
364 {
365 /* XXX KEBE SAYS FILL ME IN. */
366 /* For now... */
367 return (B_FALSE);
368 }
369
370 /*
371 * If we reach here, we need to find a NAT rule, and see if we can/should
372 * CREATE a new NAT flow, or whether or not we should drop, maybe even
373 * returning an ICMP message of some sort.
374 *
375 * B_TRUE means the packet was handled, and we shouldn't continue processing
376 * (even if "was handled" means droppage).
377 */
378 static boolean_t
379 vxlnat_one_vxlan_rule(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
380 ip6_t *ip6h)
381 {
382 vxlnat_rule_t *rule;
383
384 /* XXX handle IPv6 later */
385 if (ip6h != NULL)
386 return (B_FALSE);
387
388 ASSERT3P(ipha, !=, NULL);
389
390 mutex_enter(&vnet->vxnv_rule_lock);
391 rule = list_head(&vnet->vxnv_rules);
392
393 /*
394 * search for a match in the nat rules
395 * XXX investigate perf issues with with respect to list_t size
396 */
397 while (rule != NULL) {
398 ipaddr_t ipaddr;
399 uint32_t netmask = 0xffffffff;
400 uint8_t prefix = rule->vxnr_prefix - 96;
401
402 /* calculate the v4 netmask */
403 netmask <<= (32 - prefix);
404 netmask = htonl(netmask);
405
406 IN6_V4MAPPED_TO_IPADDR(&rule->vxnr_myaddr, ipaddr);
407 /* XXX ASSERT vlanid? */
408 if ((ipaddr & netmask) == (ipha->ipha_src & netmask)) {
409 VXNR_REFHOLD(rule);
410 break;
411 }
412
413 rule = list_next(&vnet->vxnv_rules, rule);
414 }
415
416 mutex_exit(&vnet->vxnv_rule_lock);
417
418 if (rule == NULL)
419 return (B_FALSE);
420
421 /* process packet */
422 /*
423 static vxlnat_flow_t *
424 vxlnat_new_flow(vxlnat_rule_t *rule, in6_addr_t *inner_src, in6_addr_t *dst,
425 uint32_t ports, uint8_t protocol)
426 */
427
428 return (B_FALSE);
429 }
430
431 /*
432 * See if the inbound VXLAN packet hits a 1-1/fixed mapping, and process if it
433 * does. B_TRUE means the packet was handled, and we shouldn't continue
434 * processing (even if "was handled" means droppage).
435 */
436 static boolean_t
437 vxlnat_one_vxlan_fixed(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
438 ip6_t *ip6h)
439 {
440 vxlnat_fixed_t *fixed, fsearch;
441 mblk_t *newmp;
442 ire_t *outbound_ire;
443 /* Use C99's initializers for fun & profit. */
444 ip_recv_attr_t iras = { IRAF_IS_IPV4 | IRAF_VERIFIED_SRC };
445
446 if (ipha != NULL) {
447 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
448 &fsearch.vxnf_addr);
449 } else {
450 /* vxlnat_cache_remote() did reality checks... */
451 ASSERT(ipha == NULL && ip6h != NULL);
452 fsearch.vxnf_addr = ip6h->ip6_src;
453 }
454
455 rw_enter(&vnet->vxnv_fixed_lock, RW_READER);
456 fixed = avl_find(&vnet->vxnv_fixed_ips, &fsearch, NULL);
457 if (fixed != NULL)
458 VXNF_REFHOLD(fixed);
459 rw_exit(&vnet->vxnv_fixed_lock);
460 if (fixed == NULL)
461 return (B_FALSE); /* Try another method of processing. */
462
463 newmp = NULL;
464 /*
465 * XXX KEBE ASKS --> Do an MTU check NOW?! That way, we have
466 * pre-natted data. One gotcha, external dests may have
467 * different PathMTUs so see below about EMSGSIZE...
468 *
469 * For now, let the post-NAT crunch through
470 * ire_recv_forward_v4() take care of all of that.
471 */
472
473 if (ipha != NULL)
474 newmp = vxlnat_fixed_fixv4(mp, fixed, B_FALSE);
475 else {
476 freemsg(mp); /* XXX handle ip6h */
477 return (B_TRUE);
478 }
479
480 if (newmp == NULL)
481 return (B_TRUE); /* mp eaten by vxlnat_fixed_fixv4() */
482
483
484 ASSERT3P(ipha, ==, newmp->b_rptr);
485 /* XXX KEBE ASKS, IRR_ALLOCATE okay?!? */
486 /* XXX KEBE SAYS XMIT HINT! */
487 outbound_ire = ire_route_recursive_dstonly_v4(ipha->ipha_dst,
488 IRR_ALLOCATE, 0, vxlnat_netstack->netstack_ip);
489 VERIFY3P(outbound_ire, !=, NULL);
490 if (outbound_ire->ire_type == IRE_NOROUTE) {
491 /* Bail! */
492 DTRACE_PROBE2(vxlnat__in__drop__fixedire, ipaddr_t,
493 ipha->ipha_dst, mblk_t *, mp);
494 VXNF_REFRELE(fixed);
495 freemsg(mp);
496 return (B_TRUE);
497 }
498
499 iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
500 if (iras.ira_ip_hdr_length > sizeof (ipha_t))
501 iras.ira_flags |= IRAF_IPV4_OPTIONS;
502 iras.ira_xmit_hint = 0; /* XXX KEBE SAYS FIX ME! */
503 iras.ira_zoneid = outbound_ire->ire_zoneid;
504 iras.ira_pktlen = ntohs(ipha->ipha_length);
505 iras.ira_protocol = ipha->ipha_protocol;
506 /* XXX KEBE ASKS rifindex & ruifindex ?!? */
507 /*
508 * NOTE: AT LEAST ira_ill needs ILLF_ROUTER set, as
509 * well as the ill for the external NIC (where
510 * off-link destinations live). For fixed, ira_ill
511 * should be the ill of the external source.
512 */
513 iras.ira_rill = vxlnat_underlay_ire->ire_ill;
514 iras.ira_ill = fixed->vxnf_ire->ire_ill;
515 /* XXX KEBE ASKS cred & cpid ? */
516 iras.ira_verified_src = ipha->ipha_src;
517 /* XXX KEBE SAYS don't sweat IPsec stuff. */
518 /* XXX KEBE SAYS ALSO don't sweat l2src & mhip */
519
520 /* Okay, we're good! Let's pretend we're forwarding. */
521 ire_recv_forward_v4(outbound_ire, mp, ipha, &iras);
522 ire_refrele(outbound_ire);
523
524 return (B_TRUE);
525 }
526
527 /*
528 * Process exactly one VXLAN packet.
529 */
530 static void
531 vxlnat_one_vxlan(mblk_t *mp, struct sockaddr_in6 *underlay_src)
532 {
533 vxlan_hdr_t *vxh;
534 vxlnat_vnet_t *vnet;
535 ipha_t *ipha;
536 ip6_t *ip6h;
537
538 if (MBLKL(mp) < sizeof (*vxh)) {
539 /* XXX KEBE ASKS -- should we be more forgiving? */
540 DTRACE_PROBE1(vxlnat__in__drop__vxlsize, mblk_t *, mp);
541 freemsg(mp);
542 return;
543 }
544 vxh = (vxlan_hdr_t *)mp->b_rptr;
545
546 /* If we start using more than just the one flag, fix it. */
547 if (vxh->vxlan_flags != VXLAN_F_VDI_WIRE) {
548 DTRACE_PROBE1(vxlnat__in__drop__VDI, mblk_t *, mp);
549 freemsg(mp);
550 return;
551 }
552
553 /* Remember, we key off of what's on the wire. */
554 vnet = vxlnat_get_vnet(VXLAN_ID_WIRE32(vxh->vxlan_id), B_FALSE);
555 if (vnet == NULL) {
556 DTRACE_PROBE1(vxlnat__in__drop__vnetid, uint32_t,
557 VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)));
558 freemsg(mp);
559 return;
560 }
561
562 DTRACE_PROBE2(vxlnat__in__vnet, uint32_t,
563 VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)),
564 vxlnat_vnet_t, vnet);
565
566 /*
567 * Off-vxlan processing steps:
568 * 1.) Locate the ethernet header and check/update/add-into remotes.
569 * 2.) Search 1-1s, process if hit.
570 * 3.) Search flows, process if hit.
571 * 4.) Search rules, create new flow (or not) if hit.
572 * 5.) Drop the packets.
573 */
574
575 /* 1.) Locate the ethernet header and check/update/add-into remotes. */
576 mp->b_rptr += sizeof (*vxh);
577 while (MBLKL(mp) == 0) {
578 mblk_t *oldmp = mp;
579
580 mp = mp->b_cont;
581 freeb(oldmp);
582 }
583 mp = vxlnat_cache_remote(mp, underlay_src, vnet);
584 if (mp == NULL)
585 goto bail_no_free;
586
587 /* Let's cache the IP header here... */
588 ipha = (ipha_t *)mp->b_rptr;
589 switch (IPH_HDR_VERSION(ipha)) {
590 case IPV4_VERSION:
591 ip6h = NULL;
592 break;
593 case IPV6_VERSION:
594 ip6h = (ip6_t *)ipha;
595 ipha = NULL;
596 break;
597 default:
598 DTRACE_PROBE2(vxlnat__in__drop__ipvers, int,
599 IPH_HDR_VERSION(ipha), mblk_t *, mp);
600 goto bail_and_free;
601 }
602
603 /* 2.) Search 1-1s, process if hit. */
604 if (vxlnat_one_vxlan_fixed(vnet, mp, ipha, ip6h))
605 goto bail_no_free; /* Success means mp was consumed. */
606
607 /* 3.) Search flows, process if hit. */
608 if (vxlnat_one_vxlan_flow(vnet, mp, ipha, ip6h))
609 goto bail_no_free; /* Success means mp was consumed. */
610
611 /* 4.) Search rules, create new flow (or not) if hit. */
612 if (vxlnat_one_vxlan_rule(vnet, mp, ipha, ip6h))
613 goto bail_no_free; /* Success means mp was consumed. */
614
615 /* 5.) Nothing, drop the packet. */
616
617 DTRACE_PROBE2(vxlnat__in___drop__nohits, vxlnat_vnet_t *, vnet,
618 mblk_t *, mp);
619
620 bail_and_free:
621 freemsg(mp);
622 bail_no_free:
623 VXNV_REFRELE(vnet);
624 }
625 /*
626 * ONLY return B_FALSE if we get a packet-clogging event.
627 */
628 /* ARGSUSED */
629 static boolean_t
630 vxlnat_vxlan_input(ksocket_t insock, mblk_t *chain, size_t msgsize, int oob,
631 void *ignored)
632 {
633 mblk_t *mp, *nextmp;
634
635 /*
636 * XXX KEBE ASKS --> move hold & release outside of loop?
637 * If so, hold rwlock here.
638 */
639
640 for (mp = chain; mp != NULL; mp = nextmp) {
641 struct T_unitdata_ind *tudi;
642 struct sockaddr_in6 *sin6;
643
644 nextmp = mp->b_next;
645 if (DB_TYPE(mp) != M_PROTO || mp->b_cont == NULL) {
646 DTRACE_PROBE1(vxlnat__in__drop__mblk, mblk_t *, mp);
647 freemsg(mp);
648 continue;
649 }
650
651 /* LINTED -- aligned */
652 tudi = (struct T_unitdata_ind *)mp->b_rptr;
653 if (tudi->PRIM_type != T_UNITDATA_IND) {
654 DTRACE_PROBE1(vxlnat__in__drop__TPI, mblk_t *, mp);
655 freemsg(mp);
656 continue;
657 }
658 /* LINTED -- aligned */
659 sin6 = (struct sockaddr_in6 *)(mp->b_rptr + tudi->SRC_offset);
660 VERIFY(sin6->sin6_family == AF_INET6);
661 VERIFY(tudi->SRC_length >= sizeof (*sin6));
662
663 vxlnat_one_vxlan(mp->b_cont, sin6);
664 freeb(mp);
665 }
666
667 return (B_TRUE);
668 }
669
670 /*
671 * Use RFC 1141's technique (with a check for -0).
672 *
673 * newsum = oldsum - (new16a + old16a - new16b + old16b ...);
674 *
675 * NOTE: "oldsum" is right off the wire in wire-native order.
676 * NOTE2: "old" and "new" ALSO point to things in wire-native order.
677 * NOTE3: THIS MUST TAKE A MULTIPLE OF 2 BYTES (i.e. uint16_t array).
678 * NOTE4: The 32-bit running sum means we can't take len > 64k.
679 */
680 uint16_t
681 vxlnat_cksum_adjust(uint16_t oldsum, uint16_t *old, uint16_t *new, uint_t len)
682 {
683 uint32_t newsum = ntohs(oldsum);
684
685 ASSERT((len & 0x1) == 0);
686 while (len != 0) {
687 newsum -= ntohs(*new);
688 newsum += ntohs(*old);
689 len -= 2;
690 old++;
691 new++;
692 }
693 newsum += (newsum >> 16) & 0xffff;
694
695 return (newsum == 0xffff ? 0 : htons(newsum));
696 }
697
698 /*
699 * Fix inner headers on an ICMP packet.
700 *
701 * XXX KEBE SAYS FOR NOW, just do addresses for 1-1/fixed. When we do
702 * flows, include old_port/new_port as well.
703 */
704 static mblk_t *
705 vxlnat_fix_icmp_inner_v4(mblk_t *mp, icmph_t *icmph, ipaddr_t old_one,
706 ipaddr_t new_one, boolean_t to_private)
707 {
708 mblk_t *newmp;
709 ipha_t *inner_ipha;
710 ipaddr_t *new_ones_place;
711
712 if ((uint8_t *)(icmph + 1) + sizeof (ipha_t) > mp->b_wptr) {
713 /* Pay the pullup tax. */
714 newmp = msgpullup(mp, -1);
715 freemsg(mp);
716 if (newmp == NULL) {
717 DTRACE_PROBE1(vxlnat__fixicmp__pullupfail, void *,
718 NULL);
719 return (NULL);
720 }
721 if (MBLKL(newmp) < 2 * sizeof (ipha_t) + sizeof (icmph_t)) {
722 /* Wow! Too-tiny ICMP packet. */
723 DTRACE_PROBE1(vxlnat__fixicmp__tootiny, mblk_t *,
724 newmp);
725 freeb(newmp);
726 return (NULL);
727 }
728 mp = newmp;
729 /* Temporarily use inner_ipha for the outer one. */
730 inner_ipha = (ipha_t *)mp->b_rptr;
731 icmph = (icmph_t *)(mp->b_rptr + IPH_HDR_LENGTH(inner_ipha));
732 }
733 inner_ipha = (ipha_t *)(icmph + 1);
734 new_ones_place = to_private ?
735 &inner_ipha->ipha_src : &inner_ipha->ipha_dst;
736 if (*new_ones_place != old_one) {
737 /* Either I'm buggy or the packet is. */
738 DTRACE_PROBE2(vxlnat__fixicmp__badinneraddr, ipaddr_t,
739 old_one, ipaddr_t, *new_ones_place);
740 freeb(mp);
741 return (NULL);
742 }
743 *new_ones_place = new_one;
744
745 /* Adjust ICMP checksum... */
746 icmph->icmph_checksum = vxlnat_cksum_adjust(icmph->icmph_checksum,
747 (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
748
749 /*
750 * XXX KEBE ASKS, recompute *inner-packet* checksums? Let's not for
751 * now, but consider this Fair Warning (or some other VH album...).
752 */
753 return (mp);
754 }
755
756 /*
757 * Take a 1-1/fixed IPv4 packet and convert it for transmission out the
758 * appropriate end. "to_private" is what it says on the tin.
759 * ALWAYS consumes "mp", regardless of return value.
760 */
761 static mblk_t *
762 vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed, boolean_t to_private)
763 {
764 ipaddr_t new_one, old_one;
765 ipaddr_t *new_ones_place;
766 ipha_t *ipha = (ipha_t *)mp->b_rptr;
767 uint8_t *nexthdr, *end_wptr;
768
769 if (to_private) {
770 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_addr, new_one);
771 new_ones_place = &ipha->ipha_dst;
772 } else {
773 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_pubaddr, new_one);
774 new_ones_place = &ipha->ipha_src;
775 }
776
777 old_one = *new_ones_place;
778 *new_ones_place = new_one;
779
780 /*
781 * Recompute the IP header checksum, and check for the TCP or UDP
782 * checksum as well, as they'll need recomputing as well.
783 */
784
785 /* First, the IPv4 header itself. */
786 ipha->ipha_hdr_checksum = vxlnat_cksum_adjust(ipha->ipha_hdr_checksum,
787 (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
788
789 nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
790 if (nexthdr >= mp->b_wptr) {
791 nexthdr = mp->b_cont->b_rptr +
792 (MBLKL(mp) - IPH_HDR_LENGTH(ipha));
793 end_wptr = mp->b_cont->b_wptr;
794 } else {
795 end_wptr = mp->b_wptr;
796 }
797
798 switch (ipha->ipha_protocol) {
799 case IPPROTO_TCP: {
800 tcpha_t *tcph = (tcpha_t *)nexthdr;
801
802 if (nexthdr + sizeof (*tcph) >= end_wptr) {
803 /* Bail for now. */
804 DTRACE_PROBE1(vxlnat__fix__tcp__mblkspan, mblk_t *,
805 mp);
806 freemsg(mp);
807 return (NULL);
808 }
809 tcph->tha_sum = vxlnat_cksum_adjust(tcph->tha_sum,
810 (uint16_t *)&old_one, (uint16_t *)&new_one,
811 sizeof (ipaddr_t));
812 break; /* Out of switch. */
813 }
814 case IPPROTO_UDP: {
815 udpha_t *udph = (udpha_t *)nexthdr;
816
817 if (nexthdr + sizeof (*udph) >= end_wptr) {
818 /* Bail for now. */
819 DTRACE_PROBE1(vxlnat__fix__udp__mblkspan, mblk_t *,
820 mp);
821 freemsg(mp);
822 return (NULL);
823 }
824 udph->uha_checksum = vxlnat_cksum_adjust(udph->uha_checksum,
825 (uint16_t *)&old_one, (uint16_t *)&new_one,
826 sizeof (ipaddr_t));
827 break; /* Out of switch. */
828 }
829 case IPPROTO_ICMP: {
830 icmph_t *icmph = (icmph_t *)nexthdr;
831
832 /*
833 * We need to check the case of ICMP messages that contain
834 * IP packets. We will need to at least change the addresses,
835 * and *maybe* the checksums too if necessary.
836 *
837 * This may replicate some of icmp_inbound_v4(), alas.
838 */
839 if (nexthdr + sizeof (*icmph) >= end_wptr) {
840 mblk_t *newmp;
841 /*
842 * Unlike the others, we're going to pay the pullup
843 * tax here.
844 */
845 newmp = msgpullup(mp, -1);
846 freemsg(mp);
847 if (newmp == NULL) {
848 DTRACE_PROBE1(vxlnat__icmp__pullupfail, void *,
849 NULL);
850 return (NULL);
851 }
852 mp = newmp;
853 ipha = (ipha_t *)(mp->b_rptr);
854 nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
855 icmph = (icmph_t *)nexthdr;
856 }
857
858 switch (icmph->icmph_type) {
859 case ICMP_ADDRESS_MASK_REPLY:
860 case ICMP_ADDRESS_MASK_REQUEST:
861 case ICMP_TIME_STAMP_REPLY:
862 case ICMP_TIME_STAMP_REQUEST:
863 case ICMP_ECHO_REQUEST:
864 case ICMP_ECHO_REPLY:
865 /* These merely need to get passed along. */
866 break;
867 case ICMP_ROUTER_ADVERTISEMENT:
868 case ICMP_ROUTER_SOLICITATION:
869 /* These shouldn't be traversing a NAT at all. Drop. */
870 DTRACE_PROBE1(vxlnat__icmp__cantpass, int,
871 icmph->icmph_type);
872 freemsg(mp);
873 return (NULL);
874 case ICMP_PARAM_PROBLEM:
875 case ICMP_TIME_EXCEEDED:
876 case ICMP_DEST_UNREACHABLE:
877 /* These include inner-IP headers we need to adjust. */
878 mp = vxlnat_fix_icmp_inner_v4(mp, icmph, old_one,
879 new_one, to_private);
880 break;
881 default:
882 /* Pass along to receiver, but warn. */
883 DTRACE_PROBE1(vxlnat__icmp__unknown, int,
884 icmph->icmph_type);
885 break;
886 }
887 }
888 /* Otherwise we can't make any other assumptions for now... */
889 default:
890 break;
891 }
892
893 return (mp);
894 }
895
896 vxlnat_remote_t *
897 vxlnat_xmit_vxlanv4(mblk_t *mp, in6_addr_t *overlay_dst,
898 vxlnat_remote_t *remote, uint8_t *myether, vxlnat_vnet_t *vnet)
899 {
900 struct sockaddr_in6 sin6 = {AF_INET6};
901 struct msghdr msghdr = {NULL};
902 mblk_t *vlan_mp;
903 extern uint_t vxlan_alloc_size, vxlan_noalloc_min;
904 vxlan_hdr_t *vxh;
905 struct ether_vlan_header *evh;
906 int rc;
907 cred_t *cred;
908
909 if (remote == NULL || remote->vxnrem_vnet == NULL) {
910 DTRACE_PROBE1(vxlnat__xmit__vxlanv4, vxlnat_remote_t *, remote);
911 /* Release the condemned remote. */
912 if (remote != NULL)
913 VXNREM_REFRELE(remote);
914
915 /* See if we have a remote ready to use... */
916 remote = vxlnat_get_remote(vnet, overlay_dst, B_FALSE);
917
918 if (remote == NULL) {
919 /*
920 * We need to do the moral equivalent of PF_KEY
921 * ACQUIRE or overlay's queue-resolve so that we can
922 * have someone in user-space send me a remote. Until
923 * then, drop the reference if condemned, free the
924 * message, and return NULL.
925 */
926
927 freemsg(mp);
928 return (NULL);
929 }
930 }
931 ASSERT(vnet == remote->vxnrem_vnet);
932
933 if (DB_REF(mp) > 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
934 vlan_mp = allocb(vxlan_alloc_size, BPRI_HI);
935 if (vlan_mp == NULL) {
936 DTRACE_PROBE1(vxlnat__xmit__vxlanv4__allocfail,
937 vxlnat_remote_t *, remote);
938 freemsg(mp);
939 /* Just drop the packet, but don't tell caller. */
940 return (remote);
941 }
942 vlan_mp->b_wptr = DB_LIM(vlan_mp);
943 vlan_mp->b_rptr = vlan_mp->b_wptr;
944 vlan_mp->b_cont = mp;
945 } else {
946 vlan_mp = mp;
947 }
948 vlan_mp->b_rptr -= sizeof (*vxh) + sizeof (*evh);
949 vxh = (vxlan_hdr_t *)vlan_mp->b_rptr;
950 vxh->vxlan_flags = VXLAN_F_VDI_WIRE;
951 vxh->vxlan_id = vnet->vxnv_vnetid; /* Already in wire-order. */
952
953 /* Fill in the Ethernet header. */
954 evh = (struct ether_vlan_header *)(vxh + 1);
955 ether_copy(&remote->vxnrem_ether, &evh->ether_dhost);
956 ether_copy(myether, &evh->ether_shost);
957 evh->ether_tpid = htons(ETHERTYPE_VLAN);
958 evh->ether_tci = remote->vxnrem_vlan;
959 evh->ether_type = htons(ETHERTYPE_IP);
960
961 msghdr.msg_name = (struct sockaddr_storage *)&sin6;
962 msghdr.msg_namelen = sizeof (sin6);
963 /* Address family and other zeroing already done up top. */
964 sin6.sin6_port = htons(IPPORT_VXLAN);
965 sin6.sin6_addr = remote->vxnrem_uaddr;
966
967 /*
968 * cred_t dance is because we may be getting this straight from
969 * interrupt context.
970 */
971 cred = zone_get_kcred(netstack_get_zoneid(vxlnat_netstack));
972 if (cred == NULL) {
973 DTRACE_PROBE1(vxlnat__xmit__vxlan4__credfail,
974 vxlnat_remote_t *, remote);
975 freemsg(vlan_mp);
976 }
977 /*
978 * Use MSG_DONTWAIT to avoid blocks, esp. if we're getting this
979 * straight from interrupt context.
980 */
981 rc = ksocket_sendmblk(vxlnat_underlay, &msghdr, MSG_DONTWAIT, &vlan_mp,
982 cred);
983 crfree(cred);
984 if (rc != 0) {
985 DTRACE_PROBE2(vxlnat__xmit__vxlan4__sendfail, int, rc,
986 vxlnat_remote_t *, remote);
987 freemsg(vlan_mp);
988 }
989 return (remote);
990 }
991
992 /*
993 * New ire_{recv,send}fn implementations if we're doing 1-1 mappings.
994 */
995 int
996 vxlnat_fixed_ire_send_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
997 ip_xmit_attr_t *ixa, uint32_t *identp)
998 {
999 /* XXX KEBE SAYS FILL ME IN, but for now... */
1000 freemsg(mp);
1001 return (EOPNOTSUPP);
1002 }
1003
1004 void
1005 vxlnat_fixed_ire_recv_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
1006 ip_recv_attr_t *ira)
1007 {
1008 /* XXX KEBE SAYS FILL ME IN, but for now... */
1009 freemsg(mp);
1010 }
1011
1012 /*
1013 * I believe the common case for this will be from self-generated ICMP
1014 * messages. Other same-netstack-originated traffic will also come through
1015 * here (one internal reaching what turns out to be another internal).
1016 */
1017 int
1018 vxlnat_fixed_ire_send_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1019 ip_xmit_attr_t *ixa, uint32_t *identp)
1020 {
1021 ip_recv_attr_t iras; /* NOTE: No bzero because we pay more later */
1022 ipha_t *ipha = (ipha_t *)iph_arg;
1023
1024 /*
1025 * XXX KEBE ASKS, any DTrace probes or other instrumentation that
1026 * perhaps should be set?
1027 */
1028
1029 /* Map ixa to ira. */
1030 iras.ira_pktlen = ixa->ixa_pktlen;
1031 /* XXX KEBE ASKS more?!? */
1032
1033 /*
1034 * In normal TCP/IP processing, this shortcuts the IP header checksum
1035 * AND POSSIBLY THE ULP checksum cases. Since this is likely to head
1036 * back into the internal network, we need to recompute things again.
1037 */
1038 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
1039 freemsg(mp);
1040 return (EMSGSIZE);
1041 }
1042 #if 0
1043 /* XXX KEBE ASKS Special-case ICMP here? */
1044 if (ipha->ipha_protocol == IPPROTO_ICMP) {
1045 icmph_t *icmph;
1046
1047 icmph = (icmph_t *)((uint8_t *)ipha + IPH_HDR_LENGTH(ipha));
1048 if ((uint8_t *)icmph >= mp->b_wptr) {
1049 freemsg(mp);
1050 return (EMSGSIZE);
1051 }
1052 icmph->icmph_checksum = 0;
1053 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
1054 }
1055 #endif
1056
1057 vxlnat_fixed_ire_recv_v4(ire, mp, iph_arg, &iras);
1058
1059 return (0);
1060 }
1061
1062 void
1063 vxlnat_fixed_ire_recv_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1064 ip_recv_attr_t *ira)
1065 {
1066 vxlnat_fixed_t *fixed;
1067 vxlnat_vnet_t *vnet;
1068 ipha_t *ipha = (ipha_t *)iph_arg;
1069 int newmtu;
1070
1071 /* Make a note for DAD that this address is in use */
1072 ire->ire_last_used_time = LBOLT_FASTPATH;
1073
1074 /* Only target the IRE_LOCAL with the right zoneid. */
1075 ira->ira_zoneid = ire->ire_zoneid;
1076
1077 /*
1078 * XXX KEBE ASKS, any DTrace probes or other instrumentation that
1079 * perhaps should be set?
1080 */
1081
1082 /*
1083 * Reality check some things.
1084 */
1085 fixed = (vxlnat_fixed_t *)ire->ire_dep_sib_next;
1086 vnet = fixed->vxnf_vnet;
1087
1088 ASSERT3P(ire, ==, fixed->vxnf_ire);
1089
1090 if (IRE_IS_CONDEMNED(ire) || vnet == NULL)
1091 goto detach_ire_and_bail;
1092
1093 /*
1094 * Not a common-case, but a possible one. If our underlay MTU is
1095 * smaller than the external MTU, it is possible that we will have a
1096 * size mismatch and therefore need to either fragment at the VXLAN
1097 * layer (VXLAN UDP packet sent as two or more IP fragments) OR
1098 * if IPH_DF is set, send an ICMP_NEEDS_FRAGMENTATION back to the
1099 * sender. Perform the check here BEFORE we NAT the packet.
1100 */
1101 ASSERT(vxlnat_underlay_ire->ire_ill != NULL);
1102 newmtu = vxlnat_underlay_ire->ire_ill->ill_mtu - sizeof (ipha_t) -
1103 sizeof (udpha_t) - sizeof (vxlan_hdr_t) -
1104 sizeof (struct ether_vlan_header);
1105 if ((ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF) &&
1106 ntohs(ipha->ipha_length) > newmtu) {
1107 icmp_frag_needed(mp, newmtu, ira);
1108 /* We're done. Assume icmp_frag_needed() consumed mp. */
1109 return;
1110 }
1111
1112 /*
1113 * So we're here, and since we have a refheld IRE, we have a refheld
1114 * fixed and vnet. Do some of what ip_input_local_v4() does (inbound
1115 * checksum? some ira checks?), but otherwise, swap the destination
1116 * address as mapped in "fixed", recompute any checksums, and send it
1117 * along its merry way (with a ttl decement too) to a VXLAN
1118 * destination.
1119 */
1120 mp = vxlnat_fixed_fixv4(mp, fixed, B_TRUE);
1121 if (mp == NULL)
1122 return; /* Assume it's been freed & dtraced already. */
1123
1124 /*
1125 * Otherwise, we're ready to transmit this packet over the vxlan
1126 * socket.
1127 */
1128 fixed->vxnf_remote = vxlnat_xmit_vxlanv4(mp, &fixed->vxnf_addr,
1129 fixed->vxnf_remote, fixed->vxnf_myether, vnet);
1130 if (fixed->vxnf_remote == NULL) {
1131 /* XXX KEBE ASKS, DTrace probe here? Or in-function? */
1132 DTRACE_PROBE2(vxlnat__fixed__xmitdrop,
1133 in6_addr_t *, &fixed->vxnf_addr,
1134 uint32_t, VXLAN_ID_NTOH(vnet->vxnv_vnetid));
1135 }
1136 return;
1137
1138 detach_ire_and_bail:
1139 /* Oh no, something's condemned. Drop the IRE now. */
1140 ire->ire_recvfn = ire_recv_local_v4;
1141 ire->ire_dep_sib_next = NULL;
1142 VXNF_REFRELE(fixed);
1143 /* Pass the packet back... */
1144 ire_recv_local_v4(ire, mp, iph_arg, ira);
1145 return;
1146 }