1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2018 Joyent, Inc.
14 */
15
16 /*
17 * NAT engine. Mappings, 1-1 The rules in vxlnat_rules.c are only consulted
18 * if the 1-1 map (kept here) misses or if the outbound lookup (vnetid,
19 * protocol, src-IP, dst-IP, src-port, dst-port) misses.
20 *
21 * The plan is for inbound to hit conn_ts, whose conn_private points to
22 * entries here. The conn_recv* functions live here too (for now).
23 */
24
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/ksynch.h>
28 #include <sys/ksocket.h>
29 #include <sys/kmem.h>
30 #include <sys/stream.h>
31 #include <sys/strsubr.h>
32 #include <sys/strsun.h>
33 #include <sys/sysmacros.h>
34 #include <sys/debug.h>
35 #include <sys/dtrace.h>
36 #include <sys/errno.h>
37 #include <sys/tihdr.h>
38 #include <netinet/in.h>
39 #include <netinet/udp.h>
40 #include <inet/ip.h>
41 #include <inet/ip6.h>
42 #include <inet/tcp_impl.h>
43 #include <inet/udp_impl.h>
44 #include <inet/tcp.h>
45
46 #include <inet/vxlnat_impl.h>
47
48 static boolean_t vxlnat_vxlan_input(ksocket_t, mblk_t *, size_t, int, void *);
49 static mblk_t *vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed,
50 boolean_t to_private);
51
52 /*
53 * Initialized to NULL, read/write protected by vxlnat_mutex.
54 * Receive functions shouldn't have to access this directly.
55 */
56 ksocket_t vxlnat_underlay;
57 ire_t *vxlnat_underlay_ire;
58
59 void
60 vxlnat_closesock(void)
61 {
62 ASSERT(MUTEX_HELD(&vxlnat_mutex));
63 if (vxlnat_underlay_ire != NULL) {
64 ire_refrele(vxlnat_underlay_ire);
65 vxlnat_underlay_ire = NULL;
66 }
67 if (vxlnat_underlay != NULL) {
68 (void) ksocket_close(vxlnat_underlay, zone_kcred());
69 vxlnat_underlay = NULL;
70 }
71 }
72
73 static int
74 vxlnat_opensock(in6_addr_t *underlay_ip)
75 {
76 int rc, val;
77 /* Assume rest is initialized to 0s. */
78 struct sockaddr_in6 sin6 = {AF_INET6, BE_16(IPPORT_VXLAN)};
79 ip_stack_t *ipst = vxlnat_netstack->netstack_ip;
80
81 ASSERT(MUTEX_HELD(&vxlnat_mutex));
82 /* Open... */
83 rc = ksocket_socket(&vxlnat_underlay, AF_INET6, SOCK_DGRAM, 0,
84 KSOCKET_SLEEP, zone_kcred());
85 if (rc != 0)
86 return (rc);
87
88 /* Bind... */
89 sin6.sin6_addr = *underlay_ip;
90 rc = ksocket_bind(vxlnat_underlay, (struct sockaddr *)(&sin6),
91 sizeof (sin6), zone_kcred());
92 if (rc != 0) {
93 vxlnat_closesock();
94 return (rc);
95 }
96
97 /* Use source-port hashing when sending packets out VXLAN... */
98 val = UDP_HASH_VXLAN;
99 rc = ksocket_setsockopt(vxlnat_underlay, IPPROTO_UDP,
100 UDP_SRCPORT_HASH, &val, sizeof (val), kcred);
101 if (rc != 0) {
102 vxlnat_closesock();
103 return (rc);
104 }
105
106 /*
107 * Grab the IRE for underlay address.
108 */
109 ASSERT3P(vxlnat_underlay_ire, ==, NULL);
110 vxlnat_underlay_ire = (IN6_IS_ADDR_V4MAPPED(underlay_ip)) ?
111 ire_ftable_lookup_simple_v4(underlay_ip->_S6_un._S6_u32[3],
112 0, ipst, NULL) :
113 ire_ftable_lookup_simple_v6(underlay_ip, 0, ipst, NULL);
114 if (vxlnat_underlay_ire == NULL) {
115 DTRACE_PROBE1(vxlnat__opensock__ire__fail, in6_addr_t *,
116 underlay_ip);
117 vxlnat_closesock();
118 return (EADDRNOTAVAIL);
119 }
120
121 /* Once we return from this, start eating data. */
122 rc = ksocket_krecv_set(vxlnat_underlay, vxlnat_vxlan_input, NULL);
123 if (rc != 0) {
124 vxlnat_closesock();
125 }
126
127 return (rc);
128 }
129
130 /*
131 * Establish a VXLAN-listening kernel socket.
132 * XXX KEBE ASKS ==> Support more than one VXLAN address?
133 */
134 /* ARGSUSED */
135 int
136 vxlnat_vxlan_addr(in6_addr_t *underlay_ip)
137 {
138 int rc;
139
140 ASSERT(MUTEX_HELD(&vxlnat_mutex));
141 /* For now, we make this a one-underlay-address-only solution. */
142 vxlnat_closesock();
143 rc = vxlnat_opensock(underlay_ip);
144 return (rc);
145 }
146
147 /*
148 * Free a remote VXLAN destination.
149 */
150 void
151 vxlnat_remote_free(vxlnat_remote_t *remote)
152 {
153 ASSERT0(remote->vxnrem_refcount);
154
155 kmem_free(remote, sizeof (*remote));
156 }
157
158 /*
159 * Like other unlink functions, assume the appropriate lock is held.
160 */
161 void
162 vxlnat_remote_unlink(vxlnat_remote_t *remote)
163 {
164 vxlnat_vnet_t *vnet = remote->vxnrem_vnet;
165
166 ASSERT3P(vnet, !=, NULL);
167 ASSERT(MUTEX_HELD(&vnet->vxnv_remote_lock));
168
169 /* First unlink so nobody else can find me */
170 avl_remove(&vnet->vxnv_remotes, remote);
171
172 /*
173 * We still hold a vnet reference, so races shouldn't be a problem.
174 * Still, for added safety, NULL it out first.
175 */
176 remote->vxnrem_vnet = NULL; /* Condemn this entry. */
177 VXNV_REFRELE(vnet);
178 VXNREM_REFRELE(remote); /* Internment release. */
179 }
180
181 /*
182 * Find or create a remote VXLAN destination.
183 */
184 static vxlnat_remote_t *
185 vxlnat_get_remote(vxlnat_vnet_t *vnet, in6_addr_t *remote_addr,
186 boolean_t create_on_miss)
187 {
188 vxlnat_remote_t *remote, searcher;
189 avl_index_t where;
190
191 searcher.vxnrem_addr = *remote_addr;
192 mutex_enter(&vnet->vxnv_remote_lock);
193 remote = avl_find(&vnet->vxnv_remotes, &searcher, &where);
194 if (remote == NULL && create_on_miss) {
195 /* Not as critical if we can't allocate here. */
196 remote = kmem_zalloc(sizeof (*remote),
197 KM_NOSLEEP | KM_NORMALPRI);
198 if (remote != NULL) {
199 remote->vxnrem_addr = *remote_addr;
200 remote->vxnrem_refcount = 1; /* Internment reference. */
201 VXNV_REFHOLD(vnet);
202 remote->vxnrem_vnet = vnet;
203 /* Rest is filled in by caller. */
204 avl_insert(&vnet->vxnv_remotes, remote, where);
205 }
206 }
207 if (remote != NULL)
208 VXNREM_REFHOLD(remote);
209 mutex_exit(&vnet->vxnv_remote_lock);
210 return (remote);
211 }
212
213 /*
214 * Cache inbound packet information in the vnet's remotes section.
215 *
216 * NOTE: This function assumes a trustworthy underlay network. If the
217 * underlay isn't trustworthy, this function should be renamed, and reduced to
218 * a "strip and reality-check the ethernet header" function.
219 *
220 * Caller has stripped any pre-ethernet data from mp. We return mp
221 * stripped down to its IP header.
222 */
223 static mblk_t *
224 vxlnat_cache_remote(mblk_t *mp, struct sockaddr_in6 *underlay_src,
225 vxlnat_vnet_t *vnet)
226 {
227 struct ether_vlan_header *evh;
228 struct ether_header *eh;
229 vxlnat_remote_t *remote;
230 uint16_t vlan, ethertype;
231 ether_addr_t remote_ether;
232 ipha_t *ipha;
233 ip6_t *ip6h;
234 in6_addr_t remote_addr;
235
236 /* Assume (for now) we have at least a VLAN header's worth of data. */
237 if (MBLKL(mp) < sizeof (*evh)) {
238 /* XXX KEBE ASKS - should we be more forgiving? */
239 DTRACE_PROBE1(vxlnat__in__drop__etherhdr, mblk_t *, mp);
240 freemsg(mp);
241 return (NULL);
242 }
243
244 eh = (struct ether_header *)mp->b_rptr;
245 ethertype = ntohs(eh->ether_type);
246 ether_copy(&eh->ether_shost, &remote_ether);
247 if (ethertype == ETHERTYPE_VLAN) {
248 evh = (struct ether_vlan_header *)eh;
249 /* Keep it in network order... */
250 vlan = evh->ether_tci;
251 ethertype = ntohs(evh->ether_type);
252 ASSERT(vlan != 0);
253 mp->b_rptr += sizeof (*evh);
254 } else {
255 evh = NULL;
256 vlan = 0;
257 mp->b_rptr += sizeof (*eh);
258 }
259 if (ethertype != ETHERTYPE_IP && ethertype != ETHERTYPE_IPV6) {
260 /*
261 * XXX KEBE SAYS for now, don't handle non-IP packets.
262 * This includes ARP.
263 */
264 DTRACE_PROBE1(vxlnat__in__drop__nonip, mblk_t *, mp);
265 freemsg(mp);
266 return (NULL);
267 }
268
269 /* Handle case of split ether + IP headers. */
270 if (MBLKL(mp) < sizeof (ipha_t)) {
271 mblk_t *freemp;
272
273 if (MBLKL(mp) > 0 || mp->b_cont == NULL) {
274 /* The IP header is split ACROSS MBLKS! Bail for now. */
275 DTRACE_PROBE1(vxlnat__in__drop__splitip, mblk_t *, mp);
276 freemsg(mp);
277 return (NULL);
278 }
279 freemp = mp;
280 mp = mp->b_cont;
281 freeb(freemp);
282 }
283 /* LINTED -- alignment... */
284 ipha = (ipha_t *)mp->b_rptr;
285
286 if (IPH_HDR_VERSION(ipha) == IPV4_VERSION) {
287 if (ethertype != ETHERTYPE_IP) {
288 /* XXX KEBE ASKS - should we be more forgiving? */
289 DTRACE_PROBE1(vxlnat__in__drop__etherhdr4,
290 mblk_t *, mp);
291 freemsg(mp);
292 return (NULL);
293 }
294 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
295 &remote_addr);
296 } else {
297 if (ethertype != ETHERTYPE_IPV6 ||
298 IPH_HDR_VERSION(ipha) != IPV6_VERSION ||
299 MBLKL(mp) < sizeof (ip6_t)) {
300 /* XXX KEBE ASKS - should we be more forgiving? */
301 DTRACE_PROBE1(vxlnat__in__drop__etherhdr6,
302 mblk_t *, mp);
303 freemsg(mp);
304 return (NULL);
305 }
306 ip6h = (ip6_t *)ipha;
307 remote_addr = ip6h->ip6_src;
308 }
309
310 /* Find remote and replace OR create new remote. */
311 remote = vxlnat_get_remote(vnet, &remote_addr, B_TRUE);
312 if (remote != NULL) {
313 /*
314 * See if this entry needs fixing or filling-in. This might
315 * get a bit racy with read-only threads that actually
316 * transmit, but it only means dropped-packets in the worst
317 * case.
318 *
319 * It's THIS PART that inspires the warning about trusting the
320 * underlay network.
321 *
322 * XXX KEBE ASKS -- should we just replace things w/o checking?
323 */
324 /* Replace the ethernet address? */
325 if (ether_cmp(&remote->vxnrem_ether, &remote_ether) != 0)
326 ether_copy(&remote_ether, &remote->vxnrem_ether);
327 /*
328 * Replace the underlay? NOTE: Fix if/when underlay becomes
329 * IPv6.
330 */
331 if (!IN6_ARE_ADDR_EQUAL(&remote->vxnrem_uaddr,
332 &underlay_src->sin6_addr)) {
333 remote->vxnrem_uaddr = underlay_src->sin6_addr;
334 }
335 /* Replace the vlan ID. Maintain network order... */
336 if (remote->vxnrem_vlan != vlan)
337 remote->vxnrem_vlan = vlan;
338 }
339 /*
340 * Else just continue and pray for better luck on another packet or
341 * on the return flight. It is IP, we can Just Drop It (TM)...
342 */
343
344 /* We're done with the remote entry now. */
345 VXNREM_REFRELE(remote);
346
347 /* Advance rptr to the inner IP header and proceed. */
348 mp->b_rptr = (uint8_t *)ipha;
349 return (mp);
350 }
351
352 /*
353 * Extract transport-level information to find a NAT flow.
354 * Consume mp and return B_FALSE if there's a problem. Fill in "ports"
355 * and "protocol" and return B_TRUE if there's not.
356 */
357 static boolean_t
358 vxlnat_grab_transport(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t *ports,
359 uint8_t *protocol, uint8_t **nexthdr_ptr)
360 {
361 uint8_t *nexthdr;
362
363 /* Punt on IPv6 for now... */
364 if (ip6h != NULL) {
365 freemsg(mp);
366 return (B_FALSE);
367 }
368
369 ASSERT(ipha != NULL);
370 *protocol = ipha->ipha_protocol;
371 nexthdr = ((uint8_t *)ipha + IPH_HDR_LENGTH(ipha));
372 *nexthdr_ptr = nexthdr; /* Get this out of the way now. */
373 if (nexthdr > mp->b_wptr) {
374 DTRACE_PROBE1(vxlnat__in__drop__trnexthdr, mblk_t *, mp);
375 freemsg(mp);
376 return (B_FALSE);
377 }
378 switch (*protocol) {
379 case IPPROTO_TCP: {
380 tcpha_t *tcph = (tcpha_t *)nexthdr;
381
382 if (nexthdr + sizeof (*tcph) > mp->b_wptr) {
383 DTRACE_PROBE1(vxlnat__in__drop__tcpnexthdr, mblk_t *,
384 mp);
385 freemsg(mp);
386 return (B_FALSE);
387 }
388 *ports = *((uint32_t *)tcph);
389 /* XXX KEBE SAYS - grab other metadata here NOW? */
390 break;
391 }
392 case IPPROTO_UDP: {
393 udpha_t *udph = (udpha_t *)nexthdr;
394
395 if (nexthdr + sizeof (*udph) > mp->b_wptr) {
396 DTRACE_PROBE1(vxlnat__in__drop__udpnexthdr, mblk_t *,
397 mp);
398 freemsg(mp);
399 return (B_FALSE);
400 }
401 *ports = *((uint32_t *)udph);
402 /*
403 * XXX KEBE SAYS - not as much as TCP, but grab other metadata
404 * here NOW?
405 */
406 break;
407 }
408 case IPPROTO_ICMP: {
409 icmph_t *icmph = (icmph_t *)nexthdr;
410
411 if (nexthdr + sizeof (*icmph) > mp->b_wptr) {
412 DTRACE_PROBE1(vxlnat__in__drop__icmpnexthdr, mblk_t *,
413 mp);
414 freemsg(mp);
415 return (B_FALSE);
416 }
417 /* XXX KEBE SAYS sort out ICMP header... */
418 switch (icmph->icmph_type) {
419 case ICMP_ECHO_REQUEST:
420 case ICMP_TIME_STAMP_REQUEST:
421 case ICMP_TIME_EXCEEDED:
422 case ICMP_INFO_REQUEST:
423 case ICMP_ADDRESS_MASK_REPLY:
424 /* All ones we can sorta cope with... */
425 break;
426 default:
427 DTRACE_PROBE2(vxlnat__in__drop__icmptype, int,
428 icmph->icmph_type, mblk_t *, mp);
429 freemsg(mp);
430 return (B_FALSE);
431 }
432 /* NOTE: as of now, will switch position depending on endian. */
433 *ports = icmph->icmph_echo_ident;
434 break;
435 }
436 default:
437 *ports = 0;
438 break;
439 }
440
441 return (B_TRUE);
442 }
443
444 /*
445 * This is the evaluate-packet vs. NAT flow state function.
446 * This function does NOT alter "mp".
447 */
448 static boolean_t
449 vxlnat_verify_natstate(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
450 vxlnat_flow_t *flow, uint8_t *nexthdr)
451 {
452 /* XXX KEBE SAYS FILL ME IN! */
453 return (B_FALSE);
454 }
455
456 /*
457 * Inspect the packet and find ports & protos (or ICMP types & codes)
458 * and see if we have an established NAT flow.
459 *
460 * XXX KEBE WONDERS if the transmission path will more closely resemble
461 * vxlnat_one_vxlan_fixed() because of ipha_ident issues or not...
462 *
463 * B_TRUE means the packet was handled, and we shouldn't continue processing
464 * (even if "was handled" means droppage).
465 */
466 static boolean_t
467 vxlnat_one_vxlan_flow(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
468 ip6_t *ip6h)
469 {
470 vxlnat_flow_t *flow, searcher;
471 uint8_t *nexthdr;
472
473 /*
474 * XXX KEBE WONDERS, should we return vxlnat_flow_t instead if we
475 * miss? That way, we only need to find the ports/protocol ONCE.
476 */
477
478 if (ip6h != NULL) {
479 /* Eventually, grab addresses for "searcher". */
480 return (B_FALSE); /* Bail on IPv6 for now... */
481 } else {
482 ASSERT(ipha != NULL);
483 searcher.vxnfl_isv4 = B_TRUE; /* Required? */
484 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
485 &searcher.vxnfl_src);
486 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_dst),
487 &searcher.vxnfl_dst);
488 }
489
490 if (!vxlnat_grab_transport(mp, ipha, ip6h, &searcher.vxnfl_ports,
491 &searcher.vxnfl_protocol, &nexthdr)) {
492 DTRACE_PROBE1(vxlnat__in__flowgrab, mblk_t *, mp);
493 freemsg(mp);
494 return (B_TRUE);
495 }
496
497
498 /*
499 * XXX KEBE SAYS Eventually put the rw&find in an IPv4-only block,
500 * because IPv6 (if we NAT it like IPv4) will have its own table/tree.
501 */
502 rw_enter(&vnet->vxnv_flowv4_lock, RW_READER);
503 flow = avl_find(&vnet->vxnv_flows_v4, &searcher, NULL);
504 if (flow != NULL)
505 VXNFL_REFHOLD(flow);
506 rw_exit(&vnet->vxnv_flowv4_lock);
507
508 if (flow == NULL)
509 return (B_FALSE); /* Let caller handle things. */
510
511 if (!vxlnat_verify_natstate(mp, ipha, ip6h, flow, nexthdr)) {
512 freemsg(mp); /* XXX KEBE SAYS FOR NOW... */
513 } else {
514 /* XXX KEBE SAYS PROCESS... */
515 }
516
517 VXNFL_REFRELE(flow);
518 return (B_TRUE);
519 }
520
521 /*
522 * We have a new packet that seems to require a new NAT flow. Construct that
523 * flow now, and intern it as both a conn_t in IP *and* in the vnet's
524 * appropriate vxnv_flows* tree. Return NULL if we have a problem.
525 */
526 static vxlnat_flow_t *
527 vxlnat_new_flow(vxlnat_rule_t *rule, in6_addr_t *inner_src, in6_addr_t *dst,
528 uint32_t ports, uint8_t protocol)
529 {
530 vxlnat_vnet_t *vnet = rule->vxnr_vnet;
531 vxlnat_flow_t *flow, *oldflow;
532 avl_tree_t *flowtree;
533 krwlock_t *flowlock;
534 avl_index_t where;
535
536 flow = kmem_alloc(sizeof (*flow), KM_NOSLEEP | KM_NORMALPRI);
537 if (flow == NULL)
538 return (NULL);
539
540 flow->vxnfl_dst = *dst;
541 flow->vxnfl_src = *inner_src;
542 flow->vxnfl_ports = ports;
543 flow->vxnfl_protocol = protocol;
544 flow->vxnfl_refcount = 2; /* One for internment, one for caller. */
545 /* Assume no mixed-IP-version mappings for now. */
546 if (IN6_IS_ADDR_V4MAPPED(inner_src)) {
547 ASSERT(IN6_IS_ADDR_V4MAPPED(dst));
548 flow->vxnfl_isv4 = B_TRUE;
549 flowtree = &vnet->vxnv_flows_v4;
550 flowlock = &vnet->vxnv_flowv4_lock;
551 } else {
552 ASSERT(!IN6_IS_ADDR_V4MAPPED(dst));
553 flow->vxnfl_isv4 = B_FALSE;
554 /* XXX KEBE SAYS we don't do IPv6 for now. */
555 DTRACE_PROBE2(vxlnat__flow__newv6, in6_addr_t *, inner_src,
556 in6_addr_t *, dst);
557 kmem_free(flow, sizeof (*flow));
558 return (NULL);
559 }
560 VXNR_REFHOLD(rule); /* For the flow itself... */
561 flow->vxnfl_rule = rule;
562
563 rw_enter(flowlock, RW_WRITER);
564 oldflow = (vxlnat_flow_t *)avl_find(flowtree, flow, &where);
565 if (oldflow != NULL) {
566 /*
567 * Hmmm, someone put one in while we were dinking around.
568 * XXX KEBE SAYS return the old one, refheld, for now.
569 */
570 VXNR_REFRELE(rule);
571 kmem_free(flow, sizeof (*flow));
572 VXNFL_REFHOLD(oldflow);
573 flow = oldflow;
574 } else {
575 avl_insert(flowtree, flow, where);
576 /*
577 * Do conn_t magic here, except for the conn_t activation. I
578 * am aware of holding the rwlock-as-write here. We may need
579 * to move this outside the rwlock hold, and
580 * reacquire-on-failure.
581 */
582 if (!vxlnat_new_conn(flow)) {
583 ASSERT(flow->vxnfl_connp == NULL);
584 avl_remove(flowtree, flow);
585 VXNR_REFRELE(flow->vxnfl_rule);
586 kmem_free(flow, sizeof (*flow));
587 flow = NULL;
588 }
589 }
590 rw_exit(flowlock);
591
592 /* We just created this one, activate it. */
593 if (oldflow == NULL && flow != NULL)
594 vxlnat_activate_conn(flow);
595
596 return (flow);
597 }
598
599 void
600 vxlnat_flow_free(vxlnat_flow_t *flow)
601 {
602 ASSERT(flow->vxnfl_refcount == 0);
603
604 /* XXX KEBE SAYS FILL ME IN?! */
605 /* XXX KEBE ASKS ipcl_hash_remove()? */
606
607 flow->vxnfl_connp->conn_priv = NULL; /* Sufficient? */
608 CONN_DEC_REF(flow->vxnfl_connp);
609 VXNR_REFRELE(flow->vxnfl_rule);
610 kmem_free(flow, sizeof (*flow));
611 }
612
613 static boolean_t
614 vxlnat_verify_initial(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
615 uint32_t ports, uint8_t protocol, uint8_t *nexthdr)
616 {
617 /* XXX KEBE SAYS FILL ME IN! */
618 freemsg(mp);
619 return (B_FALSE);
620 }
621
622 /*
623 * If we reach here, we need to find a NAT rule, and see if we can/should
624 * CREATE a new NAT flow, or whether or not we should drop, maybe even
625 * returning an ICMP message of some sort.
626 *
627 * B_TRUE means the packet was handled, and we shouldn't continue processing
628 * (even if "was handled" means droppage).
629 */
630 static boolean_t
631 vxlnat_one_vxlan_rule(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
632 ip6_t *ip6h)
633 {
634 vxlnat_rule_t *rule;
635 vxlnat_flow_t *flow;
636 in6_addr_t v4m_src, v4m_dst, *inner_src, *dst;
637 uint32_t ports;
638 uint8_t protocol;
639 uint8_t *nexthdr;
640
641 /* XXX handle IPv6 later, assigning inner_src and dst to ip6_t addrs. */
642 if (ip6h != NULL)
643 return (B_FALSE);
644
645 ASSERT3P(ipha, !=, NULL);
646 inner_src = &v4m_src;
647 dst = &v4m_dst;
648 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src), inner_src);
649 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_dst), dst);
650
651 mutex_enter(&vnet->vxnv_rule_lock);
652 rule = list_head(&vnet->vxnv_rules);
653
654 /*
655 * search for a match in the nat rules
656 * XXX investigate perf issues with with respect to list_t size
657 * XXX KEBE SAYS rewrite when we start doing IPv6 to use "inner_src"
658 * and "dst".
659 */
660 while (rule != NULL) {
661 ipaddr_t ipaddr;
662 uint32_t netmask = 0xffffffff;
663 uint8_t prefix = rule->vxnr_prefix - 96;
664
665 /* calculate the v4 netmask */
666 netmask <<= (32 - prefix);
667 netmask = htonl(netmask);
668
669 IN6_V4MAPPED_TO_IPADDR(&rule->vxnr_myaddr, ipaddr);
670 /* XXX ASSERT vlanid? */
671 if ((ipaddr & netmask) == (ipha->ipha_src & netmask)) {
672 VXNR_REFHOLD(rule);
673 break;
674 }
675
676 rule = list_next(&vnet->vxnv_rules, rule);
677 }
678
679 mutex_exit(&vnet->vxnv_rule_lock);
680
681 if (rule == NULL)
682 return (B_FALSE);
683
684 /* process packet */
685
686 /*
687 * Grab transport header, and figure out if we can proceed.
688 *
689 * NOTE: vxlnat_grab_transport() will free/consume mp if it fails,
690 * because we want to isolate non-flow-starters without having them
691 * create new flows. This means we return B_TRUE (consumed mp) on
692 * failure.
693 */
694 if (!vxlnat_grab_transport(mp, ipha, ip6h, &ports, &protocol, &nexthdr))
695 return (B_TRUE); /* see above... */
696 if (!vxlnat_verify_initial(mp, ipha, ip6h, ports, protocol, nexthdr))
697 return (B_TRUE);
698
699
700 flow = vxlnat_new_flow(rule, inner_src, dst, ports, protocol);
701 if (flow != NULL) {
702 /*
703 * Call same function that vxlnat_one_vxlan_flow() uses
704 * to remap & transmit the packet out the external side.
705 *
706 * NOTE: We've already checked the initial-packet-
707 * qualification, so unlike the main datapath, we don't
708 * need to call vxlnat_verify_natstate()
709 */
710
711 /* XXX KEBE SAYS PROCESS... */
712
713 VXNFL_REFRELE(flow);
714 return (B_TRUE);
715 }
716
717 return (B_FALSE);
718 }
719
720 /*
721 * See if the inbound VXLAN packet hits a 1-1/fixed mapping, and process if it
722 * does. B_TRUE means the packet was handled, and we shouldn't continue
723 * processing (even if "was handled" means droppage).
724 */
725 static boolean_t
726 vxlnat_one_vxlan_fixed(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
727 ip6_t *ip6h)
728 {
729 vxlnat_fixed_t *fixed, fsearch;
730 mblk_t *newmp;
731 ire_t *outbound_ire;
732 /* Use C99's initializers for fun & profit. */
733 ip_recv_attr_t iras = { IRAF_IS_IPV4 | IRAF_VERIFIED_SRC };
734
735 if (ipha != NULL) {
736 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
737 &fsearch.vxnf_addr);
738 } else {
739 /* vxlnat_cache_remote() did reality checks... */
740 ASSERT(ipha == NULL && ip6h != NULL);
741 fsearch.vxnf_addr = ip6h->ip6_src;
742 }
743
744 rw_enter(&vnet->vxnv_fixed_lock, RW_READER);
745 fixed = avl_find(&vnet->vxnv_fixed_ips, &fsearch, NULL);
746 if (fixed != NULL)
747 VXNF_REFHOLD(fixed);
748 rw_exit(&vnet->vxnv_fixed_lock);
749 if (fixed == NULL)
750 return (B_FALSE); /* Try another method of processing. */
751
752 newmp = NULL;
753 /*
754 * XXX KEBE ASKS --> Do an MTU check NOW?! That way, we have
755 * pre-natted data. One gotcha, external dests may have
756 * different PathMTUs so see below about EMSGSIZE...
757 *
758 * For now, let the post-NAT crunch through
759 * ire_recv_forward_v4() take care of all of that.
760 */
761
762 if (ipha != NULL)
763 newmp = vxlnat_fixed_fixv4(mp, fixed, B_FALSE);
764 else {
765 freemsg(mp); /* XXX handle ip6h */
766 return (B_TRUE);
767 }
768
769 if (newmp == NULL)
770 return (B_TRUE); /* mp eaten by vxlnat_fixed_fixv4() */
771
772
773 ASSERT3P(ipha, ==, newmp->b_rptr);
774 /* XXX KEBE ASKS, IRR_ALLOCATE okay?!? */
775 /* XXX KEBE SAYS XMIT HINT! */
776 outbound_ire = ire_route_recursive_dstonly_v4(ipha->ipha_dst,
777 IRR_ALLOCATE, 0, vxlnat_netstack->netstack_ip);
778 VERIFY3P(outbound_ire, !=, NULL);
779 if (outbound_ire->ire_type == IRE_NOROUTE) {
780 /* Bail! */
781 DTRACE_PROBE2(vxlnat__in__drop__fixedire, ipaddr_t,
782 ipha->ipha_dst, mblk_t *, mp);
783 VXNF_REFRELE(fixed);
784 freemsg(mp);
785 return (B_TRUE);
786 }
787
788 iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
789 if (iras.ira_ip_hdr_length > sizeof (ipha_t))
790 iras.ira_flags |= IRAF_IPV4_OPTIONS;
791 iras.ira_xmit_hint = 0; /* XXX KEBE SAYS FIX ME! */
792 iras.ira_zoneid = outbound_ire->ire_zoneid;
793 iras.ira_pktlen = ntohs(ipha->ipha_length);
794 iras.ira_protocol = ipha->ipha_protocol;
795 /* XXX KEBE ASKS rifindex & ruifindex ?!? */
796 /*
797 * NOTE: AT LEAST ira_ill needs ILLF_ROUTER set, as
798 * well as the ill for the external NIC (where
799 * off-link destinations live). For fixed, ira_ill
800 * should be the ill of the external source.
801 */
802 iras.ira_rill = vxlnat_underlay_ire->ire_ill;
803 iras.ira_ill = fixed->vxnf_ire->ire_ill;
804 /* XXX KEBE ASKS cred & cpid ? */
805 iras.ira_verified_src = ipha->ipha_src;
806 /* XXX KEBE SAYS don't sweat IPsec stuff. */
807 /* XXX KEBE SAYS ALSO don't sweat l2src & mhip */
808
809 /* Okay, we're good! Let's pretend we're forwarding. */
810 ire_recv_forward_v4(outbound_ire, mp, ipha, &iras);
811 ire_refrele(outbound_ire);
812
813 return (B_TRUE);
814 }
815
816 /*
817 * Process exactly one VXLAN packet.
818 */
819 static void
820 vxlnat_one_vxlan(mblk_t *mp, struct sockaddr_in6 *underlay_src)
821 {
822 vxlan_hdr_t *vxh;
823 vxlnat_vnet_t *vnet;
824 ipha_t *ipha;
825 ip6_t *ip6h;
826
827 if (MBLKL(mp) < sizeof (*vxh)) {
828 /* XXX KEBE ASKS -- should we be more forgiving? */
829 DTRACE_PROBE1(vxlnat__in__drop__vxlsize, mblk_t *, mp);
830 freemsg(mp);
831 return;
832 }
833 vxh = (vxlan_hdr_t *)mp->b_rptr;
834
835 /* If we start using more than just the one flag, fix it. */
836 if (vxh->vxlan_flags != VXLAN_F_VDI_WIRE) {
837 DTRACE_PROBE1(vxlnat__in__drop__VDI, mblk_t *, mp);
838 freemsg(mp);
839 return;
840 }
841
842 /* Remember, we key off of what's on the wire. */
843 vnet = vxlnat_get_vnet(VXLAN_ID_WIRE32(vxh->vxlan_id), B_FALSE);
844 if (vnet == NULL) {
845 DTRACE_PROBE1(vxlnat__in__drop__vnetid, uint32_t,
846 VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)));
847 freemsg(mp);
848 return;
849 }
850
851 DTRACE_PROBE2(vxlnat__in__vnet, uint32_t,
852 VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)),
853 vxlnat_vnet_t, vnet);
854
855 /*
856 * Arrived-from-vxlan processing steps:
857 * 1.) Locate the ethernet header and check/update/add-into remotes.
858 * 2.) Search 1-1s, process if hit.
859 * 3.) Search flows, process if hit.
860 * 4.) Search rules, create new flow (or not) if hit.
861 * 5.) Drop the packet.
862 */
863
864 /* 1.) Locate the ethernet header and check/update/add-into remotes. */
865 mp->b_rptr += sizeof (*vxh);
866 while (MBLKL(mp) == 0) {
867 mblk_t *oldmp = mp;
868
869 mp = mp->b_cont;
870 freeb(oldmp);
871 }
872 mp = vxlnat_cache_remote(mp, underlay_src, vnet);
873 if (mp == NULL)
874 goto bail_no_free;
875
876 /* Let's cache the IP header here... */
877 ipha = (ipha_t *)mp->b_rptr;
878 switch (IPH_HDR_VERSION(ipha)) {
879 case IPV4_VERSION:
880 ip6h = NULL;
881 break;
882 case IPV6_VERSION:
883 ip6h = (ip6_t *)ipha;
884 ipha = NULL;
885 break;
886 default:
887 DTRACE_PROBE2(vxlnat__in__drop__ipvers, int,
888 IPH_HDR_VERSION(ipha), mblk_t *, mp);
889 goto bail_and_free;
890 }
891
892 /* 2.) Search 1-1s, process if hit. */
893 if (vxlnat_one_vxlan_fixed(vnet, mp, ipha, ip6h))
894 goto bail_no_free; /* Success means mp was consumed. */
895
896 /* 3.) Search flows, process if hit. */
897 if (vxlnat_one_vxlan_flow(vnet, mp, ipha, ip6h))
898 goto bail_no_free; /* Success means mp was consumed. */
899
900 /* 4.) Search rules, create new flow (or not) if hit. */
901 if (vxlnat_one_vxlan_rule(vnet, mp, ipha, ip6h))
902 goto bail_no_free; /* Success means mp was consumed. */
903
904 /* 5.) Nothing, drop the packet. */
905
906 DTRACE_PROBE2(vxlnat__in___drop__nohits, vxlnat_vnet_t *, vnet,
907 mblk_t *, mp);
908
909 bail_and_free:
910 freemsg(mp);
911 bail_no_free:
912 VXNV_REFRELE(vnet);
913 }
914 /*
915 * ONLY return B_FALSE if we get a packet-clogging event.
916 */
917 /* ARGSUSED */
918 static boolean_t
919 vxlnat_vxlan_input(ksocket_t insock, mblk_t *chain, size_t msgsize, int oob,
920 void *ignored)
921 {
922 mblk_t *mp, *nextmp;
923
924 /*
925 * XXX KEBE ASKS --> move hold & release outside of loop?
926 * If so, hold rwlock here.
927 */
928
929 for (mp = chain; mp != NULL; mp = nextmp) {
930 struct T_unitdata_ind *tudi;
931 struct sockaddr_in6 *sin6;
932
933 nextmp = mp->b_next;
934 if (DB_TYPE(mp) != M_PROTO || mp->b_cont == NULL) {
935 DTRACE_PROBE1(vxlnat__in__drop__mblk, mblk_t *, mp);
936 freemsg(mp);
937 continue;
938 }
939
940 /* LINTED -- aligned */
941 tudi = (struct T_unitdata_ind *)mp->b_rptr;
942 if (tudi->PRIM_type != T_UNITDATA_IND) {
943 DTRACE_PROBE1(vxlnat__in__drop__TPI, mblk_t *, mp);
944 freemsg(mp);
945 continue;
946 }
947 /* LINTED -- aligned */
948 sin6 = (struct sockaddr_in6 *)(mp->b_rptr + tudi->SRC_offset);
949 VERIFY(sin6->sin6_family == AF_INET6);
950 VERIFY(tudi->SRC_length >= sizeof (*sin6));
951
952 vxlnat_one_vxlan(mp->b_cont, sin6);
953 freeb(mp);
954 }
955
956 return (B_TRUE);
957 }
958
959 /*
960 * Use RFC 1141's technique (with a check for -0).
961 *
962 * newsum = oldsum - (new16a + old16a - new16b + old16b ...);
963 *
964 * NOTE: "oldsum" is right off the wire in wire-native order.
965 * NOTE2: "old" and "new" ALSO point to things in wire-native order.
966 * NOTE3: THIS MUST TAKE A MULTIPLE OF 2 BYTES (i.e. uint16_t array).
967 * NOTE4: The 32-bit running sum means we can't take len > 64k.
968 */
969 uint16_t
970 vxlnat_cksum_adjust(uint16_t oldsum, uint16_t *old, uint16_t *new, uint_t len)
971 {
972 uint32_t newsum = ntohs(oldsum);
973
974 ASSERT((len & 0x1) == 0);
975 while (len != 0) {
976 newsum -= ntohs(*new);
977 newsum += ntohs(*old);
978 len -= 2;
979 old++;
980 new++;
981 }
982 newsum += (newsum >> 16) & 0xffff;
983
984 return (newsum == 0xffff ? 0 : htons(newsum));
985 }
986
987 /*
988 * Fix inner headers on an ICMP packet.
989 *
990 * XXX KEBE SAYS FOR NOW, just do addresses for 1-1/fixed. When we do
991 * flows, include old_port/new_port as well.
992 */
993 static mblk_t *
994 vxlnat_fix_icmp_inner_v4(mblk_t *mp, icmph_t *icmph, ipaddr_t old_one,
995 ipaddr_t new_one, boolean_t to_private)
996 {
997 mblk_t *newmp;
998 ipha_t *inner_ipha;
999 ipaddr_t *new_ones_place;
1000
1001 if ((uint8_t *)(icmph + 1) + sizeof (ipha_t) > mp->b_wptr) {
1002 /* Pay the pullup tax. */
1003 newmp = msgpullup(mp, -1);
1004 freemsg(mp);
1005 if (newmp == NULL) {
1006 DTRACE_PROBE1(vxlnat__fixicmp__pullupfail, void *,
1007 NULL);
1008 return (NULL);
1009 }
1010 if (MBLKL(newmp) < 2 * sizeof (ipha_t) + sizeof (icmph_t)) {
1011 /* Wow! Too-tiny ICMP packet. */
1012 DTRACE_PROBE1(vxlnat__fixicmp__tootiny, mblk_t *,
1013 newmp);
1014 freeb(newmp);
1015 return (NULL);
1016 }
1017 mp = newmp;
1018 /* Temporarily use inner_ipha for the outer one. */
1019 inner_ipha = (ipha_t *)mp->b_rptr;
1020 icmph = (icmph_t *)(mp->b_rptr + IPH_HDR_LENGTH(inner_ipha));
1021 }
1022 inner_ipha = (ipha_t *)(icmph + 1);
1023 new_ones_place = to_private ?
1024 &inner_ipha->ipha_src : &inner_ipha->ipha_dst;
1025 if (*new_ones_place != old_one) {
1026 /* Either I'm buggy or the packet is. */
1027 DTRACE_PROBE2(vxlnat__fixicmp__badinneraddr, ipaddr_t,
1028 old_one, ipaddr_t, *new_ones_place);
1029 freeb(mp);
1030 return (NULL);
1031 }
1032 *new_ones_place = new_one;
1033
1034 /* Adjust ICMP checksum... */
1035 icmph->icmph_checksum = vxlnat_cksum_adjust(icmph->icmph_checksum,
1036 (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
1037
1038 /*
1039 * XXX KEBE ASKS, recompute *inner-packet* checksums? Let's not for
1040 * now, but consider this Fair Warning (or some other VH album...).
1041 */
1042 return (mp);
1043 }
1044
1045 /*
1046 * Take a 1-1/fixed IPv4 packet and convert it for transmission out the
1047 * appropriate end. "to_private" is what it says on the tin.
1048 * ALWAYS consumes "mp", regardless of return value.
1049 */
1050 static mblk_t *
1051 vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed, boolean_t to_private)
1052 {
1053 ipaddr_t new_one, old_one;
1054 ipaddr_t *new_ones_place;
1055 ipha_t *ipha = (ipha_t *)mp->b_rptr;
1056 uint8_t *nexthdr, *end_wptr;
1057
1058 if (to_private) {
1059 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_addr, new_one);
1060 new_ones_place = &ipha->ipha_dst;
1061 } else {
1062 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_pubaddr, new_one);
1063 new_ones_place = &ipha->ipha_src;
1064 }
1065
1066 old_one = *new_ones_place;
1067 *new_ones_place = new_one;
1068
1069 /*
1070 * Recompute the IP header checksum, and check for the TCP or UDP
1071 * checksum as well, as they'll need recomputing as well.
1072 */
1073
1074 /* First, the IPv4 header itself. */
1075 ipha->ipha_hdr_checksum = vxlnat_cksum_adjust(ipha->ipha_hdr_checksum,
1076 (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
1077
1078 nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
1079 if (nexthdr >= mp->b_wptr) {
1080 nexthdr = mp->b_cont->b_rptr +
1081 (MBLKL(mp) - IPH_HDR_LENGTH(ipha));
1082 end_wptr = mp->b_cont->b_wptr;
1083 } else {
1084 end_wptr = mp->b_wptr;
1085 }
1086
1087 switch (ipha->ipha_protocol) {
1088 case IPPROTO_TCP: {
1089 tcpha_t *tcph = (tcpha_t *)nexthdr;
1090
1091 if (nexthdr + sizeof (*tcph) >= end_wptr) {
1092 /* Bail for now. */
1093 DTRACE_PROBE1(vxlnat__fix__tcp__mblkspan, mblk_t *,
1094 mp);
1095 freemsg(mp);
1096 return (NULL);
1097 }
1098 tcph->tha_sum = vxlnat_cksum_adjust(tcph->tha_sum,
1099 (uint16_t *)&old_one, (uint16_t *)&new_one,
1100 sizeof (ipaddr_t));
1101 break; /* Out of switch. */
1102 }
1103 case IPPROTO_UDP: {
1104 udpha_t *udph = (udpha_t *)nexthdr;
1105
1106 if (nexthdr + sizeof (*udph) >= end_wptr) {
1107 /* Bail for now. */
1108 DTRACE_PROBE1(vxlnat__fix__udp__mblkspan, mblk_t *,
1109 mp);
1110 freemsg(mp);
1111 return (NULL);
1112 }
1113 udph->uha_checksum = vxlnat_cksum_adjust(udph->uha_checksum,
1114 (uint16_t *)&old_one, (uint16_t *)&new_one,
1115 sizeof (ipaddr_t));
1116 break; /* Out of switch. */
1117 }
1118 case IPPROTO_ICMP: {
1119 icmph_t *icmph = (icmph_t *)nexthdr;
1120
1121 /*
1122 * We need to check the case of ICMP messages that contain
1123 * IP packets. We will need to at least change the addresses,
1124 * and *maybe* the checksums too if necessary.
1125 *
1126 * This may replicate some of icmp_inbound_v4(), alas.
1127 */
1128 if (nexthdr + sizeof (*icmph) >= end_wptr) {
1129 mblk_t *newmp;
1130 /*
1131 * Unlike the others, we're going to pay the pullup
1132 * tax here.
1133 */
1134 newmp = msgpullup(mp, -1);
1135 freemsg(mp);
1136 if (newmp == NULL) {
1137 DTRACE_PROBE1(vxlnat__icmp__pullupfail, void *,
1138 NULL);
1139 return (NULL);
1140 }
1141 mp = newmp;
1142 ipha = (ipha_t *)(mp->b_rptr);
1143 nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
1144 icmph = (icmph_t *)nexthdr;
1145 }
1146
1147 switch (icmph->icmph_type) {
1148 case ICMP_ADDRESS_MASK_REPLY:
1149 case ICMP_ADDRESS_MASK_REQUEST:
1150 case ICMP_TIME_STAMP_REPLY:
1151 case ICMP_TIME_STAMP_REQUEST:
1152 case ICMP_ECHO_REQUEST:
1153 case ICMP_ECHO_REPLY:
1154 /* These merely need to get passed along. */
1155 break;
1156 case ICMP_ROUTER_ADVERTISEMENT:
1157 case ICMP_ROUTER_SOLICITATION:
1158 /* These shouldn't be traversing a NAT at all. Drop. */
1159 DTRACE_PROBE1(vxlnat__icmp__cantpass, int,
1160 icmph->icmph_type);
1161 freemsg(mp);
1162 return (NULL);
1163 case ICMP_PARAM_PROBLEM:
1164 case ICMP_TIME_EXCEEDED:
1165 case ICMP_DEST_UNREACHABLE:
1166 /* These include inner-IP headers we need to adjust. */
1167 mp = vxlnat_fix_icmp_inner_v4(mp, icmph, old_one,
1168 new_one, to_private);
1169 break;
1170 default:
1171 /* Pass along to receiver, but warn. */
1172 DTRACE_PROBE1(vxlnat__icmp__unknown, int,
1173 icmph->icmph_type);
1174 break;
1175 }
1176 }
1177 /* Otherwise we can't make any other assumptions for now... */
1178 default:
1179 break;
1180 }
1181
1182 return (mp);
1183 }
1184
1185 vxlnat_remote_t *
1186 vxlnat_xmit_vxlanv4(mblk_t *mp, in6_addr_t *overlay_dst,
1187 vxlnat_remote_t *remote, uint8_t *myether, vxlnat_vnet_t *vnet)
1188 {
1189 struct sockaddr_in6 sin6 = {AF_INET6};
1190 struct msghdr msghdr = {NULL};
1191 mblk_t *vlan_mp;
1192 extern uint_t vxlan_alloc_size, vxlan_noalloc_min;
1193 vxlan_hdr_t *vxh;
1194 struct ether_vlan_header *evh;
1195 int rc;
1196 cred_t *cred;
1197
1198 if (remote == NULL || remote->vxnrem_vnet == NULL) {
1199 DTRACE_PROBE1(vxlnat__xmit__vxlanv4, vxlnat_remote_t *, remote);
1200 /* Release the condemned remote. */
1201 if (remote != NULL)
1202 VXNREM_REFRELE(remote);
1203
1204 /* See if we have a remote ready to use... */
1205 remote = vxlnat_get_remote(vnet, overlay_dst, B_FALSE);
1206
1207 if (remote == NULL) {
1208 /*
1209 * We need to do the moral equivalent of PF_KEY
1210 * ACQUIRE or overlay's queue-resolve so that we can
1211 * have someone in user-space send me a remote. Until
1212 * then, drop the reference if condemned, free the
1213 * message, and return NULL.
1214 */
1215
1216 freemsg(mp);
1217 return (NULL);
1218 }
1219 }
1220 ASSERT(vnet == remote->vxnrem_vnet);
1221
1222 if (DB_REF(mp) > 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
1223 vlan_mp = allocb(vxlan_alloc_size, BPRI_HI);
1224 if (vlan_mp == NULL) {
1225 DTRACE_PROBE1(vxlnat__xmit__vxlanv4__allocfail,
1226 vxlnat_remote_t *, remote);
1227 freemsg(mp);
1228 /* Just drop the packet, but don't tell caller. */
1229 return (remote);
1230 }
1231 vlan_mp->b_wptr = DB_LIM(vlan_mp);
1232 vlan_mp->b_rptr = vlan_mp->b_wptr;
1233 vlan_mp->b_cont = mp;
1234 } else {
1235 vlan_mp = mp;
1236 }
1237 vlan_mp->b_rptr -= sizeof (*vxh) + sizeof (*evh);
1238 vxh = (vxlan_hdr_t *)vlan_mp->b_rptr;
1239 vxh->vxlan_flags = VXLAN_F_VDI_WIRE;
1240 vxh->vxlan_id = vnet->vxnv_vnetid; /* Already in wire-order. */
1241
1242 /* Fill in the Ethernet header. */
1243 evh = (struct ether_vlan_header *)(vxh + 1);
1244 ether_copy(&remote->vxnrem_ether, &evh->ether_dhost);
1245 ether_copy(myether, &evh->ether_shost);
1246 evh->ether_tpid = htons(ETHERTYPE_VLAN);
1247 evh->ether_tci = remote->vxnrem_vlan;
1248 evh->ether_type = htons(ETHERTYPE_IP);
1249
1250 msghdr.msg_name = (struct sockaddr_storage *)&sin6;
1251 msghdr.msg_namelen = sizeof (sin6);
1252 /* Address family and other zeroing already done up top. */
1253 sin6.sin6_port = htons(IPPORT_VXLAN);
1254 sin6.sin6_addr = remote->vxnrem_uaddr;
1255
1256 /*
1257 * cred_t dance is because we may be getting this straight from
1258 * interrupt context.
1259 */
1260 cred = zone_get_kcred(netstack_get_zoneid(vxlnat_netstack));
1261 if (cred == NULL) {
1262 DTRACE_PROBE1(vxlnat__xmit__vxlan4__credfail,
1263 vxlnat_remote_t *, remote);
1264 freemsg(vlan_mp);
1265 }
1266 /*
1267 * Use MSG_DONTWAIT to avoid blocks, esp. if we're getting this
1268 * straight from interrupt context.
1269 */
1270 rc = ksocket_sendmblk(vxlnat_underlay, &msghdr, MSG_DONTWAIT, &vlan_mp,
1271 cred);
1272 crfree(cred);
1273 if (rc != 0) {
1274 DTRACE_PROBE2(vxlnat__xmit__vxlan4__sendfail, int, rc,
1275 vxlnat_remote_t *, remote);
1276 freemsg(vlan_mp);
1277 }
1278 return (remote);
1279 }
1280
1281 /*
1282 * New ire_{recv,send}fn implementations if we're doing 1-1 mappings.
1283 */
1284 int
1285 vxlnat_fixed_ire_send_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
1286 ip_xmit_attr_t *ixa, uint32_t *identp)
1287 {
1288 /* XXX KEBE SAYS FILL ME IN, but for now... */
1289 freemsg(mp);
1290 return (EOPNOTSUPP);
1291 }
1292
1293 void
1294 vxlnat_fixed_ire_recv_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
1295 ip_recv_attr_t *ira)
1296 {
1297 /* XXX KEBE SAYS FILL ME IN, but for now... */
1298 freemsg(mp);
1299 }
1300
1301 /*
1302 * I believe the common case for this will be from self-generated ICMP
1303 * messages. Other same-netstack-originated traffic will also come through
1304 * here (one internal reaching what turns out to be another internal).
1305 */
1306 int
1307 vxlnat_fixed_ire_send_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1308 ip_xmit_attr_t *ixa, uint32_t *identp)
1309 {
1310 ip_recv_attr_t iras; /* NOTE: No bzero because we pay more later */
1311 ipha_t *ipha = (ipha_t *)iph_arg;
1312
1313 /*
1314 * XXX KEBE ASKS, any DTrace probes or other instrumentation that
1315 * perhaps should be set?
1316 */
1317
1318 /* Map ixa to ira. */
1319 iras.ira_pktlen = ixa->ixa_pktlen;
1320 /* XXX KEBE ASKS more?!? */
1321
1322 /*
1323 * In normal TCP/IP processing, this shortcuts the IP header checksum
1324 * AND POSSIBLY THE ULP checksum cases. Since this is likely to head
1325 * back into the internal network, we need to recompute things again.
1326 */
1327 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
1328 freemsg(mp);
1329 return (EMSGSIZE);
1330 }
1331 #if 0
1332 /* XXX KEBE ASKS Special-case ICMP here? */
1333 if (ipha->ipha_protocol == IPPROTO_ICMP) {
1334 icmph_t *icmph;
1335
1336 icmph = (icmph_t *)((uint8_t *)ipha + IPH_HDR_LENGTH(ipha));
1337 if ((uint8_t *)icmph >= mp->b_wptr) {
1338 freemsg(mp);
1339 return (EMSGSIZE);
1340 }
1341 icmph->icmph_checksum = 0;
1342 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
1343 }
1344 #endif
1345
1346 vxlnat_fixed_ire_recv_v4(ire, mp, iph_arg, &iras);
1347
1348 return (0);
1349 }
1350
1351 void
1352 vxlnat_fixed_ire_recv_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1353 ip_recv_attr_t *ira)
1354 {
1355 vxlnat_fixed_t *fixed;
1356 vxlnat_vnet_t *vnet;
1357 ipha_t *ipha = (ipha_t *)iph_arg;
1358 int newmtu;
1359
1360 /* Make a note for DAD that this address is in use */
1361 ire->ire_last_used_time = LBOLT_FASTPATH;
1362
1363 /* Only target the IRE_LOCAL with the right zoneid. */
1364 ira->ira_zoneid = ire->ire_zoneid;
1365
1366 /*
1367 * XXX KEBE ASKS, any DTrace probes or other instrumentation that
1368 * perhaps should be set?
1369 */
1370
1371 /*
1372 * Reality check some things.
1373 */
1374 fixed = (vxlnat_fixed_t *)ire->ire_dep_sib_next;
1375 vnet = fixed->vxnf_vnet;
1376
1377 ASSERT3P(ire, ==, fixed->vxnf_ire);
1378
1379 if (IRE_IS_CONDEMNED(ire) || vnet == NULL)
1380 goto detach_ire_and_bail;
1381
1382 /*
1383 * Not a common-case, but a possible one. If our underlay MTU is
1384 * smaller than the external MTU, it is possible that we will have a
1385 * size mismatch and therefore need to either fragment at the VXLAN
1386 * layer (VXLAN UDP packet sent as two or more IP fragments) OR
1387 * if IPH_DF is set, send an ICMP_NEEDS_FRAGMENTATION back to the
1388 * sender. Perform the check here BEFORE we NAT the packet.
1389 */
1390 ASSERT(vxlnat_underlay_ire->ire_ill != NULL);
1391 newmtu = vxlnat_underlay_ire->ire_ill->ill_mtu - sizeof (ipha_t) -
1392 sizeof (udpha_t) - sizeof (vxlan_hdr_t) -
1393 sizeof (struct ether_vlan_header);
1394 if ((ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF) &&
1395 ntohs(ipha->ipha_length) > newmtu) {
1396 icmp_frag_needed(mp, newmtu, ira);
1397 /* We're done. Assume icmp_frag_needed() consumed mp. */
1398 return;
1399 }
1400
1401 /*
1402 * So we're here, and since we have a refheld IRE, we have a refheld
1403 * fixed and vnet. Do some of what ip_input_local_v4() does (inbound
1404 * checksum? some ira checks?), but otherwise, swap the destination
1405 * address as mapped in "fixed", recompute any checksums, and send it
1406 * along its merry way (with a ttl decement too) to a VXLAN
1407 * destination.
1408 */
1409 mp = vxlnat_fixed_fixv4(mp, fixed, B_TRUE);
1410 if (mp == NULL)
1411 return; /* Assume it's been freed & dtraced already. */
1412
1413 /*
1414 * Otherwise, we're ready to transmit this packet over the vxlan
1415 * socket.
1416 */
1417 fixed->vxnf_remote = vxlnat_xmit_vxlanv4(mp, &fixed->vxnf_addr,
1418 fixed->vxnf_remote, fixed->vxnf_myether, vnet);
1419 if (fixed->vxnf_remote == NULL) {
1420 /* XXX KEBE ASKS, DTrace probe here? Or in-function? */
1421 DTRACE_PROBE2(vxlnat__fixed__xmitdrop,
1422 in6_addr_t *, &fixed->vxnf_addr,
1423 uint32_t, VXLAN_ID_NTOH(vnet->vxnv_vnetid));
1424 }
1425 return;
1426
1427 detach_ire_and_bail:
1428 /* Oh no, something's condemned. Drop the IRE now. */
1429 ire->ire_recvfn = ire_recv_local_v4;
1430 ire->ire_dep_sib_next = NULL;
1431 VXNF_REFRELE(fixed);
1432 /* Pass the packet back... */
1433 ire_recv_local_v4(ire, mp, iph_arg, ira);
1434 return;
1435 }