Print this page
WIP to help bringup NAT flows
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/inet/vxlnat/vxlnat_nat.c
+++ new/usr/src/uts/common/inet/vxlnat/vxlnat_nat.c
1 1 /*
2 2 * This file and its contents are supplied under the terms of the
3 3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 4 * You may only use this file in accordance with the terms of version
5 5 * 1.0 of the CDDL.
6 6 *
7 7 * A full copy of the text of the CDDL should have accompanied this
8 8 * source. A copy of the CDDL is also available via the Internet at
9 9 * http://www.illumos.org/license/CDDL.
10 10 */
11 11
12 12 /*
13 13 * Copyright 2018 Joyent, Inc.
14 14 */
15 15
16 16 /*
17 17 * NAT engine. Mappings, 1-1 The rules in vxlnat_rules.c are only consulted
18 18 * if the 1-1 map (kept here) misses or if the outbound lookup (vnetid,
19 19 * protocol, src-IP, dst-IP, src-port, dst-port) misses.
20 20 *
21 21 * The plan is for inbound to hit conn_ts, whose conn_private points to
22 22 * entries here. The conn_recv* functions live here too (for now).
23 23 */
24 24
25 25 #include <sys/types.h>
26 26 #include <sys/socket.h>
27 27 #include <sys/ksynch.h>
28 28 #include <sys/ksocket.h>
29 29 #include <sys/kmem.h>
30 30 #include <sys/stream.h>
31 31 #include <sys/strsubr.h>
|
↓ open down ↓ |
31 lines elided |
↑ open up ↑ |
32 32 #include <sys/strsun.h>
33 33 #include <sys/sysmacros.h>
34 34 #include <sys/debug.h>
35 35 #include <sys/dtrace.h>
36 36 #include <sys/errno.h>
37 37 #include <sys/tihdr.h>
38 38 #include <netinet/in.h>
39 39 #include <netinet/udp.h>
40 40 #include <inet/ip.h>
41 41 #include <inet/ip6.h>
42 +#include <inet/tcp_impl.h>
42 43 #include <inet/udp_impl.h>
43 44 #include <inet/tcp.h>
44 45
45 46 #include <inet/vxlnat_impl.h>
46 47
47 48 static boolean_t vxlnat_vxlan_input(ksocket_t, mblk_t *, size_t, int, void *);
48 49 static mblk_t *vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed,
49 50 boolean_t to_private);
50 51
51 52 /*
52 53 * Initialized to NULL, read/write protected by vxlnat_mutex.
53 54 * Receive functions shouldn't have to access this directly.
54 55 */
55 56 ksocket_t vxlnat_underlay;
56 57 ire_t *vxlnat_underlay_ire;
57 58
58 59 void
59 60 vxlnat_closesock(void)
60 61 {
61 62 ASSERT(MUTEX_HELD(&vxlnat_mutex));
62 63 if (vxlnat_underlay_ire != NULL) {
63 64 ire_refrele(vxlnat_underlay_ire);
64 65 vxlnat_underlay_ire = NULL;
65 66 }
66 67 if (vxlnat_underlay != NULL) {
67 68 (void) ksocket_close(vxlnat_underlay, zone_kcred());
68 69 vxlnat_underlay = NULL;
69 70 }
70 71 }
71 72
72 73 static int
73 74 vxlnat_opensock(in6_addr_t *underlay_ip)
74 75 {
75 76 int rc, val;
76 77 /* Assume rest is initialized to 0s. */
77 78 struct sockaddr_in6 sin6 = {AF_INET6, BE_16(IPPORT_VXLAN)};
78 79 ip_stack_t *ipst = vxlnat_netstack->netstack_ip;
79 80
80 81 ASSERT(MUTEX_HELD(&vxlnat_mutex));
81 82 /* Open... */
82 83 rc = ksocket_socket(&vxlnat_underlay, AF_INET6, SOCK_DGRAM, 0,
83 84 KSOCKET_SLEEP, zone_kcred());
84 85 if (rc != 0)
85 86 return (rc);
86 87
87 88 /* Bind... */
88 89 sin6.sin6_addr = *underlay_ip;
89 90 rc = ksocket_bind(vxlnat_underlay, (struct sockaddr *)(&sin6),
90 91 sizeof (sin6), zone_kcred());
91 92 if (rc != 0) {
92 93 vxlnat_closesock();
93 94 return (rc);
94 95 }
95 96
96 97 /* Use source-port hashing when sending packets out VXLAN... */
97 98 val = UDP_HASH_VXLAN;
98 99 rc = ksocket_setsockopt(vxlnat_underlay, IPPROTO_UDP,
99 100 UDP_SRCPORT_HASH, &val, sizeof (val), kcred);
100 101 if (rc != 0) {
101 102 vxlnat_closesock();
102 103 return (rc);
103 104 }
104 105
105 106 /*
106 107 * Grab the IRE for underlay address.
107 108 */
108 109 ASSERT3P(vxlnat_underlay_ire, ==, NULL);
109 110 vxlnat_underlay_ire = (IN6_IS_ADDR_V4MAPPED(underlay_ip)) ?
110 111 ire_ftable_lookup_simple_v4(underlay_ip->_S6_un._S6_u32[3],
111 112 0, ipst, NULL) :
112 113 ire_ftable_lookup_simple_v6(underlay_ip, 0, ipst, NULL);
113 114 if (vxlnat_underlay_ire == NULL) {
114 115 DTRACE_PROBE1(vxlnat__opensock__ire__fail, in6_addr_t *,
115 116 underlay_ip);
116 117 vxlnat_closesock();
117 118 return (EADDRNOTAVAIL);
118 119 }
119 120
120 121 /* Once we return from this, start eating data. */
121 122 rc = ksocket_krecv_set(vxlnat_underlay, vxlnat_vxlan_input, NULL);
122 123 if (rc != 0) {
123 124 vxlnat_closesock();
124 125 }
125 126
126 127 return (rc);
127 128 }
128 129
129 130 /*
130 131 * Establish a VXLAN-listening kernel socket.
131 132 * XXX KEBE ASKS ==> Support more than one VXLAN address?
132 133 */
133 134 /* ARGSUSED */
134 135 int
135 136 vxlnat_vxlan_addr(in6_addr_t *underlay_ip)
136 137 {
137 138 int rc;
138 139
139 140 ASSERT(MUTEX_HELD(&vxlnat_mutex));
140 141 /* For now, we make this a one-underlay-address-only solution. */
141 142 vxlnat_closesock();
142 143 rc = vxlnat_opensock(underlay_ip);
143 144 return (rc);
144 145 }
145 146
146 147 /*
147 148 * Free a remote VXLAN destination.
148 149 */
149 150 void
150 151 vxlnat_remote_free(vxlnat_remote_t *remote)
151 152 {
152 153 ASSERT0(remote->vxnrem_refcount);
153 154
154 155 kmem_free(remote, sizeof (*remote));
155 156 }
156 157
157 158 /*
158 159 * Like other unlink functions, assume the appropriate lock is held.
159 160 */
160 161 void
161 162 vxlnat_remote_unlink(vxlnat_remote_t *remote)
162 163 {
163 164 vxlnat_vnet_t *vnet = remote->vxnrem_vnet;
164 165
165 166 ASSERT3P(vnet, !=, NULL);
166 167 ASSERT(MUTEX_HELD(&vnet->vxnv_remote_lock));
167 168
168 169 /* First unlink so nobody else can find me */
169 170 avl_remove(&vnet->vxnv_remotes, remote);
170 171
171 172 /*
172 173 * We still hold a vnet reference, so races shouldn't be a problem.
173 174 * Still, for added safety, NULL it out first.
174 175 */
175 176 remote->vxnrem_vnet = NULL; /* Condemn this entry. */
176 177 VXNV_REFRELE(vnet);
177 178 VXNREM_REFRELE(remote); /* Internment release. */
178 179 }
179 180
180 181 /*
181 182 * Find or create a remote VXLAN destination.
182 183 */
183 184 static vxlnat_remote_t *
184 185 vxlnat_get_remote(vxlnat_vnet_t *vnet, in6_addr_t *remote_addr,
185 186 boolean_t create_on_miss)
186 187 {
187 188 vxlnat_remote_t *remote, searcher;
188 189 avl_index_t where;
189 190
190 191 searcher.vxnrem_addr = *remote_addr;
191 192 mutex_enter(&vnet->vxnv_remote_lock);
192 193 remote = avl_find(&vnet->vxnv_remotes, &searcher, &where);
193 194 if (remote == NULL && create_on_miss) {
194 195 /* Not as critical if we can't allocate here. */
195 196 remote = kmem_zalloc(sizeof (*remote),
196 197 KM_NOSLEEP | KM_NORMALPRI);
197 198 if (remote != NULL) {
198 199 remote->vxnrem_addr = *remote_addr;
199 200 remote->vxnrem_refcount = 1; /* Internment reference. */
200 201 VXNV_REFHOLD(vnet);
201 202 remote->vxnrem_vnet = vnet;
202 203 /* Rest is filled in by caller. */
203 204 avl_insert(&vnet->vxnv_remotes, remote, where);
204 205 }
205 206 }
206 207 if (remote != NULL)
207 208 VXNREM_REFHOLD(remote);
208 209 mutex_exit(&vnet->vxnv_remote_lock);
209 210 return (remote);
210 211 }
211 212
212 213 /*
213 214 * Cache inbound packet information in the vnet's remotes section.
214 215 *
215 216 * NOTE: This function assumes a trustworthy underlay network. If the
216 217 * underlay isn't trustworthy, this function should be renamed, and reduced to
217 218 * a "strip and reality-check the ethernet header" function.
218 219 *
219 220 * Caller has stripped any pre-ethernet data from mp. We return mp
220 221 * stripped down to its IP header.
221 222 */
222 223 static mblk_t *
223 224 vxlnat_cache_remote(mblk_t *mp, struct sockaddr_in6 *underlay_src,
224 225 vxlnat_vnet_t *vnet)
225 226 {
226 227 struct ether_vlan_header *evh;
227 228 struct ether_header *eh;
228 229 vxlnat_remote_t *remote;
229 230 uint16_t vlan, ethertype;
230 231 ether_addr_t remote_ether;
231 232 ipha_t *ipha;
232 233 ip6_t *ip6h;
233 234 in6_addr_t remote_addr;
234 235
235 236 /* Assume (for now) we have at least a VLAN header's worth of data. */
236 237 if (MBLKL(mp) < sizeof (*evh)) {
237 238 /* XXX KEBE ASKS - should we be more forgiving? */
238 239 DTRACE_PROBE1(vxlnat__in__drop__etherhdr, mblk_t *, mp);
239 240 freemsg(mp);
240 241 return (NULL);
241 242 }
242 243
243 244 eh = (struct ether_header *)mp->b_rptr;
244 245 ethertype = ntohs(eh->ether_type);
245 246 ether_copy(&eh->ether_shost, &remote_ether);
246 247 if (ethertype == ETHERTYPE_VLAN) {
247 248 evh = (struct ether_vlan_header *)eh;
248 249 /* Keep it in network order... */
249 250 vlan = evh->ether_tci;
250 251 ethertype = ntohs(evh->ether_type);
251 252 ASSERT(vlan != 0);
252 253 mp->b_rptr += sizeof (*evh);
253 254 } else {
254 255 evh = NULL;
255 256 vlan = 0;
256 257 mp->b_rptr += sizeof (*eh);
257 258 }
258 259 if (ethertype != ETHERTYPE_IP && ethertype != ETHERTYPE_IPV6) {
259 260 /*
260 261 * XXX KEBE SAYS for now, don't handle non-IP packets.
261 262 * This includes ARP.
262 263 */
263 264 DTRACE_PROBE1(vxlnat__in__drop__nonip, mblk_t *, mp);
264 265 freemsg(mp);
265 266 return (NULL);
266 267 }
267 268
268 269 /* Handle case of split ether + IP headers. */
269 270 if (MBLKL(mp) < sizeof (ipha_t)) {
270 271 mblk_t *freemp;
271 272
272 273 if (MBLKL(mp) > 0 || mp->b_cont == NULL) {
273 274 /* The IP header is split ACROSS MBLKS! Bail for now. */
274 275 DTRACE_PROBE1(vxlnat__in__drop__splitip, mblk_t *, mp);
275 276 freemsg(mp);
276 277 return (NULL);
277 278 }
278 279 freemp = mp;
279 280 mp = mp->b_cont;
280 281 freeb(freemp);
281 282 }
282 283 /* LINTED -- alignment... */
283 284 ipha = (ipha_t *)mp->b_rptr;
284 285
285 286 if (IPH_HDR_VERSION(ipha) == IPV4_VERSION) {
286 287 if (ethertype != ETHERTYPE_IP) {
287 288 /* XXX KEBE ASKS - should we be more forgiving? */
288 289 DTRACE_PROBE1(vxlnat__in__drop__etherhdr4,
289 290 mblk_t *, mp);
290 291 freemsg(mp);
291 292 return (NULL);
292 293 }
293 294 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
294 295 &remote_addr);
295 296 } else {
296 297 if (ethertype != ETHERTYPE_IPV6 ||
297 298 IPH_HDR_VERSION(ipha) != IPV6_VERSION ||
298 299 MBLKL(mp) < sizeof (ip6_t)) {
299 300 /* XXX KEBE ASKS - should we be more forgiving? */
300 301 DTRACE_PROBE1(vxlnat__in__drop__etherhdr6,
301 302 mblk_t *, mp);
302 303 freemsg(mp);
303 304 return (NULL);
304 305 }
305 306 ip6h = (ip6_t *)ipha;
306 307 remote_addr = ip6h->ip6_src;
307 308 }
308 309
309 310 /* Find remote and replace OR create new remote. */
310 311 remote = vxlnat_get_remote(vnet, &remote_addr, B_TRUE);
311 312 if (remote != NULL) {
312 313 /*
313 314 * See if this entry needs fixing or filling-in. This might
314 315 * get a bit racy with read-only threads that actually
315 316 * transmit, but it only means dropped-packets in the worst
316 317 * case.
317 318 *
318 319 * It's THIS PART that inspires the warning about trusting the
319 320 * underlay network.
320 321 *
321 322 * XXX KEBE ASKS -- should we just replace things w/o checking?
322 323 */
323 324 /* Replace the ethernet address? */
324 325 if (ether_cmp(&remote->vxnrem_ether, &remote_ether) != 0)
325 326 ether_copy(&remote_ether, &remote->vxnrem_ether);
326 327 /*
327 328 * Replace the underlay? NOTE: Fix if/when underlay becomes
328 329 * IPv6.
329 330 */
330 331 if (!IN6_ARE_ADDR_EQUAL(&remote->vxnrem_uaddr,
331 332 &underlay_src->sin6_addr)) {
332 333 remote->vxnrem_uaddr = underlay_src->sin6_addr;
333 334 }
334 335 /* Replace the vlan ID. Maintain network order... */
335 336 if (remote->vxnrem_vlan != vlan)
336 337 remote->vxnrem_vlan = vlan;
337 338 }
338 339 /*
339 340 * Else just continue and pray for better luck on another packet or
340 341 * on the return flight. It is IP, we can Just Drop It (TM)...
341 342 */
|
↓ open down ↓ |
290 lines elided |
↑ open up ↑ |
342 343
343 344 /* We're done with the remote entry now. */
344 345 VXNREM_REFRELE(remote);
345 346
346 347 /* Advance rptr to the inner IP header and proceed. */
347 348 mp->b_rptr = (uint8_t *)ipha;
348 349 return (mp);
349 350 }
350 351
351 352 /*
353 + * Extract transport-level information to find a NAT flow.
354 + * Consume mp and return B_FALSE if there's a problem. Fill in "ports"
355 + * and "protocol" and return B_TRUE if there's not.
356 + */
357 +static boolean_t
358 +vxlnat_grab_transport(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t *ports,
359 + uint8_t *protocol, uint8_t **nexthdr_ptr)
360 +{
361 + uint8_t *nexthdr;
362 +
363 + /* Punt on IPv6 for now... */
364 + if (ip6h != NULL) {
365 + freemsg(mp);
366 + return (B_FALSE);
367 + }
368 +
369 + ASSERT(ipha != NULL);
370 + *protocol = ipha->ipha_protocol;
371 + nexthdr = ((uint8_t *)ipha + IPH_HDR_LENGTH(ipha));
372 + *nexthdr_ptr = nexthdr; /* Get this out of the way now. */
373 + if (nexthdr > mp->b_wptr) {
374 + DTRACE_PROBE1(vxlnat__in__drop__trnexthdr, mblk_t *, mp);
375 + freemsg(mp);
376 + return (B_FALSE);
377 + }
378 + switch (*protocol) {
379 + case IPPROTO_TCP: {
380 + tcpha_t *tcph = (tcpha_t *)nexthdr;
381 +
382 + if (nexthdr + sizeof (*tcph) > mp->b_wptr) {
383 + DTRACE_PROBE1(vxlnat__in__drop__tcpnexthdr, mblk_t *,
384 + mp);
385 + freemsg(mp);
386 + return (B_FALSE);
387 + }
388 + *ports = *((uint32_t *)tcph);
389 + /* XXX KEBE SAYS - grab other metadata here NOW? */
390 + break;
391 + }
392 + case IPPROTO_UDP: {
393 + udpha_t *udph = (udpha_t *)nexthdr;
394 +
395 + if (nexthdr + sizeof (*udph) > mp->b_wptr) {
396 + DTRACE_PROBE1(vxlnat__in__drop__udpnexthdr, mblk_t *,
397 + mp);
398 + freemsg(mp);
399 + return (B_FALSE);
400 + }
401 + *ports = *((uint32_t *)udph);
402 + /*
403 + * XXX KEBE SAYS - not as much as TCP, but grab other metadata
404 + * here NOW?
405 + */
406 + break;
407 + }
408 + case IPPROTO_ICMP: {
409 + icmph_t *icmph = (icmph_t *)nexthdr;
410 +
411 + if (nexthdr + sizeof (*icmph) > mp->b_wptr) {
412 + DTRACE_PROBE1(vxlnat__in__drop__icmpnexthdr, mblk_t *,
413 + mp);
414 + freemsg(mp);
415 + return (B_FALSE);
416 + }
417 + /* XXX KEBE SAYS sort out ICMP header... */
418 + switch (icmph->icmph_type) {
419 + case ICMP_ECHO_REQUEST:
420 + case ICMP_TIME_STAMP_REQUEST:
421 + case ICMP_TIME_EXCEEDED:
422 + case ICMP_INFO_REQUEST:
423 + case ICMP_ADDRESS_MASK_REPLY:
424 + /* All ones we can sorta cope with... */
425 + break;
426 + default:
427 + DTRACE_PROBE2(vxlnat__in__drop__icmptype, int,
428 + icmph->icmph_type, mblk_t *, mp);
429 + freemsg(mp);
430 + return (B_FALSE);
431 + }
432 + /* NOTE: as of now, will switch position depending on endian. */
433 + *ports = icmph->icmph_echo_ident;
434 + break;
435 + }
436 + default:
437 + *ports = 0;
438 + break;
439 + }
440 +
441 + return (B_TRUE);
442 +}
443 +
444 +/*
445 + * This is the evaluate-packet vs. NAT flow state function.
446 + * This function does NOT alter "mp".
447 + */
448 +static boolean_t
449 +vxlnat_verify_natstate(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
450 + vxlnat_flow_t *flow, uint8_t *nexthdr)
451 +{
452 + /* XXX KEBE SAYS FILL ME IN! */
453 + return (B_FALSE);
454 +}
455 +
456 +/*
352 457 * Inspect the packet and find ports & protos (or ICMP types & codes)
353 458 * and see if we have an established NAT flow.
354 459 *
355 460 * XXX KEBE WONDERS if the transmission path will more closely resemble
356 461 * vxlnat_one_vxlan_fixed() because of ipha_ident issues or not...
357 462 *
358 463 * B_TRUE means the packet was handled, and we shouldn't continue processing
359 464 * (even if "was handled" means droppage).
360 465 */
361 466 static boolean_t
362 467 vxlnat_one_vxlan_flow(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
363 468 ip6_t *ip6h)
364 469 {
365 - /* XXX KEBE SAYS FILL ME IN. */
366 - /* For now... */
470 + vxlnat_flow_t *flow, searcher;
471 + uint8_t *nexthdr;
472 +
473 + /*
474 + * XXX KEBE WONDERS, should we return vxlnat_flow_t instead if we
475 + * miss? That way, we only need to find the ports/protocol ONCE.
476 + */
477 +
478 + if (ip6h != NULL) {
479 + /* Eventually, grab addresses for "searcher". */
480 + return (B_FALSE); /* Bail on IPv6 for now... */
481 + } else {
482 + ASSERT(ipha != NULL);
483 + searcher.vxnfl_isv4 = B_TRUE; /* Required? */
484 + IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
485 + &searcher.vxnfl_src);
486 + IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_dst),
487 + &searcher.vxnfl_dst);
488 + }
489 +
490 + if (!vxlnat_grab_transport(mp, ipha, ip6h, &searcher.vxnfl_ports,
491 + &searcher.vxnfl_protocol, &nexthdr)) {
492 + DTRACE_PROBE1(vxlnat__in__flowgrab, mblk_t *, mp);
493 + freemsg(mp);
494 + return (B_TRUE);
495 + }
496 +
497 +
498 + /*
499 + * XXX KEBE SAYS Eventually put the rw&find in an IPv4-only block,
500 + * because IPv6 (if we NAT it like IPv4) will have its own table/tree.
501 + */
502 + rw_enter(&vnet->vxnv_flowv4_lock, RW_READER);
503 + flow = avl_find(&vnet->vxnv_flows_v4, &searcher, NULL);
504 + if (flow != NULL)
505 + VXNFL_REFHOLD(flow);
506 + rw_exit(&vnet->vxnv_flowv4_lock);
507 +
508 + if (flow == NULL)
509 + return (B_FALSE); /* Let caller handle things. */
510 +
511 + if (!vxlnat_verify_natstate(mp, ipha, ip6h, flow, nexthdr)) {
512 + freemsg(mp); /* XXX KEBE SAYS FOR NOW... */
513 + } else {
514 + /* XXX KEBE SAYS PROCESS... */
515 + }
516 +
517 + VXNFL_REFRELE(flow);
518 + return (B_TRUE);
519 +}
520 +
521 +/*
522 + * We have a new packet that seems to require a new NAT flow. Construct that
523 + * flow now, and intern it as both a conn_t in IP *and* in the vnet's
524 + * appropriate vxnv_flows* tree. Return NULL if we have a problem.
525 + */
526 +static vxlnat_flow_t *
527 +vxlnat_new_flow(vxlnat_rule_t *rule, in6_addr_t *inner_src, in6_addr_t *dst,
528 + uint32_t ports, uint8_t protocol)
529 +{
530 + vxlnat_vnet_t *vnet = rule->vxnr_vnet;
531 + vxlnat_flow_t *flow, *oldflow;
532 + avl_tree_t *flowtree;
533 + krwlock_t *flowlock;
534 + avl_index_t where;
535 +
536 + flow = kmem_alloc(sizeof (*flow), KM_NOSLEEP | KM_NORMALPRI);
537 + if (flow == NULL)
538 + return (NULL);
539 +
540 + flow->vxnfl_dst = *dst;
541 + flow->vxnfl_src = *inner_src;
542 + flow->vxnfl_ports = ports;
543 + flow->vxnfl_protocol = protocol;
544 + flow->vxnfl_refcount = 2; /* One for internment, one for caller. */
545 + /* Assume no mixed-IP-version mappings for now. */
546 + if (IN6_IS_ADDR_V4MAPPED(inner_src)) {
547 + ASSERT(IN6_IS_ADDR_V4MAPPED(dst));
548 + flow->vxnfl_isv4 = B_TRUE;
549 + flowtree = &vnet->vxnv_flows_v4;
550 + flowlock = &vnet->vxnv_flowv4_lock;
551 + } else {
552 + ASSERT(!IN6_IS_ADDR_V4MAPPED(dst));
553 + flow->vxnfl_isv4 = B_FALSE;
554 + /* XXX KEBE SAYS we don't do IPv6 for now. */
555 + DTRACE_PROBE2(vxlnat__flow__newv6, in6_addr_t *, inner_src,
556 + in6_addr_t *, dst);
557 + kmem_free(flow, sizeof (*flow));
558 + return (NULL);
559 + }
560 + VXNR_REFHOLD(rule); /* For the flow itself... */
561 + flow->vxnfl_rule = rule;
562 +
563 + rw_enter(flowlock, RW_WRITER);
564 + oldflow = (vxlnat_flow_t *)avl_find(flowtree, flow, &where);
565 + if (oldflow != NULL) {
566 + /*
567 + * Hmmm, someone put one in while we were dinking around.
568 + * XXX KEBE SAYS return the old one, refheld, for now.
569 + */
570 + VXNR_REFRELE(rule);
571 + kmem_free(flow, sizeof (*flow));
572 + VXNFL_REFHOLD(oldflow);
573 + flow = oldflow;
574 + } else {
575 + avl_insert(flowtree, flow, where);
576 + /*
577 + * Do conn_t magic here, except for the conn_t activation. I
578 + * am aware of holding the rwlock-as-write here. We may need
579 + * to move this outside the rwlock hold, and
580 + * reacquire-on-failure.
581 + */
582 + if (!vxlnat_new_conn(flow)) {
583 + ASSERT(flow->vxnfl_connp == NULL);
584 + avl_remove(flowtree, flow);
585 + VXNR_REFRELE(flow->vxnfl_rule);
586 + kmem_free(flow, sizeof (*flow));
587 + flow = NULL;
588 + }
589 + }
590 + rw_exit(flowlock);
591 +
592 + /* We just created this one, activate it. */
593 + if (oldflow == NULL && flow != NULL)
594 + vxlnat_activate_conn(flow);
595 +
596 + return (flow);
597 +}
598 +
599 +void
600 +vxlnat_flow_free(vxlnat_flow_t *flow)
601 +{
602 + ASSERT(flow->vxnfl_refcount == 0);
603 +
604 + /* XXX KEBE SAYS FILL ME IN?! */
605 + /* XXX KEBE ASKS ipcl_hash_remove()? */
606 +
607 + flow->vxnfl_connp->conn_priv = NULL; /* Sufficient? */
608 + CONN_DEC_REF(flow->vxnfl_connp);
609 + VXNR_REFRELE(flow->vxnfl_rule);
610 + kmem_free(flow, sizeof (*flow));
611 +}
612 +
613 +static boolean_t
614 +vxlnat_verify_initial(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
615 + uint32_t ports, uint8_t protocol, uint8_t *nexthdr)
616 +{
617 + /* XXX KEBE SAYS FILL ME IN! */
618 + freemsg(mp);
367 619 return (B_FALSE);
368 620 }
369 621
370 622 /*
371 623 * If we reach here, we need to find a NAT rule, and see if we can/should
372 624 * CREATE a new NAT flow, or whether or not we should drop, maybe even
373 625 * returning an ICMP message of some sort.
374 626 *
375 627 * B_TRUE means the packet was handled, and we shouldn't continue processing
376 628 * (even if "was handled" means droppage).
377 629 */
378 630 static boolean_t
379 631 vxlnat_one_vxlan_rule(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
380 632 ip6_t *ip6h)
381 633 {
382 634 vxlnat_rule_t *rule;
635 + vxlnat_flow_t *flow;
636 + in6_addr_t v4m_src, v4m_dst, *inner_src, *dst;
637 + uint32_t ports;
638 + uint8_t protocol;
639 + uint8_t *nexthdr;
383 640
384 - /* XXX handle IPv6 later */
641 + /* XXX handle IPv6 later, assigning inner_src and dst to ip6_t addrs. */
385 642 if (ip6h != NULL)
386 643 return (B_FALSE);
387 644
388 645 ASSERT3P(ipha, !=, NULL);
646 + inner_src = &v4m_src;
647 + dst = &v4m_dst;
648 + IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src), inner_src);
649 + IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_dst), dst);
389 650
390 651 mutex_enter(&vnet->vxnv_rule_lock);
391 652 rule = list_head(&vnet->vxnv_rules);
392 653
393 654 /*
394 655 * search for a match in the nat rules
395 656 * XXX investigate perf issues with with respect to list_t size
657 + * XXX KEBE SAYS rewrite when we start doing IPv6 to use "inner_src"
658 + * and "dst".
396 659 */
397 660 while (rule != NULL) {
398 661 ipaddr_t ipaddr;
399 662 uint32_t netmask = 0xffffffff;
400 663 uint8_t prefix = rule->vxnr_prefix - 96;
401 664
402 665 /* calculate the v4 netmask */
403 666 netmask <<= (32 - prefix);
404 667 netmask = htonl(netmask);
405 668
406 669 IN6_V4MAPPED_TO_IPADDR(&rule->vxnr_myaddr, ipaddr);
407 670 /* XXX ASSERT vlanid? */
408 671 if ((ipaddr & netmask) == (ipha->ipha_src & netmask)) {
409 672 VXNR_REFHOLD(rule);
410 673 break;
411 674 }
|
↓ open down ↓ |
6 lines elided |
↑ open up ↑ |
412 675
413 676 rule = list_next(&vnet->vxnv_rules, rule);
414 677 }
415 678
416 679 mutex_exit(&vnet->vxnv_rule_lock);
417 680
418 681 if (rule == NULL)
419 682 return (B_FALSE);
420 683
421 684 /* process packet */
685 +
422 686 /*
423 - static vxlnat_flow_t *
424 - vxlnat_new_flow(vxlnat_rule_t *rule, in6_addr_t *inner_src, in6_addr_t *dst,
425 - uint32_t ports, uint8_t protocol)
687 + * Grab transport header, and figure out if we can proceed.
688 + *
689 + * NOTE: vxlnat_grab_transport() will free/consume mp if it fails,
690 + * because we want to isolate non-flow-starters without having them
691 + * create new flows. This means we return B_TRUE (consumed mp) on
692 + * failure.
426 693 */
694 + if (!vxlnat_grab_transport(mp, ipha, ip6h, &ports, &protocol, &nexthdr))
695 + return (B_TRUE); /* see above... */
696 + if (!vxlnat_verify_initial(mp, ipha, ip6h, ports, protocol, nexthdr))
697 + return (B_TRUE);
698 +
427 699
700 + flow = vxlnat_new_flow(rule, inner_src, dst, ports, protocol);
701 + if (flow != NULL) {
702 + /*
703 + * Call same function that vxlnat_one_vxlan_flow() uses
704 + * to remap & transmit the packet out the external side.
705 + *
706 + * NOTE: We've already checked the initial-packet-
707 + * qualification, so unlike the main datapath, we don't
708 + * need to call vxlnat_verify_natstate()
709 + */
710 +
711 + /* XXX KEBE SAYS PROCESS... */
712 +
713 + VXNFL_REFRELE(flow);
714 + return (B_TRUE);
715 + }
716 +
428 717 return (B_FALSE);
429 718 }
430 719
431 720 /*
432 721 * See if the inbound VXLAN packet hits a 1-1/fixed mapping, and process if it
433 722 * does. B_TRUE means the packet was handled, and we shouldn't continue
434 723 * processing (even if "was handled" means droppage).
435 724 */
436 725 static boolean_t
437 726 vxlnat_one_vxlan_fixed(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
438 727 ip6_t *ip6h)
439 728 {
440 729 vxlnat_fixed_t *fixed, fsearch;
441 730 mblk_t *newmp;
442 731 ire_t *outbound_ire;
443 732 /* Use C99's initializers for fun & profit. */
444 733 ip_recv_attr_t iras = { IRAF_IS_IPV4 | IRAF_VERIFIED_SRC };
445 734
446 735 if (ipha != NULL) {
447 736 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
448 737 &fsearch.vxnf_addr);
449 738 } else {
450 739 /* vxlnat_cache_remote() did reality checks... */
451 740 ASSERT(ipha == NULL && ip6h != NULL);
452 741 fsearch.vxnf_addr = ip6h->ip6_src;
453 742 }
454 743
455 744 rw_enter(&vnet->vxnv_fixed_lock, RW_READER);
456 745 fixed = avl_find(&vnet->vxnv_fixed_ips, &fsearch, NULL);
457 746 if (fixed != NULL)
458 747 VXNF_REFHOLD(fixed);
459 748 rw_exit(&vnet->vxnv_fixed_lock);
460 749 if (fixed == NULL)
461 750 return (B_FALSE); /* Try another method of processing. */
462 751
463 752 newmp = NULL;
464 753 /*
465 754 * XXX KEBE ASKS --> Do an MTU check NOW?! That way, we have
466 755 * pre-natted data. One gotcha, external dests may have
467 756 * different PathMTUs so see below about EMSGSIZE...
468 757 *
469 758 * For now, let the post-NAT crunch through
470 759 * ire_recv_forward_v4() take care of all of that.
471 760 */
472 761
473 762 if (ipha != NULL)
474 763 newmp = vxlnat_fixed_fixv4(mp, fixed, B_FALSE);
475 764 else {
476 765 freemsg(mp); /* XXX handle ip6h */
477 766 return (B_TRUE);
478 767 }
479 768
480 769 if (newmp == NULL)
481 770 return (B_TRUE); /* mp eaten by vxlnat_fixed_fixv4() */
482 771
483 772
484 773 ASSERT3P(ipha, ==, newmp->b_rptr);
485 774 /* XXX KEBE ASKS, IRR_ALLOCATE okay?!? */
486 775 /* XXX KEBE SAYS XMIT HINT! */
487 776 outbound_ire = ire_route_recursive_dstonly_v4(ipha->ipha_dst,
488 777 IRR_ALLOCATE, 0, vxlnat_netstack->netstack_ip);
489 778 VERIFY3P(outbound_ire, !=, NULL);
490 779 if (outbound_ire->ire_type == IRE_NOROUTE) {
491 780 /* Bail! */
492 781 DTRACE_PROBE2(vxlnat__in__drop__fixedire, ipaddr_t,
493 782 ipha->ipha_dst, mblk_t *, mp);
494 783 VXNF_REFRELE(fixed);
495 784 freemsg(mp);
496 785 return (B_TRUE);
497 786 }
498 787
499 788 iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
500 789 if (iras.ira_ip_hdr_length > sizeof (ipha_t))
501 790 iras.ira_flags |= IRAF_IPV4_OPTIONS;
502 791 iras.ira_xmit_hint = 0; /* XXX KEBE SAYS FIX ME! */
503 792 iras.ira_zoneid = outbound_ire->ire_zoneid;
504 793 iras.ira_pktlen = ntohs(ipha->ipha_length);
505 794 iras.ira_protocol = ipha->ipha_protocol;
506 795 /* XXX KEBE ASKS rifindex & ruifindex ?!? */
507 796 /*
508 797 * NOTE: AT LEAST ira_ill needs ILLF_ROUTER set, as
509 798 * well as the ill for the external NIC (where
510 799 * off-link destinations live). For fixed, ira_ill
511 800 * should be the ill of the external source.
512 801 */
513 802 iras.ira_rill = vxlnat_underlay_ire->ire_ill;
514 803 iras.ira_ill = fixed->vxnf_ire->ire_ill;
515 804 /* XXX KEBE ASKS cred & cpid ? */
516 805 iras.ira_verified_src = ipha->ipha_src;
517 806 /* XXX KEBE SAYS don't sweat IPsec stuff. */
518 807 /* XXX KEBE SAYS ALSO don't sweat l2src & mhip */
519 808
520 809 /* Okay, we're good! Let's pretend we're forwarding. */
521 810 ire_recv_forward_v4(outbound_ire, mp, ipha, &iras);
522 811 ire_refrele(outbound_ire);
523 812
524 813 return (B_TRUE);
525 814 }
526 815
527 816 /*
528 817 * Process exactly one VXLAN packet.
529 818 */
530 819 static void
531 820 vxlnat_one_vxlan(mblk_t *mp, struct sockaddr_in6 *underlay_src)
532 821 {
533 822 vxlan_hdr_t *vxh;
534 823 vxlnat_vnet_t *vnet;
535 824 ipha_t *ipha;
536 825 ip6_t *ip6h;
537 826
538 827 if (MBLKL(mp) < sizeof (*vxh)) {
539 828 /* XXX KEBE ASKS -- should we be more forgiving? */
540 829 DTRACE_PROBE1(vxlnat__in__drop__vxlsize, mblk_t *, mp);
541 830 freemsg(mp);
542 831 return;
543 832 }
544 833 vxh = (vxlan_hdr_t *)mp->b_rptr;
545 834
546 835 /* If we start using more than just the one flag, fix it. */
547 836 if (vxh->vxlan_flags != VXLAN_F_VDI_WIRE) {
548 837 DTRACE_PROBE1(vxlnat__in__drop__VDI, mblk_t *, mp);
549 838 freemsg(mp);
550 839 return;
551 840 }
552 841
553 842 /* Remember, we key off of what's on the wire. */
554 843 vnet = vxlnat_get_vnet(VXLAN_ID_WIRE32(vxh->vxlan_id), B_FALSE);
555 844 if (vnet == NULL) {
556 845 DTRACE_PROBE1(vxlnat__in__drop__vnetid, uint32_t,
|
↓ open down ↓ |
119 lines elided |
↑ open up ↑ |
557 846 VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)));
558 847 freemsg(mp);
559 848 return;
560 849 }
561 850
562 851 DTRACE_PROBE2(vxlnat__in__vnet, uint32_t,
563 852 VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)),
564 853 vxlnat_vnet_t, vnet);
565 854
566 855 /*
567 - * Off-vxlan processing steps:
856 + * Arrived-from-vxlan processing steps:
568 857 * 1.) Locate the ethernet header and check/update/add-into remotes.
569 858 * 2.) Search 1-1s, process if hit.
570 859 * 3.) Search flows, process if hit.
571 860 * 4.) Search rules, create new flow (or not) if hit.
572 - * 5.) Drop the packets.
861 + * 5.) Drop the packet.
573 862 */
574 863
575 864 /* 1.) Locate the ethernet header and check/update/add-into remotes. */
576 865 mp->b_rptr += sizeof (*vxh);
577 866 while (MBLKL(mp) == 0) {
578 867 mblk_t *oldmp = mp;
579 868
580 869 mp = mp->b_cont;
581 870 freeb(oldmp);
582 871 }
583 872 mp = vxlnat_cache_remote(mp, underlay_src, vnet);
584 873 if (mp == NULL)
585 874 goto bail_no_free;
586 875
587 876 /* Let's cache the IP header here... */
588 877 ipha = (ipha_t *)mp->b_rptr;
589 878 switch (IPH_HDR_VERSION(ipha)) {
590 879 case IPV4_VERSION:
591 880 ip6h = NULL;
592 881 break;
593 882 case IPV6_VERSION:
594 883 ip6h = (ip6_t *)ipha;
595 884 ipha = NULL;
596 885 break;
597 886 default:
598 887 DTRACE_PROBE2(vxlnat__in__drop__ipvers, int,
599 888 IPH_HDR_VERSION(ipha), mblk_t *, mp);
600 889 goto bail_and_free;
601 890 }
602 891
603 892 /* 2.) Search 1-1s, process if hit. */
604 893 if (vxlnat_one_vxlan_fixed(vnet, mp, ipha, ip6h))
605 894 goto bail_no_free; /* Success means mp was consumed. */
606 895
607 896 /* 3.) Search flows, process if hit. */
608 897 if (vxlnat_one_vxlan_flow(vnet, mp, ipha, ip6h))
609 898 goto bail_no_free; /* Success means mp was consumed. */
610 899
611 900 /* 4.) Search rules, create new flow (or not) if hit. */
612 901 if (vxlnat_one_vxlan_rule(vnet, mp, ipha, ip6h))
613 902 goto bail_no_free; /* Success means mp was consumed. */
614 903
615 904 /* 5.) Nothing, drop the packet. */
616 905
617 906 DTRACE_PROBE2(vxlnat__in___drop__nohits, vxlnat_vnet_t *, vnet,
618 907 mblk_t *, mp);
619 908
620 909 bail_and_free:
621 910 freemsg(mp);
622 911 bail_no_free:
623 912 VXNV_REFRELE(vnet);
624 913 }
625 914 /*
626 915 * ONLY return B_FALSE if we get a packet-clogging event.
627 916 */
628 917 /* ARGSUSED */
629 918 static boolean_t
630 919 vxlnat_vxlan_input(ksocket_t insock, mblk_t *chain, size_t msgsize, int oob,
631 920 void *ignored)
632 921 {
633 922 mblk_t *mp, *nextmp;
634 923
635 924 /*
636 925 * XXX KEBE ASKS --> move hold & release outside of loop?
637 926 * If so, hold rwlock here.
638 927 */
639 928
640 929 for (mp = chain; mp != NULL; mp = nextmp) {
641 930 struct T_unitdata_ind *tudi;
642 931 struct sockaddr_in6 *sin6;
643 932
644 933 nextmp = mp->b_next;
645 934 if (DB_TYPE(mp) != M_PROTO || mp->b_cont == NULL) {
646 935 DTRACE_PROBE1(vxlnat__in__drop__mblk, mblk_t *, mp);
647 936 freemsg(mp);
648 937 continue;
649 938 }
650 939
651 940 /* LINTED -- aligned */
652 941 tudi = (struct T_unitdata_ind *)mp->b_rptr;
653 942 if (tudi->PRIM_type != T_UNITDATA_IND) {
654 943 DTRACE_PROBE1(vxlnat__in__drop__TPI, mblk_t *, mp);
655 944 freemsg(mp);
656 945 continue;
657 946 }
658 947 /* LINTED -- aligned */
659 948 sin6 = (struct sockaddr_in6 *)(mp->b_rptr + tudi->SRC_offset);
660 949 VERIFY(sin6->sin6_family == AF_INET6);
661 950 VERIFY(tudi->SRC_length >= sizeof (*sin6));
662 951
663 952 vxlnat_one_vxlan(mp->b_cont, sin6);
664 953 freeb(mp);
665 954 }
666 955
667 956 return (B_TRUE);
668 957 }
669 958
670 959 /*
671 960 * Use RFC 1141's technique (with a check for -0).
672 961 *
673 962 * newsum = oldsum - (new16a + old16a - new16b + old16b ...);
674 963 *
675 964 * NOTE: "oldsum" is right off the wire in wire-native order.
676 965 * NOTE2: "old" and "new" ALSO point to things in wire-native order.
677 966 * NOTE3: THIS MUST TAKE A MULTIPLE OF 2 BYTES (i.e. uint16_t array).
678 967 * NOTE4: The 32-bit running sum means we can't take len > 64k.
679 968 */
680 969 uint16_t
681 970 vxlnat_cksum_adjust(uint16_t oldsum, uint16_t *old, uint16_t *new, uint_t len)
682 971 {
683 972 uint32_t newsum = ntohs(oldsum);
684 973
685 974 ASSERT((len & 0x1) == 0);
686 975 while (len != 0) {
687 976 newsum -= ntohs(*new);
688 977 newsum += ntohs(*old);
689 978 len -= 2;
690 979 old++;
691 980 new++;
692 981 }
693 982 newsum += (newsum >> 16) & 0xffff;
694 983
695 984 return (newsum == 0xffff ? 0 : htons(newsum));
696 985 }
697 986
698 987 /*
699 988 * Fix inner headers on an ICMP packet.
700 989 *
701 990 * XXX KEBE SAYS FOR NOW, just do addresses for 1-1/fixed. When we do
702 991 * flows, include old_port/new_port as well.
703 992 */
704 993 static mblk_t *
705 994 vxlnat_fix_icmp_inner_v4(mblk_t *mp, icmph_t *icmph, ipaddr_t old_one,
706 995 ipaddr_t new_one, boolean_t to_private)
707 996 {
708 997 mblk_t *newmp;
709 998 ipha_t *inner_ipha;
710 999 ipaddr_t *new_ones_place;
711 1000
712 1001 if ((uint8_t *)(icmph + 1) + sizeof (ipha_t) > mp->b_wptr) {
713 1002 /* Pay the pullup tax. */
714 1003 newmp = msgpullup(mp, -1);
715 1004 freemsg(mp);
716 1005 if (newmp == NULL) {
717 1006 DTRACE_PROBE1(vxlnat__fixicmp__pullupfail, void *,
718 1007 NULL);
719 1008 return (NULL);
720 1009 }
721 1010 if (MBLKL(newmp) < 2 * sizeof (ipha_t) + sizeof (icmph_t)) {
722 1011 /* Wow! Too-tiny ICMP packet. */
723 1012 DTRACE_PROBE1(vxlnat__fixicmp__tootiny, mblk_t *,
724 1013 newmp);
725 1014 freeb(newmp);
726 1015 return (NULL);
727 1016 }
728 1017 mp = newmp;
729 1018 /* Temporarily use inner_ipha for the outer one. */
730 1019 inner_ipha = (ipha_t *)mp->b_rptr;
731 1020 icmph = (icmph_t *)(mp->b_rptr + IPH_HDR_LENGTH(inner_ipha));
732 1021 }
733 1022 inner_ipha = (ipha_t *)(icmph + 1);
734 1023 new_ones_place = to_private ?
735 1024 &inner_ipha->ipha_src : &inner_ipha->ipha_dst;
736 1025 if (*new_ones_place != old_one) {
737 1026 /* Either I'm buggy or the packet is. */
738 1027 DTRACE_PROBE2(vxlnat__fixicmp__badinneraddr, ipaddr_t,
739 1028 old_one, ipaddr_t, *new_ones_place);
740 1029 freeb(mp);
741 1030 return (NULL);
742 1031 }
743 1032 *new_ones_place = new_one;
744 1033
745 1034 /* Adjust ICMP checksum... */
746 1035 icmph->icmph_checksum = vxlnat_cksum_adjust(icmph->icmph_checksum,
747 1036 (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
748 1037
749 1038 /*
750 1039 * XXX KEBE ASKS, recompute *inner-packet* checksums? Let's not for
751 1040 * now, but consider this Fair Warning (or some other VH album...).
752 1041 */
753 1042 return (mp);
754 1043 }
755 1044
756 1045 /*
757 1046 * Take a 1-1/fixed IPv4 packet and convert it for transmission out the
758 1047 * appropriate end. "to_private" is what it says on the tin.
759 1048 * ALWAYS consumes "mp", regardless of return value.
760 1049 */
761 1050 static mblk_t *
762 1051 vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed, boolean_t to_private)
763 1052 {
764 1053 ipaddr_t new_one, old_one;
765 1054 ipaddr_t *new_ones_place;
766 1055 ipha_t *ipha = (ipha_t *)mp->b_rptr;
767 1056 uint8_t *nexthdr, *end_wptr;
768 1057
769 1058 if (to_private) {
770 1059 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_addr, new_one);
771 1060 new_ones_place = &ipha->ipha_dst;
772 1061 } else {
773 1062 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_pubaddr, new_one);
774 1063 new_ones_place = &ipha->ipha_src;
775 1064 }
776 1065
777 1066 old_one = *new_ones_place;
778 1067 *new_ones_place = new_one;
779 1068
780 1069 /*
781 1070 * Recompute the IP header checksum, and check for the TCP or UDP
782 1071 * checksum as well, as they'll need recomputing as well.
783 1072 */
784 1073
785 1074 /* First, the IPv4 header itself. */
786 1075 ipha->ipha_hdr_checksum = vxlnat_cksum_adjust(ipha->ipha_hdr_checksum,
787 1076 (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
788 1077
789 1078 nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
790 1079 if (nexthdr >= mp->b_wptr) {
791 1080 nexthdr = mp->b_cont->b_rptr +
792 1081 (MBLKL(mp) - IPH_HDR_LENGTH(ipha));
793 1082 end_wptr = mp->b_cont->b_wptr;
794 1083 } else {
795 1084 end_wptr = mp->b_wptr;
796 1085 }
797 1086
798 1087 switch (ipha->ipha_protocol) {
799 1088 case IPPROTO_TCP: {
800 1089 tcpha_t *tcph = (tcpha_t *)nexthdr;
801 1090
802 1091 if (nexthdr + sizeof (*tcph) >= end_wptr) {
803 1092 /* Bail for now. */
804 1093 DTRACE_PROBE1(vxlnat__fix__tcp__mblkspan, mblk_t *,
805 1094 mp);
806 1095 freemsg(mp);
807 1096 return (NULL);
808 1097 }
809 1098 tcph->tha_sum = vxlnat_cksum_adjust(tcph->tha_sum,
810 1099 (uint16_t *)&old_one, (uint16_t *)&new_one,
811 1100 sizeof (ipaddr_t));
812 1101 break; /* Out of switch. */
813 1102 }
814 1103 case IPPROTO_UDP: {
815 1104 udpha_t *udph = (udpha_t *)nexthdr;
816 1105
817 1106 if (nexthdr + sizeof (*udph) >= end_wptr) {
818 1107 /* Bail for now. */
819 1108 DTRACE_PROBE1(vxlnat__fix__udp__mblkspan, mblk_t *,
820 1109 mp);
821 1110 freemsg(mp);
822 1111 return (NULL);
823 1112 }
824 1113 udph->uha_checksum = vxlnat_cksum_adjust(udph->uha_checksum,
825 1114 (uint16_t *)&old_one, (uint16_t *)&new_one,
826 1115 sizeof (ipaddr_t));
827 1116 break; /* Out of switch. */
828 1117 }
829 1118 case IPPROTO_ICMP: {
830 1119 icmph_t *icmph = (icmph_t *)nexthdr;
831 1120
832 1121 /*
833 1122 * We need to check the case of ICMP messages that contain
834 1123 * IP packets. We will need to at least change the addresses,
835 1124 * and *maybe* the checksums too if necessary.
836 1125 *
837 1126 * This may replicate some of icmp_inbound_v4(), alas.
838 1127 */
839 1128 if (nexthdr + sizeof (*icmph) >= end_wptr) {
840 1129 mblk_t *newmp;
841 1130 /*
842 1131 * Unlike the others, we're going to pay the pullup
843 1132 * tax here.
844 1133 */
845 1134 newmp = msgpullup(mp, -1);
846 1135 freemsg(mp);
847 1136 if (newmp == NULL) {
848 1137 DTRACE_PROBE1(vxlnat__icmp__pullupfail, void *,
849 1138 NULL);
850 1139 return (NULL);
851 1140 }
852 1141 mp = newmp;
853 1142 ipha = (ipha_t *)(mp->b_rptr);
854 1143 nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
855 1144 icmph = (icmph_t *)nexthdr;
856 1145 }
857 1146
858 1147 switch (icmph->icmph_type) {
859 1148 case ICMP_ADDRESS_MASK_REPLY:
860 1149 case ICMP_ADDRESS_MASK_REQUEST:
861 1150 case ICMP_TIME_STAMP_REPLY:
862 1151 case ICMP_TIME_STAMP_REQUEST:
863 1152 case ICMP_ECHO_REQUEST:
864 1153 case ICMP_ECHO_REPLY:
865 1154 /* These merely need to get passed along. */
866 1155 break;
867 1156 case ICMP_ROUTER_ADVERTISEMENT:
868 1157 case ICMP_ROUTER_SOLICITATION:
869 1158 /* These shouldn't be traversing a NAT at all. Drop. */
870 1159 DTRACE_PROBE1(vxlnat__icmp__cantpass, int,
871 1160 icmph->icmph_type);
872 1161 freemsg(mp);
873 1162 return (NULL);
874 1163 case ICMP_PARAM_PROBLEM:
875 1164 case ICMP_TIME_EXCEEDED:
876 1165 case ICMP_DEST_UNREACHABLE:
877 1166 /* These include inner-IP headers we need to adjust. */
878 1167 mp = vxlnat_fix_icmp_inner_v4(mp, icmph, old_one,
879 1168 new_one, to_private);
880 1169 break;
881 1170 default:
882 1171 /* Pass along to receiver, but warn. */
883 1172 DTRACE_PROBE1(vxlnat__icmp__unknown, int,
884 1173 icmph->icmph_type);
885 1174 break;
886 1175 }
887 1176 }
888 1177 /* Otherwise we can't make any other assumptions for now... */
889 1178 default:
890 1179 break;
891 1180 }
892 1181
893 1182 return (mp);
894 1183 }
895 1184
896 1185 vxlnat_remote_t *
897 1186 vxlnat_xmit_vxlanv4(mblk_t *mp, in6_addr_t *overlay_dst,
898 1187 vxlnat_remote_t *remote, uint8_t *myether, vxlnat_vnet_t *vnet)
899 1188 {
900 1189 struct sockaddr_in6 sin6 = {AF_INET6};
901 1190 struct msghdr msghdr = {NULL};
902 1191 mblk_t *vlan_mp;
903 1192 extern uint_t vxlan_alloc_size, vxlan_noalloc_min;
904 1193 vxlan_hdr_t *vxh;
905 1194 struct ether_vlan_header *evh;
906 1195 int rc;
907 1196 cred_t *cred;
908 1197
909 1198 if (remote == NULL || remote->vxnrem_vnet == NULL) {
910 1199 DTRACE_PROBE1(vxlnat__xmit__vxlanv4, vxlnat_remote_t *, remote);
911 1200 /* Release the condemned remote. */
912 1201 if (remote != NULL)
913 1202 VXNREM_REFRELE(remote);
914 1203
915 1204 /* See if we have a remote ready to use... */
916 1205 remote = vxlnat_get_remote(vnet, overlay_dst, B_FALSE);
917 1206
918 1207 if (remote == NULL) {
919 1208 /*
920 1209 * We need to do the moral equivalent of PF_KEY
921 1210 * ACQUIRE or overlay's queue-resolve so that we can
922 1211 * have someone in user-space send me a remote. Until
923 1212 * then, drop the reference if condemned, free the
924 1213 * message, and return NULL.
925 1214 */
926 1215
927 1216 freemsg(mp);
928 1217 return (NULL);
929 1218 }
930 1219 }
931 1220 ASSERT(vnet == remote->vxnrem_vnet);
932 1221
933 1222 if (DB_REF(mp) > 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
934 1223 vlan_mp = allocb(vxlan_alloc_size, BPRI_HI);
935 1224 if (vlan_mp == NULL) {
936 1225 DTRACE_PROBE1(vxlnat__xmit__vxlanv4__allocfail,
937 1226 vxlnat_remote_t *, remote);
938 1227 freemsg(mp);
939 1228 /* Just drop the packet, but don't tell caller. */
940 1229 return (remote);
941 1230 }
942 1231 vlan_mp->b_wptr = DB_LIM(vlan_mp);
943 1232 vlan_mp->b_rptr = vlan_mp->b_wptr;
944 1233 vlan_mp->b_cont = mp;
945 1234 } else {
946 1235 vlan_mp = mp;
947 1236 }
948 1237 vlan_mp->b_rptr -= sizeof (*vxh) + sizeof (*evh);
949 1238 vxh = (vxlan_hdr_t *)vlan_mp->b_rptr;
950 1239 vxh->vxlan_flags = VXLAN_F_VDI_WIRE;
951 1240 vxh->vxlan_id = vnet->vxnv_vnetid; /* Already in wire-order. */
952 1241
953 1242 /* Fill in the Ethernet header. */
954 1243 evh = (struct ether_vlan_header *)(vxh + 1);
955 1244 ether_copy(&remote->vxnrem_ether, &evh->ether_dhost);
956 1245 ether_copy(myether, &evh->ether_shost);
957 1246 evh->ether_tpid = htons(ETHERTYPE_VLAN);
958 1247 evh->ether_tci = remote->vxnrem_vlan;
959 1248 evh->ether_type = htons(ETHERTYPE_IP);
960 1249
961 1250 msghdr.msg_name = (struct sockaddr_storage *)&sin6;
962 1251 msghdr.msg_namelen = sizeof (sin6);
963 1252 /* Address family and other zeroing already done up top. */
964 1253 sin6.sin6_port = htons(IPPORT_VXLAN);
965 1254 sin6.sin6_addr = remote->vxnrem_uaddr;
966 1255
967 1256 /*
968 1257 * cred_t dance is because we may be getting this straight from
969 1258 * interrupt context.
970 1259 */
971 1260 cred = zone_get_kcred(netstack_get_zoneid(vxlnat_netstack));
972 1261 if (cred == NULL) {
973 1262 DTRACE_PROBE1(vxlnat__xmit__vxlan4__credfail,
974 1263 vxlnat_remote_t *, remote);
975 1264 freemsg(vlan_mp);
976 1265 }
977 1266 /*
978 1267 * Use MSG_DONTWAIT to avoid blocks, esp. if we're getting this
979 1268 * straight from interrupt context.
980 1269 */
981 1270 rc = ksocket_sendmblk(vxlnat_underlay, &msghdr, MSG_DONTWAIT, &vlan_mp,
982 1271 cred);
983 1272 crfree(cred);
984 1273 if (rc != 0) {
985 1274 DTRACE_PROBE2(vxlnat__xmit__vxlan4__sendfail, int, rc,
986 1275 vxlnat_remote_t *, remote);
987 1276 freemsg(vlan_mp);
988 1277 }
989 1278 return (remote);
990 1279 }
991 1280
992 1281 /*
993 1282 * New ire_{recv,send}fn implementations if we're doing 1-1 mappings.
994 1283 */
995 1284 int
996 1285 vxlnat_fixed_ire_send_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
997 1286 ip_xmit_attr_t *ixa, uint32_t *identp)
998 1287 {
999 1288 /* XXX KEBE SAYS FILL ME IN, but for now... */
1000 1289 freemsg(mp);
1001 1290 return (EOPNOTSUPP);
1002 1291 }
1003 1292
1004 1293 void
1005 1294 vxlnat_fixed_ire_recv_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
1006 1295 ip_recv_attr_t *ira)
1007 1296 {
1008 1297 /* XXX KEBE SAYS FILL ME IN, but for now... */
1009 1298 freemsg(mp);
1010 1299 }
1011 1300
1012 1301 /*
1013 1302 * I believe the common case for this will be from self-generated ICMP
1014 1303 * messages. Other same-netstack-originated traffic will also come through
1015 1304 * here (one internal reaching what turns out to be another internal).
1016 1305 */
1017 1306 int
1018 1307 vxlnat_fixed_ire_send_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1019 1308 ip_xmit_attr_t *ixa, uint32_t *identp)
1020 1309 {
1021 1310 ip_recv_attr_t iras; /* NOTE: No bzero because we pay more later */
1022 1311 ipha_t *ipha = (ipha_t *)iph_arg;
1023 1312
1024 1313 /*
1025 1314 * XXX KEBE ASKS, any DTrace probes or other instrumentation that
1026 1315 * perhaps should be set?
1027 1316 */
1028 1317
1029 1318 /* Map ixa to ira. */
1030 1319 iras.ira_pktlen = ixa->ixa_pktlen;
1031 1320 /* XXX KEBE ASKS more?!? */
1032 1321
1033 1322 /*
1034 1323 * In normal TCP/IP processing, this shortcuts the IP header checksum
1035 1324 * AND POSSIBLY THE ULP checksum cases. Since this is likely to head
1036 1325 * back into the internal network, we need to recompute things again.
1037 1326 */
1038 1327 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
1039 1328 freemsg(mp);
1040 1329 return (EMSGSIZE);
1041 1330 }
1042 1331 #if 0
1043 1332 /* XXX KEBE ASKS Special-case ICMP here? */
1044 1333 if (ipha->ipha_protocol == IPPROTO_ICMP) {
1045 1334 icmph_t *icmph;
1046 1335
1047 1336 icmph = (icmph_t *)((uint8_t *)ipha + IPH_HDR_LENGTH(ipha));
1048 1337 if ((uint8_t *)icmph >= mp->b_wptr) {
1049 1338 freemsg(mp);
1050 1339 return (EMSGSIZE);
1051 1340 }
1052 1341 icmph->icmph_checksum = 0;
1053 1342 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
1054 1343 }
1055 1344 #endif
1056 1345
1057 1346 vxlnat_fixed_ire_recv_v4(ire, mp, iph_arg, &iras);
1058 1347
1059 1348 return (0);
1060 1349 }
1061 1350
1062 1351 void
1063 1352 vxlnat_fixed_ire_recv_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1064 1353 ip_recv_attr_t *ira)
1065 1354 {
1066 1355 vxlnat_fixed_t *fixed;
1067 1356 vxlnat_vnet_t *vnet;
1068 1357 ipha_t *ipha = (ipha_t *)iph_arg;
1069 1358 int newmtu;
1070 1359
1071 1360 /* Make a note for DAD that this address is in use */
1072 1361 ire->ire_last_used_time = LBOLT_FASTPATH;
1073 1362
1074 1363 /* Only target the IRE_LOCAL with the right zoneid. */
1075 1364 ira->ira_zoneid = ire->ire_zoneid;
1076 1365
1077 1366 /*
1078 1367 * XXX KEBE ASKS, any DTrace probes or other instrumentation that
1079 1368 * perhaps should be set?
1080 1369 */
1081 1370
1082 1371 /*
1083 1372 * Reality check some things.
1084 1373 */
1085 1374 fixed = (vxlnat_fixed_t *)ire->ire_dep_sib_next;
1086 1375 vnet = fixed->vxnf_vnet;
1087 1376
1088 1377 ASSERT3P(ire, ==, fixed->vxnf_ire);
1089 1378
1090 1379 if (IRE_IS_CONDEMNED(ire) || vnet == NULL)
1091 1380 goto detach_ire_and_bail;
1092 1381
1093 1382 /*
1094 1383 * Not a common-case, but a possible one. If our underlay MTU is
1095 1384 * smaller than the external MTU, it is possible that we will have a
1096 1385 * size mismatch and therefore need to either fragment at the VXLAN
1097 1386 * layer (VXLAN UDP packet sent as two or more IP fragments) OR
1098 1387 * if IPH_DF is set, send an ICMP_NEEDS_FRAGMENTATION back to the
1099 1388 * sender. Perform the check here BEFORE we NAT the packet.
1100 1389 */
1101 1390 ASSERT(vxlnat_underlay_ire->ire_ill != NULL);
1102 1391 newmtu = vxlnat_underlay_ire->ire_ill->ill_mtu - sizeof (ipha_t) -
1103 1392 sizeof (udpha_t) - sizeof (vxlan_hdr_t) -
1104 1393 sizeof (struct ether_vlan_header);
1105 1394 if ((ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF) &&
1106 1395 ntohs(ipha->ipha_length) > newmtu) {
1107 1396 icmp_frag_needed(mp, newmtu, ira);
1108 1397 /* We're done. Assume icmp_frag_needed() consumed mp. */
1109 1398 return;
1110 1399 }
1111 1400
1112 1401 /*
1113 1402 * So we're here, and since we have a refheld IRE, we have a refheld
1114 1403 * fixed and vnet. Do some of what ip_input_local_v4() does (inbound
1115 1404 * checksum? some ira checks?), but otherwise, swap the destination
1116 1405 * address as mapped in "fixed", recompute any checksums, and send it
1117 1406 * along its merry way (with a ttl decement too) to a VXLAN
1118 1407 * destination.
1119 1408 */
1120 1409 mp = vxlnat_fixed_fixv4(mp, fixed, B_TRUE);
1121 1410 if (mp == NULL)
1122 1411 return; /* Assume it's been freed & dtraced already. */
1123 1412
1124 1413 /*
1125 1414 * Otherwise, we're ready to transmit this packet over the vxlan
1126 1415 * socket.
1127 1416 */
1128 1417 fixed->vxnf_remote = vxlnat_xmit_vxlanv4(mp, &fixed->vxnf_addr,
1129 1418 fixed->vxnf_remote, fixed->vxnf_myether, vnet);
1130 1419 if (fixed->vxnf_remote == NULL) {
1131 1420 /* XXX KEBE ASKS, DTrace probe here? Or in-function? */
1132 1421 DTRACE_PROBE2(vxlnat__fixed__xmitdrop,
1133 1422 in6_addr_t *, &fixed->vxnf_addr,
1134 1423 uint32_t, VXLAN_ID_NTOH(vnet->vxnv_vnetid));
1135 1424 }
1136 1425 return;
1137 1426
1138 1427 detach_ire_and_bail:
1139 1428 /* Oh no, something's condemned. Drop the IRE now. */
1140 1429 ire->ire_recvfn = ire_recv_local_v4;
1141 1430 ire->ire_dep_sib_next = NULL;
1142 1431 VXNF_REFRELE(fixed);
1143 1432 /* Pass the packet back... */
1144 1433 ire_recv_local_v4(ire, mp, iph_arg, ira);
1145 1434 return;
1146 1435 }
|
↓ open down ↓ |
564 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX