Print this page
Factor out fixed/1-1 processing from vxlnat_vxlan_one(), paving way for
future processing types.
Initial definitions of NAT flows.
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/inet/vxlnat/vxlnat_nat.c
+++ new/usr/src/uts/common/inet/vxlnat/vxlnat_nat.c
1 1 /*
2 2 * This file and its contents are supplied under the terms of the
3 3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 4 * You may only use this file in accordance with the terms of version
5 5 * 1.0 of the CDDL.
6 6 *
7 7 * A full copy of the text of the CDDL should have accompanied this
8 8 * source. A copy of the CDDL is also available via the Internet at
9 9 * http://www.illumos.org/license/CDDL.
10 10 */
11 11
12 12 /*
13 13 * Copyright 2018 Joyent, Inc.
14 14 */
15 15
16 16 /*
17 17 * NAT engine. Mappings, 1-1 The rules in vxlnat_rules.c are only consulted
18 18 * if the 1-1 map (kept here) misses or if the outbound lookup (vnetid,
19 19 * protocol, src-IP, dst-IP, src-port, dst-port) misses.
20 20 *
21 21 * The plan is for inbound to hit conn_ts, whose conn_private points to
22 22 * entries here. The conn_recv* functions live here too (for now).
23 23 */
24 24
25 25 #include <sys/types.h>
26 26 #include <sys/socket.h>
27 27 #include <sys/ksynch.h>
28 28 #include <sys/ksocket.h>
29 29 #include <sys/kmem.h>
30 30 #include <sys/stream.h>
31 31 #include <sys/strsubr.h>
32 32 #include <sys/strsun.h>
33 33 #include <sys/sysmacros.h>
34 34 #include <sys/debug.h>
35 35 #include <sys/dtrace.h>
36 36 #include <sys/errno.h>
37 37 #include <sys/tihdr.h>
38 38 #include <netinet/in.h>
39 39 #include <netinet/udp.h>
40 40 #include <inet/ip.h>
41 41 #include <inet/ip6.h>
42 42 #include <inet/udp_impl.h>
43 43 #include <inet/tcp.h>
44 44
45 45 #include <inet/vxlnat_impl.h>
46 46
47 47 static boolean_t vxlnat_vxlan_input(ksocket_t, mblk_t *, size_t, int, void *);
48 48 static mblk_t *vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed,
49 49 boolean_t to_private);
50 50
51 51 /*
52 52 * Initialized to NULL, read/write protected by vxlnat_mutex.
53 53 * Receive functions shouldn't have to access this directly.
54 54 */
55 55 ksocket_t vxlnat_underlay;
56 56 ire_t *vxlnat_underlay_ire;
57 57
58 58 void
59 59 vxlnat_closesock(void)
60 60 {
61 61 ASSERT(MUTEX_HELD(&vxlnat_mutex));
62 62 if (vxlnat_underlay_ire != NULL) {
63 63 ire_refrele(vxlnat_underlay_ire);
64 64 vxlnat_underlay_ire = NULL;
65 65 }
66 66 if (vxlnat_underlay != NULL) {
67 67 (void) ksocket_close(vxlnat_underlay, zone_kcred());
68 68 vxlnat_underlay = NULL;
69 69 }
70 70 }
71 71
72 72 static int
73 73 vxlnat_opensock(in6_addr_t *underlay_ip)
74 74 {
75 75 int rc, val;
76 76 /* Assume rest is initialized to 0s. */
77 77 struct sockaddr_in6 sin6 = {AF_INET6, BE_16(IPPORT_VXLAN)};
78 78 ip_stack_t *ipst = vxlnat_netstack->netstack_ip;
79 79
80 80 ASSERT(MUTEX_HELD(&vxlnat_mutex));
81 81 /* Open... */
82 82 rc = ksocket_socket(&vxlnat_underlay, AF_INET6, SOCK_DGRAM, 0,
83 83 KSOCKET_SLEEP, zone_kcred());
84 84 if (rc != 0)
85 85 return (rc);
86 86
87 87 /* Bind... */
88 88 sin6.sin6_addr = *underlay_ip;
89 89 rc = ksocket_bind(vxlnat_underlay, (struct sockaddr *)(&sin6),
90 90 sizeof (sin6), zone_kcred());
91 91 if (rc != 0) {
92 92 vxlnat_closesock();
93 93 return (rc);
94 94 }
95 95
96 96 /* Use source-port hashing when sending packets out VXLAN... */
97 97 val = UDP_HASH_VXLAN;
98 98 rc = ksocket_setsockopt(vxlnat_underlay, IPPROTO_UDP,
99 99 UDP_SRCPORT_HASH, &val, sizeof (val), kcred);
100 100 if (rc != 0) {
101 101 vxlnat_closesock();
102 102 return (rc);
103 103 }
104 104
105 105 /*
106 106 * Grab the IRE for underlay address.
107 107 */
108 108 ASSERT3P(vxlnat_underlay_ire, ==, NULL);
109 109 vxlnat_underlay_ire = (IN6_IS_ADDR_V4MAPPED(underlay_ip)) ?
110 110 ire_ftable_lookup_simple_v4(underlay_ip->_S6_un._S6_u32[3],
111 111 0, ipst, NULL) :
112 112 ire_ftable_lookup_simple_v6(underlay_ip, 0, ipst, NULL);
113 113 if (vxlnat_underlay_ire == NULL) {
114 114 DTRACE_PROBE1(vxlnat__opensock__ire__fail, in6_addr_t *,
115 115 underlay_ip);
116 116 vxlnat_closesock();
117 117 return (EADDRNOTAVAIL);
118 118 }
119 119
120 120 /* Once we return from this, start eating data. */
121 121 rc = ksocket_krecv_set(vxlnat_underlay, vxlnat_vxlan_input, NULL);
122 122 if (rc != 0) {
123 123 vxlnat_closesock();
124 124 }
125 125
126 126 return (rc);
127 127 }
128 128
129 129 /*
130 130 * Establish a VXLAN-listening kernel socket.
131 131 * XXX KEBE ASKS ==> Support more than one VXLAN address?
132 132 */
133 133 /* ARGSUSED */
134 134 int
135 135 vxlnat_vxlan_addr(in6_addr_t *underlay_ip)
136 136 {
137 137 int rc;
138 138
139 139 ASSERT(MUTEX_HELD(&vxlnat_mutex));
140 140 /* For now, we make this a one-underlay-address-only solution. */
141 141 vxlnat_closesock();
142 142 rc = vxlnat_opensock(underlay_ip);
143 143 return (rc);
144 144 }
145 145
146 146 /*
147 147 * Free a remote VXLAN destination.
148 148 */
149 149 void
150 150 vxlnat_remote_free(vxlnat_remote_t *remote)
151 151 {
152 152 ASSERT0(remote->vxnrem_refcount);
153 153
154 154 kmem_free(remote, sizeof (*remote));
155 155 }
156 156
157 157 /*
158 158 * Like other unlink functions, assume the appropriate lock is held.
159 159 */
160 160 void
161 161 vxlnat_remote_unlink(vxlnat_remote_t *remote)
162 162 {
163 163 vxlnat_vnet_t *vnet = remote->vxnrem_vnet;
164 164
165 165 ASSERT3P(vnet, !=, NULL);
166 166 ASSERT(MUTEX_HELD(&vnet->vxnv_remote_lock));
167 167
168 168 /* First unlink so nobody else can find me */
169 169 avl_remove(&vnet->vxnv_remotes, remote);
170 170
171 171 /*
172 172 * We still hold a vnet reference, so races shouldn't be a problem.
173 173 * Still, for added safety, NULL it out first.
174 174 */
175 175 remote->vxnrem_vnet = NULL; /* Condemn this entry. */
176 176 VXNV_REFRELE(vnet);
177 177 VXNREM_REFRELE(remote); /* Internment release. */
178 178 }
179 179
180 180 /*
181 181 * Find or create a remote VXLAN destination.
182 182 */
183 183 static vxlnat_remote_t *
184 184 vxlnat_get_remote(vxlnat_vnet_t *vnet, in6_addr_t *remote_addr,
185 185 boolean_t create_on_miss)
186 186 {
187 187 vxlnat_remote_t *remote, searcher;
188 188 avl_index_t where;
189 189
190 190 searcher.vxnrem_addr = *remote_addr;
191 191 mutex_enter(&vnet->vxnv_remote_lock);
192 192 remote = avl_find(&vnet->vxnv_remotes, &searcher, &where);
193 193 if (remote == NULL && create_on_miss) {
194 194 /* Not as critical if we can't allocate here. */
195 195 remote = kmem_zalloc(sizeof (*remote),
196 196 KM_NOSLEEP | KM_NORMALPRI);
197 197 if (remote != NULL) {
198 198 remote->vxnrem_addr = *remote_addr;
199 199 remote->vxnrem_refcount = 1; /* Internment reference. */
200 200 VXNV_REFHOLD(vnet);
201 201 remote->vxnrem_vnet = vnet;
202 202 /* Rest is filled in by caller. */
203 203 avl_insert(&vnet->vxnv_remotes, remote, where);
204 204 }
205 205 }
206 206 if (remote != NULL)
207 207 VXNREM_REFHOLD(remote);
208 208 mutex_exit(&vnet->vxnv_remote_lock);
209 209 return (remote);
210 210 }
211 211
212 212 /*
213 213 * Cache inbound packet information in the vnet's remotes section.
214 214 *
215 215 * NOTE: This function assumes a trustworthy underlay network. If the
216 216 * underlay isn't trustworthy, this function should be renamed, and reduced to
217 217 * a "strip and reality-check the ethernet header" function.
218 218 *
219 219 * Caller has stripped any pre-ethernet data from mp. We return mp
220 220 * stripped down to its IP header.
221 221 */
222 222 static mblk_t *
223 223 vxlnat_cache_remote(mblk_t *mp, struct sockaddr_in6 *underlay_src,
224 224 vxlnat_vnet_t *vnet)
225 225 {
226 226 struct ether_vlan_header *evh;
227 227 struct ether_header *eh;
228 228 vxlnat_remote_t *remote;
229 229 uint16_t vlan, ethertype;
230 230 ether_addr_t remote_ether;
231 231 ipha_t *ipha;
232 232 ip6_t *ip6h;
233 233 in6_addr_t remote_addr;
234 234
235 235 /* Assume (for now) we have at least a VLAN header's worth of data. */
236 236 if (MBLKL(mp) < sizeof (*evh)) {
237 237 /* XXX KEBE ASKS - should we be more forgiving? */
238 238 DTRACE_PROBE1(vxlnat__in__drop__etherhdr, mblk_t *, mp);
239 239 freemsg(mp);
240 240 return (NULL);
241 241 }
242 242
243 243 eh = (struct ether_header *)mp->b_rptr;
244 244 ethertype = ntohs(eh->ether_type);
245 245 ether_copy(&eh->ether_shost, &remote_ether);
246 246 if (ethertype == ETHERTYPE_VLAN) {
247 247 evh = (struct ether_vlan_header *)eh;
248 248 /* Keep it in network order... */
249 249 vlan = evh->ether_tci;
250 250 ethertype = ntohs(evh->ether_type);
251 251 ASSERT(vlan != 0);
252 252 mp->b_rptr += sizeof (*evh);
253 253 } else {
254 254 evh = NULL;
255 255 vlan = 0;
256 256 mp->b_rptr += sizeof (*eh);
257 257 }
258 258 if (ethertype != ETHERTYPE_IP && ethertype != ETHERTYPE_IPV6) {
259 259 /*
260 260 * XXX KEBE SAYS for now, don't handle non-IP packets.
261 261 * This includes ARP.
262 262 */
263 263 DTRACE_PROBE1(vxlnat__in__drop__nonip, mblk_t *, mp);
264 264 freemsg(mp);
265 265 return (NULL);
266 266 }
267 267
268 268 /* Handle case of split ether + IP headers. */
269 269 if (MBLKL(mp) < sizeof (ipha_t)) {
270 270 mblk_t *freemp;
271 271
272 272 if (MBLKL(mp) > 0 || mp->b_cont == NULL) {
273 273 /* The IP header is split ACROSS MBLKS! Bail for now. */
274 274 DTRACE_PROBE1(vxlnat__in__drop__splitip, mblk_t *, mp);
275 275 freemsg(mp);
276 276 return (NULL);
277 277 }
278 278 freemp = mp;
279 279 mp = mp->b_cont;
280 280 freeb(freemp);
281 281 }
282 282 /* LINTED -- alignment... */
283 283 ipha = (ipha_t *)mp->b_rptr;
284 284
285 285 if (IPH_HDR_VERSION(ipha) == IPV4_VERSION) {
286 286 if (ethertype != ETHERTYPE_IP) {
287 287 /* XXX KEBE ASKS - should we be more forgiving? */
288 288 DTRACE_PROBE1(vxlnat__in__drop__etherhdr4,
289 289 mblk_t *, mp);
290 290 freemsg(mp);
291 291 return (NULL);
292 292 }
293 293 IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
294 294 &remote_addr);
295 295 } else {
296 296 if (ethertype != ETHERTYPE_IPV6 ||
297 297 IPH_HDR_VERSION(ipha) != IPV6_VERSION ||
298 298 MBLKL(mp) < sizeof (ip6_t)) {
299 299 /* XXX KEBE ASKS - should we be more forgiving? */
300 300 DTRACE_PROBE1(vxlnat__in__drop__etherhdr6,
301 301 mblk_t *, mp);
302 302 freemsg(mp);
303 303 return (NULL);
304 304 }
305 305 ip6h = (ip6_t *)ipha;
306 306 remote_addr = ip6h->ip6_src;
307 307 }
308 308
309 309 /* Find remote and replace OR create new remote. */
310 310 remote = vxlnat_get_remote(vnet, &remote_addr, B_TRUE);
311 311 if (remote != NULL) {
312 312 /*
313 313 * See if this entry needs fixing or filling-in. This might
314 314 * get a bit racy with read-only threads that actually
315 315 * transmit, but it only means dropped-packets in the worst
316 316 * case.
317 317 *
318 318 * It's THIS PART that inspires the warning about trusting the
319 319 * underlay network.
320 320 *
321 321 * XXX KEBE ASKS -- should we just replace things w/o checking?
322 322 */
323 323 /* Replace the ethernet address? */
324 324 if (ether_cmp(&remote->vxnrem_ether, &remote_ether) != 0)
325 325 ether_copy(&remote_ether, &remote->vxnrem_ether);
326 326 /*
327 327 * Replace the underlay? NOTE: Fix if/when underlay becomes
328 328 * IPv6.
329 329 */
330 330 if (!IN6_ARE_ADDR_EQUAL(&remote->vxnrem_uaddr,
331 331 &underlay_src->sin6_addr)) {
332 332 remote->vxnrem_uaddr = underlay_src->sin6_addr;
333 333 }
334 334 /* Replace the vlan ID. Maintain network order... */
335 335 if (remote->vxnrem_vlan != vlan)
336 336 remote->vxnrem_vlan = vlan;
337 337 }
338 338 /*
339 339 * Else just continue and pray for better luck on another packet or
340 340 * on the return flight. It is IP, we can Just Drop It (TM)...
341 341 */
|
↓ open down ↓ |
341 lines elided |
↑ open up ↑ |
342 342
343 343 /* We're done with the remote entry now. */
344 344 VXNREM_REFRELE(remote);
345 345
346 346 /* Advance rptr to the inner IP header and proceed. */
347 347 mp->b_rptr = (uint8_t *)ipha;
348 348 return (mp);
349 349 }
350 350
351 351 /*
352 + * See if the inbound VXLAN packet hits a 1-1/fixed mapping, and process if it
353 + * does. B_TRUE means the packet was handled, and we shouldn't continue
354 + * processing (even if "was handled" means droppage).
355 + */
356 +static boolean_t
357 +vxlnat_one_vxlan_fixed(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
358 + ip6_t *ip6h)
359 +{
360 + vxlnat_fixed_t *fixed, fsearch;
361 + mblk_t *newmp;
362 + ire_t *outbound_ire;
363 + /* Use C99's initializers for fun & profit. */
364 + ip_recv_attr_t iras = { IRAF_IS_IPV4 | IRAF_VERIFIED_SRC };
365 +
366 + if (ipha != NULL) {
367 + IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
368 + &fsearch.vxnf_addr);
369 + } else {
370 + /* vxlnat_cache_remote() did reality checks... */
371 + ASSERT(ipha == NULL && ip6h != NULL);
372 + fsearch.vxnf_addr = ip6h->ip6_src;
373 + }
374 +
375 + rw_enter(&vnet->vxnv_fixed_lock, RW_READER);
376 + fixed = avl_find(&vnet->vxnv_fixed_ips, &fsearch, NULL);
377 + if (fixed != NULL)
378 + VXNF_REFHOLD(fixed);
379 + rw_exit(&vnet->vxnv_fixed_lock);
380 + if (fixed == NULL)
381 + return (B_FALSE); /* Try another method of processing. */
382 +
383 + newmp = NULL;
384 + /*
385 + * XXX KEBE ASKS --> Do an MTU check NOW?! That way, we have
386 + * pre-natted data. One gotcha, external dests may have
387 + * different PathMTUs so see below about EMSGSIZE...
388 + *
389 + * For now, let the post-NAT crunch through
390 + * ire_recv_forward_v4() take care of all of that.
391 + */
392 +
393 + if (ipha != NULL)
394 + newmp = vxlnat_fixed_fixv4(mp, fixed, B_FALSE);
395 + else {
396 + freemsg(mp); /* XXX handle ip6h */
397 + return (B_TRUE);
398 + }
399 +
400 + if (newmp == NULL)
401 + return (B_TRUE); /* mp eaten by vxlnat_fixed_fixv4() */
402 +
403 +
404 + ASSERT3P(ipha, ==, newmp->b_rptr);
405 + /* XXX KEBE ASKS, IRR_ALLOCATE okay?!? */
406 + /* XXX KEBE SAYS XMIT HINT! */
407 + outbound_ire = ire_route_recursive_dstonly_v4(ipha->ipha_dst,
408 + IRR_ALLOCATE, 0, vxlnat_netstack->netstack_ip);
409 + VERIFY3P(outbound_ire, !=, NULL);
410 + if (outbound_ire->ire_type == IRE_NOROUTE) {
411 + /* Bail! */
412 + DTRACE_PROBE2(vxlnat__in__drop__fixedire, ipaddr_t,
413 + ipha->ipha_dst, mblk_t *, mp);
414 + VXNF_REFRELE(fixed);
415 + freemsg(mp);
416 + return (B_TRUE);
417 + }
418 +
419 + iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
420 + if (iras.ira_ip_hdr_length > sizeof (ipha_t))
421 + iras.ira_flags |= IRAF_IPV4_OPTIONS;
422 + iras.ira_xmit_hint = 0; /* XXX KEBE SAYS FIX ME! */
423 + iras.ira_zoneid = outbound_ire->ire_zoneid;
424 + iras.ira_pktlen = ntohs(ipha->ipha_length);
425 + iras.ira_protocol = ipha->ipha_protocol;
426 + /* XXX KEBE ASKS rifindex & ruifindex ?!? */
427 + /*
428 + * NOTE: AT LEAST ira_ill needs ILLF_ROUTER set, as
429 + * well as the ill for the external NIC (where
430 + * off-link destinations live). For fixed, ira_ill
431 + * should be the ill of the external source.
432 + */
433 + iras.ira_rill = vxlnat_underlay_ire->ire_ill;
434 + iras.ira_ill = fixed->vxnf_ire->ire_ill;
435 + /* XXX KEBE ASKS cred & cpid ? */
436 + iras.ira_verified_src = ipha->ipha_src;
437 + /* XXX KEBE SAYS don't sweat IPsec stuff. */
438 + /* XXX KEBE SAYS ALSO don't sweat l2src & mhip */
439 +
440 + /* Okay, we're good! Let's pretend we're forwarding. */
441 + ire_recv_forward_v4(outbound_ire, mp, ipha, &iras);
442 + ire_refrele(outbound_ire);
443 +
444 + return (B_TRUE);
445 +}
446 +
447 +/*
352 448 * Process exactly one VXLAN packet.
353 449 */
354 450 static void
355 451 vxlnat_one_vxlan(mblk_t *mp, struct sockaddr_in6 *underlay_src)
356 452 {
357 453 vxlan_hdr_t *vxh;
358 454 vxlnat_vnet_t *vnet;
359 455 ipha_t *ipha;
360 456 ip6_t *ip6h;
361 - vxlnat_fixed_t *fixed, fsearch;
362 457
363 458 if (MBLKL(mp) < sizeof (*vxh)) {
364 459 /* XXX KEBE ASKS -- should we be more forgiving? */
365 460 DTRACE_PROBE1(vxlnat__in__drop__vxlsize, mblk_t *, mp);
366 461 freemsg(mp);
367 462 return;
368 463 }
369 464 vxh = (vxlan_hdr_t *)mp->b_rptr;
370 465
371 466 /* If we start using more than just the one flag, fix it. */
372 467 if (vxh->vxlan_flags != VXLAN_F_VDI_WIRE) {
373 468 DTRACE_PROBE1(vxlnat__in__drop__VDI, mblk_t *, mp);
374 469 freemsg(mp);
375 470 return;
376 471 }
377 472
378 473 /* Remember, we key off of what's on the wire. */
379 474 vnet = vxlnat_get_vnet(VXLAN_ID_WIRE32(vxh->vxlan_id), B_FALSE);
380 475 if (vnet == NULL) {
381 476 DTRACE_PROBE1(vxlnat__in__drop__vnetid, uint32_t,
382 477 VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)));
383 478 freemsg(mp);
384 479 return;
385 480 }
386 481
387 482 DTRACE_PROBE2(vxlnat__in__vnet, uint32_t,
388 483 VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)),
389 484 vxlnat_vnet_t, vnet);
390 485
391 486 /*
392 487 * Off-vxlan processing steps:
393 488 * 1.) Locate the ethernet header and check/update/add-into remotes.
394 489 * 2.) Search 1-1s, process if hit.
395 490 * 3.) Search flows, process if hit.
396 491 * 4.) Search rules, create new flow (or not) if hit.
397 492 * 5.) Drop the packets.
398 493 */
|
↓ open down ↓ |
27 lines elided |
↑ open up ↑ |
399 494
400 495 /* 1.) Locate the ethernet header and check/update/add-into remotes. */
401 496 mp->b_rptr += sizeof (*vxh);
402 497 while (MBLKL(mp) == 0) {
403 498 mblk_t *oldmp = mp;
404 499
405 500 mp = mp->b_cont;
406 501 freeb(oldmp);
407 502 }
408 503 mp = vxlnat_cache_remote(mp, underlay_src, vnet);
409 - if (mp == NULL) {
410 - VXNV_REFRELE(vnet);
411 - return;
412 - }
504 + if (mp == NULL)
505 + goto bail_no_free;
413 506
414 - /* 2.) Search 1-1s, process if hit. */
507 + /* Let's cache the IP header here... */
415 508 ipha = (ipha_t *)mp->b_rptr;
416 - if (IPH_HDR_VERSION(ipha) == IPV4_VERSION) {
509 + switch (IPH_HDR_VERSION(ipha)) {
510 + case IPV4_VERSION:
417 511 ip6h = NULL;
418 - IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
419 - &fsearch.vxnf_addr);
420 - } else {
421 - /* vxlnat_cache_remote() did reality checks... */
422 - ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
512 + break;
513 + case IPV6_VERSION:
423 514 ip6h = (ip6_t *)ipha;
424 515 ipha = NULL;
425 - fsearch.vxnf_addr = ip6h->ip6_src;
516 + break;
517 + default:
518 + DTRACE_PROBE2(vxlnat__in__drop__ipvers, int,
519 + IPH_HDR_VERSION(ipha), mblk_t *, mp);
520 + goto bail_and_free;
426 521 }
427 - rw_enter(&vnet->vxnv_fixed_lock, RW_READER);
428 - fixed = avl_find(&vnet->vxnv_fixed_ips, &fsearch, NULL);
429 - if (fixed != NULL)
430 - VXNF_REFHOLD(fixed);
431 - rw_exit(&vnet->vxnv_fixed_lock);
432 - if (fixed != NULL) {
433 - mblk_t *newmp = NULL;
434 522
435 - /*
436 - * XXX KEBE ASKS --> Do MTU check NOW?! That way, we have
437 - * pre-natted data. One gotcha, external dests may have
438 - * different PathMTUs so see below about EMSGSIZE...
439 - */
523 + /* 2.) Search 1-1s, process if hit. */
524 + if (vxlnat_one_vxlan_fixed(vnet, mp, ipha, ip6h))
525 + goto bail_no_free; /* Success means mp was consumed. */
440 526
441 - /* XXX KEBE SAYS -- FILL ME IN... but for now: */
442 - if (ipha != NULL)
443 - newmp = vxlnat_fixed_fixv4(mp, fixed, B_FALSE);
444 - else
445 - freemsg(mp); /* XXX handle ip6h */
527 +#ifdef notyet
528 + /* 3.) Search flows, process if hit. */
529 + if (vxlnat_one_vxlan_flow(vnet, mp, ipha, ip6h))
530 + goto bail_no_free; /* Success means mp was consumed. */
446 531
447 - if (newmp != NULL) {
448 - ire_t *outbound_ire;
449 - /* Use C99's initializers for fun & profit. */
450 - ip_recv_attr_t iras =
451 - { IRAF_IS_IPV4 | IRAF_VERIFIED_SRC };
532 + /* 4.) Search rules, create new flow (or not) if hit. */
533 + if (vxlnat_one_vxlan_rule(vnet, mp, ipha, ip6h))
534 + goto bail_no_free; /* Success means mp was consumed. */
535 +#endif
452 536
453 - ASSERT3P(ipha, !=, NULL);
454 - ASSERT3P(ipha, ==, newmp->b_rptr);
455 - /* XXX KEBE ASKS, IRR_ALLOCATE okay?!? */
456 - outbound_ire = ire_route_recursive_dstonly_v4(
457 - ipha->ipha_dst, IRR_ALLOCATE,
458 - 0 /* XXX KEBE SAYS XMIT HINT! */,
459 - vxlnat_netstack->netstack_ip);
460 - VERIFY3P(outbound_ire, !=, NULL);
461 - if (outbound_ire->ire_type == IRE_NOROUTE) {
462 - /* Bail! */
463 - VXNF_REFRELE(fixed);
464 - VXNV_REFRELE(vnet);
465 - return;
466 - }
537 + /* 5.) Nothing, drop the packet. */
467 538
468 - iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
469 - if (iras.ira_ip_hdr_length > sizeof (ipha_t))
470 - iras.ira_flags |= IRAF_IPV4_OPTIONS;
471 - iras.ira_xmit_hint = 0; /* XXX KEBE SAYS FIX ME! */
472 - iras.ira_zoneid = outbound_ire->ire_zoneid;
473 - iras.ira_pktlen = ntohs(ipha->ipha_length);
474 - iras.ira_protocol = ipha->ipha_protocol;
475 - /* XXX KEBE ASKS rifindex & ruifindex ?!? */
476 - /*
477 - * NOTE: AT LEAST ira_ill needs ILLF_ROUTER set, as
478 - * well as the ill for the external NIC (where
479 - * off-link destinations live). For fixed, ira_ill
480 - * should be the ill of the external source.
481 - */
482 - iras.ira_rill = vxlnat_underlay_ire->ire_ill;
483 - iras.ira_ill = fixed->vxnf_ire->ire_ill;
484 - /* XXX KEBE ASKS cred & cpid ? */
485 - iras.ira_verified_src = ipha->ipha_src;
486 - /* XXX KEBE SAYS don't sweat IPsec stuff. */
487 - /* XXX KEBE SAYS ALSO don't sweat l2src & mhip */
539 + DTRACE_PROBE2(vxlnat__in___drop__nohits, vxlnat_vnet_t *, vnet,
540 + mblk_t *, mp);
488 541
489 - /* Okay, we're good! Let's pretend we're forwarding. */
490 - ire_recv_forward_v4(outbound_ire, mp, ipha, &iras);
491 - ire_refrele(outbound_ire);
492 - }
493 -
494 - /* All done... */
495 - VXNF_REFRELE(fixed);
496 - VXNV_REFRELE(vnet);
497 - return;
498 - }
499 -
500 - /* XXX KEBE SAYS BUILD STEPS 3-4. */
501 -
502 - /* 5.) Nothing, drop the packet. */
503 - /* XXX KEBE ASKS DIAGNOSTIC? */
504 - VXNV_REFRELE(vnet);
542 +bail_and_free:
505 543 freemsg(mp);
544 +bail_no_free:
545 + VXNV_REFRELE(vnet);
506 546 }
507 547 /*
508 548 * ONLY return B_FALSE if we get a packet-clogging event.
509 549 */
510 550 /* ARGSUSED */
511 551 static boolean_t
512 552 vxlnat_vxlan_input(ksocket_t insock, mblk_t *chain, size_t msgsize, int oob,
513 553 void *ignored)
514 554 {
515 555 mblk_t *mp, *nextmp;
516 556
517 557 /*
518 558 * XXX KEBE ASKS --> move hold & release outside of loop?
519 559 * If so, hold rwlock here.
520 560 */
521 561
522 562 for (mp = chain; mp != NULL; mp = nextmp) {
523 563 struct T_unitdata_ind *tudi;
524 564 struct sockaddr_in6 *sin6;
525 565
526 566 nextmp = mp->b_next;
527 567 if (DB_TYPE(mp) != M_PROTO || mp->b_cont == NULL) {
528 568 DTRACE_PROBE1(vxlnat__in__drop__mblk, mblk_t *, mp);
529 569 freemsg(mp);
530 570 continue;
531 571 }
532 572
533 573 /* LINTED -- aligned */
534 574 tudi = (struct T_unitdata_ind *)mp->b_rptr;
535 575 if (tudi->PRIM_type != T_UNITDATA_IND) {
536 576 DTRACE_PROBE1(vxlnat__in__drop__TPI, mblk_t *, mp);
537 577 freemsg(mp);
538 578 continue;
539 579 }
540 580 /* LINTED -- aligned */
541 581 sin6 = (struct sockaddr_in6 *)(mp->b_rptr + tudi->SRC_offset);
542 582 VERIFY(sin6->sin6_family == AF_INET6);
543 583 VERIFY(tudi->SRC_length >= sizeof (*sin6));
544 584
545 585 vxlnat_one_vxlan(mp->b_cont, sin6);
546 586 freeb(mp);
547 587 }
548 588
549 589 return (B_TRUE);
550 590 }
551 591
552 592 /*
553 593 * Use RFC 1141's technique (with a check for -0).
554 594 *
555 595 * newsum = oldsum - (new16a + old16a - new16b + old16b ...);
556 596 *
557 597 * NOTE: "oldsum" is right off the wire in wire-native order.
558 598 * NOTE2: "old" and "new" ALSO point to things in wire-native order.
559 599 * NOTE3: THIS MUST TAKE A MULTIPLE OF 2 BYTES (i.e. uint16_t array).
560 600 * NOTE4: The 32-bit running sum means we can't take len > 64k.
561 601 */
562 602 uint16_t
563 603 vxlnat_cksum_adjust(uint16_t oldsum, uint16_t *old, uint16_t *new, uint_t len)
564 604 {
565 605 uint32_t newsum = ntohs(oldsum);
566 606
567 607 ASSERT((len & 0x1) == 0);
568 608 while (len != 0) {
569 609 newsum -= ntohs(*new);
570 610 newsum += ntohs(*old);
571 611 len -= 2;
572 612 old++;
573 613 new++;
574 614 }
575 615 newsum += (newsum >> 16) & 0xffff;
576 616
577 617 return (newsum == 0xffff ? 0 : htons(newsum));
578 618 }
579 619
580 620 /*
581 621 * Fix inner headers on an ICMP packet.
582 622 *
583 623 * XXX KEBE SAYS FOR NOW, just do addresses for 1-1/fixed. When we do
584 624 * flows, include old_port/new_port as well.
585 625 */
586 626 static mblk_t *
587 627 vxlnat_fix_icmp_inner_v4(mblk_t *mp, icmph_t *icmph, ipaddr_t old_one,
588 628 ipaddr_t new_one, boolean_t to_private)
589 629 {
590 630 mblk_t *newmp;
591 631 ipha_t *inner_ipha;
592 632 ipaddr_t *new_ones_place;
593 633
594 634 if ((uint8_t *)(icmph + 1) + sizeof (ipha_t) > mp->b_wptr) {
595 635 /* Pay the pullup tax. */
596 636 newmp = msgpullup(mp, -1);
597 637 freemsg(mp);
598 638 if (newmp == NULL) {
599 639 DTRACE_PROBE1(vxlnat__fixicmp__pullupfail, void *,
600 640 NULL);
601 641 return (NULL);
602 642 }
603 643 if (MBLKL(newmp) < 2 * sizeof (ipha_t) + sizeof (icmph_t)) {
604 644 /* Wow! Too-tiny ICMP packet. */
605 645 DTRACE_PROBE1(vxlnat__fixicmp__tootiny, mblk_t *,
606 646 newmp);
607 647 freeb(newmp);
608 648 return (NULL);
609 649 }
610 650 mp = newmp;
611 651 /* Temporarily use inner_ipha for the outer one. */
612 652 inner_ipha = (ipha_t *)mp->b_rptr;
613 653 icmph = (icmph_t *)(mp->b_rptr + IPH_HDR_LENGTH(inner_ipha));
614 654 }
615 655 inner_ipha = (ipha_t *)(icmph + 1);
616 656 new_ones_place = to_private ?
617 657 &inner_ipha->ipha_src : &inner_ipha->ipha_dst;
618 658 if (*new_ones_place != old_one) {
619 659 /* Either I'm buggy or the packet is. */
620 660 DTRACE_PROBE2(vxlnat__fixicmp__badinneraddr, ipaddr_t,
621 661 old_one, ipaddr_t, *new_ones_place);
622 662 freeb(mp);
623 663 return (NULL);
624 664 }
625 665 *new_ones_place = new_one;
626 666
627 667 /* Adjust ICMP checksum... */
628 668 icmph->icmph_checksum = vxlnat_cksum_adjust(icmph->icmph_checksum,
629 669 (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
630 670
|
↓ open down ↓ |
115 lines elided |
↑ open up ↑ |
631 671 /*
632 672 * XXX KEBE ASKS, recompute *inner-packet* checksums? Let's not for
633 673 * now, but consider this Fair Warning (or some other VH album...).
634 674 */
635 675 return (mp);
636 676 }
637 677
638 678 /*
639 679 * Take a 1-1/fixed IPv4 packet and convert it for transmission out the
640 680 * appropriate end. "to_private" is what it says on the tin.
681 + * ALWAYS consumes "mp", regardless of return value.
641 682 */
642 683 static mblk_t *
643 684 vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed, boolean_t to_private)
644 685 {
645 686 ipaddr_t new_one, old_one;
646 687 ipaddr_t *new_ones_place;
647 688 ipha_t *ipha = (ipha_t *)mp->b_rptr;
648 689 uint8_t *nexthdr, *end_wptr;
649 690
650 691 if (to_private) {
651 692 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_addr, new_one);
652 693 new_ones_place = &ipha->ipha_dst;
653 694 } else {
654 695 IN6_V4MAPPED_TO_IPADDR(&fixed->vxnf_pubaddr, new_one);
655 696 new_ones_place = &ipha->ipha_src;
656 697 }
657 698
658 699 old_one = *new_ones_place;
659 700 *new_ones_place = new_one;
660 701
661 702 /*
662 703 * Recompute the IP header checksum, and check for the TCP or UDP
663 704 * checksum as well, as they'll need recomputing as well.
664 705 */
665 706
666 707 /* First, the IPv4 header itself. */
667 708 ipha->ipha_hdr_checksum = vxlnat_cksum_adjust(ipha->ipha_hdr_checksum,
668 709 (uint16_t *)&old_one, (uint16_t *)&new_one, sizeof (ipaddr_t));
669 710
670 711 nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
671 712 if (nexthdr >= mp->b_wptr) {
672 713 nexthdr = mp->b_cont->b_rptr +
673 714 (MBLKL(mp) - IPH_HDR_LENGTH(ipha));
674 715 end_wptr = mp->b_cont->b_wptr;
675 716 } else {
676 717 end_wptr = mp->b_wptr;
677 718 }
678 719
679 720 switch (ipha->ipha_protocol) {
680 721 case IPPROTO_TCP: {
681 722 tcpha_t *tcph = (tcpha_t *)nexthdr;
682 723
683 724 if (nexthdr + sizeof (*tcph) >= end_wptr) {
684 725 /* Bail for now. */
685 726 DTRACE_PROBE1(vxlnat__fix__tcp__mblkspan, mblk_t *,
686 727 mp);
687 728 freemsg(mp);
688 729 return (NULL);
689 730 }
690 731 tcph->tha_sum = vxlnat_cksum_adjust(tcph->tha_sum,
691 732 (uint16_t *)&old_one, (uint16_t *)&new_one,
692 733 sizeof (ipaddr_t));
693 734 break; /* Out of switch. */
694 735 }
695 736 case IPPROTO_UDP: {
696 737 udpha_t *udph = (udpha_t *)nexthdr;
697 738
698 739 if (nexthdr + sizeof (*udph) >= end_wptr) {
699 740 /* Bail for now. */
700 741 DTRACE_PROBE1(vxlnat__fix__udp__mblkspan, mblk_t *,
701 742 mp);
702 743 freemsg(mp);
703 744 return (NULL);
704 745 }
705 746 udph->uha_checksum = vxlnat_cksum_adjust(udph->uha_checksum,
706 747 (uint16_t *)&old_one, (uint16_t *)&new_one,
707 748 sizeof (ipaddr_t));
708 749 break; /* Out of switch. */
709 750 }
710 751 case IPPROTO_ICMP: {
711 752 icmph_t *icmph = (icmph_t *)nexthdr;
712 753
713 754 /*
714 755 * We need to check the case of ICMP messages that contain
715 756 * IP packets. We will need to at least change the addresses,
716 757 * and *maybe* the checksums too if necessary.
717 758 *
718 759 * This may replicate some of icmp_inbound_v4(), alas.
719 760 */
720 761 if (nexthdr + sizeof (*icmph) >= end_wptr) {
721 762 mblk_t *newmp;
722 763 /*
723 764 * Unlike the others, we're going to pay the pullup
724 765 * tax here.
725 766 */
726 767 newmp = msgpullup(mp, -1);
727 768 freemsg(mp);
728 769 if (newmp == NULL) {
729 770 DTRACE_PROBE1(vxlnat__icmp__pullupfail, void *,
730 771 NULL);
731 772 return (NULL);
732 773 }
733 774 mp = newmp;
734 775 ipha = (ipha_t *)(mp->b_rptr);
735 776 nexthdr = (uint8_t *)ipha + IPH_HDR_LENGTH(ipha);
736 777 icmph = (icmph_t *)nexthdr;
737 778 }
738 779
739 780 switch (icmph->icmph_type) {
740 781 case ICMP_ADDRESS_MASK_REPLY:
741 782 case ICMP_ADDRESS_MASK_REQUEST:
742 783 case ICMP_TIME_STAMP_REPLY:
743 784 case ICMP_TIME_STAMP_REQUEST:
744 785 case ICMP_ECHO_REQUEST:
745 786 case ICMP_ECHO_REPLY:
746 787 /* These merely need to get passed along. */
747 788 break;
748 789 case ICMP_ROUTER_ADVERTISEMENT:
749 790 case ICMP_ROUTER_SOLICITATION:
750 791 /* These shouldn't be traversing a NAT at all. Drop. */
751 792 DTRACE_PROBE1(vxlnat__icmp__cantpass, int,
752 793 icmph->icmph_type);
753 794 freemsg(mp);
754 795 return (NULL);
755 796 case ICMP_PARAM_PROBLEM:
756 797 case ICMP_TIME_EXCEEDED:
757 798 case ICMP_DEST_UNREACHABLE:
758 799 /* These include inner-IP headers we need to adjust. */
759 800 mp = vxlnat_fix_icmp_inner_v4(mp, icmph, old_one,
760 801 new_one, to_private);
761 802 break;
762 803 default:
763 804 /* Pass along to receiver, but warn. */
764 805 DTRACE_PROBE1(vxlnat__icmp__unknown, int,
765 806 icmph->icmph_type);
766 807 break;
767 808 }
768 809 }
769 810 /* Otherwise we can't make any other assumptions for now... */
770 811 default:
771 812 break;
772 813 }
773 814
774 815 return (mp);
775 816 }
776 817
777 818 vxlnat_remote_t *
778 819 vxlnat_xmit_vxlanv4(mblk_t *mp, in6_addr_t *overlay_dst,
779 820 vxlnat_remote_t *remote, uint8_t *myether, vxlnat_vnet_t *vnet)
780 821 {
781 822 struct sockaddr_in6 sin6 = {AF_INET6};
782 823 struct msghdr msghdr = {NULL};
783 824 mblk_t *vlan_mp;
784 825 extern uint_t vxlan_alloc_size, vxlan_noalloc_min;
785 826 vxlan_hdr_t *vxh;
786 827 struct ether_vlan_header *evh;
787 828 int rc;
788 829 cred_t *cred;
789 830
790 831 if (remote == NULL || remote->vxnrem_vnet == NULL) {
791 832 DTRACE_PROBE1(vxlnat__xmit__vxlanv4, vxlnat_remote_t *, remote);
792 833 /* Release the condemned remote. */
793 834 if (remote != NULL)
794 835 VXNREM_REFRELE(remote);
795 836
796 837 /* See if we have a remote ready to use... */
797 838 remote = vxlnat_get_remote(vnet, overlay_dst, B_FALSE);
798 839
799 840 if (remote == NULL) {
800 841 /*
801 842 * We need to do the moral equivalent of PF_KEY
802 843 * ACQUIRE or overlay's queue-resolve so that we can
803 844 * have someone in user-space send me a remote. Until
804 845 * then, drop the reference if condemned, free the
805 846 * message, and return NULL.
806 847 */
807 848
808 849 freemsg(mp);
809 850 return (NULL);
810 851 }
811 852 }
812 853 ASSERT(vnet == remote->vxnrem_vnet);
813 854
814 855 if (DB_REF(mp) > 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
815 856 vlan_mp = allocb(vxlan_alloc_size, BPRI_HI);
816 857 if (vlan_mp == NULL) {
817 858 DTRACE_PROBE1(vxlnat__xmit__vxlanv4__allocfail,
818 859 vxlnat_remote_t *, remote);
819 860 freemsg(mp);
820 861 /* Just drop the packet, but don't tell caller. */
821 862 return (remote);
822 863 }
823 864 vlan_mp->b_wptr = DB_LIM(vlan_mp);
824 865 vlan_mp->b_rptr = vlan_mp->b_wptr;
825 866 vlan_mp->b_cont = mp;
826 867 } else {
827 868 vlan_mp = mp;
828 869 }
829 870 vlan_mp->b_rptr -= sizeof (*vxh) + sizeof (*evh);
830 871 vxh = (vxlan_hdr_t *)vlan_mp->b_rptr;
831 872 vxh->vxlan_flags = VXLAN_F_VDI_WIRE;
832 873 vxh->vxlan_id = vnet->vxnv_vnetid; /* Already in wire-order. */
833 874
834 875 /* Fill in the Ethernet header. */
835 876 evh = (struct ether_vlan_header *)(vxh + 1);
836 877 ether_copy(&remote->vxnrem_ether, &evh->ether_dhost);
837 878 ether_copy(myether, &evh->ether_shost);
838 879 evh->ether_tpid = htons(ETHERTYPE_VLAN);
839 880 evh->ether_tci = remote->vxnrem_vlan;
840 881 evh->ether_type = htons(ETHERTYPE_IP);
841 882
842 883 msghdr.msg_name = (struct sockaddr_storage *)&sin6;
843 884 msghdr.msg_namelen = sizeof (sin6);
844 885 /* Address family and other zeroing already done up top. */
845 886 sin6.sin6_port = htons(IPPORT_VXLAN);
846 887 sin6.sin6_addr = remote->vxnrem_uaddr;
847 888
848 889 /*
849 890 * cred_t dance is because we may be getting this straight from
850 891 * interrupt context.
851 892 */
852 893 cred = zone_get_kcred(netstack_get_zoneid(vxlnat_netstack));
853 894 if (cred == NULL) {
854 895 DTRACE_PROBE1(vxlnat__xmit__vxlan4__credfail,
855 896 vxlnat_remote_t *, remote);
856 897 freemsg(vlan_mp);
857 898 }
858 899 /*
859 900 * Use MSG_DONTWAIT to avoid blocks, esp. if we're getting this
860 901 * straight from interrupt context.
861 902 */
862 903 rc = ksocket_sendmblk(vxlnat_underlay, &msghdr, MSG_DONTWAIT, &vlan_mp,
863 904 cred);
864 905 crfree(cred);
865 906 if (rc != 0) {
866 907 DTRACE_PROBE2(vxlnat__xmit__vxlan4__sendfail, int, rc,
867 908 vxlnat_remote_t *, remote);
868 909 freemsg(vlan_mp);
869 910 }
870 911 return (remote);
871 912 }
872 913
873 914 /*
874 915 * New ire_{recv,send}fn implementations if we're doing 1-1 mappings.
875 916 */
876 917 int
877 918 vxlnat_fixed_ire_send_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
878 919 ip_xmit_attr_t *ixa, uint32_t *identp)
879 920 {
880 921 /* XXX KEBE SAYS FILL ME IN, but for now... */
881 922 freemsg(mp);
882 923 return (EOPNOTSUPP);
883 924 }
884 925
885 926 void
886 927 vxlnat_fixed_ire_recv_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
887 928 ip_recv_attr_t *ira)
888 929 {
889 930 /* XXX KEBE SAYS FILL ME IN, but for now... */
890 931 freemsg(mp);
891 932 }
892 933
893 934 /*
894 935 * I believe the common case for this will be from self-generated ICMP
895 936 * messages. Other same-netstack-originated traffic will also come through
896 937 * here (one internal reaching what turns out to be another internal).
897 938 */
898 939 int
899 940 vxlnat_fixed_ire_send_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
900 941 ip_xmit_attr_t *ixa, uint32_t *identp)
901 942 {
902 943 ip_recv_attr_t iras; /* NOTE: No bzero because we pay more later */
903 944 ipha_t *ipha = (ipha_t *)iph_arg;
904 945
905 946 /*
906 947 * XXX KEBE ASKS, any DTrace probes or other instrumentation that
907 948 * perhaps should be set?
908 949 */
909 950
910 951 /* Map ixa to ira. */
911 952 iras.ira_pktlen = ixa->ixa_pktlen;
912 953 /* XXX KEBE ASKS more?!? */
913 954
914 955 /*
915 956 * In normal TCP/IP processing, this shortcuts the IP header checksum
916 957 * AND POSSIBLY THE ULP checksum cases. Since this is likely to head
917 958 * back into the internal network, we need to recompute things again.
918 959 */
919 960 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
920 961 freemsg(mp);
921 962 return (EMSGSIZE);
922 963 }
923 964 #if 0
924 965 /* XXX KEBE ASKS Special-case ICMP here? */
925 966 if (ipha->ipha_protocol == IPPROTO_ICMP) {
926 967 icmph_t *icmph;
927 968
928 969 icmph = (icmph_t *)((uint8_t *)ipha + IPH_HDR_LENGTH(ipha));
929 970 if ((uint8_t *)icmph >= mp->b_wptr) {
930 971 freemsg(mp);
931 972 return (EMSGSIZE);
932 973 }
933 974 icmph->icmph_checksum = 0;
934 975 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
935 976 }
936 977 #endif
937 978
938 979 vxlnat_fixed_ire_recv_v4(ire, mp, iph_arg, &iras);
939 980
940 981 return (0);
941 982 }
942 983
943 984 void
944 985 vxlnat_fixed_ire_recv_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
945 986 ip_recv_attr_t *ira)
946 987 {
947 988 vxlnat_fixed_t *fixed;
948 989 vxlnat_vnet_t *vnet;
949 990 ipha_t *ipha = (ipha_t *)iph_arg;
950 991 int newmtu;
951 992
952 993 /* Make a note for DAD that this address is in use */
953 994 ire->ire_last_used_time = LBOLT_FASTPATH;
954 995
955 996 /* Only target the IRE_LOCAL with the right zoneid. */
956 997 ira->ira_zoneid = ire->ire_zoneid;
957 998
958 999 /*
959 1000 * XXX KEBE ASKS, any DTrace probes or other instrumentation that
960 1001 * perhaps should be set?
961 1002 */
962 1003
963 1004 /*
964 1005 * Reality check some things.
965 1006 */
966 1007 fixed = (vxlnat_fixed_t *)ire->ire_dep_sib_next;
967 1008 vnet = fixed->vxnf_vnet;
968 1009
969 1010 ASSERT3P(ire, ==, fixed->vxnf_ire);
970 1011
971 1012 if (IRE_IS_CONDEMNED(ire) || vnet == NULL)
972 1013 goto detach_ire_and_bail;
973 1014
974 1015 /*
975 1016 * Not a common-case, but a possible one. If our underlay MTU is
976 1017 * smaller than the external MTU, it is possible that we will have a
977 1018 * size mismatch and therefore need to either fragment at the VXLAN
978 1019 * layer (VXLAN UDP packet sent as two or more IP fragments) OR
979 1020 * if IPH_DF is set, send an ICMP_NEEDS_FRAGMENTATION back to the
980 1021 * sender. Perform the check here BEFORE we NAT the packet.
981 1022 */
982 1023 ASSERT(vxlnat_underlay_ire->ire_ill != NULL);
983 1024 newmtu = vxlnat_underlay_ire->ire_ill->ill_mtu - sizeof (ipha_t) -
984 1025 sizeof (udpha_t) - sizeof (vxlan_hdr_t) -
985 1026 sizeof (struct ether_vlan_header);
986 1027 if ((ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF) &&
987 1028 ntohs(ipha->ipha_length) > newmtu) {
988 1029 icmp_frag_needed(mp, newmtu, ira);
989 1030 /* We're done. Assume icmp_frag_needed() consumed mp. */
990 1031 return;
991 1032 }
992 1033
993 1034 /*
994 1035 * So we're here, and since we have a refheld IRE, we have a refheld
995 1036 * fixed and vnet. Do some of what ip_input_local_v4() does (inbound
996 1037 * checksum? some ira checks?), but otherwise, swap the destination
997 1038 * address as mapped in "fixed", recompute any checksums, and send it
998 1039 * along its merry way (with a ttl decement too) to a VXLAN
999 1040 * destination.
1000 1041 */
1001 1042 mp = vxlnat_fixed_fixv4(mp, fixed, B_TRUE);
1002 1043 if (mp == NULL)
1003 1044 return; /* Assume it's been freed & dtraced already. */
1004 1045
1005 1046 /*
1006 1047 * Otherwise, we're ready to transmit this packet over the vxlan
1007 1048 * socket.
1008 1049 */
1009 1050 fixed->vxnf_remote = vxlnat_xmit_vxlanv4(mp, &fixed->vxnf_addr,
1010 1051 fixed->vxnf_remote, fixed->vxnf_myether, vnet);
1011 1052 if (fixed->vxnf_remote == NULL) {
1012 1053 /* XXX KEBE ASKS, DTrace probe here? Or in-function? */
1013 1054 DTRACE_PROBE2(vxlnat__fixed__xmitdrop,
1014 1055 in6_addr_t *, &fixed->vxnf_addr,
1015 1056 uint32_t, VXLAN_ID_NTOH(vnet->vxnv_vnetid));
1016 1057 }
1017 1058 return;
1018 1059
1019 1060 detach_ire_and_bail:
1020 1061 /* Oh no, something's condemned. Drop the IRE now. */
1021 1062 ire->ire_recvfn = ire_recv_local_v4;
1022 1063 ire->ire_dep_sib_next = NULL;
1023 1064 VXNF_REFRELE(fixed);
1024 1065 /* Pass the packet back... */
1025 1066 ire_recv_local_v4(ire, mp, iph_arg, ira);
1026 1067 return;
1027 1068 }
|
↓ open down ↓ |
377 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX