1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2018 Joyent, Inc.
14 */
15
16 /*
17 * Writes (new rules) and reads (rule dump) go here. So do the
18 * ins/outs of reading & writing.
19 */
20
21 #include <sys/ddi.h>
22 #include <sys/dtrace.h>
23 #include <sys/debug.h>
24 #include <inet/vxlnat_impl.h>
25 #include <inet/ip_if.h> /* XXX KEBE SAYS CHEESY HACK */
26
27 /*
28 * These are all initialized to NULL or 0.
29 *
30 * If a VXNM_DUMP is requested, these get allocated/set. vxlnat_read()
31 * calls will consume them, and once delivered the last bytes read will
32 * cause these to be freed and reset to NULL/0. Cheesy, but this is a
33 * one-at-a-time thing. Protected by vxlnat_mutex.
34 */
35 static vxn_msg_t *vxlnat_dumpbuf;
36 static size_t vxlnat_initial; /* non-zero if no read yet. */
37 static size_t vxlnat_dumpcount;
38 static size_t vxlnat_dumpcurrent;
39
40 /*
41 * Store per-vnet-state in AVL tree. We could be handling 1000s or more...
42 * Could split this into a hash table of AVL trees if need be.
43 */
44 static krwlock_t vxlnat_vnet_lock; /* Could be mutex if we use refhold. */
45 static avl_tree_t vxlnat_vnets;
46
47 static void vxlnat_rule_unlink(vxlnat_rule_t *);
48 static void vxlnat_fixed_unlink(vxlnat_fixed_t *);
49 /* In vxlnat_nat.c */
50 extern void vxlnat_remote_unlink(vxlnat_remote_t *);
51
52 /*
53 * Comparison function for vnet AVL tree.
54 */
55 static int
56 vxlnat_vnetid_cmp(const void *first, const void *second)
57 {
58 uint32_t first_vnetid, second_vnetid;
59
60 first_vnetid = ((vxlnat_vnet_t *)first)->vxnv_vnetid;
61 second_vnetid = ((vxlnat_vnet_t *)second)->vxnv_vnetid;
62
63 if (first_vnetid < second_vnetid)
64 return (-1);
65 if (first_vnetid > second_vnetid)
66 return (1);
67 return (0);
68 }
69
70 /*
71 *
72 * NOTE: Many structures start with the form:
73 *
74 * struct foo {
75 * avl_node_t node;
76 * in6_addr_t address_which_is_search_key;
77 * ....
78 *
79 * We will use this same AVL comparison function for many of these structures.
80 */
81 int
82 vxlnat_tree_plus_in6_cmp(const void *first, const void *second)
83 {
84 in6_addr_t *firstaddr, *secondaddr;
85 int ret;
86
87 firstaddr = (in6_addr_t *)(((avl_node_t *)first) + 1);
88 secondaddr = (in6_addr_t *)(((avl_node_t *)second) + 1);
89
90 ret = memcmp(firstaddr, secondaddr, sizeof (in6_addr_t));
91 if (ret > 0)
92 return (1);
93 if (ret < 0)
94 return (-1);
95 return (0);
96 }
97
98 /*
99 * Comparison function for NAT flow.
100 */
101 static int
102 vxlnat_flow_cmp_v4(const void *first, const void *second)
103 {
104 vxlnat_flow_t *first_flow = (vxlnat_flow_t *)first;
105 vxlnat_flow_t *second_flow = (vxlnat_flow_t *)second;
106 uint64_t firstaddrs, secondaddrs, firstportproto, secondportproto;
107
108 firstaddrs = first_flow->vxnfl_src._S6_un._S6_u32[3] |
109 (((uint64_t)first_flow->vxnfl_dst._S6_un._S6_u32[3]) << 32ULL);
110 secondaddrs = second_flow->vxnfl_src._S6_un._S6_u32[3] |
111 (((uint64_t)second_flow->vxnfl_dst._S6_un._S6_u32[3]) << 32ULL);
112 firstportproto = first_flow->vxnfl_ports |
113 (((uint64_t)first_flow->vxnfl_protocol) << 32ULL);
114 secondportproto = second_flow->vxnfl_ports |
115 (((uint64_t)second_flow->vxnfl_protocol) << 32ULL);
116
117 if (firstaddrs > secondaddrs)
118 return (1);
119 else if (firstaddrs < secondaddrs)
120 return (-1);
121 else if (firstportproto > secondportproto)
122 return (1);
123 else if (firstportproto < secondportproto)
124 return (-1);
125
126 return (0);
127 }
128
129 /*
130 * Find-and-reference-hold a vnet. If none present, create one.
131 * "vnetid" MUST be in wire-order and its one byte cleared.
132 */
133 vxlnat_vnet_t *
134 vxlnat_get_vnet(uint32_t vnetid, boolean_t create_on_miss)
135 {
136 vxlnat_vnet_t *vnet, searcher;
137 avl_index_t where;
138
139 /* Cheesy, but we KNOW vxnv_vnetid is the only thing checked. */
140 searcher.vxnv_vnetid = vnetid;
141
142 rw_enter(&vxlnat_vnet_lock, create_on_miss ? RW_WRITER : RW_READER);
143 vnet = (vxlnat_vnet_t *)avl_find(&vxlnat_vnets, &searcher, &where);
144 if (vnet == NULL && create_on_miss) {
145 vnet = kmem_zalloc(sizeof (*vnet), KM_SLEEP);
146 /* KM_SLEEP means non-NULL guaranteed. */
147 vnet->vxnv_refcount = 1; /* Internment reference. */
148 vnet->vxnv_vnetid = vnetid;
149 /* Initialize 1-1 mappings... */
150 rw_init(&vnet->vxnv_fixed_lock, NULL, RW_DRIVER, NULL);
151 avl_create(&vnet->vxnv_fixed_ips, vxlnat_tree_plus_in6_cmp,
152 sizeof (vxlnat_fixed_t), 0);
153 /* Initialize NAT rules. (NAT mutex is zeroed-out.) */
154 list_create(&vnet->vxnv_rules, sizeof (vxlnat_rule_t), 0);
155
156 /* Initialize NAT flows... */
157 rw_init(&vnet->vxnv_flowv4_lock, NULL, RW_DRIVER, NULL);
158 avl_create(&vnet->vxnv_flows_v4, vxlnat_flow_cmp_v4,
159 sizeof (vxlnat_flow_t), 0);
160
161 /*
162 * Initialize remote VXLAN destination cache.
163 * (remotes mutex is zeroed-out.)
164 */
165 avl_create(&vnet->vxnv_remotes, vxlnat_tree_plus_in6_cmp,
166 sizeof (vxlnat_remote_t), 0);
167
168 avl_insert(&vxlnat_vnets, vnet, where);
169 }
170 if (vnet != NULL)
171 VXNV_REFHOLD(vnet); /* Caller's reference. */
172 rw_exit(&vxlnat_vnet_lock);
173
174 return (vnet);
175 }
176
177 void
178 vxlnat_vnet_free(vxlnat_vnet_t *vnet)
179 {
180 /* XXX KEBE SAYS FILL ME IN */
181 ASSERT0(vnet->vxnv_refcount);
182 /* XXX KEBE ASKS -- assert detachment? */
183
184 kmem_free(vnet, sizeof (*vnet));
185 }
186
187 static void
188 vxlnat_vnet_unlink_locked(vxlnat_vnet_t *vnet)
189 {
190 ASSERT3U(vnet->vxnv_refcount, >=, 1);
191
192 ASSERT(RW_WRITE_HELD(&vxlnat_vnet_lock));
193 avl_remove(&vxlnat_vnets, vnet);
194 /* XXX KEBE ASKS --> Mark as condemned? */
195
196 /* Unlink all NAT rules */
197 mutex_enter(&vnet->vxnv_rule_lock);
198 while (!list_is_empty(&vnet->vxnv_rules)) {
199 /* Will decrement vnet's refcount too. */
200 vxlnat_rule_unlink(
201 (vxlnat_rule_t *)list_head(&vnet->vxnv_rules));
202 }
203 mutex_exit(&vnet->vxnv_rule_lock);
204 /* XXX KEBE SAYS unlink all 1-1 mappings */
205 rw_enter(&vnet->vxnv_fixed_lock, RW_WRITER);
206 while (!avl_is_empty(&vnet->vxnv_fixed_ips)) {
207 /* Will decrement vnet's refcount too. */
208 vxlnat_fixed_unlink(
209 (vxlnat_fixed_t *)avl_first(&vnet->vxnv_fixed_ips));
210 }
211 rw_exit(&vnet->vxnv_fixed_lock);
212
213 /* Unlink all remotes */
214 mutex_enter(&vnet->vxnv_remote_lock);
215 while (!avl_is_empty(&vnet->vxnv_remotes)) {
216 /* Will decrement vnet's refcount too. */
217 vxlnat_remote_unlink(
218 (vxlnat_remote_t *)avl_first(&vnet->vxnv_remotes));
219 }
220 mutex_exit(&vnet->vxnv_remote_lock);
221
222 /* XXX KEBE SAYS unlink all NAT flows */
223
224 VXNV_REFRELE(vnet); /* Internment reference. */
225 }
226
227 /*
228 * Assume it's refheld by the caller, so we will drop two references
229 * explicitly (caller's and internment), plus free any rules.
230 */
231 void
232 vxlnat_vnet_unlink(vxlnat_vnet_t *vnet)
233 {
234 ASSERT3U(vnet->vxnv_refcount, >=, 2);
235 rw_enter(&vxlnat_vnet_lock, RW_WRITER);
236 vxlnat_vnet_unlink_locked(vnet);
237 rw_exit(&vxlnat_vnet_lock);
238 /*
239 * At this point, we've decremented the refcount by one with the
240 * unlink. Drop the caller's now.
241 */
242 VXNV_REFRELE(vnet);
243 }
244
245 /*
246 * Add a (vnetid+prefix => external) rule.
247 */
248 static int
249 vxlnat_nat_rule(vxn_msg_t *vxnm)
250 {
251 vxlnat_vnet_t *vnet;
252 vxlnat_rule_t *rule;
253 uint32_t vnetid;
254
255 ASSERT(MUTEX_HELD(&vxlnat_mutex));
256
257 /* Reserve the requested public IP for shared use. */
258 if (!vxlnat_public_hold(&vxnm->vxnm_public, B_FALSE))
259 return (EADDRNOTAVAIL);
260
261 vnetid = VXLAN_ID_HTON(vxnm->vxnm_vnetid);
262 vnet = vxlnat_get_vnet(vnetid, B_TRUE);
263 if (vnet == NULL) {
264 /* RARE case of failed allocation or other disaster. */
265 vxlnat_public_rele(&vxnm->vxnm_public);
266 return (ENOMEM);
267 }
268
269 /* Now we have a reference-held vnet, create a rule for it. */
270 rule = kmem_alloc(sizeof (*rule), KM_SLEEP);
271 /* KM_SLEEP means non-NULL guaranteed. */
272 rule->vxnr_vnet = vnet; /* vnet already refheld, remember?. */
273 /* XXX KEBE ASKS, check the vxnm more carefully? */
274 rule->vxnr_myaddr = vxnm->vxnm_private;
275 rule->vxnr_pubaddr = vxnm->vxnm_public;
276 rule->vxnr_prefix = vxnm->vxnm_prefix;
277 /* For easier packet matching, keep vlanid in network order. */
278 rule->vxnr_vlanid = htons(vxnm->vxnm_vlanid);
279 bcopy(vxnm->vxnm_ether_addr, rule->vxnr_myether, ETHERADDRL);
280 rule->vxnr_refcount = 1; /* Internment reference. */
281 list_link_init(&rule->vxnr_link);
282
283 /* Put rule into vnet. */
284 mutex_enter(&vnet->vxnv_rule_lock);
285 /* XXX KEBE ASKS --> Check for collisions?!? */
286 list_insert_tail(&vnet->vxnv_rules, rule);
287 mutex_exit(&vnet->vxnv_rule_lock);
288
289 return (0);
290 }
291
292 void
293 vxlnat_rule_free(vxlnat_rule_t *rule)
294 {
295 ASSERT3P(rule->vxnr_vnet, ==, NULL);
296 ASSERT3P(rule->vxnr_link.list_next, ==, NULL);
297 ASSERT3P(rule->vxnr_link.list_prev, ==, NULL);
298 ASSERT0(rule->vxnr_refcount);
299 vxlnat_public_rele(&rule->vxnr_pubaddr);
300 kmem_free(rule, sizeof (*rule));
301 }
302
303 static void
304 vxlnat_rule_unlink(vxlnat_rule_t *rule)
305 {
306 vxlnat_vnet_t *vnet = rule->vxnr_vnet;
307
308 ASSERT3P(vnet, !=, NULL);
309 ASSERT(MUTEX_HELD(&vnet->vxnv_rule_lock));
310
311 list_remove(&vnet->vxnv_rules, rule);
312 VXNV_REFRELE(vnet);
313 rule->vxnr_vnet = NULL; /* This condemns this rule. */
314 VXNR_REFRELE(rule);
315 }
316
317 static int
318 vxlnat_flush(void)
319 {
320 vxlnat_closesock();
321 /* XXX KEBE SAYS DO OTHER STATE FLUSHING TOO. */
322
323 /* Flush out vnets. */
324 rw_enter(&vxlnat_vnet_lock, RW_WRITER);
325 while (!avl_is_empty(&vxlnat_vnets))
326 vxlnat_vnet_unlink_locked(avl_first(&vxlnat_vnets));
327 rw_exit(&vxlnat_vnet_lock);
328 if (vxlnat_dumpbuf != NULL) {
329 kmem_free(vxlnat_dumpbuf,
330 vxlnat_dumpcount * sizeof (vxn_msg_t));
331 vxlnat_dumpbuf = NULL;
332 vxlnat_initial = vxlnat_dumpcount = vxlnat_dumpcurrent = 0;
333 }
334 return (0);
335 }
336
337 void
338 vxlnat_fixed_free(vxlnat_fixed_t *fixed)
339 {
340 ASSERT0(fixed->vxnf_refcount);
341
342 vxlnat_public_rele(&fixed->vxnf_pubaddr);
343 kmem_free(fixed, sizeof (*fixed));
344 }
345
346 static void
347 vxlnat_fixed_unlink(vxlnat_fixed_t *fixed)
348 {
349 vxlnat_vnet_t *vnet = fixed->vxnf_vnet;
350 ire_t *ire = fixed->vxnf_ire;
351
352 ASSERT3P(vnet, !=, NULL);
353 ASSERT(RW_WRITE_HELD(&vnet->vxnv_fixed_lock));
354
355 /* Rid ourselves of the IRE now. */
356 if (ire != NULL) {
357 ASSERT(ire->ire_type == IRE_LOCAL);
358 ASSERT3P((void *)ire->ire_dep_sib_next, ==, (void *)fixed);
359
360 /* XXX KEBE SAYS CHEESY HACK. */
361 if (fixed->vxnf_clear_router)
362 ire->ire_ill->ill_flags &= ~ILLF_ROUTER;
363
364 ire->ire_dep_sib_next = NULL;
365 VXNF_REFRELE(fixed); /* ire's hold on us. */
366 /* Rewire IRE back to normal. */
367 ire->ire_recvfn = (ire->ire_ipversion == IPV4_VERSION) ?
368 ire_recv_local_v4 : ire_recv_local_v6;
369 ire_refrele(ire);
370 }
371
372 /* And the remote, if it's there. */
373 if (fixed->vxnf_remote != NULL) {
374 VXNREM_REFRELE(fixed->vxnf_remote);
375 fixed->vxnf_remote = NULL;
376 }
377
378 avl_remove(&vnet->vxnv_fixed_ips, fixed);
379 fixed->vxnf_vnet = NULL; /* This condemns this 1-1 mapping. */
380 VXNV_REFRELE(vnet);
381 VXNF_REFRELE(fixed);
382 }
383
384 /*
385 * Add a 1-1 (vnetid+IP <==> external) rule.
386 */
387 static int
388 vxlnat_fixed_ip(vxn_msg_t *vxnm)
389 {
390 vxlnat_vnet_t *vnet;
391 vxlnat_fixed_t *fixed;
392 uint32_t vnetid;
393 avl_index_t where;
394 int rc;
395 ire_t *ire;
396 ip_stack_t *ipst;
397
398 /* XXX KEBE SAYS FILL ME IN. */
399 ASSERT(MUTEX_HELD(&vxlnat_mutex));
400
401 /* Reserve the requested public IP for exclusive use. */
402 if (!vxlnat_public_hold(&vxnm->vxnm_public, B_TRUE))
403 return (EADDRNOTAVAIL);
404
405 vnetid = VXLAN_ID_HTON(vxnm->vxnm_vnetid);
406 vnet = vxlnat_get_vnet(vnetid, B_TRUE);
407 if (vnet == NULL) {
408 /* RARE case of failed allocation or other disaster. */
409 rc = ENOMEM;
410 goto fail;
411 }
412
413 fixed = kmem_zalloc(sizeof (*fixed), KM_SLEEP);
414 /* KM_SLEEP means non-NULL guaranteed. */
415 fixed->vxnf_vnet = vnet; /* vnet already refheld, remember? */
416 /* XXX KEBE ASKS, check the vxnm more carefully? */
417 fixed->vxnf_addr = vxnm->vxnm_private;
418 fixed->vxnf_pubaddr = vxnm->vxnm_public;
419 fixed->vxnf_refcount = 1; /* Internment reference. */
420 bcopy(&vxnm->vxnm_ether_addr, &fixed->vxnf_myether, ETHERADDRL);
421 fixed->vxnf_vlanid = htons(vxnm->vxnm_vlanid);
422
423 /*
424 * Find a local-address IRE for the public address.
425 */
426 ipst = vxlnat_netstack->netstack_ip;
427 ire = IN6_IS_ADDR_V4MAPPED(&fixed->vxnf_pubaddr) ?
428 ire_ftable_lookup_simple_v4(fixed->vxnf_pubaddr._S6_un._S6_u32[3],
429 0, ipst, NULL) :
430 ire_ftable_lookup_simple_v6(&fixed->vxnf_pubaddr, 0, ipst, NULL);
431
432 if (ire == NULL) {
433 /*
434 * Can't find a local IRE. For now, return.
435 * XXX KEBE ASKS --> Do we instead put a new entry in
436 * there? Or do we count on zone/netstack configuration
437 * to make sure the requested external address is there?!
438 */
439 kmem_free(fixed, sizeof (*fixed));
440 rc = EADDRNOTAVAIL;
441 goto fail;
442 }
443
444 /*
445 * Check the IRE for appropriate properties.
446 *
447 * This may change as we implement, but for now, we MUST have an ipif
448 * (local address) for the public IP. This can/should be on the
449 * public NIC OR on a my-netstack-only etherstub to enable
450 * instantiating redundant versions of vxlnat on other netstacks on
451 * other {zones,machines} without triggering DAD.
452 */
453 if (ire->ire_type != IRE_LOCAL) {
454 ire_refrele(ire);
455 kmem_free(fixed, sizeof (*fixed));
456 rc = EADDRNOTAVAIL; /* XXX KEBE ASKS different errno? */
457 goto fail;
458 }
459
460 /* Put the 1-1 mapping in place. */
461 rw_enter(&vnet->vxnv_fixed_lock, RW_WRITER);
462 if (avl_find(&vnet->vxnv_fixed_ips, fixed, &where) != NULL) {
463 /* Oh crap, we have an internal IP mapped already. */
464 ire_refrele(ire);
465 kmem_free(fixed, sizeof (*fixed));
466 rc = EEXIST;
467 } else {
468 avl_insert(&vnet->vxnv_fixed_ips, fixed, where);
469 rc = 0;
470 /*
471 * ODD USE OF POINTERS WARNING: I'm going to use
472 * ire_dep_sib_next for this IRE_LOCAL as a backpointer to
473 * this 'fixed'. This'll allow rapid packet processing.
474 * Inspection seems to indicate that IRE_LOCAL ires NEVER use
475 * the ire_dep* pointers, so we'll use one (and independent of
476 * ip_stack_t's ips_ire_dep_lock as well). If I'm wrong,
477 * fix it here and add a new pointer in ip.h for ire_t.
478 */
479 ire->ire_dep_sib_next = (ire_t *)fixed;
480 VXNF_REFHOLD(fixed); /* ire holds us too... */
481 fixed->vxnf_ire = ire;
482 /* and then rewire the ire receive and send functions. */
483 if (ire->ire_ipversion == IPV4_VERSION) {
484 ire->ire_recvfn = vxlnat_fixed_ire_recv_v4;
485 ire->ire_sendfn = vxlnat_fixed_ire_send_v4;
486 } else {
487 ASSERT(ire->ire_ipversion == IPV6_VERSION);
488 ire->ire_recvfn = vxlnat_fixed_ire_recv_v6;
489 ire->ire_sendfn = vxlnat_fixed_ire_send_v6;
490 }
491 #if 1 /* Cheesy hack */
492 /*
493 * XXX KEBE SAYS CHEESY HACK:
494 */
495 if (!(ire->ire_ill->ill_flags & ILLF_ROUTER)) {
496 fixed->vxnf_clear_router = B_TRUE;
497 ire->ire_ill->ill_flags |= ILLF_ROUTER;
498 } else {
499 /* Just so we're clear... */
500 fixed->vxnf_clear_router = B_FALSE;
501 }
502 #endif /* Cheesy hack */
503 }
504 rw_exit(&vnet->vxnv_fixed_lock);
505
506 fail:
507 if (rc != 0)
508 vxlnat_public_rele(&vxnm->vxnm_public);
509
510 return (rc);
511 }
512
513 static void
514 vxlnat_rule_to_msg(vxn_msg_t *msg, vxlnat_rule_t *rule)
515 {
516 msg->vxnm_type = VXNM_RULE;
517 msg->vxnm_vnetid = VXLAN_ID_NTOH(rule->vxnr_vnet->vxnv_vnetid);
518 msg->vxnm_prefix = rule->vxnr_prefix;
519 msg->vxnm_vlanid = ntohs(rule->vxnr_vlanid);
520 bcopy(rule->vxnr_myether, msg->vxnm_ether_addr, ETHERADDRL);
521 msg->vxnm_public = rule->vxnr_pubaddr;
522 msg->vxnm_private = rule->vxnr_myaddr;
523 }
524
525 static void
526 vxlnat_fixed_to_msg(vxn_msg_t *msg, vxlnat_fixed_t *fixed)
527 {
528 msg->vxnm_type = VXNM_FIXEDIP;
529 msg->vxnm_vnetid = VXLAN_ID_NTOH(fixed->vxnf_vnet->vxnv_vnetid);
530 msg->vxnm_prefix = 0;
531 msg->vxnm_vlanid = ntohs(fixed->vxnf_vlanid);
532 bcopy(fixed->vxnf_myether, msg->vxnm_ether_addr, ETHERADDRL);
533 msg->vxnm_public = fixed->vxnf_pubaddr;
534 msg->vxnm_private = fixed->vxnf_addr;
535 }
536
537 static int
538 vxlnat_dump(void)
539 {
540 int rc = 0;
541 size_t entries = 0;
542 vxlnat_vnet_t *vnet;
543 vxlnat_fixed_t *fixed;
544 vxlnat_rule_t *rule;
545 vxn_msg_t *current;
546
547 ASSERT(MUTEX_HELD(&vxlnat_mutex));
548
549 /*
550 * XXX KEBE SAYS setup vxlnat_dump* above.
551 * XXX KEBE SAYS If function fails for reasons that aren't "dump in
552 * progress", make sure it keeps vxlnat_dump* stuff clean
553 *
554 * NOTE: Other commands are excluded at this point, but packet
555 * processing is not. OTOH, packet processing doesn't affect any
556 * entities we dump (at this time). We only dump things that can be
557 * added with commands. (So no remote VXLAN peers and no NAT flows.)
558 */
559
560 /* Lock down things. */
561 rw_enter(&vxlnat_vnet_lock, RW_READER);
562 if (avl_numnodes(&vxlnat_vnets) == 0)
563 goto bail; /* Nothing to see here, move along. */
564
565 /*
566 * This is going to be inefficient, requiring two passes through each
567 * vnet. The first pass locks-down and counts. Then we allocate
568 * based on the count. The second pass copies out and unlocks.
569 */
570 for (vnet = avl_first(&vxlnat_vnets); vnet != NULL;
571 vnet = AVL_NEXT(&vxlnat_vnets, vnet)) {
572 rw_enter(&vnet->vxnv_fixed_lock, RW_READER);
573 entries += avl_numnodes(&vnet->vxnv_fixed_ips);
574 mutex_enter(&vnet->vxnv_rule_lock);
575 /* Let's hope this isn't a big number... */
576 for (rule = list_head(&vnet->vxnv_rules); rule != NULL;
577 rule = list_next(&vnet->vxnv_rules, rule)) {
578 entries++;
579 }
580 /* XXX KEBE ASKS -- other fields?!? */
581 }
582 if (entries == 0)
583 goto bail; /* VNETs but with no rules AND no 1-1s?!? */
584 /* Don't be too agressive in allocating this. */
585 vxlnat_dumpbuf = kmem_alloc(entries * sizeof (vxn_msg_t),
586 KM_NOSLEEP | KM_NORMALPRI);
587 if (vxlnat_dumpbuf == NULL)
588 rc = ENOMEM; /* We still have to unlock everything. */
589 current = vxlnat_dumpbuf;
590
591 /* Second pass. */
592 for (vnet = avl_first(&vxlnat_vnets); vnet != NULL;
593 vnet = AVL_NEXT(&vxlnat_vnets, vnet)) {
594 /* XXX KEBE ASKS -- other fields?!? */
595 for (rule = list_head(&vnet->vxnv_rules); rule != NULL;
596 rule = list_next(&vnet->vxnv_rules, rule)) {
597 if (rc == 0) {
598 vxlnat_rule_to_msg(current, rule);
599 current++;
600 }
601 }
602 mutex_exit(&vnet->vxnv_rule_lock);
603 for (fixed = avl_first(&vnet->vxnv_fixed_ips); fixed != NULL;
604 fixed = AVL_NEXT(&vnet->vxnv_fixed_ips, fixed)) {
605 if (rc == 0) {
606 vxlnat_fixed_to_msg(current, fixed);
607 current++;
608 }
609 }
610 rw_exit(&vnet->vxnv_fixed_lock);
611 }
612 vxlnat_dumpcount = vxlnat_initial = entries;
613 vxlnat_dumpcurrent = 0;
614 ASSERT3P((vxlnat_dumpbuf + entries), ==, current);
615
616 bail:
617 rw_exit(&vxlnat_vnet_lock);
618 return (rc);
619 }
620
621 int
622 vxlnat_command(vxn_msg_t *vxnm)
623 {
624 int rc;
625
626 switch (vxnm->vxnm_type) {
627 case VXNM_VXLAN_ADDR:
628 rc = vxlnat_vxlan_addr(&vxnm->vxnm_private);
629 break;
630 case VXNM_RULE:
631 rc = vxlnat_nat_rule(vxnm);
632 break;
633 case VXNM_FIXEDIP:
634 rc = vxlnat_fixed_ip(vxnm);
635 break;
636 case VXNM_FLUSH:
637 rc = vxlnat_flush();
638 break;
639 case VXNM_DUMP:
640 rc = vxlnat_dump();
641 break;
642 default:
643 rc = EINVAL;
644 break;
645 }
646
647 return (rc);
648 }
649
650 void
651 vxlnat_state_init(void)
652 {
653 ASSERT(MUTEX_HELD(&vxlnat_mutex));
654 rw_init(&vxlnat_vnet_lock, NULL, RW_DRIVER, NULL);
655 avl_create(&vxlnat_vnets, vxlnat_vnetid_cmp, sizeof (vxlnat_vnet_t), 0);
656 vxlnat_public_init();
657 /* XXX KEBE SAYS -- more here. */
658 }
659
660 void
661 vxlnat_state_fini(void)
662 {
663 ASSERT(MUTEX_HELD(&vxlnat_mutex));
664 (void) vxlnat_flush(); /* If we fail, we're in bigger trouble anyway. */
665 vxlnat_public_init();
666 avl_destroy(&vxlnat_vnets);
667 rw_destroy(&vxlnat_vnet_lock);
668 }
669
670 int
671 vxlnat_read_dump(struct uio *uiop)
672 {
673 int rc = 0;
674 size_t dumpprogress = 0;
675
676 mutex_enter(&vxlnat_mutex);
677
678 /*
679 * Initial-case ==> dumpbuf with none delivered yet.
680 * Utter an 8-byte count.
681 */
682 if (vxlnat_initial != 0 && uiop->uio_resid >= sizeof (uint64_t)) {
683 uint64_t total = vxlnat_dumpcount;
684
685 ASSERT(vxlnat_dumpbuf != NULL && vxlnat_dumpcurrent == 0);
686 rc = uiomove(&total, sizeof (uint64_t), UIO_READ, uiop);
687 if (rc != 0)
688 goto bail;
689 vxlnat_initial = 0;
690 }
691
692 /* XXX KEBE THINKS -- if no dump buffer, just return w/o data. */
693 while (rc == 0 && vxlnat_dumpbuf != NULL &&
694 uiop->uio_resid >= sizeof (vxn_msg_t)) {
695 rc = uiomove(vxlnat_dumpbuf + vxlnat_dumpcurrent,
696 sizeof (vxn_msg_t), UIO_READ, uiop);
697 if (rc != 0) {
698 /*
699 * XXX KEBE ASKS, destroy or preserve dumpstate?
700 * Fill in answer here.
701 */
702 break;
703 }
704 vxlnat_dumpcurrent++;
705 dumpprogress++;
706 if (vxlnat_dumpcurrent == vxlnat_dumpcount) {
707 kmem_free(vxlnat_dumpbuf,
708 vxlnat_dumpcount * sizeof (vxn_msg_t));
709 vxlnat_dumpbuf = NULL;
710 vxlnat_dumpcount = vxlnat_dumpcurrent = 0;
711 }
712 }
713
714 bail:
715 /*
716 * If there's room at the end, just ignore that space for now. Handy
717 * DTrace probe below notes amount of extra bytes..
718 */
719 DTRACE_PROBE1(vxlnat__read__extrabytes, ssize_t, uiop->uio_resid);
720 /* Note progress of dump with DTrace probes. */
721 DTRACE_PROBE3(vxlnat__read__dumpprogress, size_t, dumpprogress, size_t,
722 vxlnat_dumpcurrent, size_t, vxlnat_dumpcount);
723
724 mutex_exit(&vxlnat_mutex);
725 return (rc);
726 }