1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2018 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * Writes (new rules) and reads (rule dump) go here.  So do the
  18  * ins/outs of reading & writing.
  19  */
  20 
  21 #include <sys/ddi.h>
  22 #include <sys/dtrace.h>
  23 #include <sys/debug.h>
  24 #include <inet/vxlnat_impl.h>
  25 #include <inet/ip_if.h>   /* XXX KEBE SAYS CHEESY HACK */
  26 
  27 /*
  28  * These are all initialized to NULL or 0.
  29  *
  30  * If a VXNM_DUMP is requested, these get allocated/set.  vxlnat_read()
  31  * calls will consume them, and once delivered the last bytes read will
  32  * cause these to be freed and reset to NULL/0.  Cheesy, but this is a
  33  * one-at-a-time thing.  Protected by vxlnat_mutex.
  34  */
  35 static vxn_msg_t *vxlnat_dumpbuf;
  36 static size_t vxlnat_initial;   /* non-zero if no read yet. */
  37 static size_t vxlnat_dumpcount;
  38 static size_t vxlnat_dumpcurrent;
  39 
  40 /*
  41  * Store per-vnet-state in AVL tree.  We could be handling 1000s or more...
  42  * Could split this into a hash table of AVL trees if need be.
  43  */
  44 static krwlock_t vxlnat_vnet_lock;      /* Could be mutex if we use refhold. */
  45 static avl_tree_t vxlnat_vnets;
  46 
  47 static void vxlnat_rule_unlink(vxlnat_rule_t *);
  48 static void vxlnat_fixed_unlink(vxlnat_fixed_t *);
  49 /* In vxlnat_nat.c */
  50 extern void vxlnat_remote_unlink(vxlnat_remote_t *);
  51 
  52 /*
  53  * Comparison function for vnet AVL tree.
  54  */
  55 static int
  56 vxlnat_vnetid_cmp(const void *first, const void *second)
  57 {
  58         uint32_t first_vnetid, second_vnetid;
  59 
  60         first_vnetid = ((vxlnat_vnet_t *)first)->vxnv_vnetid;
  61         second_vnetid = ((vxlnat_vnet_t *)second)->vxnv_vnetid;
  62 
  63         if (first_vnetid < second_vnetid)
  64                 return (-1);
  65         if (first_vnetid > second_vnetid)
  66                 return (1);
  67         return (0);
  68 }
  69 
  70 /*
  71  *
  72  * NOTE:  Many structures start with the form:
  73  *
  74  *      struct foo {
  75  *              avl_node_t node;
  76  *              in6_addr_t address_which_is_search_key;
  77  *              ....
  78  *
  79  * We will use this same AVL comparison function for many of these structures.
  80  */
  81 int
  82 vxlnat_tree_plus_in6_cmp(const void *first, const void *second)
  83 {
  84         in6_addr_t *firstaddr, *secondaddr;
  85         int ret;
  86 
  87         firstaddr = (in6_addr_t *)(((avl_node_t *)first) + 1);
  88         secondaddr = (in6_addr_t *)(((avl_node_t *)second) + 1);
  89 
  90         ret = memcmp(firstaddr, secondaddr, sizeof (in6_addr_t));
  91         if (ret > 0)
  92                 return (1);
  93         if (ret < 0)
  94                 return (-1);
  95         return (0);
  96 }
  97 
  98 /*
  99  * Find-and-reference-hold a vnet.  If none present, create one.
 100  * "vnetid" MUST be in wire-order and its one byte cleared.
 101  */
 102 vxlnat_vnet_t *
 103 vxlnat_get_vnet(uint32_t vnetid, boolean_t create_on_miss)
 104 {
 105         vxlnat_vnet_t *vnet, searcher;
 106         avl_index_t where;
 107 
 108         /* Cheesy, but we KNOW vxnv_vnetid is the only thing checked. */
 109         searcher.vxnv_vnetid = vnetid;
 110 
 111         rw_enter(&vxlnat_vnet_lock, create_on_miss ? RW_WRITER : RW_READER);
 112         vnet = (vxlnat_vnet_t *)avl_find(&vxlnat_vnets, &searcher, &where);
 113         if (vnet == NULL && create_on_miss) {
 114                 vnet = kmem_zalloc(sizeof (*vnet), KM_SLEEP);
 115                 /* KM_SLEEP means non-NULL guaranteed. */
 116                 vnet->vxnv_refcount = 1; /* Internment reference. */
 117                 vnet->vxnv_vnetid = vnetid;
 118                 /* Initialize 1-1 mappings... */
 119                 rw_init(&vnet->vxnv_fixed_lock, NULL, RW_DRIVER, NULL);
 120                 avl_create(&vnet->vxnv_fixed_ips, vxlnat_tree_plus_in6_cmp,
 121                     sizeof (vxlnat_fixed_t), 0);
 122                 /* Initialize NAT rules.  (NAT mutex is zeroed-out.) */
 123                 list_create(&vnet->vxnv_rules, sizeof (vxlnat_rule_t), 0);
 124 #ifdef notyet
 125                 /* XXX KEBE SAYS INITIALIZE NAT flows... */
 126 #endif /* notyet */
 127                 /*
 128                  * Initialize remote VXLAN destination cache.
 129                  * (remotes mutex is zeroed-out.)
 130                  */
 131                 avl_create(&vnet->vxnv_remotes, vxlnat_tree_plus_in6_cmp,
 132                     sizeof (vxlnat_remote_t), 0);
 133 
 134                 avl_insert(&vxlnat_vnets, vnet, where);
 135         }
 136         if (vnet != NULL)
 137                 VXNV_REFHOLD(vnet);     /* Caller's reference. */
 138         rw_exit(&vxlnat_vnet_lock);
 139 
 140         return (vnet);
 141 }
 142 
 143 void
 144 vxlnat_vnet_free(vxlnat_vnet_t *vnet)
 145 {
 146         /* XXX KEBE SAYS FILL ME IN */
 147         ASSERT0(vnet->vxnv_refcount);
 148         /* XXX KEBE ASKS -- assert detachment? */
 149 
 150         kmem_free(vnet, sizeof (*vnet));
 151 }
 152 
 153 static void
 154 vxlnat_vnet_unlink_locked(vxlnat_vnet_t *vnet)
 155 {
 156         ASSERT3U(vnet->vxnv_refcount, >=, 1);
 157 
 158         ASSERT(RW_WRITE_HELD(&vxlnat_vnet_lock));
 159         avl_remove(&vxlnat_vnets, vnet);
 160         /* XXX KEBE ASKS --> Mark as condemned? */
 161         
 162         /* Unlink all NAT rules */
 163         mutex_enter(&vnet->vxnv_rule_lock);
 164         while (!list_is_empty(&vnet->vxnv_rules)) {
 165                 /* Will decrement vnet's refcount too. */
 166                 vxlnat_rule_unlink(
 167                     (vxlnat_rule_t *)list_head(&vnet->vxnv_rules));
 168         }
 169         mutex_exit(&vnet->vxnv_rule_lock);
 170         /* XXX KEBE SAYS unlink all 1-1 mappings */
 171         rw_enter(&vnet->vxnv_fixed_lock, RW_WRITER);
 172         while (!avl_is_empty(&vnet->vxnv_fixed_ips)) {
 173                 /* Will decrement vnet's refcount too. */
 174                 vxlnat_fixed_unlink(
 175                     (vxlnat_fixed_t *)avl_first(&vnet->vxnv_fixed_ips));
 176         }
 177         rw_exit(&vnet->vxnv_fixed_lock);
 178 
 179         /* Unlink all remotes */
 180         mutex_enter(&vnet->vxnv_remote_lock);
 181         while (!avl_is_empty(&vnet->vxnv_remotes)) {
 182                 /* Will decrement vnet's refcount too. */
 183                 vxlnat_remote_unlink(
 184                     (vxlnat_remote_t *)avl_first(&vnet->vxnv_remotes));
 185         }
 186         mutex_exit(&vnet->vxnv_remote_lock);
 187 
 188         /* XXX KEBE SAYS unlink all NAT flows */
 189 
 190         VXNV_REFRELE(vnet);     /* Internment reference. */
 191 }
 192 
 193 /*
 194  * Assume it's refheld by the caller, so we will drop two references
 195  * explicitly (caller's and internment), plus free any rules.
 196  */
 197 void
 198 vxlnat_vnet_unlink(vxlnat_vnet_t *vnet)
 199 {
 200         ASSERT3U(vnet->vxnv_refcount, >=, 2);
 201         rw_enter(&vxlnat_vnet_lock, RW_WRITER);
 202         vxlnat_vnet_unlink_locked(vnet);
 203         rw_exit(&vxlnat_vnet_lock);
 204         /*
 205          * At this point, we've decremented the refcount by one with the
 206          * unlink. Drop the caller's now.
 207          */
 208         VXNV_REFRELE(vnet);
 209 }
 210 
 211 /*
 212  * Add a (vnetid+prefix => external) rule.
 213  */
 214 static int
 215 vxlnat_nat_rule(vxn_msg_t *vxnm)
 216 {
 217         vxlnat_vnet_t *vnet;
 218         vxlnat_rule_t *rule;
 219         uint32_t vnetid;
 220 
 221         ASSERT(MUTEX_HELD(&vxlnat_mutex));
 222 
 223         /* Reserve the requested public IP for shared use. */
 224         if (!vxlnat_public_hold(&vxnm->vxnm_public, B_FALSE))
 225                 return (EADDRNOTAVAIL);
 226 
 227         vnetid = VXLAN_ID_HTON(vxnm->vxnm_vnetid);
 228         vnet = vxlnat_get_vnet(vnetid, B_TRUE);
 229         if (vnet == NULL) {
 230                 /* RARE case of failed allocation or other disaster. */
 231                 vxlnat_public_rele(&vxnm->vxnm_public);
 232                 return (ENOMEM);
 233         }
 234 
 235         /* Now we have a reference-held vnet, create a rule for it. */
 236         rule = kmem_alloc(sizeof (*rule), KM_SLEEP);
 237         /* KM_SLEEP means non-NULL guaranteed. */
 238         rule->vxnr_vnet = vnet;      /* vnet already refheld, remember?. */
 239         /* XXX KEBE ASKS, check the vxnm more carefully? */
 240         rule->vxnr_myaddr = vxnm->vxnm_private;
 241         rule->vxnr_pubaddr = vxnm->vxnm_public;
 242         rule->vxnr_prefix = vxnm->vxnm_prefix;
 243         /* For easier packet matching, keep vlanid in network order. */
 244         rule->vxnr_vlanid = htons(vxnm->vxnm_vlanid);
 245         bcopy(vxnm->vxnm_ether_addr, rule->vxnr_myether, ETHERADDRL);
 246         rule->vxnr_refcount = 1;     /* Internment reference. */
 247         list_link_init(&rule->vxnr_link);
 248 
 249         /* Put rule into vnet. */
 250         mutex_enter(&vnet->vxnv_rule_lock);
 251         /* XXX KEBE ASKS --> Check for collisions?!? */
 252         list_insert_tail(&vnet->vxnv_rules, rule);
 253         mutex_exit(&vnet->vxnv_rule_lock);
 254 
 255         return (0);
 256 }
 257 
 258 void
 259 vxlnat_rule_free(vxlnat_rule_t *rule)
 260 {
 261         ASSERT3P(rule->vxnr_vnet, ==, NULL);
 262         ASSERT3P(rule->vxnr_link.list_next, ==, NULL);
 263         ASSERT3P(rule->vxnr_link.list_prev, ==, NULL);
 264         ASSERT0(rule->vxnr_refcount);
 265         vxlnat_public_rele(&rule->vxnr_pubaddr);
 266         kmem_free(rule, sizeof (*rule));
 267 }
 268 
 269 static void
 270 vxlnat_rule_unlink(vxlnat_rule_t *rule)
 271 {
 272         vxlnat_vnet_t *vnet = rule->vxnr_vnet;
 273 
 274         ASSERT3P(vnet, !=, NULL);
 275         ASSERT(MUTEX_HELD(&vnet->vxnv_rule_lock));
 276 
 277         list_remove(&vnet->vxnv_rules, rule);
 278         VXNV_REFRELE(vnet);
 279         rule->vxnr_vnet = NULL;      /* This condemns this rule. */
 280         VXNR_REFRELE(rule);
 281 }
 282 
 283 static int
 284 vxlnat_flush(void)
 285 {
 286         vxlnat_closesock();
 287         /* XXX KEBE SAYS DO OTHER STATE FLUSHING TOO. */
 288 
 289         /* Flush out vnets. */
 290         rw_enter(&vxlnat_vnet_lock, RW_WRITER);
 291         while (!avl_is_empty(&vxlnat_vnets))
 292                 vxlnat_vnet_unlink_locked(avl_first(&vxlnat_vnets));
 293         rw_exit(&vxlnat_vnet_lock);
 294         if (vxlnat_dumpbuf != NULL) {
 295                 kmem_free(vxlnat_dumpbuf,
 296                     vxlnat_dumpcount * sizeof (vxn_msg_t));
 297                 vxlnat_dumpbuf = NULL;
 298                 vxlnat_initial = vxlnat_dumpcount = vxlnat_dumpcurrent = 0;
 299         }
 300         return (0);
 301 }
 302 
 303 void
 304 vxlnat_fixed_free(vxlnat_fixed_t *fixed)
 305 {
 306         ASSERT0(fixed->vxnf_refcount);
 307 
 308         vxlnat_public_rele(&fixed->vxnf_pubaddr);
 309         kmem_free(fixed, sizeof (*fixed));
 310 }
 311 
 312 static void
 313 vxlnat_fixed_unlink(vxlnat_fixed_t *fixed)
 314 {
 315         vxlnat_vnet_t *vnet = fixed->vxnf_vnet;
 316         ire_t *ire = fixed->vxnf_ire;
 317 
 318         ASSERT3P(vnet, !=, NULL);
 319         ASSERT(RW_WRITE_HELD(&vnet->vxnv_fixed_lock));
 320 
 321         /* Rid ourselves of the IRE now. */
 322         if (ire != NULL) {
 323                 ASSERT(ire->ire_type == IRE_LOCAL);
 324                 ASSERT3P((void *)ire->ire_dep_sib_next, ==, (void *)fixed);
 325 
 326                 /* XXX KEBE SAYS CHEESY HACK. */
 327                 if (fixed->vxnf_clear_router)
 328                         ire->ire_ill->ill_flags &= ~ILLF_ROUTER;
 329 
 330                 ire->ire_dep_sib_next = NULL;
 331                 VXNF_REFRELE(fixed);    /* ire's hold on us. */
 332                 /* Rewire IRE back to normal. */
 333                 ire->ire_recvfn = (ire->ire_ipversion == IPV4_VERSION) ?
 334                     ire_recv_local_v4 : ire_recv_local_v6;
 335                 ire_refrele(ire);
 336         }
 337 
 338         /* And the remote, if it's there. */
 339         if (fixed->vxnf_remote != NULL) {
 340                 VXNREM_REFRELE(fixed->vxnf_remote);
 341                 fixed->vxnf_remote = NULL;
 342         }
 343 
 344         avl_remove(&vnet->vxnv_fixed_ips, fixed);
 345         fixed->vxnf_vnet = NULL; /* This condemns this 1-1 mapping. */
 346         VXNV_REFRELE(vnet);
 347         VXNF_REFRELE(fixed);
 348 }
 349 
 350 /*
 351  * Add a 1-1 (vnetid+IP <==> external) rule.
 352  */
 353 static int
 354 vxlnat_fixed_ip(vxn_msg_t *vxnm)
 355 {
 356         vxlnat_vnet_t *vnet;
 357         vxlnat_fixed_t *fixed;
 358         uint32_t vnetid;
 359         avl_index_t where;
 360         int rc;
 361         ire_t *ire;
 362         ip_stack_t *ipst;
 363 
 364         /* XXX KEBE SAYS FILL ME IN. */
 365         ASSERT(MUTEX_HELD(&vxlnat_mutex));
 366 
 367         /* Reserve the requested public IP for exclusive use. */
 368         if (!vxlnat_public_hold(&vxnm->vxnm_public, B_TRUE))
 369                 return (EADDRNOTAVAIL);
 370 
 371         vnetid = VXLAN_ID_HTON(vxnm->vxnm_vnetid);
 372         vnet = vxlnat_get_vnet(vnetid, B_TRUE);
 373         if (vnet == NULL) {
 374                 /* RARE case of failed allocation or other disaster. */
 375                 rc = ENOMEM;
 376                 goto fail;
 377         }
 378 
 379         fixed = kmem_zalloc(sizeof (*fixed), KM_SLEEP);
 380         /* KM_SLEEP means non-NULL guaranteed. */
 381         fixed->vxnf_vnet = vnet; /* vnet already refheld, remember? */
 382         /* XXX KEBE ASKS, check the vxnm more carefully? */
 383         fixed->vxnf_addr = vxnm->vxnm_private;
 384         fixed->vxnf_pubaddr = vxnm->vxnm_public;
 385         fixed->vxnf_refcount = 1;    /* Internment reference. */
 386         bcopy(&vxnm->vxnm_ether_addr, &fixed->vxnf_myether, ETHERADDRL);
 387         fixed->vxnf_vlanid = htons(vxnm->vxnm_vlanid);
 388 
 389         /*
 390          * Find a local-address IRE for the public address.
 391          */
 392         ipst = vxlnat_netstack->netstack_ip;
 393         ire = IN6_IS_ADDR_V4MAPPED(&fixed->vxnf_pubaddr) ?
 394             ire_ftable_lookup_simple_v4(fixed->vxnf_pubaddr._S6_un._S6_u32[3],
 395             0, ipst, NULL) :
 396             ire_ftable_lookup_simple_v6(&fixed->vxnf_pubaddr, 0, ipst, NULL);
 397 
 398         if (ire == NULL) {
 399                 /*
 400                  * Can't find a local IRE. For now, return.
 401                  * XXX KEBE ASKS --> Do we instead put a new entry in
 402                  * there?  Or do we count on zone/netstack configuration
 403                  * to make sure the requested external address is there?!
 404                  */
 405                 kmem_free(fixed, sizeof (*fixed));
 406                 rc = EADDRNOTAVAIL;
 407                 goto fail;
 408         }
 409 
 410         /*
 411          * Check the IRE for appropriate properties.
 412          *
 413          * This may change as we implement, but for now, we MUST have an ipif
 414          * (local address) for the public IP.  This can/should be on the
 415          * public NIC OR on a my-netstack-only etherstub to enable
 416          * instantiating redundant versions of vxlnat on other netstacks on
 417          * other {zones,machines} without triggering DAD.
 418          */
 419         if (ire->ire_type != IRE_LOCAL) {
 420                 ire_refrele(ire);
 421                 kmem_free(fixed, sizeof (*fixed));
 422                 rc = EADDRNOTAVAIL;     /* XXX KEBE ASKS different errno? */
 423                 goto fail;
 424         }
 425 
 426         /* Put the 1-1 mapping in place. */
 427         rw_enter(&vnet->vxnv_fixed_lock, RW_WRITER);
 428         if (avl_find(&vnet->vxnv_fixed_ips, fixed, &where) != NULL) {
 429                 /* Oh crap, we have an internal IP mapped already. */
 430                 ire_refrele(ire);
 431                 kmem_free(fixed, sizeof (*fixed));
 432                 rc = EEXIST;
 433         } else {
 434                 avl_insert(&vnet->vxnv_fixed_ips, fixed, where);
 435                 rc = 0;
 436                 /*
 437                  * CHEESY USE OF POINTERS WARNING: I'm going to use
 438                  * ire_dep_children for this IRE_LOCAL as a backpointer to
 439                  * this 'fixed'.  This'll allow rapid packet processing.
 440                  * Inspection seems to indicate that IRE_LOCAL ires NEVER use
 441                  * the ire_dep* pointers, so we'll use one (and independent of
 442                  * ip_stack_t's ips_ire_dep_lock as well).  If I'm wrong,
 443                  * fix it here and add a new pointer in ip.h for ire_t.
 444                  */
 445                 ire->ire_dep_sib_next = (ire_t *)fixed;
 446                 /* and then rewire the ire receive and send functions. */
 447                 if (ire->ire_ipversion == IPV4_VERSION) {
 448                         ire->ire_recvfn = vxlnat_fixed_ire_recv_v4;
 449                         ire->ire_sendfn = vxlnat_fixed_ire_send_v4;
 450                 } else {
 451                         ASSERT(ire->ire_ipversion == IPV6_VERSION);
 452                         ire->ire_recvfn = vxlnat_fixed_ire_recv_v6;
 453                         ire->ire_sendfn = vxlnat_fixed_ire_send_v6;
 454                 }
 455                 VXNF_REFHOLD(fixed);    /* ire holds us too... */
 456                 fixed->vxnf_ire = ire;
 457                 /*
 458                  * XXX KEBE SAYS CHEESY HACK:
 459                  */
 460                 if (!(ire->ire_ill->ill_flags & ILLF_ROUTER)) {
 461                         fixed->vxnf_clear_router = B_TRUE;
 462                         ire->ire_ill->ill_flags |= ILLF_ROUTER;
 463                 } else {
 464                         /* Just so we're clear... */
 465                         fixed->vxnf_clear_router = B_FALSE;
 466                 }
 467         }
 468         rw_exit(&vnet->vxnv_fixed_lock);
 469 
 470 fail:
 471         if (rc != 0)
 472                 vxlnat_public_rele(&vxnm->vxnm_public);
 473 
 474         return (rc);
 475 }
 476 
 477 static void
 478 vxlnat_rule_to_msg(vxn_msg_t *msg, vxlnat_rule_t *rule)
 479 {
 480         msg->vxnm_type = VXNM_RULE;
 481         msg->vxnm_vnetid = VXLAN_ID_NTOH(rule->vxnr_vnet->vxnv_vnetid);
 482         msg->vxnm_prefix = rule->vxnr_prefix;
 483         msg->vxnm_vlanid = ntohs(rule->vxnr_vlanid);
 484         bcopy(rule->vxnr_myether, msg->vxnm_ether_addr, ETHERADDRL);
 485         msg->vxnm_public = rule->vxnr_pubaddr;
 486         msg->vxnm_private = rule->vxnr_myaddr;
 487 }
 488 
 489 static void
 490 vxlnat_fixed_to_msg(vxn_msg_t *msg, vxlnat_fixed_t *fixed)
 491 {
 492         msg->vxnm_type = VXNM_FIXEDIP;
 493         msg->vxnm_vnetid = VXLAN_ID_NTOH(fixed->vxnf_vnet->vxnv_vnetid);
 494         msg->vxnm_prefix = 0;
 495         msg->vxnm_vlanid = ntohs(fixed->vxnf_vlanid);
 496         bcopy(fixed->vxnf_myether, msg->vxnm_ether_addr, ETHERADDRL);
 497         msg->vxnm_public = fixed->vxnf_pubaddr;
 498         msg->vxnm_private = fixed->vxnf_addr;
 499 }
 500 
 501 static int
 502 vxlnat_dump(void)
 503 {
 504         int rc = 0;
 505         size_t entries = 0;
 506         vxlnat_vnet_t *vnet;
 507         vxlnat_fixed_t *fixed;
 508         vxlnat_rule_t *rule;
 509         vxn_msg_t *current;
 510 
 511         ASSERT(MUTEX_HELD(&vxlnat_mutex));
 512 
 513         /*
 514          * XXX KEBE SAYS setup vxlnat_dump* above.
 515          * XXX KEBE SAYS If function fails for reasons that aren't "dump in
 516          * progress", make sure it keeps vxlnat_dump* stuff clean
 517          *
 518          * NOTE: Other commands are excluded at this point, but packet
 519          * processing is not.  OTOH, packet processing doesn't affect any
 520          * entities we dump (at this time).  We only dump things that can be
 521          * added with commands.  (So no remote VXLAN peers and no NAT flows.)
 522          */
 523 
 524         /* Lock down things. */
 525         rw_enter(&vxlnat_vnet_lock, RW_READER);
 526         if (avl_numnodes(&vxlnat_vnets) == 0)
 527                 goto bail;      /* Nothing to see here, move along. */
 528 
 529         /*
 530          * This is going to be inefficient, requiring two passes through each
 531          * vnet.  The first pass locks-down and counts.  Then we allocate
 532          * based on the count.  The second pass copies out and unlocks.
 533          */
 534         for (vnet = avl_first(&vxlnat_vnets); vnet != NULL;
 535             vnet = AVL_NEXT(&vxlnat_vnets, vnet)) {
 536                 rw_enter(&vnet->vxnv_fixed_lock, RW_READER);
 537                 entries += avl_numnodes(&vnet->vxnv_fixed_ips);
 538                 mutex_enter(&vnet->vxnv_rule_lock);
 539                 /* Let's hope this isn't a big number... */
 540                 for (rule = list_head(&vnet->vxnv_rules); rule != NULL;
 541                     rule = list_next(&vnet->vxnv_rules, rule)) {
 542                         entries++;
 543                 }
 544                 /* XXX KEBE ASKS -- other fields?!? */
 545         }
 546         if (entries == 0)
 547                 goto bail;      /* VNETs but with no rules AND no 1-1s?!? */
 548         /* Don't be too agressive in allocating this. */
 549         vxlnat_dumpbuf = kmem_alloc(entries * sizeof (vxn_msg_t),
 550             KM_NOSLEEP | KM_NORMALPRI);
 551         if (vxlnat_dumpbuf == NULL)
 552                 rc = ENOMEM;    /* We still have to unlock everything. */
 553         current = vxlnat_dumpbuf;
 554 
 555         /* Second pass. */
 556         for (vnet = avl_first(&vxlnat_vnets); vnet != NULL;
 557             vnet = AVL_NEXT(&vxlnat_vnets, vnet)) {
 558                 /* XXX KEBE ASKS -- other fields?!? */
 559                 for (rule = list_head(&vnet->vxnv_rules); rule != NULL;
 560                     rule = list_next(&vnet->vxnv_rules, rule)) {
 561                         if (rc == 0) {
 562                                 vxlnat_rule_to_msg(current, rule);
 563                                 current++;
 564                         }
 565                 }
 566                 mutex_exit(&vnet->vxnv_rule_lock);
 567                 for (fixed = avl_first(&vnet->vxnv_fixed_ips); fixed != NULL;
 568                     fixed = AVL_NEXT(&vnet->vxnv_fixed_ips, fixed)) {
 569                         if (rc == 0) {
 570                                 vxlnat_fixed_to_msg(current, fixed);
 571                                 current++;
 572                         }
 573                 }
 574                 rw_exit(&vnet->vxnv_fixed_lock);
 575         }
 576         vxlnat_dumpcount = vxlnat_initial = entries;
 577         vxlnat_dumpcurrent = 0;
 578         ASSERT3P((vxlnat_dumpbuf + entries), ==, current);
 579 
 580 bail:
 581         rw_exit(&vxlnat_vnet_lock);
 582         return (rc);
 583 }
 584 
 585 int
 586 vxlnat_command(vxn_msg_t *vxnm)
 587 {
 588         int rc;
 589 
 590         switch (vxnm->vxnm_type) {
 591         case VXNM_VXLAN_ADDR:
 592                 rc = vxlnat_vxlan_addr(&vxnm->vxnm_private);
 593                 break;
 594         case VXNM_RULE:
 595                 rc = vxlnat_nat_rule(vxnm);
 596                 break;
 597         case VXNM_FIXEDIP:
 598                 rc = vxlnat_fixed_ip(vxnm);
 599                 break;
 600         case VXNM_FLUSH:
 601                 rc = vxlnat_flush();
 602                 break;
 603         case VXNM_DUMP:
 604                 rc = vxlnat_dump();
 605                 break;
 606         default:
 607                 rc = EINVAL;
 608                 break;
 609         }
 610 
 611         return (rc);
 612 }
 613 
 614 void
 615 vxlnat_state_init(void)
 616 {
 617         ASSERT(MUTEX_HELD(&vxlnat_mutex));
 618         rw_init(&vxlnat_vnet_lock, NULL, RW_DRIVER, NULL);
 619         avl_create(&vxlnat_vnets, vxlnat_vnetid_cmp, sizeof (vxlnat_vnet_t), 0);
 620         vxlnat_public_init();
 621         /* XXX KEBE SAYS -- more here. */
 622 }
 623 
 624 void
 625 vxlnat_state_fini(void)
 626 {
 627         ASSERT(MUTEX_HELD(&vxlnat_mutex));
 628         (void) vxlnat_flush(); /* If we fail, we're in bigger trouble anyway. */
 629         vxlnat_public_init();
 630         avl_destroy(&vxlnat_vnets);
 631         rw_destroy(&vxlnat_vnet_lock);
 632 }
 633 
 634 int
 635 vxlnat_read_dump(struct uio *uiop)
 636 {
 637         int rc = 0;
 638         size_t dumpprogress = 0;
 639 
 640         mutex_enter(&vxlnat_mutex);
 641 
 642         /*
 643          * Initial-case ==> dumpbuf with none delivered yet.
 644          * Utter an 8-byte count.
 645          */
 646         if (vxlnat_initial != 0 && uiop->uio_resid >= sizeof (uint64_t)) {
 647                 uint64_t total = vxlnat_dumpcount;
 648 
 649                 ASSERT(vxlnat_dumpbuf != NULL && vxlnat_dumpcurrent == 0);
 650                 rc = uiomove(&total, sizeof (uint64_t), UIO_READ, uiop);
 651                 if (rc != 0)
 652                         goto bail;
 653                 vxlnat_initial = 0;
 654         }
 655 
 656         /* XXX KEBE THINKS -- if no dump buffer, just return w/o data. */
 657         while (rc == 0 && vxlnat_dumpbuf != NULL &&
 658             uiop->uio_resid >= sizeof (vxn_msg_t)) {
 659                 rc = uiomove(vxlnat_dumpbuf + vxlnat_dumpcurrent,
 660                     sizeof (vxn_msg_t), UIO_READ, uiop);
 661                 if (rc != 0) {
 662                         /*
 663                          * XXX KEBE ASKS, destroy or preserve dumpstate?
 664                          * Fill in answer here.
 665                          */
 666                         break;
 667                 }
 668                 vxlnat_dumpcurrent++;
 669                 dumpprogress++;
 670                 if (vxlnat_dumpcurrent == vxlnat_dumpcount) {
 671                         kmem_free(vxlnat_dumpbuf,
 672                             vxlnat_dumpcount * sizeof (vxn_msg_t));
 673                         vxlnat_dumpbuf = NULL;
 674                         vxlnat_dumpcount = vxlnat_dumpcurrent = 0;
 675                 }
 676         }
 677 
 678 bail:
 679         /*
 680          * If there's room at the end, just ignore that space for now.  Handy
 681          * DTrace probe below notes amount of extra bytes..
 682          */
 683         DTRACE_PROBE1(vxlnat__read__extrabytes, ssize_t, uiop->uio_resid);
 684         /* Note progress of dump with DTrace probes. */
 685         DTRACE_PROBE3(vxlnat__read__dumpprogress, size_t, dumpprogress, size_t,
 686             vxlnat_dumpcurrent, size_t, vxlnat_dumpcount);
 687 
 688         mutex_exit(&vxlnat_mutex);
 689         return (rc);
 690 }