1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2018 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * Writes (new rules) and reads (rule dump) go here.  So do the
  18  * ins/outs of reading & writing.
  19  */
  20 
  21 #include <sys/ddi.h>
  22 #include <sys/dtrace.h>
  23 #include <sys/debug.h>
  24 #include <inet/vxlnat_impl.h>
  25 #include <inet/ip_if.h>   /* XXX KEBE SAYS CHEESY HACK */
  26 
  27 /*
  28  * These are all initialized to NULL or 0.
  29  *
  30  * If a VXNM_DUMP is requested, these get allocated/set.  vxlnat_read()
  31  * calls will consume them, and once delivered the last bytes read will
  32  * cause these to be freed and reset to NULL/0.  Cheesy, but this is a
  33  * one-at-a-time thing.  Protected by vxlnat_mutex.
  34  */
  35 static vxn_msg_t *vxlnat_dumpbuf;
  36 static size_t vxlnat_initial;   /* non-zero if no read yet. */
  37 static size_t vxlnat_dumpcount;
  38 static size_t vxlnat_dumpcurrent;
  39 
  40 /*
  41  * Store per-vnet-state in AVL tree.  We could be handling 1000s or more...
  42  * Could split this into a hash table of AVL trees if need be.
  43  */
  44 static krwlock_t vxlnat_vnet_lock;      /* Could be mutex if we use refhold. */
  45 static avl_tree_t vxlnat_vnets;
  46 
  47 static void vxlnat_rule_unlink(vxlnat_rule_t *);
  48 static void vxlnat_fixed_unlink(vxlnat_fixed_t *);
  49 /* In vxlnat_nat.c */
  50 extern void vxlnat_remote_unlink(vxlnat_remote_t *);
  51 
  52 /*
  53  * Comparison function for vnet AVL tree.
  54  */
  55 static int
  56 vxlnat_vnetid_cmp(const void *first, const void *second)
  57 {
  58         uint32_t first_vnetid, second_vnetid;
  59 
  60         first_vnetid = ((vxlnat_vnet_t *)first)->vxnv_vnetid;
  61         second_vnetid = ((vxlnat_vnet_t *)second)->vxnv_vnetid;
  62 
  63         if (first_vnetid < second_vnetid)
  64                 return (-1);
  65         if (first_vnetid > second_vnetid)
  66                 return (1);
  67         return (0);
  68 }
  69 
  70 /*
  71  *
  72  * NOTE:  Many structures start with the form:
  73  *
  74  *      struct foo {
  75  *              avl_node_t node;
  76  *              in6_addr_t address_which_is_search_key;
  77  *              ....
  78  *
  79  * We will use this same AVL comparison function for many of these structures.
  80  */
  81 int
  82 vxlnat_tree_plus_in6_cmp(const void *first, const void *second)
  83 {
  84         in6_addr_t *firstaddr, *secondaddr;
  85         int ret;
  86 
  87         firstaddr = (in6_addr_t *)(((avl_node_t *)first) + 1);
  88         secondaddr = (in6_addr_t *)(((avl_node_t *)second) + 1);
  89 
  90         ret = memcmp(firstaddr, secondaddr, sizeof (in6_addr_t));
  91         if (ret > 0)
  92                 return (1);
  93         if (ret < 0)
  94                 return (-1);
  95         return (0);
  96 }
  97 
  98 /*
  99  * Comparison function for NAT flow.
 100  */
 101 static int
 102 vxlnat_flow_cmp_v4(const void *first, const void *second)
 103 {
 104         vxlnat_flow_t *first_flow = (vxlnat_flow_t *)first;
 105         vxlnat_flow_t *second_flow = (vxlnat_flow_t *)second;
 106         uint64_t firstaddrs, secondaddrs, firstportproto, secondportproto;
 107 
 108         firstaddrs = first_flow->vxnfl_src._S6_un._S6_u32[3] |
 109             (((uint64_t)first_flow->vxnfl_dst._S6_un._S6_u32[3]) << 32ULL);
 110         secondaddrs = second_flow->vxnfl_src._S6_un._S6_u32[3] |
 111             (((uint64_t)second_flow->vxnfl_dst._S6_un._S6_u32[3]) << 32ULL);
 112         firstportproto = first_flow->vxnfl_ports |
 113             (((uint64_t)first_flow->vxnfl_protocol) << 32ULL);
 114         secondportproto = second_flow->vxnfl_ports |
 115             (((uint64_t)second_flow->vxnfl_protocol) << 32ULL);
 116 
 117         if (firstaddrs > secondaddrs)
 118                 return (1);
 119         else if (firstaddrs < secondaddrs)
 120                 return (-1);
 121         else if (firstportproto > secondportproto)
 122                 return (1);
 123         else if (firstportproto < secondportproto)
 124                 return (-1);
 125 
 126         return (0);
 127 }
 128 
 129 /*
 130  * Find-and-reference-hold a vnet.  If none present, create one.
 131  * "vnetid" MUST be in wire-order and its one byte cleared.
 132  */
 133 vxlnat_vnet_t *
 134 vxlnat_get_vnet(uint32_t vnetid, boolean_t create_on_miss)
 135 {
 136         vxlnat_vnet_t *vnet, searcher;
 137         avl_index_t where;
 138 
 139         /* Cheesy, but we KNOW vxnv_vnetid is the only thing checked. */
 140         searcher.vxnv_vnetid = vnetid;
 141 
 142         rw_enter(&vxlnat_vnet_lock, create_on_miss ? RW_WRITER : RW_READER);
 143         vnet = (vxlnat_vnet_t *)avl_find(&vxlnat_vnets, &searcher, &where);
 144         if (vnet == NULL && create_on_miss) {
 145                 vnet = kmem_zalloc(sizeof (*vnet), KM_SLEEP);
 146                 /* KM_SLEEP means non-NULL guaranteed. */
 147                 vnet->vxnv_refcount = 1; /* Internment reference. */
 148                 vnet->vxnv_vnetid = vnetid;
 149                 /* Initialize 1-1 mappings... */
 150                 rw_init(&vnet->vxnv_fixed_lock, NULL, RW_DRIVER, NULL);
 151                 avl_create(&vnet->vxnv_fixed_ips, vxlnat_tree_plus_in6_cmp,
 152                     sizeof (vxlnat_fixed_t), 0);
 153                 /* Initialize NAT rules.  (NAT mutex is zeroed-out.) */
 154                 list_create(&vnet->vxnv_rules, sizeof (vxlnat_rule_t), 0);
 155 
 156                 /* Initialize NAT flows... */
 157                 rw_init(&vnet->vxnv_flowv4_lock, NULL, RW_DRIVER, NULL);
 158                 avl_create(&vnet->vxnv_flows_v4, vxlnat_flow_cmp_v4,
 159                     sizeof (vxlnat_flow_t), 0);
 160 
 161                 /*
 162                  * Initialize remote VXLAN destination cache.
 163                  * (remotes mutex is zeroed-out.)
 164                  */
 165                 avl_create(&vnet->vxnv_remotes, vxlnat_tree_plus_in6_cmp,
 166                     sizeof (vxlnat_remote_t), 0);
 167 
 168                 avl_insert(&vxlnat_vnets, vnet, where);
 169         }
 170         if (vnet != NULL)
 171                 VXNV_REFHOLD(vnet);     /* Caller's reference. */
 172         rw_exit(&vxlnat_vnet_lock);
 173 
 174         return (vnet);
 175 }
 176 
 177 void
 178 vxlnat_vnet_free(vxlnat_vnet_t *vnet)
 179 {
 180         /* XXX KEBE SAYS FILL ME IN */
 181         ASSERT0(vnet->vxnv_refcount);
 182         /* XXX KEBE ASKS -- assert detachment? */
 183 
 184         kmem_free(vnet, sizeof (*vnet));
 185 }
 186 
 187 static void
 188 vxlnat_vnet_unlink_locked(vxlnat_vnet_t *vnet)
 189 {
 190         ASSERT3U(vnet->vxnv_refcount, >=, 1);
 191 
 192         ASSERT(RW_WRITE_HELD(&vxlnat_vnet_lock));
 193         avl_remove(&vxlnat_vnets, vnet);
 194         /* XXX KEBE ASKS --> Mark as condemned? */
 195         
 196         /* Unlink all NAT rules */
 197         mutex_enter(&vnet->vxnv_rule_lock);
 198         while (!list_is_empty(&vnet->vxnv_rules)) {
 199                 /* Will decrement vnet's refcount too. */
 200                 vxlnat_rule_unlink(
 201                     (vxlnat_rule_t *)list_head(&vnet->vxnv_rules));
 202         }
 203         mutex_exit(&vnet->vxnv_rule_lock);
 204         /* XXX KEBE SAYS unlink all 1-1 mappings */
 205         rw_enter(&vnet->vxnv_fixed_lock, RW_WRITER);
 206         while (!avl_is_empty(&vnet->vxnv_fixed_ips)) {
 207                 /* Will decrement vnet's refcount too. */
 208                 vxlnat_fixed_unlink(
 209                     (vxlnat_fixed_t *)avl_first(&vnet->vxnv_fixed_ips));
 210         }
 211         rw_exit(&vnet->vxnv_fixed_lock);
 212 
 213         /* Unlink all remotes */
 214         mutex_enter(&vnet->vxnv_remote_lock);
 215         while (!avl_is_empty(&vnet->vxnv_remotes)) {
 216                 /* Will decrement vnet's refcount too. */
 217                 vxlnat_remote_unlink(
 218                     (vxlnat_remote_t *)avl_first(&vnet->vxnv_remotes));
 219         }
 220         mutex_exit(&vnet->vxnv_remote_lock);
 221 
 222         /* XXX KEBE SAYS unlink all NAT flows */
 223 
 224         VXNV_REFRELE(vnet);     /* Internment reference. */
 225 }
 226 
 227 /*
 228  * Assume it's refheld by the caller, so we will drop two references
 229  * explicitly (caller's and internment), plus free any rules.
 230  */
 231 void
 232 vxlnat_vnet_unlink(vxlnat_vnet_t *vnet)
 233 {
 234         ASSERT3U(vnet->vxnv_refcount, >=, 2);
 235         rw_enter(&vxlnat_vnet_lock, RW_WRITER);
 236         vxlnat_vnet_unlink_locked(vnet);
 237         rw_exit(&vxlnat_vnet_lock);
 238         /*
 239          * At this point, we've decremented the refcount by one with the
 240          * unlink. Drop the caller's now.
 241          */
 242         VXNV_REFRELE(vnet);
 243 }
 244 
 245 /*
 246  * Add a (vnetid+prefix => external) rule.
 247  */
 248 static int
 249 vxlnat_nat_rule(vxn_msg_t *vxnm)
 250 {
 251         vxlnat_vnet_t *vnet;
 252         vxlnat_rule_t *rule;
 253         uint32_t vnetid;
 254 
 255         ASSERT(MUTEX_HELD(&vxlnat_mutex));
 256 
 257         /* Reserve the requested public IP for shared use. */
 258         if (!vxlnat_public_hold(&vxnm->vxnm_public, B_FALSE))
 259                 return (EADDRNOTAVAIL);
 260 
 261         vnetid = VXLAN_ID_HTON(vxnm->vxnm_vnetid);
 262         vnet = vxlnat_get_vnet(vnetid, B_TRUE);
 263         if (vnet == NULL) {
 264                 /* RARE case of failed allocation or other disaster. */
 265                 vxlnat_public_rele(&vxnm->vxnm_public);
 266                 return (ENOMEM);
 267         }
 268 
 269         /* Now we have a reference-held vnet, create a rule for it. */
 270         rule = kmem_alloc(sizeof (*rule), KM_SLEEP);
 271         /* KM_SLEEP means non-NULL guaranteed. */
 272         rule->vxnr_vnet = vnet;      /* vnet already refheld, remember?. */
 273         /* XXX KEBE ASKS, check the vxnm more carefully? */
 274         rule->vxnr_myaddr = vxnm->vxnm_private;
 275         rule->vxnr_pubaddr = vxnm->vxnm_public;
 276         rule->vxnr_prefix = vxnm->vxnm_prefix;
 277         /* For easier packet matching, keep vlanid in network order. */
 278         rule->vxnr_vlanid = htons(vxnm->vxnm_vlanid);
 279         bcopy(vxnm->vxnm_ether_addr, rule->vxnr_myether, ETHERADDRL);
 280         rule->vxnr_refcount = 1;     /* Internment reference. */
 281         list_link_init(&rule->vxnr_link);
 282 
 283         /* Put rule into vnet. */
 284         mutex_enter(&vnet->vxnv_rule_lock);
 285         /* XXX KEBE ASKS --> Check for collisions?!? */
 286         list_insert_tail(&vnet->vxnv_rules, rule);
 287         mutex_exit(&vnet->vxnv_rule_lock);
 288 
 289         return (0);
 290 }
 291 
 292 void
 293 vxlnat_rule_free(vxlnat_rule_t *rule)
 294 {
 295         ASSERT3P(rule->vxnr_vnet, ==, NULL);
 296         ASSERT3P(rule->vxnr_link.list_next, ==, NULL);
 297         ASSERT3P(rule->vxnr_link.list_prev, ==, NULL);
 298         ASSERT0(rule->vxnr_refcount);
 299         vxlnat_public_rele(&rule->vxnr_pubaddr);
 300         kmem_free(rule, sizeof (*rule));
 301 }
 302 
 303 static void
 304 vxlnat_rule_unlink(vxlnat_rule_t *rule)
 305 {
 306         vxlnat_vnet_t *vnet = rule->vxnr_vnet;
 307 
 308         ASSERT3P(vnet, !=, NULL);
 309         ASSERT(MUTEX_HELD(&vnet->vxnv_rule_lock));
 310 
 311         list_remove(&vnet->vxnv_rules, rule);
 312         VXNV_REFRELE(vnet);
 313         rule->vxnr_vnet = NULL;      /* This condemns this rule. */
 314         VXNR_REFRELE(rule);
 315 }
 316 
 317 static int
 318 vxlnat_flush(void)
 319 {
 320         vxlnat_closesock();
 321         /* XXX KEBE SAYS DO OTHER STATE FLUSHING TOO. */
 322 
 323         /* Flush out vnets. */
 324         rw_enter(&vxlnat_vnet_lock, RW_WRITER);
 325         while (!avl_is_empty(&vxlnat_vnets))
 326                 vxlnat_vnet_unlink_locked(avl_first(&vxlnat_vnets));
 327         rw_exit(&vxlnat_vnet_lock);
 328         if (vxlnat_dumpbuf != NULL) {
 329                 kmem_free(vxlnat_dumpbuf,
 330                     vxlnat_dumpcount * sizeof (vxn_msg_t));
 331                 vxlnat_dumpbuf = NULL;
 332                 vxlnat_initial = vxlnat_dumpcount = vxlnat_dumpcurrent = 0;
 333         }
 334         return (0);
 335 }
 336 
 337 void
 338 vxlnat_fixed_free(vxlnat_fixed_t *fixed)
 339 {
 340         ASSERT0(fixed->vxnf_refcount);
 341 
 342         vxlnat_public_rele(&fixed->vxnf_pubaddr);
 343         kmem_free(fixed, sizeof (*fixed));
 344 }
 345 
 346 static void
 347 vxlnat_fixed_unlink(vxlnat_fixed_t *fixed)
 348 {
 349         vxlnat_vnet_t *vnet = fixed->vxnf_vnet;
 350         ire_t *ire = fixed->vxnf_ire;
 351 
 352         ASSERT3P(vnet, !=, NULL);
 353         ASSERT(RW_WRITE_HELD(&vnet->vxnv_fixed_lock));
 354 
 355         /* Rid ourselves of the IRE now. */
 356         if (ire != NULL) {
 357                 ASSERT(ire->ire_type == IRE_LOCAL);
 358                 ASSERT3P((void *)ire->ire_dep_sib_next, ==, (void *)fixed);
 359 
 360                 /* XXX KEBE SAYS CHEESY HACK. */
 361                 if (fixed->vxnf_clear_router)
 362                         ire->ire_ill->ill_flags &= ~ILLF_ROUTER;
 363 
 364                 ire->ire_dep_sib_next = NULL;
 365                 VXNF_REFRELE(fixed);    /* ire's hold on us. */
 366                 /* Rewire IRE back to normal. */
 367                 ire->ire_recvfn = (ire->ire_ipversion == IPV4_VERSION) ?
 368                     ire_recv_local_v4 : ire_recv_local_v6;
 369                 ire_refrele(ire);
 370         }
 371 
 372         /* And the remote, if it's there. */
 373         if (fixed->vxnf_remote != NULL) {
 374                 VXNREM_REFRELE(fixed->vxnf_remote);
 375                 fixed->vxnf_remote = NULL;
 376         }
 377 
 378         avl_remove(&vnet->vxnv_fixed_ips, fixed);
 379         fixed->vxnf_vnet = NULL; /* This condemns this 1-1 mapping. */
 380         VXNV_REFRELE(vnet);
 381         VXNF_REFRELE(fixed);
 382 }
 383 
 384 /*
 385  * Add a 1-1 (vnetid+IP <==> external) rule.
 386  */
 387 static int
 388 vxlnat_fixed_ip(vxn_msg_t *vxnm)
 389 {
 390         vxlnat_vnet_t *vnet;
 391         vxlnat_fixed_t *fixed;
 392         uint32_t vnetid;
 393         avl_index_t where;
 394         int rc;
 395         ire_t *ire;
 396         ip_stack_t *ipst;
 397 
 398         /* XXX KEBE SAYS FILL ME IN. */
 399         ASSERT(MUTEX_HELD(&vxlnat_mutex));
 400 
 401         /* Reserve the requested public IP for exclusive use. */
 402         if (!vxlnat_public_hold(&vxnm->vxnm_public, B_TRUE))
 403                 return (EADDRNOTAVAIL);
 404 
 405         vnetid = VXLAN_ID_HTON(vxnm->vxnm_vnetid);
 406         vnet = vxlnat_get_vnet(vnetid, B_TRUE);
 407         if (vnet == NULL) {
 408                 /* RARE case of failed allocation or other disaster. */
 409                 rc = ENOMEM;
 410                 goto fail;
 411         }
 412 
 413         fixed = kmem_zalloc(sizeof (*fixed), KM_SLEEP);
 414         /* KM_SLEEP means non-NULL guaranteed. */
 415         fixed->vxnf_vnet = vnet; /* vnet already refheld, remember? */
 416         /* XXX KEBE ASKS, check the vxnm more carefully? */
 417         fixed->vxnf_addr = vxnm->vxnm_private;
 418         fixed->vxnf_pubaddr = vxnm->vxnm_public;
 419         fixed->vxnf_refcount = 1;    /* Internment reference. */
 420         bcopy(&vxnm->vxnm_ether_addr, &fixed->vxnf_myether, ETHERADDRL);
 421         fixed->vxnf_vlanid = htons(vxnm->vxnm_vlanid);
 422 
 423         /*
 424          * Find a local-address IRE for the public address.
 425          */
 426         ipst = vxlnat_netstack->netstack_ip;
 427         ire = IN6_IS_ADDR_V4MAPPED(&fixed->vxnf_pubaddr) ?
 428             ire_ftable_lookup_simple_v4(fixed->vxnf_pubaddr._S6_un._S6_u32[3],
 429             0, ipst, NULL) :
 430             ire_ftable_lookup_simple_v6(&fixed->vxnf_pubaddr, 0, ipst, NULL);
 431 
 432         if (ire == NULL) {
 433                 /*
 434                  * Can't find a local IRE. For now, return.
 435                  * XXX KEBE ASKS --> Do we instead put a new entry in
 436                  * there?  Or do we count on zone/netstack configuration
 437                  * to make sure the requested external address is there?!
 438                  */
 439                 kmem_free(fixed, sizeof (*fixed));
 440                 rc = EADDRNOTAVAIL;
 441                 goto fail;
 442         }
 443 
 444         /*
 445          * Check the IRE for appropriate properties.
 446          *
 447          * This may change as we implement, but for now, we MUST have an ipif
 448          * (local address) for the public IP.  This can/should be on the
 449          * public NIC OR on a my-netstack-only etherstub to enable
 450          * instantiating redundant versions of vxlnat on other netstacks on
 451          * other {zones,machines} without triggering DAD.
 452          */
 453         if (ire->ire_type != IRE_LOCAL) {
 454                 ire_refrele(ire);
 455                 kmem_free(fixed, sizeof (*fixed));
 456                 rc = EADDRNOTAVAIL;     /* XXX KEBE ASKS different errno? */
 457                 goto fail;
 458         }
 459 
 460         /* Put the 1-1 mapping in place. */
 461         rw_enter(&vnet->vxnv_fixed_lock, RW_WRITER);
 462         if (avl_find(&vnet->vxnv_fixed_ips, fixed, &where) != NULL) {
 463                 /* Oh crap, we have an internal IP mapped already. */
 464                 ire_refrele(ire);
 465                 kmem_free(fixed, sizeof (*fixed));
 466                 rc = EEXIST;
 467         } else {
 468                 avl_insert(&vnet->vxnv_fixed_ips, fixed, where);
 469                 rc = 0;
 470                 /*
 471                  * ODD USE OF POINTERS WARNING: I'm going to use
 472                  * ire_dep_sib_next for this IRE_LOCAL as a backpointer to
 473                  * this 'fixed'.  This'll allow rapid packet processing.
 474                  * Inspection seems to indicate that IRE_LOCAL ires NEVER use
 475                  * the ire_dep* pointers, so we'll use one (and independent of
 476                  * ip_stack_t's ips_ire_dep_lock as well).  If I'm wrong,
 477                  * fix it here and add a new pointer in ip.h for ire_t.
 478                  */
 479                 ire->ire_dep_sib_next = (ire_t *)fixed;
 480                 VXNF_REFHOLD(fixed);    /* ire holds us too... */
 481                 fixed->vxnf_ire = ire;
 482                 /* and then rewire the ire receive and send functions. */
 483                 if (ire->ire_ipversion == IPV4_VERSION) {
 484                         ire->ire_recvfn = vxlnat_fixed_ire_recv_v4;
 485                         ire->ire_sendfn = vxlnat_fixed_ire_send_v4;
 486                 } else {
 487                         ASSERT(ire->ire_ipversion == IPV6_VERSION);
 488                         ire->ire_recvfn = vxlnat_fixed_ire_recv_v6;
 489                         ire->ire_sendfn = vxlnat_fixed_ire_send_v6;
 490                 }
 491 #if 1   /* Cheesy hack */
 492                 /*
 493                  * XXX KEBE SAYS CHEESY HACK:
 494                  */
 495                 if (!(ire->ire_ill->ill_flags & ILLF_ROUTER)) {
 496                         fixed->vxnf_clear_router = B_TRUE;
 497                         ire->ire_ill->ill_flags |= ILLF_ROUTER;
 498                 } else {
 499                         /* Just so we're clear... */
 500                         fixed->vxnf_clear_router = B_FALSE;
 501                 }
 502 #endif  /* Cheesy hack */
 503         }
 504         rw_exit(&vnet->vxnv_fixed_lock);
 505 
 506 fail:
 507         if (rc != 0)
 508                 vxlnat_public_rele(&vxnm->vxnm_public);
 509 
 510         return (rc);
 511 }
 512 
 513 static void
 514 vxlnat_rule_to_msg(vxn_msg_t *msg, vxlnat_rule_t *rule)
 515 {
 516         msg->vxnm_type = VXNM_RULE;
 517         msg->vxnm_vnetid = VXLAN_ID_NTOH(rule->vxnr_vnet->vxnv_vnetid);
 518         msg->vxnm_prefix = rule->vxnr_prefix;
 519         msg->vxnm_vlanid = ntohs(rule->vxnr_vlanid);
 520         bcopy(rule->vxnr_myether, msg->vxnm_ether_addr, ETHERADDRL);
 521         msg->vxnm_public = rule->vxnr_pubaddr;
 522         msg->vxnm_private = rule->vxnr_myaddr;
 523 }
 524 
 525 static void
 526 vxlnat_fixed_to_msg(vxn_msg_t *msg, vxlnat_fixed_t *fixed)
 527 {
 528         msg->vxnm_type = VXNM_FIXEDIP;
 529         msg->vxnm_vnetid = VXLAN_ID_NTOH(fixed->vxnf_vnet->vxnv_vnetid);
 530         msg->vxnm_prefix = 0;
 531         msg->vxnm_vlanid = ntohs(fixed->vxnf_vlanid);
 532         bcopy(fixed->vxnf_myether, msg->vxnm_ether_addr, ETHERADDRL);
 533         msg->vxnm_public = fixed->vxnf_pubaddr;
 534         msg->vxnm_private = fixed->vxnf_addr;
 535 }
 536 
 537 static int
 538 vxlnat_dump(void)
 539 {
 540         int rc = 0;
 541         size_t entries = 0;
 542         vxlnat_vnet_t *vnet;
 543         vxlnat_fixed_t *fixed;
 544         vxlnat_rule_t *rule;
 545         vxn_msg_t *current;
 546 
 547         ASSERT(MUTEX_HELD(&vxlnat_mutex));
 548 
 549         /*
 550          * XXX KEBE SAYS setup vxlnat_dump* above.
 551          * XXX KEBE SAYS If function fails for reasons that aren't "dump in
 552          * progress", make sure it keeps vxlnat_dump* stuff clean
 553          *
 554          * NOTE: Other commands are excluded at this point, but packet
 555          * processing is not.  OTOH, packet processing doesn't affect any
 556          * entities we dump (at this time).  We only dump things that can be
 557          * added with commands.  (So no remote VXLAN peers and no NAT flows.)
 558          */
 559 
 560         /* Lock down things. */
 561         rw_enter(&vxlnat_vnet_lock, RW_READER);
 562         if (avl_numnodes(&vxlnat_vnets) == 0)
 563                 goto bail;      /* Nothing to see here, move along. */
 564 
 565         /*
 566          * This is going to be inefficient, requiring two passes through each
 567          * vnet.  The first pass locks-down and counts.  Then we allocate
 568          * based on the count.  The second pass copies out and unlocks.
 569          */
 570         for (vnet = avl_first(&vxlnat_vnets); vnet != NULL;
 571             vnet = AVL_NEXT(&vxlnat_vnets, vnet)) {
 572                 rw_enter(&vnet->vxnv_fixed_lock, RW_READER);
 573                 entries += avl_numnodes(&vnet->vxnv_fixed_ips);
 574                 mutex_enter(&vnet->vxnv_rule_lock);
 575                 /* Let's hope this isn't a big number... */
 576                 for (rule = list_head(&vnet->vxnv_rules); rule != NULL;
 577                     rule = list_next(&vnet->vxnv_rules, rule)) {
 578                         entries++;
 579                 }
 580                 /* XXX KEBE ASKS -- other fields?!? */
 581         }
 582         if (entries == 0)
 583                 goto bail;      /* VNETs but with no rules AND no 1-1s?!? */
 584         /* Don't be too agressive in allocating this. */
 585         vxlnat_dumpbuf = kmem_alloc(entries * sizeof (vxn_msg_t),
 586             KM_NOSLEEP | KM_NORMALPRI);
 587         if (vxlnat_dumpbuf == NULL)
 588                 rc = ENOMEM;    /* We still have to unlock everything. */
 589         current = vxlnat_dumpbuf;
 590 
 591         /* Second pass. */
 592         for (vnet = avl_first(&vxlnat_vnets); vnet != NULL;
 593             vnet = AVL_NEXT(&vxlnat_vnets, vnet)) {
 594                 /* XXX KEBE ASKS -- other fields?!? */
 595                 for (rule = list_head(&vnet->vxnv_rules); rule != NULL;
 596                     rule = list_next(&vnet->vxnv_rules, rule)) {
 597                         if (rc == 0) {
 598                                 vxlnat_rule_to_msg(current, rule);
 599                                 current++;
 600                         }
 601                 }
 602                 mutex_exit(&vnet->vxnv_rule_lock);
 603                 for (fixed = avl_first(&vnet->vxnv_fixed_ips); fixed != NULL;
 604                     fixed = AVL_NEXT(&vnet->vxnv_fixed_ips, fixed)) {
 605                         if (rc == 0) {
 606                                 vxlnat_fixed_to_msg(current, fixed);
 607                                 current++;
 608                         }
 609                 }
 610                 rw_exit(&vnet->vxnv_fixed_lock);
 611         }
 612         vxlnat_dumpcount = vxlnat_initial = entries;
 613         vxlnat_dumpcurrent = 0;
 614         ASSERT3P((vxlnat_dumpbuf + entries), ==, current);
 615 
 616 bail:
 617         rw_exit(&vxlnat_vnet_lock);
 618         return (rc);
 619 }
 620 
 621 int
 622 vxlnat_command(vxn_msg_t *vxnm)
 623 {
 624         int rc;
 625 
 626         switch (vxnm->vxnm_type) {
 627         case VXNM_VXLAN_ADDR:
 628                 rc = vxlnat_vxlan_addr(&vxnm->vxnm_private);
 629                 break;
 630         case VXNM_RULE:
 631                 rc = vxlnat_nat_rule(vxnm);
 632                 break;
 633         case VXNM_FIXEDIP:
 634                 rc = vxlnat_fixed_ip(vxnm);
 635                 break;
 636         case VXNM_FLUSH:
 637                 rc = vxlnat_flush();
 638                 break;
 639         case VXNM_DUMP:
 640                 rc = vxlnat_dump();
 641                 break;
 642         default:
 643                 rc = EINVAL;
 644                 break;
 645         }
 646 
 647         return (rc);
 648 }
 649 
 650 void
 651 vxlnat_state_init(void)
 652 {
 653         ASSERT(MUTEX_HELD(&vxlnat_mutex));
 654         rw_init(&vxlnat_vnet_lock, NULL, RW_DRIVER, NULL);
 655         avl_create(&vxlnat_vnets, vxlnat_vnetid_cmp, sizeof (vxlnat_vnet_t), 0);
 656         vxlnat_public_init();
 657         /* XXX KEBE SAYS -- more here. */
 658 }
 659 
 660 void
 661 vxlnat_state_fini(void)
 662 {
 663         ASSERT(MUTEX_HELD(&vxlnat_mutex));
 664         (void) vxlnat_flush(); /* If we fail, we're in bigger trouble anyway. */
 665         vxlnat_public_init();
 666         avl_destroy(&vxlnat_vnets);
 667         rw_destroy(&vxlnat_vnet_lock);
 668 }
 669 
 670 int
 671 vxlnat_read_dump(struct uio *uiop)
 672 {
 673         int rc = 0;
 674         size_t dumpprogress = 0;
 675 
 676         mutex_enter(&vxlnat_mutex);
 677 
 678         /*
 679          * Initial-case ==> dumpbuf with none delivered yet.
 680          * Utter an 8-byte count.
 681          */
 682         if (vxlnat_initial != 0 && uiop->uio_resid >= sizeof (uint64_t)) {
 683                 uint64_t total = vxlnat_dumpcount;
 684 
 685                 ASSERT(vxlnat_dumpbuf != NULL && vxlnat_dumpcurrent == 0);
 686                 rc = uiomove(&total, sizeof (uint64_t), UIO_READ, uiop);
 687                 if (rc != 0)
 688                         goto bail;
 689                 vxlnat_initial = 0;
 690         }
 691 
 692         /* XXX KEBE THINKS -- if no dump buffer, just return w/o data. */
 693         while (rc == 0 && vxlnat_dumpbuf != NULL &&
 694             uiop->uio_resid >= sizeof (vxn_msg_t)) {
 695                 rc = uiomove(vxlnat_dumpbuf + vxlnat_dumpcurrent,
 696                     sizeof (vxn_msg_t), UIO_READ, uiop);
 697                 if (rc != 0) {
 698                         /*
 699                          * XXX KEBE ASKS, destroy or preserve dumpstate?
 700                          * Fill in answer here.
 701                          */
 702                         break;
 703                 }
 704                 vxlnat_dumpcurrent++;
 705                 dumpprogress++;
 706                 if (vxlnat_dumpcurrent == vxlnat_dumpcount) {
 707                         kmem_free(vxlnat_dumpbuf,
 708                             vxlnat_dumpcount * sizeof (vxn_msg_t));
 709                         vxlnat_dumpbuf = NULL;
 710                         vxlnat_dumpcount = vxlnat_dumpcurrent = 0;
 711                 }
 712         }
 713 
 714 bail:
 715         /*
 716          * If there's room at the end, just ignore that space for now.  Handy
 717          * DTrace probe below notes amount of extra bytes..
 718          */
 719         DTRACE_PROBE1(vxlnat__read__extrabytes, ssize_t, uiop->uio_resid);
 720         /* Note progress of dump with DTrace probes. */
 721         DTRACE_PROBE3(vxlnat__read__dumpprogress, size_t, dumpprogress, size_t,
 722             vxlnat_dumpcurrent, size_t, vxlnat_dumpcount);
 723 
 724         mutex_exit(&vxlnat_mutex);
 725         return (rc);
 726 }