Print this page
WIP to help bringup NAT flows

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/inet/vxlnat/vxlnat_nat.c
          +++ new/usr/src/uts/common/inet/vxlnat/vxlnat_nat.c
↓ open down ↓ 31 lines elided ↑ open up ↑
  32   32  #include <sys/strsun.h>
  33   33  #include <sys/sysmacros.h>
  34   34  #include <sys/debug.h>
  35   35  #include <sys/dtrace.h>
  36   36  #include <sys/errno.h>
  37   37  #include <sys/tihdr.h>
  38   38  #include <netinet/in.h>
  39   39  #include <netinet/udp.h>
  40   40  #include <inet/ip.h>
  41   41  #include <inet/ip6.h>
       42 +#include <inet/tcp_impl.h>
  42   43  #include <inet/udp_impl.h>
  43   44  #include <inet/tcp.h>
  44   45  
  45   46  #include <inet/vxlnat_impl.h>
  46   47  
  47   48  static boolean_t vxlnat_vxlan_input(ksocket_t, mblk_t *, size_t, int, void *);
  48   49  static mblk_t *vxlnat_fixed_fixv4(mblk_t *mp, vxlnat_fixed_t *fixed,
  49   50      boolean_t to_private);
  50   51  
  51   52  /*
↓ open down ↓ 290 lines elided ↑ open up ↑
 342  343  
 343  344          /* We're done with the remote entry now. */
 344  345          VXNREM_REFRELE(remote);
 345  346  
 346  347          /* Advance rptr to the inner IP header and proceed. */
 347  348          mp->b_rptr = (uint8_t *)ipha;
 348  349          return (mp);
 349  350  }
 350  351  
 351  352  /*
      353 + * Extract transport-level information to find a NAT flow.
      354 + * Consume mp and return B_FALSE if there's a problem.  Fill in "ports"
      355 + * and "protocol" and return B_TRUE if there's not.
      356 + */
      357 +static boolean_t
      358 +vxlnat_grab_transport(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t *ports,
      359 +    uint8_t *protocol, uint8_t **nexthdr_ptr)
      360 +{
      361 +        uint8_t *nexthdr;
      362 +
      363 +        /* Punt on IPv6 for now... */
      364 +        if (ip6h != NULL) {
      365 +                freemsg(mp);
      366 +                return (B_FALSE);
      367 +        }
      368 +
      369 +        ASSERT(ipha != NULL);
      370 +        *protocol = ipha->ipha_protocol;
      371 +        nexthdr = ((uint8_t *)ipha + IPH_HDR_LENGTH(ipha));
      372 +        *nexthdr_ptr = nexthdr; /* Get this out of the way now. */
      373 +        if (nexthdr > mp->b_wptr) {
      374 +                DTRACE_PROBE1(vxlnat__in__drop__trnexthdr, mblk_t *, mp);
      375 +                freemsg(mp);
      376 +                return (B_FALSE);
      377 +        }
      378 +        switch (*protocol) {
      379 +        case IPPROTO_TCP: {
      380 +                tcpha_t *tcph = (tcpha_t *)nexthdr;
      381 +
      382 +                if (nexthdr + sizeof (*tcph) > mp->b_wptr) {
      383 +                        DTRACE_PROBE1(vxlnat__in__drop__tcpnexthdr, mblk_t *,
      384 +                            mp);
      385 +                        freemsg(mp);
      386 +                        return (B_FALSE);
      387 +                }
      388 +                *ports = *((uint32_t *)tcph);
      389 +                /* XXX KEBE SAYS - grab other metadata here NOW? */
      390 +                break;
      391 +        }
      392 +        case IPPROTO_UDP: {
      393 +                udpha_t *udph = (udpha_t *)nexthdr;
      394 +
      395 +                if (nexthdr + sizeof (*udph) > mp->b_wptr) {
      396 +                        DTRACE_PROBE1(vxlnat__in__drop__udpnexthdr, mblk_t *,
      397 +                            mp);
      398 +                        freemsg(mp);
      399 +                        return (B_FALSE);
      400 +                }
      401 +                *ports = *((uint32_t *)udph);
      402 +                /*
      403 +                 * XXX KEBE SAYS - not as much as TCP, but grab other metadata
      404 +                 * here NOW?
      405 +                 */
      406 +                break;
      407 +        }
      408 +        case IPPROTO_ICMP: {
      409 +                icmph_t *icmph = (icmph_t *)nexthdr;
      410 +
      411 +                if (nexthdr + sizeof (*icmph) > mp->b_wptr) {
      412 +                        DTRACE_PROBE1(vxlnat__in__drop__icmpnexthdr, mblk_t *,
      413 +                            mp);
      414 +                        freemsg(mp);
      415 +                        return (B_FALSE);
      416 +                }
      417 +                /* XXX KEBE SAYS sort out ICMP header... */
      418 +                switch (icmph->icmph_type) {
      419 +                case ICMP_ECHO_REQUEST:
      420 +                case ICMP_TIME_STAMP_REQUEST:
      421 +                case ICMP_TIME_EXCEEDED:
      422 +                case ICMP_INFO_REQUEST:
      423 +                case ICMP_ADDRESS_MASK_REPLY:
      424 +                        /* All ones we can sorta cope with... */
      425 +                        break;
      426 +                default:
      427 +                        DTRACE_PROBE2(vxlnat__in__drop__icmptype, int,
      428 +                            icmph->icmph_type, mblk_t *, mp);
      429 +                        freemsg(mp);
      430 +                        return (B_FALSE);
      431 +                }
      432 +                /* NOTE: as of now, will switch position depending on endian. */
      433 +                *ports = icmph->icmph_echo_ident;
      434 +                break;
      435 +        }
      436 +        default:
      437 +                *ports = 0;
      438 +                break;
      439 +        }
      440 +
      441 +        return (B_TRUE);
      442 +}
      443 +
      444 +/*
      445 + * This is the evaluate-packet vs. NAT flow state function.
      446 + * This function does NOT alter "mp".
      447 + */
      448 +static boolean_t
      449 +vxlnat_verify_natstate(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
      450 +    vxlnat_flow_t *flow, uint8_t *nexthdr)
      451 +{
      452 +        /* XXX KEBE SAYS FILL ME IN! */
      453 +        return (B_FALSE);
      454 +}
      455 +
      456 +/*
 352  457   * Inspect the packet and find ports & protos (or ICMP types & codes)
 353  458   * and see if we have an established NAT flow.
 354  459   *
 355  460   * XXX KEBE WONDERS if the transmission path will more closely resemble
 356  461   * vxlnat_one_vxlan_fixed() because of ipha_ident issues or not...
 357  462   *
 358  463   * B_TRUE means the packet was handled, and we shouldn't continue processing
 359  464   * (even if "was handled" means droppage).
 360  465   */
 361  466  static boolean_t
 362  467  vxlnat_one_vxlan_flow(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
 363  468      ip6_t *ip6h)
 364  469  {
 365      -        /* XXX KEBE SAYS FILL ME IN. */
 366      -        /* For now... */
      470 +        vxlnat_flow_t *flow, searcher;
      471 +        uint8_t *nexthdr;
      472 +
      473 +        /*
      474 +         * XXX KEBE WONDERS, should we return vxlnat_flow_t instead if we
      475 +         * miss?  That way, we only need to find the ports/protocol ONCE.
      476 +         */
      477 +
      478 +        if (ip6h != NULL) {
      479 +                /* Eventually, grab addresses for "searcher". */
      480 +                return (B_FALSE);       /* Bail on IPv6 for now... */
      481 +        } else {
      482 +                ASSERT(ipha != NULL);
      483 +                searcher.vxnfl_isv4 = B_TRUE;   /* Required? */
      484 +                IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
      485 +                    &searcher.vxnfl_src);
      486 +                IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_dst),
      487 +                    &searcher.vxnfl_dst);
      488 +        }
      489 +
      490 +        if (!vxlnat_grab_transport(mp, ipha, ip6h, &searcher.vxnfl_ports,
      491 +            &searcher.vxnfl_protocol, &nexthdr)) {
      492 +                DTRACE_PROBE1(vxlnat__in__flowgrab, mblk_t *, mp);
      493 +                freemsg(mp);
      494 +                return (B_TRUE);
      495 +        }
      496 +        
      497 +
      498 +        /*
      499 +         * XXX KEBE SAYS Eventually put the rw&find in an IPv4-only block,
      500 +         * because IPv6 (if we NAT it like IPv4) will have its own table/tree.
      501 +         */
      502 +        rw_enter(&vnet->vxnv_flowv4_lock, RW_READER);
      503 +        flow = avl_find(&vnet->vxnv_flows_v4, &searcher, NULL);
      504 +        if (flow != NULL)
      505 +                VXNFL_REFHOLD(flow);
      506 +        rw_exit(&vnet->vxnv_flowv4_lock);
      507 +
      508 +        if (flow == NULL)
      509 +                return (B_FALSE);       /* Let caller handle things. */
      510 +
      511 +        if (!vxlnat_verify_natstate(mp, ipha, ip6h, flow, nexthdr)) {
      512 +                freemsg(mp);    /* XXX KEBE SAYS FOR NOW... */
      513 +        } else {
      514 +                /* XXX KEBE SAYS PROCESS... */
      515 +        }
      516 +
      517 +        VXNFL_REFRELE(flow);
      518 +        return (B_TRUE);
      519 +}
      520 +
      521 +/*
      522 + * We have a new packet that seems to require a new NAT flow.  Construct that
      523 + * flow now, and intern it as both a conn_t in IP *and* in the vnet's
      524 + * appropriate vxnv_flows* tree.  Return NULL if we have a problem.
      525 + */
      526 +static vxlnat_flow_t *
      527 +vxlnat_new_flow(vxlnat_rule_t *rule, in6_addr_t *inner_src, in6_addr_t *dst,
      528 +    uint32_t ports, uint8_t protocol)
      529 +{
      530 +        vxlnat_vnet_t *vnet = rule->vxnr_vnet;
      531 +        vxlnat_flow_t *flow, *oldflow;
      532 +        avl_tree_t *flowtree;
      533 +        krwlock_t *flowlock;
      534 +        avl_index_t where;
      535 +
      536 +        flow = kmem_alloc(sizeof (*flow), KM_NOSLEEP | KM_NORMALPRI);
      537 +        if (flow == NULL)
      538 +                return (NULL);
      539 +
      540 +        flow->vxnfl_dst = *dst;
      541 +        flow->vxnfl_src = *inner_src;
      542 +        flow->vxnfl_ports = ports;
      543 +        flow->vxnfl_protocol = protocol;
      544 +        flow->vxnfl_refcount = 2; /* One for internment, one for caller. */
      545 +        /* Assume no mixed-IP-version mappings for now. */
      546 +        if (IN6_IS_ADDR_V4MAPPED(inner_src)) {
      547 +                ASSERT(IN6_IS_ADDR_V4MAPPED(dst));
      548 +                flow->vxnfl_isv4 = B_TRUE;
      549 +                flowtree = &vnet->vxnv_flows_v4;
      550 +                flowlock = &vnet->vxnv_flowv4_lock;
      551 +        } else {
      552 +                ASSERT(!IN6_IS_ADDR_V4MAPPED(dst));
      553 +                flow->vxnfl_isv4 = B_FALSE;
      554 +                /* XXX KEBE SAYS we don't do IPv6 for now. */
      555 +                DTRACE_PROBE2(vxlnat__flow__newv6, in6_addr_t *, inner_src,
      556 +                    in6_addr_t *, dst);
      557 +                kmem_free(flow, sizeof (*flow));
      558 +                return (NULL);
      559 +        }
      560 +        VXNR_REFHOLD(rule);     /* For the flow itself... */
      561 +        flow->vxnfl_rule = rule;
      562 +
      563 +        rw_enter(flowlock, RW_WRITER);
      564 +        oldflow = (vxlnat_flow_t *)avl_find(flowtree, flow, &where);
      565 +        if (oldflow != NULL) {
      566 +                /*
      567 +                 * Hmmm, someone put one in while we were dinking around.
      568 +                 * XXX KEBE SAYS return the old one, refheld, for now.
      569 +                 */
      570 +                VXNR_REFRELE(rule);
      571 +                kmem_free(flow, sizeof (*flow));
      572 +                VXNFL_REFHOLD(oldflow);
      573 +                flow = oldflow;
      574 +        } else {
      575 +                avl_insert(flowtree, flow, where);
      576 +                /*
      577 +                 * Do conn_t magic here, except for the conn_t activation.  I
      578 +                 * am aware of holding the rwlock-as-write here.  We may need
      579 +                 * to move this outside the rwlock hold, and
      580 +                 * reacquire-on-failure.
      581 +                 */
      582 +                if (!vxlnat_new_conn(flow)) {
      583 +                        ASSERT(flow->vxnfl_connp == NULL);
      584 +                        avl_remove(flowtree, flow);
      585 +                        VXNR_REFRELE(flow->vxnfl_rule);
      586 +                        kmem_free(flow, sizeof (*flow));
      587 +                        flow = NULL;
      588 +                }
      589 +        }
      590 +        rw_exit(flowlock);
      591 +        
      592 +        /* We just created this one, activate it. */
      593 +        if (oldflow == NULL && flow != NULL)
      594 +                vxlnat_activate_conn(flow);
      595 +
      596 +        return (flow);
      597 +}
      598 +
      599 +void
      600 +vxlnat_flow_free(vxlnat_flow_t *flow)
      601 +{
      602 +        ASSERT(flow->vxnfl_refcount == 0);
      603 +
      604 +        /* XXX KEBE SAYS FILL ME IN?! */
      605 +        /* XXX KEBE ASKS ipcl_hash_remove()? */
      606 +
      607 +        flow->vxnfl_connp->conn_priv = NULL; /* Sufficient? */
      608 +        CONN_DEC_REF(flow->vxnfl_connp);
      609 +        VXNR_REFRELE(flow->vxnfl_rule);
      610 +        kmem_free(flow, sizeof (*flow));
      611 +}
      612 +
      613 +static boolean_t
      614 +vxlnat_verify_initial(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
      615 +    uint32_t ports, uint8_t protocol, uint8_t *nexthdr)
      616 +{
      617 +        /* XXX KEBE SAYS FILL ME IN! */
      618 +        freemsg(mp);
 367  619          return (B_FALSE);
 368  620  }
 369  621  
 370  622  /*
 371  623   * If we reach here, we need to find a NAT rule, and see if we can/should
 372  624   * CREATE a new NAT flow, or whether or not we should drop, maybe even
 373  625   * returning an ICMP message of some sort.
 374  626   *
 375  627   * B_TRUE means the packet was handled, and we shouldn't continue processing
 376  628   * (even if "was handled" means droppage).
 377  629   */
 378  630  static boolean_t
 379  631  vxlnat_one_vxlan_rule(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
 380  632      ip6_t *ip6h)
 381  633  {
 382  634          vxlnat_rule_t *rule;
      635 +        vxlnat_flow_t *flow;
      636 +        in6_addr_t v4m_src, v4m_dst, *inner_src, *dst;
      637 +        uint32_t ports;
      638 +        uint8_t protocol;
      639 +        uint8_t *nexthdr;
 383  640  
 384      -        /* XXX handle IPv6 later */
      641 +        /* XXX handle IPv6 later, assigning inner_src and dst to ip6_t addrs. */
 385  642          if (ip6h != NULL)
 386  643                  return (B_FALSE);
 387  644  
 388  645          ASSERT3P(ipha, !=, NULL);
      646 +        inner_src = &v4m_src;
      647 +        dst = &v4m_dst;
      648 +        IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src), inner_src);
      649 +        IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_dst), dst);
 389  650  
 390  651          mutex_enter(&vnet->vxnv_rule_lock);
 391  652          rule = list_head(&vnet->vxnv_rules);
 392  653  
 393  654          /*
 394  655           * search for a match in the nat rules
 395  656           * XXX investigate perf issues with with respect to list_t size
      657 +         * XXX KEBE SAYS rewrite when we start doing IPv6 to use "inner_src"
      658 +         * and "dst". 
 396  659           */
 397  660          while (rule != NULL) {
 398  661                  ipaddr_t ipaddr;
 399  662                  uint32_t netmask = 0xffffffff;
 400  663                  uint8_t prefix = rule->vxnr_prefix - 96;
 401  664  
 402  665                  /* calculate the v4 netmask */
 403  666                  netmask <<= (32 - prefix);
 404  667                  netmask = htonl(netmask);
 405  668  
↓ open down ↓ 6 lines elided ↑ open up ↑
 412  675  
 413  676                  rule = list_next(&vnet->vxnv_rules, rule);
 414  677          }
 415  678  
 416  679          mutex_exit(&vnet->vxnv_rule_lock);
 417  680  
 418  681          if (rule == NULL)
 419  682                  return (B_FALSE);
 420  683  
 421  684          /* process packet */
      685 +
 422  686          /*
 423      -        static vxlnat_flow_t *
 424      -        vxlnat_new_flow(vxlnat_rule_t *rule, in6_addr_t *inner_src, in6_addr_t *dst,
 425      -            uint32_t ports, uint8_t protocol)
      687 +         * Grab transport header, and figure out if we can proceed.
      688 +         *
      689 +         * NOTE: vxlnat_grab_transport() will free/consume mp if it fails,
      690 +         * because we want to isolate non-flow-starters without having them
      691 +         * create new flows.  This means we return B_TRUE (consumed mp) on
      692 +         * failure. 
 426  693           */
      694 +        if (!vxlnat_grab_transport(mp, ipha, ip6h, &ports, &protocol, &nexthdr))
      695 +                return (B_TRUE); /* see above... */
      696 +        if (!vxlnat_verify_initial(mp, ipha, ip6h, ports, protocol, nexthdr))
      697 +                return (B_TRUE);
      698 +        
 427  699  
      700 +        flow = vxlnat_new_flow(rule, inner_src, dst, ports, protocol);
      701 +        if (flow != NULL) {
      702 +                /*
      703 +                 * Call same function that vxlnat_one_vxlan_flow() uses
      704 +                 * to remap & transmit the packet out the external side.
      705 +                 *
      706 +                 * NOTE:  We've already checked the initial-packet-
      707 +                 * qualification, so unlike the main datapath, we don't
      708 +                 * need to call vxlnat_verify_natstate()
      709 +                 */
      710 +
      711 +                 /* XXX KEBE SAYS PROCESS... */
      712 +                
      713 +                VXNFL_REFRELE(flow);
      714 +                return (B_TRUE);
      715 +        }
      716 +
 428  717          return (B_FALSE);
 429  718  }
 430  719  
 431  720  /*
 432  721   * See if the inbound VXLAN packet hits a 1-1/fixed mapping, and process if it
 433  722   * does.  B_TRUE means the packet was handled, and we shouldn't continue
 434  723   * processing (even if "was handled" means droppage).
 435  724   */
 436  725  static boolean_t
 437  726  vxlnat_one_vxlan_fixed(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
↓ open down ↓ 119 lines elided ↑ open up ↑
 557  846                      VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)));
 558  847                  freemsg(mp);
 559  848                  return;
 560  849          }
 561  850  
 562  851          DTRACE_PROBE2(vxlnat__in__vnet, uint32_t,
 563  852              VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)),
 564  853              vxlnat_vnet_t, vnet);
 565  854  
 566  855          /*
 567      -         * Off-vxlan processing steps:
      856 +         * Arrived-from-vxlan processing steps:
 568  857           * 1.) Locate the ethernet header and check/update/add-into remotes.
 569  858           * 2.) Search 1-1s, process if hit.
 570  859           * 3.) Search flows, process if hit.
 571  860           * 4.) Search rules, create new flow (or not) if hit.
 572      -         * 5.) Drop the packets.
      861 +         * 5.) Drop the packet.
 573  862           */
 574  863  
 575  864          /* 1.) Locate the ethernet header and check/update/add-into remotes. */
 576  865          mp->b_rptr += sizeof (*vxh);
 577  866          while (MBLKL(mp) == 0) {
 578  867                  mblk_t *oldmp = mp;
 579  868  
 580  869                  mp = mp->b_cont;
 581  870                  freeb(oldmp);
 582  871          }
↓ open down ↓ 564 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX