Print this page
WIP to help bringup NAT flows
@@ -37,10 +37,11 @@
#include <sys/tihdr.h>
#include <netinet/in.h>
#include <netinet/udp.h>
#include <inet/ip.h>
#include <inet/ip6.h>
+#include <inet/tcp_impl.h>
#include <inet/udp_impl.h>
#include <inet/tcp.h>
#include <inet/vxlnat_impl.h>
@@ -347,10 +348,114 @@
mp->b_rptr = (uint8_t *)ipha;
return (mp);
}
/*
+ * Extract transport-level information to find a NAT flow.
+ * Consume mp and return B_FALSE if there's a problem. Fill in "ports"
+ * and "protocol" and return B_TRUE if there's not.
+ */
+static boolean_t
+vxlnat_grab_transport(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t *ports,
+ uint8_t *protocol, uint8_t **nexthdr_ptr)
+{
+ uint8_t *nexthdr;
+
+ /* Punt on IPv6 for now... */
+ if (ip6h != NULL) {
+ freemsg(mp);
+ return (B_FALSE);
+ }
+
+ ASSERT(ipha != NULL);
+ *protocol = ipha->ipha_protocol;
+ nexthdr = ((uint8_t *)ipha + IPH_HDR_LENGTH(ipha));
+ *nexthdr_ptr = nexthdr; /* Get this out of the way now. */
+ if (nexthdr > mp->b_wptr) {
+ DTRACE_PROBE1(vxlnat__in__drop__trnexthdr, mblk_t *, mp);
+ freemsg(mp);
+ return (B_FALSE);
+ }
+ switch (*protocol) {
+ case IPPROTO_TCP: {
+ tcpha_t *tcph = (tcpha_t *)nexthdr;
+
+ if (nexthdr + sizeof (*tcph) > mp->b_wptr) {
+ DTRACE_PROBE1(vxlnat__in__drop__tcpnexthdr, mblk_t *,
+ mp);
+ freemsg(mp);
+ return (B_FALSE);
+ }
+ *ports = *((uint32_t *)tcph);
+ /* XXX KEBE SAYS - grab other metadata here NOW? */
+ break;
+ }
+ case IPPROTO_UDP: {
+ udpha_t *udph = (udpha_t *)nexthdr;
+
+ if (nexthdr + sizeof (*udph) > mp->b_wptr) {
+ DTRACE_PROBE1(vxlnat__in__drop__udpnexthdr, mblk_t *,
+ mp);
+ freemsg(mp);
+ return (B_FALSE);
+ }
+ *ports = *((uint32_t *)udph);
+ /*
+ * XXX KEBE SAYS - not as much as TCP, but grab other metadata
+ * here NOW?
+ */
+ break;
+ }
+ case IPPROTO_ICMP: {
+ icmph_t *icmph = (icmph_t *)nexthdr;
+
+ if (nexthdr + sizeof (*icmph) > mp->b_wptr) {
+ DTRACE_PROBE1(vxlnat__in__drop__icmpnexthdr, mblk_t *,
+ mp);
+ freemsg(mp);
+ return (B_FALSE);
+ }
+ /* XXX KEBE SAYS sort out ICMP header... */
+ switch (icmph->icmph_type) {
+ case ICMP_ECHO_REQUEST:
+ case ICMP_TIME_STAMP_REQUEST:
+ case ICMP_TIME_EXCEEDED:
+ case ICMP_INFO_REQUEST:
+ case ICMP_ADDRESS_MASK_REPLY:
+ /* All ones we can sorta cope with... */
+ break;
+ default:
+ DTRACE_PROBE2(vxlnat__in__drop__icmptype, int,
+ icmph->icmph_type, mblk_t *, mp);
+ freemsg(mp);
+ return (B_FALSE);
+ }
+ /* NOTE: as of now, will switch position depending on endian. */
+ *ports = icmph->icmph_echo_ident;
+ break;
+ }
+ default:
+ *ports = 0;
+ break;
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * This is the evaluate-packet vs. NAT flow state function.
+ * This function does NOT alter "mp".
+ */
+static boolean_t
+vxlnat_verify_natstate(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
+ vxlnat_flow_t *flow, uint8_t *nexthdr)
+{
+ /* XXX KEBE SAYS FILL ME IN! */
+ return (B_FALSE);
+}
+
+/*
* Inspect the packet and find ports & protos (or ICMP types & codes)
* and see if we have an established NAT flow.
*
* XXX KEBE WONDERS if the transmission path will more closely resemble
* vxlnat_one_vxlan_fixed() because of ipha_ident issues or not...
@@ -360,12 +465,159 @@
*/
static boolean_t
vxlnat_one_vxlan_flow(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
ip6_t *ip6h)
{
- /* XXX KEBE SAYS FILL ME IN. */
- /* For now... */
+ vxlnat_flow_t *flow, searcher;
+ uint8_t *nexthdr;
+
+ /*
+ * XXX KEBE WONDERS, should we return vxlnat_flow_t instead if we
+ * miss? That way, we only need to find the ports/protocol ONCE.
+ */
+
+ if (ip6h != NULL) {
+ /* Eventually, grab addresses for "searcher". */
+ return (B_FALSE); /* Bail on IPv6 for now... */
+ } else {
+ ASSERT(ipha != NULL);
+ searcher.vxnfl_isv4 = B_TRUE; /* Required? */
+ IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src),
+ &searcher.vxnfl_src);
+ IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_dst),
+ &searcher.vxnfl_dst);
+ }
+
+ if (!vxlnat_grab_transport(mp, ipha, ip6h, &searcher.vxnfl_ports,
+ &searcher.vxnfl_protocol, &nexthdr)) {
+ DTRACE_PROBE1(vxlnat__in__flowgrab, mblk_t *, mp);
+ freemsg(mp);
+ return (B_TRUE);
+ }
+
+
+ /*
+ * XXX KEBE SAYS Eventually put the rw&find in an IPv4-only block,
+ * because IPv6 (if we NAT it like IPv4) will have its own table/tree.
+ */
+ rw_enter(&vnet->vxnv_flowv4_lock, RW_READER);
+ flow = avl_find(&vnet->vxnv_flows_v4, &searcher, NULL);
+ if (flow != NULL)
+ VXNFL_REFHOLD(flow);
+ rw_exit(&vnet->vxnv_flowv4_lock);
+
+ if (flow == NULL)
+ return (B_FALSE); /* Let caller handle things. */
+
+ if (!vxlnat_verify_natstate(mp, ipha, ip6h, flow, nexthdr)) {
+ freemsg(mp); /* XXX KEBE SAYS FOR NOW... */
+ } else {
+ /* XXX KEBE SAYS PROCESS... */
+ }
+
+ VXNFL_REFRELE(flow);
+ return (B_TRUE);
+}
+
+/*
+ * We have a new packet that seems to require a new NAT flow. Construct that
+ * flow now, and intern it as both a conn_t in IP *and* in the vnet's
+ * appropriate vxnv_flows* tree. Return NULL if we have a problem.
+ */
+static vxlnat_flow_t *
+vxlnat_new_flow(vxlnat_rule_t *rule, in6_addr_t *inner_src, in6_addr_t *dst,
+ uint32_t ports, uint8_t protocol)
+{
+ vxlnat_vnet_t *vnet = rule->vxnr_vnet;
+ vxlnat_flow_t *flow, *oldflow;
+ avl_tree_t *flowtree;
+ krwlock_t *flowlock;
+ avl_index_t where;
+
+ flow = kmem_alloc(sizeof (*flow), KM_NOSLEEP | KM_NORMALPRI);
+ if (flow == NULL)
+ return (NULL);
+
+ flow->vxnfl_dst = *dst;
+ flow->vxnfl_src = *inner_src;
+ flow->vxnfl_ports = ports;
+ flow->vxnfl_protocol = protocol;
+ flow->vxnfl_refcount = 2; /* One for internment, one for caller. */
+ /* Assume no mixed-IP-version mappings for now. */
+ if (IN6_IS_ADDR_V4MAPPED(inner_src)) {
+ ASSERT(IN6_IS_ADDR_V4MAPPED(dst));
+ flow->vxnfl_isv4 = B_TRUE;
+ flowtree = &vnet->vxnv_flows_v4;
+ flowlock = &vnet->vxnv_flowv4_lock;
+ } else {
+ ASSERT(!IN6_IS_ADDR_V4MAPPED(dst));
+ flow->vxnfl_isv4 = B_FALSE;
+ /* XXX KEBE SAYS we don't do IPv6 for now. */
+ DTRACE_PROBE2(vxlnat__flow__newv6, in6_addr_t *, inner_src,
+ in6_addr_t *, dst);
+ kmem_free(flow, sizeof (*flow));
+ return (NULL);
+ }
+ VXNR_REFHOLD(rule); /* For the flow itself... */
+ flow->vxnfl_rule = rule;
+
+ rw_enter(flowlock, RW_WRITER);
+ oldflow = (vxlnat_flow_t *)avl_find(flowtree, flow, &where);
+ if (oldflow != NULL) {
+ /*
+ * Hmmm, someone put one in while we were dinking around.
+ * XXX KEBE SAYS return the old one, refheld, for now.
+ */
+ VXNR_REFRELE(rule);
+ kmem_free(flow, sizeof (*flow));
+ VXNFL_REFHOLD(oldflow);
+ flow = oldflow;
+ } else {
+ avl_insert(flowtree, flow, where);
+ /*
+ * Do conn_t magic here, except for the conn_t activation. I
+ * am aware of holding the rwlock-as-write here. We may need
+ * to move this outside the rwlock hold, and
+ * reacquire-on-failure.
+ */
+ if (!vxlnat_new_conn(flow)) {
+ ASSERT(flow->vxnfl_connp == NULL);
+ avl_remove(flowtree, flow);
+ VXNR_REFRELE(flow->vxnfl_rule);
+ kmem_free(flow, sizeof (*flow));
+ flow = NULL;
+ }
+ }
+ rw_exit(flowlock);
+
+ /* We just created this one, activate it. */
+ if (oldflow == NULL && flow != NULL)
+ vxlnat_activate_conn(flow);
+
+ return (flow);
+}
+
+void
+vxlnat_flow_free(vxlnat_flow_t *flow)
+{
+ ASSERT(flow->vxnfl_refcount == 0);
+
+ /* XXX KEBE SAYS FILL ME IN?! */
+ /* XXX KEBE ASKS ipcl_hash_remove()? */
+
+ flow->vxnfl_connp->conn_priv = NULL; /* Sufficient? */
+ CONN_DEC_REF(flow->vxnfl_connp);
+ VXNR_REFRELE(flow->vxnfl_rule);
+ kmem_free(flow, sizeof (*flow));
+}
+
+static boolean_t
+vxlnat_verify_initial(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
+ uint32_t ports, uint8_t protocol, uint8_t *nexthdr)
+{
+ /* XXX KEBE SAYS FILL ME IN! */
+ freemsg(mp);
return (B_FALSE);
}
/*
* If we reach here, we need to find a NAT rule, and see if we can/should
@@ -378,23 +630,34 @@
static boolean_t
vxlnat_one_vxlan_rule(vxlnat_vnet_t *vnet, mblk_t *mp, ipha_t *ipha,
ip6_t *ip6h)
{
vxlnat_rule_t *rule;
+ vxlnat_flow_t *flow;
+ in6_addr_t v4m_src, v4m_dst, *inner_src, *dst;
+ uint32_t ports;
+ uint8_t protocol;
+ uint8_t *nexthdr;
- /* XXX handle IPv6 later */
+ /* XXX handle IPv6 later, assigning inner_src and dst to ip6_t addrs. */
if (ip6h != NULL)
return (B_FALSE);
ASSERT3P(ipha, !=, NULL);
+ inner_src = &v4m_src;
+ dst = &v4m_dst;
+ IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_src), inner_src);
+ IN6_INADDR_TO_V4MAPPED((struct in_addr *)(&ipha->ipha_dst), dst);
mutex_enter(&vnet->vxnv_rule_lock);
rule = list_head(&vnet->vxnv_rules);
/*
* search for a match in the nat rules
* XXX investigate perf issues with with respect to list_t size
+ * XXX KEBE SAYS rewrite when we start doing IPv6 to use "inner_src"
+ * and "dst".
*/
while (rule != NULL) {
ipaddr_t ipaddr;
uint32_t netmask = 0xffffffff;
uint8_t prefix = rule->vxnr_prefix - 96;
@@ -417,16 +680,42 @@
if (rule == NULL)
return (B_FALSE);
/* process packet */
+
/*
- static vxlnat_flow_t *
- vxlnat_new_flow(vxlnat_rule_t *rule, in6_addr_t *inner_src, in6_addr_t *dst,
- uint32_t ports, uint8_t protocol)
+ * Grab transport header, and figure out if we can proceed.
+ *
+ * NOTE: vxlnat_grab_transport() will free/consume mp if it fails,
+ * because we want to isolate non-flow-starters without having them
+ * create new flows. This means we return B_TRUE (consumed mp) on
+ * failure.
*/
+ if (!vxlnat_grab_transport(mp, ipha, ip6h, &ports, &protocol, &nexthdr))
+ return (B_TRUE); /* see above... */
+ if (!vxlnat_verify_initial(mp, ipha, ip6h, ports, protocol, nexthdr))
+ return (B_TRUE);
+
+ flow = vxlnat_new_flow(rule, inner_src, dst, ports, protocol);
+ if (flow != NULL) {
+ /*
+ * Call same function that vxlnat_one_vxlan_flow() uses
+ * to remap & transmit the packet out the external side.
+ *
+ * NOTE: We've already checked the initial-packet-
+ * qualification, so unlike the main datapath, we don't
+ * need to call vxlnat_verify_natstate()
+ */
+
+ /* XXX KEBE SAYS PROCESS... */
+
+ VXNFL_REFRELE(flow);
+ return (B_TRUE);
+ }
+
return (B_FALSE);
}
/*
* See if the inbound VXLAN packet hits a 1-1/fixed mapping, and process if it
@@ -562,16 +851,16 @@
DTRACE_PROBE2(vxlnat__in__vnet, uint32_t,
VXLAN_ID_HTON(VXLAN_ID_WIRE32(vxh->vxlan_id)),
vxlnat_vnet_t, vnet);
/*
- * Off-vxlan processing steps:
+ * Arrived-from-vxlan processing steps:
* 1.) Locate the ethernet header and check/update/add-into remotes.
* 2.) Search 1-1s, process if hit.
* 3.) Search flows, process if hit.
* 4.) Search rules, create new flow (or not) if hit.
- * 5.) Drop the packets.
+ * 5.) Drop the packet.
*/
/* 1.) Locate the ethernet header and check/update/add-into remotes. */
mp->b_rptr += sizeof (*vxh);
while (MBLKL(mp) == 0) {