1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2018, Joyent, Inc.
  14  */
  15 
  16 #ifndef _INET_VXLNAT_IMPL_H
  17 #define _INET_VXLNAT_IMPL_H
  18 
  19 #include <inet/vxlnat.h>
  20 #include <inet/ip.h>
  21 #include <inet/ip6.h>
  22 #include <inet/ip_ire.h>
  23 #include <sys/clock_impl.h>
  24 #include <sys/avl.h>
  25 #include <sys/uio.h>
  26 #include <sys/list.h>
  27 #include <sys/byteorder.h>
  28 #include <sys/vxlan.h>
  29 
  30 /*
  31  * XXX KEBE ASKS --> do we assume port IPPORT_VXLAN all the time?
  32  * IF NOT, then we need to add ports to various things here that deal
  33  * with the underlay network.
  34  *
  35  * NOTE:  All reference counts *include* table/tree/list/whatever internment.
  36  * Once an entry is removed, *_REFRELE() must be invoked, and it may or may
  37  * not free something.
  38  */
  39 
  40 #ifdef __cplusplus
  41 extern "C" {
  42 #endif
  43 
  44 /*
  45  * NAT RULES.  Instantiated per-vnet, write-once/read-only entries,
  46  * linkage/entries protected by "rule lock" outside this structure.
  47  */
  48 typedef struct vxlnat_rule_s {
  49         list_node_t vxnr_link;
  50         /* refheld link, or if NULL, this rule is "condemned" and no good. */
  51         struct vxlnat_vnet_s *vxnr_vnet;
  52         in6_addr_t vxnr_myaddr;
  53         in6_addr_t vxnr_pubaddr;
  54         uint8_t vxnr_myether[ETHERADDRL];
  55         uint16_t vxnr_vlanid;   /* Fabrics use this too. */
  56         uint32_t vxnr_refcount;
  57         uint8_t vxnr_prefix;
  58 } vxlnat_rule_t;
  59 #define VXNR_REFHOLD(vxnr) {                    \
  60         atomic_inc_32(&(vxnr)->vxnr_refcount);   \
  61         ASSERT((vxnr)->vxnr_refcount > 0);        \
  62 }
  63 #define VXNR_REFRELE(vxnr) {                                    \
  64         ASSERT((vxnr)->vxnr_refcount > 0);                        \
  65         membar_exit();                                          \
  66         if (atomic_dec_32_nv(&(vxnr)->vxnr_refcount) == 0)       \
  67                 vxlnat_rule_free(vxnr);                         \
  68 }
  69 extern void vxlnat_rule_free(vxlnat_rule_t *);
  70 
  71 /*
  72  * NAT FLOWS.  These are per-vnet, and keyed/searched by:
  73  * <inner-IP-source,IP-dest,inner-source-port,dest-port,protocol>.
  74  * They will be tied-to/part-of
  75  */
  76 typedef struct vxlnat_flow_s {
  77         avl_node_t vxnfl_treenode;
  78         /*
  79          * I'm guessing that dst varies more than src.  Also
  80          * the plan is for the comparitor function to bcmp() both
  81          * of these as one call for IPv6 (if we ever get to that..).
  82          */
  83         in6_addr_t vxnfl_dst;
  84         in6_addr_t vxnfl_src;   /* INNER source address. */
  85         uint32_t vxnfl_ports;
  86         uint8_t vxnfl_protocol;
  87         uint8_t vxnfl_isv4 : 1, /* Will save us 12 bytes of compares... */
  88                 vxlfl_reserved1 : 7;
  89         conn_t *vxnfl_connp;    /* Question - embed instead? */
  90         vxlnat_rule_t *vxnfl_rule; /* Refhold to rule that generated me. */
  91 } vxlnat_flow_t;
  92 /* Exploit endianisms, maintain network order... */
  93 #ifdef _BIG_ENDIAN
  94 #define VXNFL_SPORT(ports) (uint16_t)((ports) >> 16) /* Unsigned all around. */
  95 #define VXNFL_DPORT(ports) ((ports) & 0xFFFF)
  96 #else
  97 #define VXNFL_SPORT(ports) ((ports) & 0xFFFF)
  98 #define VXNFL_DPORT(ports) (uint16_t)((ports) >> 16) /* Unsigned all around. */
  99 #endif
 100 
 101 /*
 102  * 1-1 IP mapping.
 103  */
 104 typedef struct vxlnat_fixed_s {
 105         avl_node_t vxnf_treenode;
 106         in6_addr_t vxnf_addr;   /* For now it needn't match to a rule. */
 107         in6_addr_t vxnf_pubaddr; /* External IP. */
 108         struct vxlnat_vnet_s *vxnf_vnet;
 109         ire_t *vxnf_ire;        /* Should be an IRE_LOCAL from the ftable. */
 110         struct vxlnat_remote_s *vxnf_remote;
 111         uint8_t vxnf_myether[ETHERADDRL];
 112         uint16_t vxnf_vlanid;   /* Stored in network order for quick xmit. */
 113         uint32_t vxnf_refcount;
 114         boolean_t vxnf_clear_router;    /* XXX KEBE SAYS CHEESY HACK */
 115 } vxlnat_fixed_t;
 116 #define VXNF_REFHOLD(vxnf) {                    \
 117         atomic_inc_32(&(vxnf)->vxnf_refcount);   \
 118         ASSERT((vxnf)->vxnf_refcount > 0);        \
 119 }
 120 #define VXNF_REFRELE(vxnf) {                                    \
 121         ASSERT((vxnf)->vxnf_refcount > 0);                        \
 122         membar_exit();                                          \
 123         if (atomic_dec_32_nv(&(vxnf)->vxnf_refcount) == 0)       \
 124                 vxlnat_fixed_free(vxnf);                        \
 125 }
 126 extern void vxlnat_fixed_free(vxlnat_fixed_t *);
 127 
 128 /*
 129  * REMOTE VXLAN destinations.
 130  */
 131 typedef struct vxlnat_remote_s {
 132         avl_node_t vxnrem_treenode;
 133         in6_addr_t vxnrem_addr; /* Same prefix as one in rule, or fixed addr. */
 134         in6_addr_t vxnrem_uaddr; /* Underlay VXLAN destination. */
 135         struct vxlnat_vnet_s *vxnrem_vnet;      /* Reference-held. */
 136         uint32_t vxnrem_refcount;
 137         uint8_t vxnrem_ether[ETHERADDRL];
 138         uint16_t vxnrem_vlan;
 139         /*
 140          * XXX KEBE SAYS put some lifetime/usetime/etc. here
 141          * so we don't keep too many of these.  Either that, or maybe
 142          * convert to a qqcache or (patents expiring) ARC.
 143          */
 144 } vxlnat_remote_t;
 145 #define VXNREM_REFHOLD(vxnrem) {                        \
 146         atomic_inc_32(&(vxnrem)->vxnrem_refcount);       \
 147         ASSERT((vxnrem)->vxnrem_refcount > 0);            \
 148 }
 149 #define VXNREM_REFRELE(vxnrem) {                                \
 150         ASSERT((vxnrem)->vxnrem_refcount > 0);                    \
 151         membar_exit();                                          \
 152         if (atomic_dec_32_nv(&(vxnrem)->vxnrem_refcount) == 0)   \
 153                 vxlnat_remote_free(vxnrem);                     \
 154 }
 155 extern void vxlnat_remote_free(vxlnat_remote_t *);
 156 
 157 /*
 158  * per-vnetid overarching structure.  AVL tree keyed by vnetid.
 159  * NOTE:  Could be split into vnetid-hashed buckets to split any
 160  * locks.
 161  */
 162 typedef struct vxlnat_vnet_s {
 163         avl_node_t vxnv_treenode;
 164         /*
 165          * 1-1 IP mappings. (1st lookup for an in-to-out packet.)
 166          * Will map to an IRE_LOCAL in IP.
 167          */
 168         krwlock_t vxnv_fixed_lock;
 169         avl_tree_t vxnv_fixed_ips;
 170 
 171         /*
 172          * NAT flows. (2nd lookup for an in-to-out packet.)
 173          * These are also conn_ts with outer-packet fields for out-to-in
 174          * matches against a conn_t.
 175          *
 176          * NOTE: We're going to keep a separate tree for inner IPv6 NAT, if
 177          * we ever need it.
 178          */
 179         krwlock_t vxnv_flowv4_lock;
 180         avl_tree_t vxnv_flows_v4;
 181 
 182         /* NAT rules. (3rd lookup for an in-to-out packet.) */
 183         kmutex_t vxnv_rule_lock;
 184         list_t vxnv_rules;
 185 
 186         /*
 187          * Internal-network remote-nodes. (only lookup for out-to-in packet.)
 188          * Entries here are also refheld by 1-1s or NAT flows.
 189          */
 190         kmutex_t vxnv_remote_lock;
 191         avl_tree_t vxnv_remotes;
 192 
 193         uint32_t vxnv_refcount;
 194         uint32_t vxnv_vnetid;   /* Wire byteorder for less swapping on LE */
 195 } vxlnat_vnet_t;
 196 #define VXNV_REFHOLD(vxnv) {                    \
 197         atomic_inc_32(&(vxnv)->vxnv_refcount);   \
 198         ASSERT((vxnv)->vxnv_refcount > 0);        \
 199 }
 200 #define VXNV_REFRELE(vxnv) {                                    \
 201         ASSERT((vxnv)->vxnv_refcount > 0);                        \
 202         membar_exit();                                          \
 203         if (atomic_dec_32_nv(&(vxnv)->vxnv_refcount) == 0)       \
 204                 vxlnat_vnet_free(vxnv);                         \
 205 }
 206 extern void vxlnat_vnet_free(vxlnat_vnet_t *);
 207 
 208 /*
 209  * Endian-independent macros for rapid off-wire header reading. i.e. avoid
 210  * [nh]to[hn]*()
 211  *
 212  * VXLAN_ID_WIRE32(id) ==> Zero-out "reserved" bits, preserve wire-order
 213  * and position of vnetid.
 214  * VXLAN_FLAGS_WIRE32(vni) ==> Zero-out reserved bits, preserve wire-order
 215  * and position of flags.
 216  * VXLAN_F_VDI_WIRE ==> VXLAN_F_VDI, but w/o needing to swap.
 217  *
 218  * ALSO:  HTON/NTOH for kernel-makes-right interactions with userland, which
 219  * means shifting actual ID to/from low-24-bits of 32-bit word.
 220  * VXLAN_ID_HTON(id)
 221  * VXLAN_ID_NTOH(id)
 222  *
 223  * XXX KEBE ASKS ==> If not confusing to folks, move into sys/vxlan.h and
 224  * have overlay's VXLAN encap adopt them?
 225  */
 226 #ifdef _BIG_ENDIAN
 227 #define VXLAN_ID_WIRE32(id) ((id) & 0xFFFFFF00)
 228 #define VXLAN_F_VDI_WIRE VXLAN_F_VDI
 229 /* XXX KEBE ASKS, do masking here? */
 230 #define VXLAN_ID_HTON(id) ((id) << VXLAN_ID_SHIFT)
 231 #define VXLAN_ID_NTOH(id) ((id) >> VXLAN_ID_SHIFT)
 232 #else   /* i.e. _LITTLE_ENDIAN */
 233 #define VXLAN_ID_WIRE32(id) ((id) & 0xFFFFFF)
 234 #define VXLAN_F_VDI_WIRE 0x08
 235 #define VXLAN_ID_HTON(id) htonl((id) << VXLAN_ID_SHIFT)
 236 #define VXLAN_ID_NTOH(id) (ntohl(id) >> VXLAN_ID_SHIFT)
 237 #endif  /* _BIG_ENDIAN */
 238 #define VXLAN_FLAGS_WIRE32(flags) ((flags) & VXLAN_F_VDI_WIRE)
 239 
 240 extern kmutex_t vxlnat_mutex;
 241 extern netstack_t *vxlnat_netstack;
 242 extern int vxlnat_command(vxn_msg_t *);
 243 extern int vxlnat_read_dump(struct uio *);
 244 extern int vxlnat_vxlan_addr(in6_addr_t *);
 245 extern void vxlnat_closesock(void);
 246 extern void vxlnat_state_init(void);
 247 extern void vxlnat_state_fini(void);
 248 
 249 extern void vxlnat_public_init(void);
 250 extern void vxlnat_public_fini(void);
 251 extern boolean_t vxlnat_public_hold(in6_addr_t *, boolean_t);
 252 extern void vxlnat_public_rele(in6_addr_t *);
 253 
 254 extern int vxlnat_tree_plus_in6_cmp(const void *, const void *);
 255 
 256 /* ire_recvfn & ire_sendfn functions for 1-1/fixed maps. */
 257 extern void vxlnat_fixed_ire_recv_v4(ire_t *, mblk_t *, void *,
 258     ip_recv_attr_t *);
 259 extern void vxlnat_fixed_ire_recv_v6(ire_t *, mblk_t *, void *,
 260     ip_recv_attr_t *);
 261 extern int vxlnat_fixed_ire_send_v4(ire_t *, mblk_t *, void *,
 262     ip_xmit_attr_t *, uint32_t *);
 263 extern int vxlnat_fixed_ire_send_v6(ire_t *, mblk_t *, void *,
 264     ip_xmit_attr_t *, uint32_t *);
 265 
 266 
 267 extern vxlnat_vnet_t *vxlnat_get_vnet(uint32_t, boolean_t);
 268 
 269 #ifdef __cplusplus
 270 }
 271 #endif
 272 
 273 #endif /* _INET_VXLNAT_IMPL_H */