1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2018, Joyent, Inc.
14 */
15
16 #ifndef _INET_VXLNAT_IMPL_H
17 #define _INET_VXLNAT_IMPL_H
18
19 #include <inet/vxlnat.h>
20 #include <inet/ip.h>
21 #include <inet/ip6.h>
22 #include <inet/ip_ire.h>
23 #include <sys/clock_impl.h>
24 #include <sys/avl.h>
25 #include <sys/uio.h>
26 #include <sys/list.h>
27 #include <sys/byteorder.h>
28 #include <sys/vxlan.h>
29
30 /*
31 * XXX KEBE ASKS --> do we assume port IPPORT_VXLAN all the time?
32 * IF NOT, then we need to add ports to various things here that deal
33 * with the underlay network.
34 *
35 * NOTE: All reference counts *include* table/tree/list/whatever internment.
36 * Once an entry is removed, *_REFRELE() must be invoked, and it may or may
37 * not free something.
38 */
39
40 #ifdef __cplusplus
41 extern "C" {
42 #endif
43
44 /*
45 * NAT RULES. Instantiated per-vnet, write-once/read-only entries,
46 * linkage/entries protected by "rule lock" outside this structure.
47 */
48 typedef struct vxlnat_rule_s {
49 list_node_t vxnr_link;
50 /* refheld link, or if NULL, this rule is "condemned" and no good. */
51 struct vxlnat_vnet_s *vxnr_vnet;
52 in6_addr_t vxnr_myaddr;
53 in6_addr_t vxnr_pubaddr;
54 /* XXX KEBE ASKS, ire? */
55 uint8_t vxnr_myether[ETHERADDRL];
56 uint16_t vxnr_vlanid; /* Fabrics use this too. */
57 uint32_t vxnr_refcount;
58 uint8_t vxnr_prefix;
59 } vxlnat_rule_t;
60 #define VXNR_REFHOLD(vxnr) { \
61 atomic_inc_32(&(vxnr)->vxnr_refcount); \
62 ASSERT((vxnr)->vxnr_refcount > 0); \
63 }
64 #define VXNR_REFRELE(vxnr) { \
65 ASSERT((vxnr)->vxnr_refcount > 0); \
66 membar_exit(); \
67 if (atomic_dec_32_nv(&(vxnr)->vxnr_refcount) == 0) \
68 vxlnat_rule_free(vxnr); \
69 }
70 extern void vxlnat_rule_free(vxlnat_rule_t *);
71
72 /*
73 * NAT FLOWS. These are per-vnet, and keyed/searched by:
74 * <inner-IP-source,IP-dest,inner-source-port,dest-port,protocol>.
75 * They will be tied-to/part-of a conn_t.
76 */
77 typedef struct vxlnat_flow_s {
78 avl_node_t vxnfl_treenode;
79 /*
80 * I'm guessing that dst varies more than src. Also
81 * the plan is for the comparitor function to bcmp() both
82 * of these as one call for IPv6 (if we ever get to that..).
83 */
84 in6_addr_t vxnfl_dst;
85 in6_addr_t vxnfl_src; /* INNER source address. */
86 uint32_t vxnfl_ports;
87 uint8_t vxnfl_protocol;
88 uint8_t vxnfl_isv4 : 1, /* Will save us 12 bytes of compares... */
89 vxlfl_reserved1 : 7;
90 /* Theoretically 16 bits lies where this comment is. */
91 uint32_t vxnfl_refcount;
92 conn_t *vxnfl_connp; /* Question - embed instead? */
93 vxlnat_rule_t *vxnfl_rule; /* Refhold to rule that generated me. */
94 /*
95 * XXX KEBE SAYS Other NAT-state belongs here too. Like time-values
96 * for timeouts, and more!
97 */
98 } vxlnat_flow_t;
99 /* Exploit endianisms, maintain network order... */
100 #ifdef _BIG_ENDIAN
101 #define VXNFL_SPORT(ports) (uint16_t)((ports) >> 16) /* Unsigned all around. */
102 #define VXNFL_DPORT(ports) ((ports) & 0xFFFF)
103 #else
104 #define VXNFL_SPORT(ports) ((ports) & 0xFFFF)
105 #define VXNFL_DPORT(ports) (uint16_t)((ports) >> 16) /* Unsigned all around. */
106 #endif
107 #define VXNFL_REFHOLD(vxnfl) { \
108 atomic_inc_32(&(vxnfl)->vxnfl_refcount); \
109 ASSERT((vxnfl)->vxnfl_refcount > 0); \
110 }
111 #define VXNFL_REFRELE(vxnfl) { \
112 ASSERT((vxnfl)->vxnfl_refcount > 0); \
113 membar_exit(); \
114 if (atomic_dec_32_nv(&(vxnfl)->vxnfl_refcount) == 0) \
115 vxlnat_flow_free(vxnfl); \
116 }
117 extern void vxlnat_flow_free(vxlnat_flow_t *);
118
119 /*
120 * 1-1 IP mapping.
121 */
122 typedef struct vxlnat_fixed_s {
123 avl_node_t vxnf_treenode;
124 in6_addr_t vxnf_addr; /* For now it needn't match to a rule. */
125 in6_addr_t vxnf_pubaddr; /* External IP. */
126 struct vxlnat_vnet_s *vxnf_vnet;
127 ire_t *vxnf_ire; /* Should be an IRE_LOCAL from the ftable. */
128 struct vxlnat_remote_s *vxnf_remote;
129 uint8_t vxnf_myether[ETHERADDRL];
130 uint16_t vxnf_vlanid; /* Stored in network order for quick xmit. */
131 uint32_t vxnf_refcount;
132 boolean_t vxnf_clear_router; /* XXX KEBE SAYS CHEESY HACK */
133 } vxlnat_fixed_t;
134 #define VXNF_REFHOLD(vxnf) { \
135 atomic_inc_32(&(vxnf)->vxnf_refcount); \
136 ASSERT((vxnf)->vxnf_refcount > 0); \
137 }
138 #define VXNF_REFRELE(vxnf) { \
139 ASSERT((vxnf)->vxnf_refcount > 0); \
140 membar_exit(); \
141 if (atomic_dec_32_nv(&(vxnf)->vxnf_refcount) == 0) \
142 vxlnat_fixed_free(vxnf); \
143 }
144 extern void vxlnat_fixed_free(vxlnat_fixed_t *);
145
146 /*
147 * REMOTE VXLAN destinations.
148 */
149 typedef struct vxlnat_remote_s {
150 avl_node_t vxnrem_treenode;
151 in6_addr_t vxnrem_addr; /* Same prefix as one in rule, or fixed addr. */
152 in6_addr_t vxnrem_uaddr; /* Underlay VXLAN destination. */
153 struct vxlnat_vnet_s *vxnrem_vnet; /* Reference-held. */
154 uint32_t vxnrem_refcount;
155 uint8_t vxnrem_ether[ETHERADDRL];
156 uint16_t vxnrem_vlan;
157 /*
158 * XXX KEBE SAYS put some lifetime/usetime/etc. here
159 * so we don't keep too many of these. Either that, or maybe
160 * convert to a qqcache or (patents expiring) ARC.
161 */
162 } vxlnat_remote_t;
163 #define VXNREM_REFHOLD(vxnrem) { \
164 atomic_inc_32(&(vxnrem)->vxnrem_refcount); \
165 ASSERT((vxnrem)->vxnrem_refcount > 0); \
166 }
167 #define VXNREM_REFRELE(vxnrem) { \
168 ASSERT((vxnrem)->vxnrem_refcount > 0); \
169 membar_exit(); \
170 if (atomic_dec_32_nv(&(vxnrem)->vxnrem_refcount) == 0) \
171 vxlnat_remote_free(vxnrem); \
172 }
173 extern void vxlnat_remote_free(vxlnat_remote_t *);
174
175 /*
176 * per-vnetid overarching structure. AVL tree keyed by vnetid.
177 * NOTE: Could be split into vnetid-hashed buckets to split any
178 * locks.
179 */
180 typedef struct vxlnat_vnet_s {
181 avl_node_t vxnv_treenode;
182 /*
183 * 1-1 IP mappings. (1st lookup for an in-to-out packet.)
184 * Will map to an IRE_LOCAL in IP.
185 */
186 krwlock_t vxnv_fixed_lock;
187 avl_tree_t vxnv_fixed_ips;
188
189 /*
190 * NAT flows. (2nd lookup for an in-to-out packet.)
191 * These are also conn_ts with outer-packet fields for out-to-in
192 * matches against a conn_t.
193 *
194 * NOTE: We're going to keep a separate tree for inner IPv6 NAT, if
195 * we ever need it.
196 */
197 krwlock_t vxnv_flowv4_lock;
198 avl_tree_t vxnv_flows_v4;
199
200 /* NAT rules. (3rd lookup for an in-to-out packet.) */
201 kmutex_t vxnv_rule_lock;
202 list_t vxnv_rules;
203
204 /*
205 * Internal-network remote-nodes. (only lookup for out-to-in packet.)
206 * Entries here are also refheld by 1-1s or NAT flows.
207 */
208 kmutex_t vxnv_remote_lock;
209 avl_tree_t vxnv_remotes;
210
211 uint32_t vxnv_refcount;
212 uint32_t vxnv_vnetid; /* Wire byteorder for less swapping on LE */
213 } vxlnat_vnet_t;
214 #define VXNV_REFHOLD(vxnv) { \
215 atomic_inc_32(&(vxnv)->vxnv_refcount); \
216 ASSERT((vxnv)->vxnv_refcount > 0); \
217 }
218 #define VXNV_REFRELE(vxnv) { \
219 ASSERT((vxnv)->vxnv_refcount > 0); \
220 membar_exit(); \
221 if (atomic_dec_32_nv(&(vxnv)->vxnv_refcount) == 0) \
222 vxlnat_vnet_free(vxnv); \
223 }
224 extern void vxlnat_vnet_free(vxlnat_vnet_t *);
225
226 /*
227 * Endian-independent macros for rapid off-wire header reading. i.e. avoid
228 * [nh]to[hn]*()
229 *
230 * VXLAN_ID_WIRE32(id) ==> Zero-out "reserved" bits, preserve wire-order
231 * and position of vnetid.
232 * VXLAN_FLAGS_WIRE32(vni) ==> Zero-out reserved bits, preserve wire-order
233 * and position of flags.
234 * VXLAN_F_VDI_WIRE ==> VXLAN_F_VDI, but w/o needing to swap.
235 *
236 * ALSO: HTON/NTOH for kernel-makes-right interactions with userland, which
237 * means shifting actual ID to/from low-24-bits of 32-bit word.
238 * VXLAN_ID_HTON(id)
239 * VXLAN_ID_NTOH(id)
240 *
241 * XXX KEBE ASKS ==> If not confusing to folks, move into sys/vxlan.h and
242 * have overlay's VXLAN encap adopt them?
243 */
244 #ifdef _BIG_ENDIAN
245 #define VXLAN_ID_WIRE32(id) ((id) & 0xFFFFFF00)
246 #define VXLAN_F_VDI_WIRE VXLAN_F_VDI
247 /* XXX KEBE ASKS, do masking here? */
248 #define VXLAN_ID_HTON(id) ((id) << VXLAN_ID_SHIFT)
249 #define VXLAN_ID_NTOH(id) ((id) >> VXLAN_ID_SHIFT)
250 #else /* i.e. _LITTLE_ENDIAN */
251 #define VXLAN_ID_WIRE32(id) ((id) & 0xFFFFFF)
252 #define VXLAN_F_VDI_WIRE 0x08
253 #define VXLAN_ID_HTON(id) htonl((id) << VXLAN_ID_SHIFT)
254 #define VXLAN_ID_NTOH(id) (ntohl(id) >> VXLAN_ID_SHIFT)
255 #endif /* _BIG_ENDIAN */
256 #define VXLAN_FLAGS_WIRE32(flags) ((flags) & VXLAN_F_VDI_WIRE)
257
258 extern kmutex_t vxlnat_mutex;
259 extern netstack_t *vxlnat_netstack;
260 extern int vxlnat_command(vxn_msg_t *);
261 extern int vxlnat_read_dump(struct uio *);
262 extern int vxlnat_vxlan_addr(in6_addr_t *);
263 extern void vxlnat_closesock(void);
264 extern void vxlnat_state_init(void);
265 extern void vxlnat_state_fini(void);
266
267 extern void vxlnat_public_init(void);
268 extern void vxlnat_public_fini(void);
269 extern boolean_t vxlnat_public_hold(in6_addr_t *, boolean_t);
270 extern void vxlnat_public_rele(in6_addr_t *);
271
272 extern int vxlnat_tree_plus_in6_cmp(const void *, const void *);
273
274 /* ire_recvfn & ire_sendfn functions for 1-1/fixed maps. */
275 extern void vxlnat_fixed_ire_recv_v4(ire_t *, mblk_t *, void *,
276 ip_recv_attr_t *);
277 extern void vxlnat_fixed_ire_recv_v6(ire_t *, mblk_t *, void *,
278 ip_recv_attr_t *);
279 extern int vxlnat_fixed_ire_send_v4(ire_t *, mblk_t *, void *,
280 ip_xmit_attr_t *, uint32_t *);
281 extern int vxlnat_fixed_ire_send_v6(ire_t *, mblk_t *, void *,
282 ip_xmit_attr_t *, uint32_t *);
283
284 extern boolean_t vxlnat_new_conn(vxlnat_flow_t *);
285 extern void vxlnat_activate_conn(vxlnat_flow_t *);
286 #ifdef notyet
287 extern void vxlnat_deactivate_conn(vxlnat_flow_t *);
288 #endif
289
290 extern vxlnat_vnet_t *vxlnat_get_vnet(uint32_t, boolean_t);
291
292 #ifdef __cplusplus
293 }
294 #endif
295
296 #endif /* _INET_VXLNAT_IMPL_H */