1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2018, Joyent, Inc.
14 */
15
16 #include <sys/types.h>
17 #include <sys/ethernet.h>
18 #include <sys/mac_client.h>
19 #include <sys/vlan.h>
20 #include <sys/ddi.h>
21 #include <sys/strsun.h>
22 #include <inet/ip.h>
23 #include <inet/ip6.h>
24 #include <netinet/in.h>
25
26 #include <sys/overlay_impl.h>
27
28 extern kmem_cache_t *overlay_entry_cache;
29 extern int overlay_ent_size;
30
31 int
32 overlay_fabric_avl(const void *a, const void *b)
33 {
34 const overlay_fabric_t *l;
35 const overlay_fabric_t *r;
36 int i;
37
38 l = &((const overlay_fabric_entry_t *)a)->ofe_fabric;
39 r = &((const overlay_fabric_entry_t *)b)->ofe_fabric;
40
41 if (l->ofb_dcid < r->ofb_dcid)
42 return (-1);
43 if (l->ofb_dcid > r->ofb_dcid)
44 return (1);
45 if (l->ofb_vid < r->ofb_vid)
46 return (-1);
47 if (l->ofb_vid > r->ofb_vid)
48 return (1);
49 if (l->ofb_vlan < r->ofb_vlan)
50 return (-1);
51 if (l->ofb_vlan > r->ofb_vlan)
52 return (1);
53 for (i = 0; i < 4; i++) {
54 if (l->ofb_addr.s6_addr32[i] < r->ofb_addr.s6_addr32[i])
55 return (-1);
56 if (l->ofb_addr.s6_addr32[i] > r->ofb_addr.s6_addr32[i])
57 return (1);
58 }
59 return (0);
60 }
61
62 overlay_fabric_entry_t *
63 overlay_fabric_entry_new(uint64_t vid, uint32_t dcid, uint16_t vlan,
64 const struct in6_addr *addr, uint8_t prefixlen, const uint8_t *mac)
65 {
66 overlay_fabric_entry_t *ofe = NULL;
67
68 ofe = kmem_zalloc(sizeof (*ofe), KM_SLEEP);
69
70 ofe->ofe_fabric.ofb_vid = vid;
71 ofe->ofe_fabric.ofb_dcid = dcid;
72 ofe->ofe_fabric.ofb_vlan = vlan;
73 ofe->ofe_fabric.ofb_prefixlen = prefixlen;
74 bcopy(addr, &ofe->ofe_fabric.ofb_addr, sizeof (*addr));
75 bcopy(mac, ofe->ofe_fabric.ofb_mac, ETHERADDRL);
76 return (ofe);
77 }
78
79 void
80 overlay_fabric_entry_free(overlay_fabric_entry_t *ofe)
81 {
82 if (ofe == NULL)
83 return;
84
85 VERIFY3U(ofe->ofe_refcnt, ==, 0);
86 kmem_free(ofe, sizeof (*ofe));
87 }
88
89 /*
90 * Using the source and destination IP address, locate the target fabric, or
91 * create larval entries and queue for missing information.
92 */
93 static overlay_fabric_entry_t *
94 overlay_route_find(overlay_dev_t *odd, const mac_header_info_t *mhi,
95 const struct in6_addr *dst)
96 {
97 overlay_target_t *ott = odd->odd_target;
98 overlay_router_t *otr;
99 overlay_fabric_entry_t *ofb_src, *ofb_dst;
100 overlay_target_entry_t *src_entry;
101 overlay_route_table_t *orte;
102 uint16_t vlan = VLAN_ID(mhi->mhi_tci);
103 boolean_t dst_found = B_FALSE;
104
105 /* First attempt to find the overlay_fabric_t for the source */
106 otr = odd->odd_router;
107
108 mutex_enter(&ott->ott_lock);
109 if ((src_entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
110 mhi->mhi_saddr)) != NULL)
111 OVERLAY_TARG_ENTRY_REFHOLD(src_entry);
112 mutex_exit(&ott->ott_lock);
113
114 if (src_entry == NULL)
115 return (NULL);
116
117 mutex_enter(&src_entry->ote_lock);
118 if (src_entry->ote_fab == NULL) {
119 mutex_exit(&src_entry->ote_lock);
120 OVERLAY_TARG_ENTRY_REFRELE(src_entry);
121 return (NULL);
122 }
123 if ((ofb_src = src_entry->ote_fab) != NULL)
124 OVERLAY_FAB_ENTRY_REFHOLD(ofb_src);
125 mutex_exit(&src_entry->ote_lock);
126
127 OVERLAY_TARG_ENTRY_REFRELE(src_entry);
128 src_entry = NULL;
129
130 if (ofb_src == NULL || ofb_src->ofe_route_table == NULL) {
131 mutex_exit(&otr->otr_lock);
132 return (NULL);
133 }
134
135 /* Go through the fabric route table and try to find a destination */
136 orte = ofb_src->ofe_route_table;
137 for (size_t i = 0; orte->ort_dest[i] != NULL; i++) {
138 ofb_dst = orte->ort_dest[i];
139 if (IN6_ARE_PREFIXEDADDR_EQUAL(dst,
140 &ofb_dst->ofe_fabric.ofb_addr,
141 ofb_dst->ofe_fabric.ofb_prefixlen)) {
142 dst_found = B_TRUE;
143 break;
144 }
145 }
146
147 if (dst_found)
148 OVERLAY_FAB_ENTRY_REFHOLD(ofb_dst);
149
150 mutex_exit(&otr->otr_lock);
151
152 return (dst_found ? ofb_dst : NULL);
153 }
154
155 /*
156 * Adjust packet fields as necessary for delivery of an encapsulated packet
157 * to a remote target (e.g. cross-DC).
158 */
159 static int
160 overlay_route(overlay_dev_t *odd, mblk_t *mp, const overlay_fabric_entry_t *ofe,
161 const overlay_target_entry_t *ote)
162 {
163 if (MBLKL(mp) >= sizeof (struct ether_vlan_header)) {
164 /* The easy way */
165 struct ether_vlan_header *evh;
166
167 evh = (struct ether_vlan_header *)mp->b_rptr;
168
169 /*
170 * Today, we require all encapsulated frames to be vlan tagged.
171 * In the future, we might relax this and insert the vlan tag if
172 * the destination is tagged and the source is not.
173 */
174 if (evh->ether_tpid != ETHERTYPE_VLAN)
175 return (OVERLAY_TARGET_DROP);
176
177 evh->ether_tci = evh->ether_tci & ~(VLAN_ID_MASK);
178 evh->ether_tci |= ofe->ofe_fabric.ofb_vlan;
179 bcopy(ote->ote_addr, &evh->ether_dhost, ETHERADDRL);
180 bcopy(ofe->ofe_fabric.ofb_mac, &evh->ether_shost, ETHERADDRL);
181 return (OVERLAY_TARGET_OK);
182 }
183
184 /* The painful, long, tedious way */
185 unsigned char *p = mp->b_rptr;
186 unsigned char *end = mp->b_wptr;
187 unsigned char *vlanp = NULL;
188 uint16_t sap = 0;
189 uint16_t vlan = 0;
190 size_t i;
191
192 /*
193 * XXX: This method seems so absolutely awful, I wonder if there
194 * shouldn't be a counter and/or dtrace probe to allow us to find
195 * out what upstack causes this path to be taken.
196 */
197 for (i = 0; i < ETHERADDRL; i++) {
198 if (p == end) {
199 mp = mp->b_cont;
200 if (mp == NULL)
201 return (OVERLAY_TARGET_DROP);
202 p = mp->b_rptr;
203 end = mp->b_wptr;
204 }
205 *p++ = ote->ote_addr[i];
206 }
207
208 for (i = 0; i < ETHERADDRL; i++) {
209 if (p == end) {
210 mp = mp->b_cont;
211 if (mp == NULL)
212 return (OVERLAY_TARGET_DROP);
213 p = mp->b_rptr;
214 end = mp->b_wptr;
215 }
216 *p++ = ofe->ofe_fabric.ofb_mac[i];
217 }
218
219 for (i = 0; i < 2; i++) {
220 if (p == end) {
221 mp = mp->b_cont;
222 if (mp == NULL)
223 return (OVERLAY_TARGET_DROP);
224 p = mp->b_rptr;
225 end = mp->b_wptr;
226 }
227 sap |= *p++ << (8 - i*8);
228 }
229
230 if (sap != ETHERTYPE_VLAN)
231 return (OVERLAY_TARGET_DROP);
232
233 if (p == end) {
234 mp = mp->b_cont;
235 if (mp == NULL)
236 return (OVERLAY_TARGET_DROP);
237 p = mp->b_rptr;
238 end = mp->b_wptr;
239 }
240 vlanp = p;
241
242 for (i = 0; i < 2; i++) {
243 if (p == end) {
244 mp = mp->b_cont;
245 if (mp == NULL)
246 return (OVERLAY_TARGET_DROP);
247 p = mp->b_rptr;
248 end = mp->b_wptr;
249 }
250 vlan |= *p++ << (8 - i*8);
251 }
252
253 vlan &= ~(VLAN_ID_MASK);
254 vlan |= ofe->ofe_fabric.ofb_vlan;
255
256 for (p = vlanp, i = 0; i < 2; i++) {
257 if (p == end) {
258 mp = mp->b_cont;
259 if (mp == NULL)
260 return (OVERLAY_TARGET_DROP);
261 p = mp->b_rptr;
262 end = mp->b_wptr;
263 }
264 *p++ = vlan >> (8 - i*8);
265 }
266
267 return (OVERLAY_TARGET_OK);
268 }
269
270 int
271 overlay_route_lookup(overlay_dev_t *odd, mblk_t *mp,
272 const mac_header_info_t *mhi, struct sockaddr *sock, socklen_t *slenp,
273 uint64_t *vidp)
274 {
275 overlay_target_t *ott = odd->odd_target;
276 overlay_router_t *otr = odd->odd_router;
277 overlay_fabric_entry_t *dst_fab;
278 overlay_target_entry_t *entry;
279 overlay_target_entry_t search = { 0 };
280 struct sockaddr_in6 *v6;
281 struct in6_addr src, dst;
282 int ret;
283
284 ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
285
286 if (odd->odd_router == NULL)
287 return (OVERLAY_TARGET_DROP);
288
289 if ((ret = overlay_mblk_vl3ip(mp, &src, &dst)) != 0)
290 return (OVERLAY_TARGET_DROP);
291
292 if ((dst_fab = overlay_route_find(odd, mhi, &dst)) == NULL)
293 return (OVERLAY_TARGET_DROP);
294
295 v6 = (struct sockaddr_in6 *)sock;
296 bzero(v6, sizeof (struct sockaddr_in6));
297 v6->sin6_family = AF_INET6;
298
299 bcopy(&dst, &search.ote_ip, sizeof (dst));
300 search.ote_fab = dst_fab;
301
302 mutex_enter(&ott->ott_lock);
303 if ((entry = refhash_lookup(ott->ott_u.ott_dyn.ott_l3dhash,
304 &search)) == NULL) {
305 if ((entry = kmem_cache_alloc(overlay_entry_cache,
306 KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
307 mutex_exit(&ott->ott_lock);
308 return (OVERLAY_TARGET_DROP);
309 }
310
311 bcopy(&dst, &entry->ote_ip, sizeof (dst));
312 entry->ote_chead = entry->ote_ctail = mp;
313 entry->ote_mbsize = msgsize(mp);
314 entry->ote_flags |= OVERLAY_ENTRY_F_VL3_PENDING;
315 entry->ote_ott = ott;
316 entry->ote_odd = odd;
317 OVERLAY_TARG_ENTRY_REFHOLD(entry);
318 refhash_insert(ott->ott_u.ott_dyn.ott_l3dhash, entry);
319 mutex_exit(&ott->ott_lock);
320 overlay_target_queue(entry);
321 OVERLAY_FAB_ENTRY_REFRELE(dst_fab);
322 return (OVERLAY_TARGET_ASYNC);
323 }
324 OVERLAY_TARG_ENTRY_REFHOLD(entry);
325 mutex_exit(&ott->ott_lock);
326
327 mutex_enter(&entry->ote_lock);
328 /* There's no point in sending packets to a remote fabric's router IP */
329 if (entry->ote_flags & (OVERLAY_ENTRY_F_DROP|OVERLAY_ENTRY_F_ROUTER)) {
330 ret = OVERLAY_TARGET_DROP;
331 } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
332 *vidp = dst_fab->ofe_fabric.ofb_vid;
333
334 bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
335 sizeof (struct in6_addr));
336 v6->sin6_port = htons(entry->ote_dest.otp_port);
337 *slenp = sizeof (struct sockaddr_in6);
338
339 ret = overlay_route(odd, mp, dst_fab, entry);
340 } else {
341 size_t mlen = msgsize(mp);
342
343 if (mlen + entry->ote_mbsize > overlay_ent_size) {
344 ret = OVERLAY_TARGET_DROP;
345 } else {
346 if (entry->ote_ctail != NULL) {
347 ASSERT(entry->ote_ctail->b_next == NULL);
348 entry->ote_ctail->b_next = mp;
349 entry->ote_ctail = mp;
350 } else {
351 entry->ote_chead = mp;
352 entry->ote_ctail = mp;
353 }
354 entry->ote_mbsize += mlen;
355 if ((entry->ote_flags & OVERLAY_ENTRY_F_PENDING) == 0) {
356 entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
357 overlay_target_queue(entry);
358 }
359 ret = OVERLAY_TARGET_ASYNC;
360 }
361 }
362 mutex_exit(&entry->ote_lock);
363
364 OVERLAY_TARG_ENTRY_REFRELE(entry);
365 OVERLAY_FAB_ENTRY_REFRELE(dst_fab);
366
367 return (ret);
368 }
369
370 /*
371 * Obtain the source and/or destination VL3 IPs of a packet. If this is
372 * IPV4 packet, the addresses are returned as IPV6 mapped addresses.
373 *
374 * XXX: The name seems a bit ugly, anyone have better suggestions?
375 */
376 int
377 overlay_mblk_vl3ip(mblk_t *mp, struct in6_addr *src, struct in6_addr *dst)
378 {
379 struct ether_header *ehp;
380 ipha_t *iphp;
381 ip6_t *ip6hp;
382 size_t skip_len;
383 uint16_t sap;
384
385 /*
386 * This is only used for outbound packets, so we use a similar
387 * approach as inet_pkt_hash().
388 */
389 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
390 ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
391
392 ehp = (struct ether_header *)mp->b_rptr;
393 sap = ntohs(ehp->ether_type);
394 if (sap == ETHERTYPE_VLAN) {
395 struct ether_vlan_header *evhp;
396 mblk_t *newmp = NULL;
397
398 skip_len = sizeof (struct ether_vlan_header);
399 if (MBLKL(mp) < skip_len) {
400 /* the vlan tag is the payload, pull up first */
401 newmp = msgpullup(mp, -1);
402 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
403 freemsg(newmp);
404 return (ENOMEM);
405 }
406 evhp = (struct ether_vlan_header *)newmp->b_rptr;
407 } else {
408 evhp = (struct ether_vlan_header *)mp->b_rptr;
409 }
410 sap = ntohs(evhp->ether_type);
411 freemsg(newmp);
412 } else {
413 skip_len = sizeof (struct ether_header);
414 }
415
416 /* if the ethernet header is in its own mblk, skip it */
417 if (MBLKL(mp) <= skip_len) {
418 skip_len -= MBLKL(mp);
419 mp = mp->b_cont;
420 if (mp == NULL)
421 return (EINVAL);
422 }
423
424 switch (sap) {
425 case ETHERTYPE_IP:
426 iphp = (ipha_t *)(mp->b_rptr + skip_len);
427 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
428 !OK_32PTR((char *)iphp))
429 return (EINVAL); /* XXX: better error code? */
430
431 IN6_IPADDR_TO_V4MAPPED(iphp->ipha_src, src);
432 IN6_IPADDR_TO_V4MAPPED(iphp->ipha_dst, dst);
433 return (0);
434 case ETHERTYPE_IPV6:
435 ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
436 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
437 !OK_32PTR((char *)ip6hp))
438 return (EINVAL);
439
440 bcopy(&ip6hp->ip6_src, src, sizeof (*src));
441 bcopy(&ip6hp->ip6_dst, dst, sizeof (*dst));
442 return (0);
443 default:
444 return (EINVAL);
445 }
446
447 return (0);
448 }
449
450 /* ARGSUSED */
451 int
452 overlay_router_ioctl(dev_t dev, int cmt, intptr_t arg, int mode, cred_t *credp,
453 int *rvalp)
454 {
455 /* TODO */
456 return (ENOTTY);
457 }