1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2019 Joyent, Inc.
14 */
15
16 /*
17 * Overlay device ksocket multiplexer.
18 *
19 * For more information, see the big theory statement in
20 * uts/common/io/overlay/overlay.c
21 */
22
23 #include <sys/types.h>
24 #include <sys/socket.h>
25 #include <sys/ksynch.h>
26 #include <sys/ksocket.h>
27 #include <sys/avl.h>
28 #include <sys/list.h>
29 #include <sys/pattr.h>
30 #include <sys/sysmacros.h>
31 #include <sys/strsubr.h>
32 #include <sys/strsun.h>
33 #include <sys/tihdr.h>
34
35 #include <sys/overlay_impl.h>
36
37 #include <sys/sdt.h>
38
39 #define OVERLAY_FREEMSG(mp, reason) \
40 DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason)
41
42 static list_t overlay_mux_list;
43 static kmutex_t overlay_mux_lock;
44
45 void
46 overlay_mux_init(void)
47 {
48 list_create(&overlay_mux_list, sizeof (overlay_mux_t),
49 offsetof(overlay_mux_t, omux_lnode));
50 mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
51 }
52
53 void
54 overlay_mux_fini(void)
55 {
56 mutex_destroy(&overlay_mux_lock);
57 list_destroy(&overlay_mux_list);
58 }
59
60 static int
61 overlay_mux_comparator(const void *a, const void *b)
62 {
63 const overlay_dev_t *odl, *odr;
64 odl = a;
65 odr = b;
66 if (odl->odd_vid > odr->odd_vid)
67 return (1);
68 else if (odl->odd_vid < odr->odd_vid)
69 return (-1);
70 else
71 return (0);
72 }
73
74 /*
75 * This is the central receive data path. We need to decode the packet, if we
76 * can, and then deliver it to the appropriate overlay.
77 */
78 /* ARGSUSED */
79 static boolean_t
80 overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
81 void *arg)
82 {
83 mblk_t *mp, *nmp, *fmp;
84 overlay_mux_t *mux = arg;
85
86 /*
87 * We may have a received a chain of messages. Each messsage in the
88 * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
89 * If we aren't getting that, we should probably drop that for the
90 * moment.
91 */
92 for (mp = mpchain; mp != NULL; mp = nmp) {
93 struct T_unitdata_ind *tudi;
94 ovep_encap_info_t infop;
95 overlay_dev_t od, *odd;
96 int ret;
97
98 nmp = mp->b_next;
99 mp->b_next = NULL;
100
101 if (DB_TYPE(mp) != M_PROTO) {
102 OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
103 freemsg(mp);
104 continue;
105 }
106
107 if (mp->b_cont == NULL) {
108 OVERLAY_FREEMSG(mp, "missing a b_cont");
109 freemsg(mp);
110 continue;
111 }
112
113 tudi = (struct T_unitdata_ind *)mp->b_rptr;
114 if (tudi->PRIM_type != T_UNITDATA_IND) {
115 OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
116 freemsg(mp);
117 continue;
118 }
119
120 /*
121 * In the future, we'll care about the source information
122 * for purposes of telling varpd for oob invalidation. But for
123 * now, just drop that block.
124 */
125 fmp = mp;
126 mp = fmp->b_cont;
127 freeb(fmp);
128
129 /*
130 * Until we have VXLAN-or-other-decap HW acceleration support
131 * (e.g. we support NICs that reach into VXLAN-encapsulated
132 * packets and check the inside-VXLAN IP packets' checksums,
133 * or do LSO with VXLAN), we should clear any HW-accelerated-
134 * performed bits.
135 */
136 DB_CKSUMFLAGS(mp) = 0;
137
138 /*
139 * Decap and deliver.
140 */
141 bzero(&infop, sizeof (ovep_encap_info_t));
142 ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
143 if (ret != 0) {
144 OVERLAY_FREEMSG(mp, "decap failed");
145 freemsg(mp);
146 continue;
147 }
148 if (MBLKL(mp) > infop.ovdi_hdr_size) {
149 mp->b_rptr += infop.ovdi_hdr_size;
150 } else {
151 while (infop.ovdi_hdr_size != 0) {
152 size_t rem, blkl;
153
154 if (mp == NULL)
155 break;
156
157 blkl = MBLKL(mp);
158 rem = MIN(infop.ovdi_hdr_size, blkl);
159 infop.ovdi_hdr_size -= rem;
160 mp->b_rptr += rem;
161 if (rem == blkl) {
162 fmp = mp;
163 mp = fmp->b_cont;
164 fmp->b_cont = NULL;
165 OVERLAY_FREEMSG(mp,
166 "freed a fmp block");
167 freemsg(fmp);
168 }
169 }
170 if (mp == NULL) {
171 OVERLAY_FREEMSG(mp, "freed it all...");
172 continue;
173 }
174 }
175
176
177 od.odd_vid = infop.ovdi_id;
178 mutex_enter(&mux->omux_lock);
179 odd = avl_find(&mux->omux_devices, &od, NULL);
180 if (odd == NULL) {
181 mutex_exit(&mux->omux_lock);
182 OVERLAY_FREEMSG(mp, "no matching vid");
183 freemsg(mp);
184 continue;
185 }
186 mutex_enter(&odd->odd_lock);
187 if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
188 !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
189 mutex_exit(&odd->odd_lock);
190 mutex_exit(&mux->omux_lock);
191 OVERLAY_FREEMSG(mp, "dev dropped");
192 freemsg(mp);
193 continue;
194 }
195 overlay_io_start(odd, OVERLAY_F_IN_RX);
196 mutex_exit(&odd->odd_lock);
197 mutex_exit(&mux->omux_lock);
198
199 mac_rx(odd->odd_mh, NULL, mp);
200
201 mutex_enter(&odd->odd_lock);
202 overlay_io_done(odd, OVERLAY_F_IN_RX);
203 mutex_exit(&odd->odd_lock);
204 }
205
206 return (B_TRUE);
207 }
208
209 /*
210 * Kernel socket callback to indicate the socket itself is able to send
211 * data again. Check for devices on this mux that were send-blocked,
212 * and clear them.
213 */
214 /* ARGSUSED */
215 static void
216 overlay_mux_cansend_now(ksocket_t ksock, ksocket_callback_event_t event,
217 void *arg, uintptr_t ignore_me)
218 {
219 overlay_mux_t *mux = (overlay_mux_t *)arg;
220 overlay_dev_t *odd;
221 mac_handle_t *mhs_to_update, *current_mh;
222 size_t allocsize;
223
224 ASSERT3P(ksock, ==, mux->omux_ksock);
225 ASSERT3U(event, ==, KSOCKET_EV_CANSEND);
226
227 /* Traverse omux_devices and check for ones marked as send-blocked. */
228 mutex_enter(&mux->omux_lock);
229 if (mux->omux_count == 0) {
230 /* Nothing to wake up. */
231 mutex_exit(&mux->omux_lock);
232 return;
233 }
234 allocsize = sizeof (mac_handle_t) * mux->omux_count;
235 mhs_to_update = kmem_zalloc(allocsize, KM_NOSLEEP);
236 VERIFY(mhs_to_update != NULL); /* Failure should be rare. */
237 current_mh = mhs_to_update;
238
239 for (odd = avl_first(&mux->omux_devices); odd != NULL;
240 odd = AVL_NEXT(&mux->omux_devices, odd)) {
241 mac_handle_t odd_mh = NULL;
242
243 mutex_enter(&odd->odd_lock);
244 if ((odd->odd_flags & OVERLAY_F_TXSTOPPED) != 0) {
245 /* Get ready to tell MAC it can transmit again. */
246 odd->odd_flags &= ~OVERLAY_F_TXSTOPPED;
247 odd_mh = odd->odd_mh;
248 }
249 mutex_exit(&odd->odd_lock);
250 if (odd_mh != NULL) {
251 *current_mh = odd_mh;
252 current_mh++;
253 }
254 }
255 mutex_exit(&mux->omux_lock);
256
257 /*
258 * Yes, I'm using the value-then-decrement. "current_mh" is
259 * guaranteed to be at least one ahead of mhs_to_update if there are
260 * any mac handles that need updating. I also have to do this outside
261 * the omux lock because the tx_update may trigger immediate or
262 * concurrent packet transmission.
263 */
264 while (current_mh-- != mhs_to_update)
265 mac_tx_update(*current_mh);
266
267 kmem_free(mhs_to_update, allocsize);
268 }
269
270 /*
271 * Register a given device with a socket backend. If no such device socket
272 * exists, create a new one.
273 */
274 overlay_mux_t *
275 overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
276 struct sockaddr *addr, socklen_t len, int *errp)
277 {
278 int err;
279 overlay_mux_t *mux;
280 ksocket_t ksock;
281 ksocket_callbacks_t ks_cb = { 0 };
282
283 if (errp == NULL)
284 errp = &err;
285
286 mutex_enter(&overlay_mux_lock);
287 for (mux = list_head(&overlay_mux_list); mux != NULL;
288 mux = list_next(&overlay_mux_list, mux)) {
289 if (domain == mux->omux_domain &&
290 family == mux->omux_family &&
291 protocol == mux->omux_protocol &&
292 len == mux->omux_alen &&
293 bcmp(addr, mux->omux_addr, len) == 0) {
294
295 if (opp != mux->omux_plugin) {
296 *errp = EEXIST;
297 return (NULL);
298 }
299
300 mutex_enter(&mux->omux_lock);
301 mux->omux_count++;
302 mutex_exit(&mux->omux_lock);
303 mutex_exit(&overlay_mux_lock);
304 *errp = 0;
305 return (mux);
306 }
307 }
308
309 /*
310 * Today we aren't zone-aware and only exist in the global zone. When we
311 * allow for things to exist in the non-global zone, we'll want to use a
312 * credential that's actually specific to the zone.
313 */
314 *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
315 kcred);
316 if (*errp != 0) {
317 mutex_exit(&overlay_mux_lock);
318 return (NULL);
319 }
320
321 *errp = ksocket_bind(ksock, addr, len, kcred);
322 if (*errp != 0) {
323 mutex_exit(&overlay_mux_lock);
324 ksocket_close(ksock, kcred);
325 return (NULL);
326 }
327
328 /*
329 * Ask our lower layer to optionally toggle anything they need on this
330 * socket. Because a socket is owned by a single type of plugin, we can
331 * then ask it to perform any additional socket set up it'd like to do.
332 */
333 if (opp->ovp_ops->ovpo_sockopt != NULL &&
334 (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
335 mutex_exit(&overlay_mux_lock);
336 ksocket_close(ksock, kcred);
337 return (NULL);
338 }
339
340 mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
341 list_link_init(&mux->omux_lnode);
342 mux->omux_ksock = ksock;
343 mux->omux_plugin = opp;
344 mux->omux_domain = domain;
345 mux->omux_family = family;
346 mux->omux_protocol = protocol;
347 mux->omux_addr = kmem_alloc(len, KM_SLEEP);
348 bcopy(addr, mux->omux_addr, len);
349 mux->omux_alen = len;
350 mux->omux_count = 1;
351 avl_create(&mux->omux_devices, overlay_mux_comparator,
352 sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
353 mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
354
355 #if defined(OVERLAY_PINCH) || defined(OVERLAY_FC_TEST)
356 /* Set the xmit buf to a REALLY SMALL value, say 12k (1-3 packets) */
357 int bufsize = 12 * 1024;
358
359 if (ksocket_setsockopt(ksock, SOL_SOCKET, SO_SNDBUF,
360 (const void *)&bufsize, sizeof (bufsize), CRED()) != 0) {
361 ksocket_close(ksock, kcred);
362 mutex_destroy(&mux->omux_lock);
363 avl_destroy(&mux->omux_devices);
364 kmem_free(mux->omux_addr, len);
365 kmem_free(mux, sizeof (overlay_mux_t));
366 return (NULL);
367 }
368 #endif
369 /*
370 * Set a callback in case we hit socket flow control and need to know
371 * when it's ready to send again. See the aforementioned
372 * ksocket_socket() comments about the use of kcred vs. being
373 * zone-aware.
374 */
375 ks_cb.ksock_cb_cansend = overlay_mux_cansend_now;
376 if (ksocket_setcallbacks(ksock, &ks_cb, mux, kcred) != 0) {
377 ksocket_close(ksock, kcred);
378 mutex_destroy(&mux->omux_lock);
379 avl_destroy(&mux->omux_devices);
380 kmem_free(mux->omux_addr, len);
381 kmem_free(mux, sizeof (overlay_mux_t));
382 return (NULL);
383 }
384
385 /* Once this is called, we need to expect to rx data */
386 *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
387 if (*errp != 0) {
388 ksocket_close(ksock, kcred);
389 mutex_destroy(&mux->omux_lock);
390 avl_destroy(&mux->omux_devices);
391 kmem_free(mux->omux_addr, len);
392 kmem_free(mux, sizeof (overlay_mux_t));
393 return (NULL);
394 }
395
396 list_insert_tail(&overlay_mux_list, mux);
397 mutex_exit(&overlay_mux_lock);
398
399 *errp = 0;
400 return (mux);
401 }
402
403 void
404 overlay_mux_close(overlay_mux_t *mux)
405 {
406 mutex_enter(&overlay_mux_lock);
407 mutex_enter(&mux->omux_lock);
408 mux->omux_count--;
409 if (mux->omux_count != 0) {
410 mutex_exit(&mux->omux_lock);
411 mutex_exit(&overlay_mux_lock);
412 return;
413 }
414 list_remove(&overlay_mux_list, mux);
415 mutex_exit(&mux->omux_lock);
416 mutex_exit(&overlay_mux_lock);
417
418 ksocket_close(mux->omux_ksock, kcred);
419 avl_destroy(&mux->omux_devices);
420 kmem_free(mux->omux_addr, mux->omux_alen);
421 kmem_free(mux, sizeof (overlay_mux_t));
422 }
423
424 void
425 overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
426 {
427 mutex_enter(&mux->omux_lock);
428 avl_add(&mux->omux_devices, odd);
429 mutex_exit(&mux->omux_lock);
430 }
431
432 void
433 overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
434 {
435 mutex_enter(&mux->omux_lock);
436 avl_remove(&mux->omux_devices, odd);
437 mutex_exit(&mux->omux_lock);
438 }
439
440 int
441 overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
442 {
443 int ret;
444
445 /*
446 * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
447 * that isn't actually supported by UDP at this time.
448 *
449 * Send with MSG_DONTWAIT to indicate clogged UDP sockets upstack.
450 */
451 ret = ksocket_sendmblk(mux->omux_ksock, hdr, MSG_DONTWAIT, &mp, kcred);
452 /*
453 * NOTE: ksocket_sendmblk() may send partial packets downstack,
454 * returning what's not sent in &mp (i.e. mp pre-call might be a
455 * b_cont of mp post-call). We can't hold up this message (it's a
456 * datagram), so we drop, and let the caller cope.
457 */
458 if (ret != 0)
459 freemsg(mp);
460
461 return (ret);
462 }