1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2019 Joyent, Inc.
14 */
15
16 /*
17 * Overlay device ksocket multiplexer.
18 *
19 * For more information, see the big theory statement in
20 * uts/common/io/overlay/overlay.c
21 */
22
23 #include <sys/types.h>
24 #include <sys/socket.h>
25 #include <sys/ksynch.h>
26 #include <sys/ksocket.h>
27 #include <sys/avl.h>
28 #include <sys/list.h>
29 #include <sys/pattr.h>
30 #include <sys/sysmacros.h>
31 #include <sys/strsubr.h>
32 #include <sys/strsun.h>
33 #include <sys/tihdr.h>
34
35 #include <sys/overlay_impl.h>
36
37 #include <sys/sdt.h>
38
39 #define OVERLAY_FREEMSG(mp, reason) \
40 DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason)
41
42 static list_t overlay_mux_list;
43 static kmutex_t overlay_mux_lock;
44
45 void
46 overlay_mux_init(void)
47 {
48 list_create(&overlay_mux_list, sizeof (overlay_mux_t),
49 offsetof(overlay_mux_t, omux_lnode));
50 mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
51 }
52
53 void
54 overlay_mux_fini(void)
55 {
56 mutex_destroy(&overlay_mux_lock);
57 list_destroy(&overlay_mux_list);
58 }
59
60 static int
61 overlay_mux_comparator(const void *a, const void *b)
62 {
63 const overlay_dev_t *odl, *odr;
64 odl = a;
65 odr = b;
66 if (odl->odd_vid > odr->odd_vid)
67 return (1);
68 else if (odl->odd_vid < odr->odd_vid)
69 return (-1);
70 else
71 return (0);
72 }
73
74 /*
75 * This is the central receive data path. We need to decode the packet, if we
76 * can, and then deliver it to the appropriate overlay.
77 */
78 /* ARGSUSED */
79 static boolean_t
80 overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
81 void *arg)
82 {
83 mblk_t *mp, *nmp, *fmp;
84 overlay_mux_t *mux = arg;
85
86 /*
87 * We may have a received a chain of messages. Each messsage in the
88 * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
89 * If we aren't getting that, we should probably drop that for the
90 * moment.
91 */
92 for (mp = mpchain; mp != NULL; mp = nmp) {
93 struct T_unitdata_ind *tudi;
94 ovep_encap_info_t infop;
95 overlay_dev_t od, *odd;
96 int ret;
97
98 nmp = mp->b_next;
99 mp->b_next = NULL;
100
101 if (DB_TYPE(mp) != M_PROTO) {
102 OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
103 freemsg(mp);
104 continue;
105 }
106
107 if (mp->b_cont == NULL) {
108 OVERLAY_FREEMSG(mp, "missing a b_cont");
109 freemsg(mp);
110 continue;
111 }
112
113 tudi = (struct T_unitdata_ind *)mp->b_rptr;
114 if (tudi->PRIM_type != T_UNITDATA_IND) {
115 OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
116 freemsg(mp);
117 continue;
118 }
119
120 /*
121 * In the future, we'll care about the source information
122 * for purposes of telling varpd for oob invalidation. But for
123 * now, just drop that block.
124 */
125 fmp = mp;
126 mp = fmp->b_cont;
127 freeb(fmp);
128
129 /*
130 * Until we have VXLAN-or-other-decap HW acceleration support
131 * (e.g. we support NICs that reach into VXLAN-encapsulated
132 * packets and check the inside-VXLAN IP packets' checksums,
133 * or do LSO with VXLAN), we should clear any HW-accelerated-
134 * performed bits.
135 */
136 DB_CKSUMFLAGS(mp) = 0;
137
138 /*
139 * Decap and deliver.
140 */
141 bzero(&infop, sizeof (ovep_encap_info_t));
142 ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
143 if (ret != 0) {
144 OVERLAY_FREEMSG(mp, "decap failed");
145 freemsg(mp);
146 continue;
147 }
148 if (MBLKL(mp) > infop.ovdi_hdr_size) {
149 mp->b_rptr += infop.ovdi_hdr_size;
150 } else {
151 while (infop.ovdi_hdr_size != 0) {
152 size_t rem, blkl;
153
154 if (mp == NULL)
155 break;
156
157 blkl = MBLKL(mp);
158 rem = MIN(infop.ovdi_hdr_size, blkl);
159 infop.ovdi_hdr_size -= rem;
160 mp->b_rptr += rem;
161 if (rem == blkl) {
162 fmp = mp;
163 mp = fmp->b_cont;
164 fmp->b_cont = NULL;
165 OVERLAY_FREEMSG(mp,
166 "freed a fmp block");
167 freemsg(fmp);
168 }
169 }
170 if (mp == NULL) {
171 OVERLAY_FREEMSG(mp, "freed it all...");
172 continue;
173 }
174 }
175
176
177 od.odd_vid = infop.ovdi_id;
178 mutex_enter(&mux->omux_lock);
179 odd = avl_find(&mux->omux_devices, &od, NULL);
180 if (odd == NULL) {
181 mutex_exit(&mux->omux_lock);
182 OVERLAY_FREEMSG(mp, "no matching vid");
183 freemsg(mp);
184 continue;
185 }
186 mutex_enter(&odd->odd_lock);
187 if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
188 !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
189 mutex_exit(&odd->odd_lock);
190 mutex_exit(&mux->omux_lock);
191 OVERLAY_FREEMSG(mp, "dev dropped");
192 freemsg(mp);
193 continue;
194 }
195 overlay_io_start(odd, OVERLAY_F_IN_RX);
196 mutex_exit(&odd->odd_lock);
197 mutex_exit(&mux->omux_lock);
198
199 mac_rx(odd->odd_mh, NULL, mp);
200
201 mutex_enter(&odd->odd_lock);
202 overlay_io_done(odd, OVERLAY_F_IN_RX);
203 mutex_exit(&odd->odd_lock);
204 }
205
206 return (B_TRUE);
207 }
208
209 /*
210 * Register a given device with a socket backend. If no such device socket
211 * exists, create a new one.
212 */
213 overlay_mux_t *
214 overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
215 struct sockaddr *addr, socklen_t len, int *errp)
216 {
217 int err;
218 overlay_mux_t *mux;
219 ksocket_t ksock;
220
221 if (errp == NULL)
222 errp = &err;
223
224 mutex_enter(&overlay_mux_lock);
225 for (mux = list_head(&overlay_mux_list); mux != NULL;
226 mux = list_next(&overlay_mux_list, mux)) {
227 if (domain == mux->omux_domain &&
228 family == mux->omux_family &&
229 protocol == mux->omux_protocol &&
230 len == mux->omux_alen &&
231 bcmp(addr, mux->omux_addr, len) == 0) {
232
233 if (opp != mux->omux_plugin) {
234 *errp = EEXIST;
235 return (NULL);
236 }
237
238 mutex_enter(&mux->omux_lock);
239 mux->omux_count++;
240 mutex_exit(&mux->omux_lock);
241 mutex_exit(&overlay_mux_lock);
242 *errp = 0;
243 return (mux);
244 }
245 }
246
247 /*
248 * Today we aren't zone-aware and only exist in the global zone. When we
249 * allow for things to exist in the non-global zone, we'll want to use a
250 * credential that's actually specific to the zone.
251 */
252 *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
253 kcred);
254 if (*errp != 0) {
255 mutex_exit(&overlay_mux_lock);
256 return (NULL);
257 }
258
259 *errp = ksocket_bind(ksock, addr, len, kcred);
260 if (*errp != 0) {
261 mutex_exit(&overlay_mux_lock);
262 ksocket_close(ksock, kcred);
263 return (NULL);
264 }
265
266 /*
267 * Ask our lower layer to optionally toggle anything they need on this
268 * socket. Because a socket is owned by a single type of plugin, we can
269 * then ask it to perform any additional socket set up it'd like to do.
270 */
271 if (opp->ovp_ops->ovpo_sockopt != NULL &&
272 (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
273 mutex_exit(&overlay_mux_lock);
274 ksocket_close(ksock, kcred);
275 return (NULL);
276 }
277
278 mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
279 list_link_init(&mux->omux_lnode);
280 mux->omux_ksock = ksock;
281 mux->omux_plugin = opp;
282 mux->omux_domain = domain;
283 mux->omux_family = family;
284 mux->omux_protocol = protocol;
285 mux->omux_addr = kmem_alloc(len, KM_SLEEP);
286 bcopy(addr, mux->omux_addr, len);
287 mux->omux_alen = len;
288 mux->omux_count = 1;
289 avl_create(&mux->omux_devices, overlay_mux_comparator,
290 sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
291 mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
292
293
294 /* Once this is called, we need to expect to rx data */
295 *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
296 if (*errp != 0) {
297 ksocket_close(ksock, kcred);
298 mutex_destroy(&mux->omux_lock);
299 avl_destroy(&mux->omux_devices);
300 kmem_free(mux->omux_addr, len);
301 kmem_free(mux, sizeof (overlay_mux_t));
302 return (NULL);
303 }
304
305 list_insert_tail(&overlay_mux_list, mux);
306 mutex_exit(&overlay_mux_lock);
307
308 *errp = 0;
309 return (mux);
310 }
311
312 void
313 overlay_mux_close(overlay_mux_t *mux)
314 {
315 mutex_enter(&overlay_mux_lock);
316 mutex_enter(&mux->omux_lock);
317 mux->omux_count--;
318 if (mux->omux_count != 0) {
319 mutex_exit(&mux->omux_lock);
320 mutex_exit(&overlay_mux_lock);
321 return;
322 }
323 list_remove(&overlay_mux_list, mux);
324 mutex_exit(&mux->omux_lock);
325 mutex_exit(&overlay_mux_lock);
326
327 ksocket_close(mux->omux_ksock, kcred);
328 avl_destroy(&mux->omux_devices);
329 kmem_free(mux->omux_addr, mux->omux_alen);
330 kmem_free(mux, sizeof (overlay_mux_t));
331 }
332
333 void
334 overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
335 {
336 mutex_enter(&mux->omux_lock);
337 avl_add(&mux->omux_devices, odd);
338 mutex_exit(&mux->omux_lock);
339 }
340
341 void
342 overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
343 {
344 mutex_enter(&mux->omux_lock);
345 avl_remove(&mux->omux_devices, odd);
346 mutex_exit(&mux->omux_lock);
347 }
348
349 int
350 overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
351 {
352 int ret;
353
354 /*
355 * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
356 * that isn't actually supported by UDP at this time.
357 */
358 ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred);
359 if (ret != 0)
360 freemsg(mp);
361
362 return (ret);
363 }