1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2015 Joyent, Inc.
14 */
15
16 /*
17 * Overlay device ksocket multiplexer.
18 *
19 * For more information, see the big theory statement in
20 * uts/common/io/overlay/overlay.c
21 */
22
23 #include <sys/types.h>
24 #include <sys/socket.h>
25 #include <sys/ksynch.h>
26 #include <sys/ksocket.h>
27 #include <sys/avl.h>
28 #include <sys/list.h>
29 #include <sys/sysmacros.h>
30 #include <sys/strsubr.h>
31 #include <sys/strsun.h>
32 #include <sys/tihdr.h>
33
34 #include <sys/overlay_impl.h>
35
36 #include <sys/sdt.h>
37
38 #define OVERLAY_FREEMSG(mp, reason) \
39 DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason)
40
41 static list_t overlay_mux_list;
42 static kmutex_t overlay_mux_lock;
43
44 void
45 overlay_mux_init(void)
46 {
47 list_create(&overlay_mux_list, sizeof (overlay_mux_t),
48 offsetof(overlay_mux_t, omux_lnode));
49 mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
50 }
51
52 void
53 overlay_mux_fini(void)
54 {
55 mutex_destroy(&overlay_mux_lock);
56 list_destroy(&overlay_mux_list);
57 }
58
59 static int
60 overlay_mux_comparator(const void *a, const void *b)
61 {
62 const overlay_dev_t *odl, *odr;
63 odl = a;
64 odr = b;
65 if (odl->odd_vid > odr->odd_vid)
66 return (1);
67 else if (odl->odd_vid < odr->odd_vid)
68 return (-1);
69 else
70 return (0);
71 }
72
73 /*
74 * This is the central receive data path. We need to decode the packet, if we
75 * can, and then deliver it to the appropriate overlay.
76 */
77 /* ARGSUSED */
78 static boolean_t
79 overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
80 void *arg)
81 {
82 mblk_t *mp, *nmp, *fmp;
83 overlay_mux_t *mux = arg;
84
85 /*
86 * We may have a received a chain of messages. Each messsage in the
87 * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
88 * If we aren't getting that, we should probably drop that for the
89 * moment.
90 */
91 for (mp = mpchain; mp != NULL; mp = nmp) {
92 struct T_unitdata_ind *tudi;
93 ovep_encap_info_t infop;
94 overlay_dev_t od, *odd;
95 int ret;
96
97 nmp = mp->b_next;
98 mp->b_next = NULL;
99
100 if (DB_TYPE(mp) != M_PROTO) {
101 OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
102 freemsg(mp);
103 continue;
104 }
105
106 if (mp->b_cont == NULL) {
107 OVERLAY_FREEMSG(mp, "missing a b_cont");
108 freemsg(mp);
109 continue;
110 }
111
112 tudi = (struct T_unitdata_ind *)mp->b_rptr;
113 if (tudi->PRIM_type != T_UNITDATA_IND) {
114 OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
115 freemsg(mp);
116 continue;
117 }
118
119 /*
120 * In the future, we'll care about the source information
121 * for purposes of telling varpd for oob invalidation. But for
122 * now, just drop that block.
123 */
124 fmp = mp;
125 mp = fmp->b_cont;
126 fmp->b_cont = NULL;
127 freemsg(fmp);
128
129 /*
130 * Decap and deliver.
131 */
132 bzero(&infop, sizeof (ovep_encap_info_t));
133 ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
134 if (ret != 0) {
135 OVERLAY_FREEMSG(mp, "decap failed");
136 freemsg(mp);
137 continue;
138 }
139 if (MBLKL(mp) > infop.ovdi_hdr_size) {
140 mp->b_rptr += infop.ovdi_hdr_size;
141 } else {
142 while (infop.ovdi_hdr_size != 0) {
143 size_t rem, blkl;
144
145 if (mp == NULL)
146 break;
147
148 blkl = MBLKL(mp);
149 rem = MIN(infop.ovdi_hdr_size, blkl);
150 infop.ovdi_hdr_size -= rem;
151 mp->b_rptr += rem;
152 if (rem == blkl) {
153 fmp = mp;
154 mp = fmp->b_cont;
155 fmp->b_cont = NULL;
156 OVERLAY_FREEMSG(mp,
157 "freed a fmp block");
158 freemsg(fmp);
159 }
160 }
161 if (mp == NULL) {
162 OVERLAY_FREEMSG(mp, "freed it all...");
163 continue;
164 }
165 }
166
167
168 od.odd_vid = infop.ovdi_id;
169 mutex_enter(&mux->omux_lock);
170 odd = avl_find(&mux->omux_devices, &od, NULL);
171 if (odd == NULL) {
172 mutex_exit(&mux->omux_lock);
173 OVERLAY_FREEMSG(mp, "no matching vid");
174 freemsg(mp);
175 continue;
176 }
177 mutex_enter(&odd->odd_lock);
178 if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
179 !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
180 mutex_exit(&odd->odd_lock);
181 mutex_exit(&mux->omux_lock);
182 OVERLAY_FREEMSG(mp, "dev dropped");
183 freemsg(mp);
184 continue;
185 }
186 overlay_io_start(odd, OVERLAY_F_IN_RX);
187 mutex_exit(&odd->odd_lock);
188 mutex_exit(&mux->omux_lock);
189
190 mac_rx(odd->odd_mh, NULL, mp);
191
192 mutex_enter(&odd->odd_lock);
193 overlay_io_done(odd, OVERLAY_F_IN_RX);
194 mutex_exit(&odd->odd_lock);
195 }
196
197 return (B_TRUE);
198 }
199
200 /*
201 * Register a given device with a socket backend. If no such device socket
202 * exists, create a new one.
203 */
204 overlay_mux_t *
205 overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
206 struct sockaddr *addr, socklen_t len, int *errp)
207 {
208 int err;
209 overlay_mux_t *mux;
210 ksocket_t ksock;
211
212 if (errp == NULL)
213 errp = &err;
214
215 mutex_enter(&overlay_mux_lock);
216 for (mux = list_head(&overlay_mux_list); mux != NULL;
217 mux = list_next(&overlay_mux_list, mux)) {
218 if (domain == mux->omux_domain &&
219 family == mux->omux_family &&
220 protocol == mux->omux_protocol &&
221 len == mux->omux_alen &&
222 bcmp(addr, mux->omux_addr, len) == 0) {
223
224 if (opp != mux->omux_plugin) {
225 *errp = EEXIST;
226 return (NULL);
227 }
228
229 mutex_enter(&mux->omux_lock);
230 mux->omux_count++;
231 mutex_exit(&mux->omux_lock);
232 mutex_exit(&overlay_mux_lock);
233 *errp = 0;
234 return (mux);
235 }
236 }
237
238 /*
239 * Today we aren't zone-aware and only exist in the global zone. When we
240 * allow for things to exist in the non-global zone, we'll want to use a
241 * credential that's actually specific to the zone.
242 */
243 *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
244 kcred);
245 if (*errp != 0) {
246 mutex_exit(&overlay_mux_lock);
247 return (NULL);
248 }
249
250 *errp = ksocket_bind(ksock, addr, len, kcred);
251 if (*errp != 0) {
252 mutex_exit(&overlay_mux_lock);
253 ksocket_close(ksock, kcred);
254 return (NULL);
255 }
256
257 /*
258 * Ask our lower layer to optionally toggle anything they need on this
259 * socket. Because a socket is owned by a single type of plugin, we can
260 * then ask it to perform any additional socket set up it'd like to do.
261 */
262 if (opp->ovp_ops->ovpo_sockopt != NULL &&
263 (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
264 mutex_exit(&overlay_mux_lock);
265 ksocket_close(ksock, kcred);
266 return (NULL);
267 }
268
269 mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
270 list_link_init(&mux->omux_lnode);
271 mux->omux_ksock = ksock;
272 mux->omux_plugin = opp;
273 mux->omux_domain = domain;
274 mux->omux_family = family;
275 mux->omux_protocol = protocol;
276 mux->omux_addr = kmem_alloc(len, KM_SLEEP);
277 bcopy(addr, mux->omux_addr, len);
278 mux->omux_alen = len;
279 mux->omux_count = 1;
280 avl_create(&mux->omux_devices, overlay_mux_comparator,
281 sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
282 mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
283
284
285 /* Once this is called, we need to expect to rx data */
286 *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
287 if (*errp != 0) {
288 ksocket_close(ksock, kcred);
289 mutex_destroy(&mux->omux_lock);
290 avl_destroy(&mux->omux_devices);
291 kmem_free(mux->omux_addr, len);
292 kmem_free(mux, sizeof (overlay_mux_t));
293 return (NULL);
294 }
295
296 list_insert_tail(&overlay_mux_list, mux);
297 mutex_exit(&overlay_mux_lock);
298
299 *errp = 0;
300 return (mux);
301 }
302
303 void
304 overlay_mux_close(overlay_mux_t *mux)
305 {
306 mutex_enter(&overlay_mux_lock);
307 mutex_enter(&mux->omux_lock);
308 mux->omux_count--;
309 if (mux->omux_count != 0) {
310 mutex_exit(&mux->omux_lock);
311 mutex_exit(&overlay_mux_lock);
312 return;
313 }
314 list_remove(&overlay_mux_list, mux);
315 mutex_exit(&mux->omux_lock);
316 mutex_exit(&overlay_mux_lock);
317
318 ksocket_close(mux->omux_ksock, kcred);
319 avl_destroy(&mux->omux_devices);
320 kmem_free(mux->omux_addr, mux->omux_alen);
321 kmem_free(mux, sizeof (overlay_mux_t));
322 }
323
324 void
325 overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
326 {
327 mutex_enter(&mux->omux_lock);
328 avl_add(&mux->omux_devices, odd);
329 mutex_exit(&mux->omux_lock);
330 }
331
332 void
333 overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
334 {
335 mutex_enter(&mux->omux_lock);
336 avl_remove(&mux->omux_devices, odd);
337 mutex_exit(&mux->omux_lock);
338 }
339
340 int
341 overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
342 {
343 int ret;
344
345 /*
346 * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
347 * that isn't actually supported by UDP at this time.
348 */
349 ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred);
350 if (ret != 0)
351 freemsg(mp);
352
353 return (ret);
354 }