Print this page
OS-7088 cyclics corked on overlay socket with full queue
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/io/overlay/overlay_mux.c
+++ new/usr/src/uts/common/io/overlay/overlay_mux.c
1 1 /*
2 2 * This file and its contents are supplied under the terms of the
3 3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 4 * You may only use this file in accordance with the terms of version
5 5 * 1.0 of the CDDL.
6 6 *
7 7 * A full copy of the text of the CDDL should have accompanied this
8 8 * source. A copy of the CDDL is also available via the Internet at
9 9 * http://www.illumos.org/license/CDDL.
10 10 */
11 11
12 12 /*
13 13 * Copyright 2019 Joyent, Inc.
14 14 */
15 15
16 16 /*
17 17 * Overlay device ksocket multiplexer.
18 18 *
19 19 * For more information, see the big theory statement in
20 20 * uts/common/io/overlay/overlay.c
21 21 */
22 22
23 23 #include <sys/types.h>
24 24 #include <sys/socket.h>
25 25 #include <sys/ksynch.h>
26 26 #include <sys/ksocket.h>
27 27 #include <sys/avl.h>
28 28 #include <sys/list.h>
29 29 #include <sys/pattr.h>
30 30 #include <sys/sysmacros.h>
31 31 #include <sys/strsubr.h>
32 32 #include <sys/strsun.h>
33 33 #include <sys/tihdr.h>
34 34
35 35 #include <sys/overlay_impl.h>
36 36
37 37 #include <sys/sdt.h>
38 38
39 39 #define OVERLAY_FREEMSG(mp, reason) \
40 40 DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason)
41 41
42 42 static list_t overlay_mux_list;
43 43 static kmutex_t overlay_mux_lock;
44 44
45 45 void
46 46 overlay_mux_init(void)
47 47 {
48 48 list_create(&overlay_mux_list, sizeof (overlay_mux_t),
49 49 offsetof(overlay_mux_t, omux_lnode));
50 50 mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
51 51 }
52 52
53 53 void
54 54 overlay_mux_fini(void)
55 55 {
56 56 mutex_destroy(&overlay_mux_lock);
57 57 list_destroy(&overlay_mux_list);
58 58 }
59 59
60 60 static int
61 61 overlay_mux_comparator(const void *a, const void *b)
62 62 {
63 63 const overlay_dev_t *odl, *odr;
64 64 odl = a;
65 65 odr = b;
66 66 if (odl->odd_vid > odr->odd_vid)
67 67 return (1);
68 68 else if (odl->odd_vid < odr->odd_vid)
69 69 return (-1);
70 70 else
71 71 return (0);
72 72 }
73 73
74 74 /*
75 75 * This is the central receive data path. We need to decode the packet, if we
76 76 * can, and then deliver it to the appropriate overlay.
77 77 */
78 78 /* ARGSUSED */
79 79 static boolean_t
80 80 overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
81 81 void *arg)
82 82 {
83 83 mblk_t *mp, *nmp, *fmp;
84 84 overlay_mux_t *mux = arg;
85 85
86 86 /*
87 87 * We may have a received a chain of messages. Each messsage in the
88 88 * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
89 89 * If we aren't getting that, we should probably drop that for the
90 90 * moment.
91 91 */
92 92 for (mp = mpchain; mp != NULL; mp = nmp) {
93 93 struct T_unitdata_ind *tudi;
94 94 ovep_encap_info_t infop;
95 95 overlay_dev_t od, *odd;
96 96 int ret;
97 97
98 98 nmp = mp->b_next;
99 99 mp->b_next = NULL;
100 100
101 101 if (DB_TYPE(mp) != M_PROTO) {
102 102 OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
103 103 freemsg(mp);
104 104 continue;
105 105 }
106 106
107 107 if (mp->b_cont == NULL) {
108 108 OVERLAY_FREEMSG(mp, "missing a b_cont");
109 109 freemsg(mp);
110 110 continue;
111 111 }
112 112
113 113 tudi = (struct T_unitdata_ind *)mp->b_rptr;
114 114 if (tudi->PRIM_type != T_UNITDATA_IND) {
115 115 OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
116 116 freemsg(mp);
117 117 continue;
118 118 }
119 119
120 120 /*
121 121 * In the future, we'll care about the source information
122 122 * for purposes of telling varpd for oob invalidation. But for
123 123 * now, just drop that block.
124 124 */
125 125 fmp = mp;
126 126 mp = fmp->b_cont;
127 127 freeb(fmp);
128 128
129 129 /*
130 130 * Until we have VXLAN-or-other-decap HW acceleration support
131 131 * (e.g. we support NICs that reach into VXLAN-encapsulated
132 132 * packets and check the inside-VXLAN IP packets' checksums,
133 133 * or do LSO with VXLAN), we should clear any HW-accelerated-
134 134 * performed bits.
135 135 */
136 136 DB_CKSUMFLAGS(mp) = 0;
137 137
138 138 /*
139 139 * Decap and deliver.
140 140 */
141 141 bzero(&infop, sizeof (ovep_encap_info_t));
142 142 ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
143 143 if (ret != 0) {
144 144 OVERLAY_FREEMSG(mp, "decap failed");
145 145 freemsg(mp);
146 146 continue;
147 147 }
148 148 if (MBLKL(mp) > infop.ovdi_hdr_size) {
149 149 mp->b_rptr += infop.ovdi_hdr_size;
150 150 } else {
151 151 while (infop.ovdi_hdr_size != 0) {
152 152 size_t rem, blkl;
153 153
154 154 if (mp == NULL)
155 155 break;
156 156
157 157 blkl = MBLKL(mp);
158 158 rem = MIN(infop.ovdi_hdr_size, blkl);
159 159 infop.ovdi_hdr_size -= rem;
160 160 mp->b_rptr += rem;
161 161 if (rem == blkl) {
162 162 fmp = mp;
163 163 mp = fmp->b_cont;
164 164 fmp->b_cont = NULL;
165 165 OVERLAY_FREEMSG(mp,
166 166 "freed a fmp block");
167 167 freemsg(fmp);
168 168 }
169 169 }
170 170 if (mp == NULL) {
171 171 OVERLAY_FREEMSG(mp, "freed it all...");
172 172 continue;
173 173 }
174 174 }
175 175
176 176
177 177 od.odd_vid = infop.ovdi_id;
178 178 mutex_enter(&mux->omux_lock);
179 179 odd = avl_find(&mux->omux_devices, &od, NULL);
180 180 if (odd == NULL) {
181 181 mutex_exit(&mux->omux_lock);
182 182 OVERLAY_FREEMSG(mp, "no matching vid");
183 183 freemsg(mp);
184 184 continue;
185 185 }
186 186 mutex_enter(&odd->odd_lock);
187 187 if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
188 188 !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
189 189 mutex_exit(&odd->odd_lock);
190 190 mutex_exit(&mux->omux_lock);
191 191 OVERLAY_FREEMSG(mp, "dev dropped");
192 192 freemsg(mp);
193 193 continue;
194 194 }
195 195 overlay_io_start(odd, OVERLAY_F_IN_RX);
196 196 mutex_exit(&odd->odd_lock);
197 197 mutex_exit(&mux->omux_lock);
198 198
199 199 mac_rx(odd->odd_mh, NULL, mp);
|
↓ open down ↓ |
199 lines elided |
↑ open up ↑ |
200 200
201 201 mutex_enter(&odd->odd_lock);
202 202 overlay_io_done(odd, OVERLAY_F_IN_RX);
203 203 mutex_exit(&odd->odd_lock);
204 204 }
205 205
206 206 return (B_TRUE);
207 207 }
208 208
209 209 /*
210 + * Kernel socket callback to indicate the socket itself is able to send
211 + * data again. Check for devices on this mux that were send-blocked,
212 + * and clear them.
213 + */
214 +/* ARGSUSED */
215 +static void
216 +overlay_mux_cansend_now(ksocket_t ksock, ksocket_callback_event_t event,
217 + void *arg, uintptr_t ignore_me)
218 +{
219 + overlay_mux_t *mux = (overlay_mux_t *)arg;
220 + overlay_dev_t *odd;
221 + mac_handle_t *mhs_to_update, *current_mh;
222 + size_t allocsize;
223 +
224 + ASSERT3P(ksock, ==, mux->omux_ksock);
225 + ASSERT3U(event, ==, KSOCKET_EV_CANSEND);
226 +
227 + /* Traverse omux_devices and check for ones marked as send-blocked. */
228 + mutex_enter(&mux->omux_lock);
229 + if (mux->omux_count == 0) {
230 + /* Nothing to wake up. */
231 + mutex_exit(&mux->omux_lock);
232 + return;
233 + }
234 + allocsize = sizeof (mac_handle_t) * mux->omux_count;
235 + mhs_to_update = kmem_zalloc(allocsize, KM_NOSLEEP);
236 + VERIFY(mhs_to_update != NULL); /* Failure should be rare. */
237 + current_mh = mhs_to_update;
238 +
239 + for (odd = avl_first(&mux->omux_devices); odd != NULL;
240 + odd = AVL_NEXT(&mux->omux_devices, odd)) {
241 + mac_handle_t odd_mh = NULL;
242 +
243 + mutex_enter(&odd->odd_lock);
244 + if ((odd->odd_flags & OVERLAY_F_TXSTOPPED) != 0) {
245 + /* Get ready to tell MAC it can transmit again. */
246 + odd->odd_flags &= ~OVERLAY_F_TXSTOPPED;
247 + odd_mh = odd->odd_mh;
248 + }
249 + mutex_exit(&odd->odd_lock);
250 + if (odd_mh != NULL) {
251 + *current_mh = odd_mh;
252 + current_mh++;
253 + }
254 + }
255 + mutex_exit(&mux->omux_lock);
256 +
257 + /*
258 + * Yes, I'm using the value-then-decrement. "current_mh" is
259 + * guaranteed to be at least one ahead of mhs_to_update if there are
260 + * any mac handles that need updating. I also have to do this outside
261 + * the omux lock because the tx_update may trigger immediate or
262 + * concurrent packet transmission.
263 + */
264 + while (current_mh-- != mhs_to_update)
265 + mac_tx_update(*current_mh);
266 +
267 + kmem_free(mhs_to_update, allocsize);
268 +}
269 +
270 +/*
210 271 * Register a given device with a socket backend. If no such device socket
211 272 * exists, create a new one.
212 273 */
213 274 overlay_mux_t *
214 275 overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
215 276 struct sockaddr *addr, socklen_t len, int *errp)
216 277 {
217 278 int err;
218 279 overlay_mux_t *mux;
219 280 ksocket_t ksock;
281 + ksocket_callbacks_t ks_cb = { 0 };
220 282
221 283 if (errp == NULL)
222 284 errp = &err;
223 285
224 286 mutex_enter(&overlay_mux_lock);
225 287 for (mux = list_head(&overlay_mux_list); mux != NULL;
226 288 mux = list_next(&overlay_mux_list, mux)) {
227 289 if (domain == mux->omux_domain &&
228 290 family == mux->omux_family &&
229 291 protocol == mux->omux_protocol &&
230 292 len == mux->omux_alen &&
231 293 bcmp(addr, mux->omux_addr, len) == 0) {
232 294
233 295 if (opp != mux->omux_plugin) {
234 296 *errp = EEXIST;
235 297 return (NULL);
236 298 }
237 299
238 300 mutex_enter(&mux->omux_lock);
239 301 mux->omux_count++;
240 302 mutex_exit(&mux->omux_lock);
241 303 mutex_exit(&overlay_mux_lock);
242 304 *errp = 0;
243 305 return (mux);
244 306 }
245 307 }
246 308
247 309 /*
248 310 * Today we aren't zone-aware and only exist in the global zone. When we
249 311 * allow for things to exist in the non-global zone, we'll want to use a
250 312 * credential that's actually specific to the zone.
251 313 */
252 314 *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
253 315 kcred);
254 316 if (*errp != 0) {
255 317 mutex_exit(&overlay_mux_lock);
256 318 return (NULL);
257 319 }
258 320
259 321 *errp = ksocket_bind(ksock, addr, len, kcred);
260 322 if (*errp != 0) {
261 323 mutex_exit(&overlay_mux_lock);
262 324 ksocket_close(ksock, kcred);
263 325 return (NULL);
264 326 }
265 327
266 328 /*
267 329 * Ask our lower layer to optionally toggle anything they need on this
268 330 * socket. Because a socket is owned by a single type of plugin, we can
269 331 * then ask it to perform any additional socket set up it'd like to do.
270 332 */
271 333 if (opp->ovp_ops->ovpo_sockopt != NULL &&
272 334 (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
273 335 mutex_exit(&overlay_mux_lock);
274 336 ksocket_close(ksock, kcred);
275 337 return (NULL);
276 338 }
277 339
278 340 mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
279 341 list_link_init(&mux->omux_lnode);
280 342 mux->omux_ksock = ksock;
281 343 mux->omux_plugin = opp;
282 344 mux->omux_domain = domain;
|
↓ open down ↓ |
53 lines elided |
↑ open up ↑ |
283 345 mux->omux_family = family;
284 346 mux->omux_protocol = protocol;
285 347 mux->omux_addr = kmem_alloc(len, KM_SLEEP);
286 348 bcopy(addr, mux->omux_addr, len);
287 349 mux->omux_alen = len;
288 350 mux->omux_count = 1;
289 351 avl_create(&mux->omux_devices, overlay_mux_comparator,
290 352 sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
291 353 mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
292 354
355 +#if defined(OVERLAY_PINCH) || defined(OVERLAY_FC_TEST)
356 + /* Set the xmit buf to a REALLY SMALL value, say 12k (1-3 packets) */
357 + int bufsize = 12 * 1024;
293 358
359 + if (ksocket_setsockopt(ksock, SOL_SOCKET, SO_SNDBUF,
360 + (const void *)&bufsize, sizeof (bufsize), CRED()) != 0) {
361 + ksocket_close(ksock, kcred);
362 + mutex_destroy(&mux->omux_lock);
363 + avl_destroy(&mux->omux_devices);
364 + kmem_free(mux->omux_addr, len);
365 + kmem_free(mux, sizeof (overlay_mux_t));
366 + return (NULL);
367 + }
368 +#endif
369 + /*
370 + * Set a callback in case we hit socket flow control and need to know
371 + * when it's ready to send again. See the aforementioned
372 + * ksocket_socket() comments about the use of kcred vs. being
373 + * zone-aware.
374 + */
375 + ks_cb.ksock_cb_cansend = overlay_mux_cansend_now;
376 + if (ksocket_setcallbacks(ksock, &ks_cb, mux, kcred) != 0) {
377 + ksocket_close(ksock, kcred);
378 + mutex_destroy(&mux->omux_lock);
379 + avl_destroy(&mux->omux_devices);
380 + kmem_free(mux->omux_addr, len);
381 + kmem_free(mux, sizeof (overlay_mux_t));
382 + return (NULL);
383 + }
384 +
294 385 /* Once this is called, we need to expect to rx data */
295 386 *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
296 387 if (*errp != 0) {
297 388 ksocket_close(ksock, kcred);
298 389 mutex_destroy(&mux->omux_lock);
299 390 avl_destroy(&mux->omux_devices);
300 391 kmem_free(mux->omux_addr, len);
301 392 kmem_free(mux, sizeof (overlay_mux_t));
302 393 return (NULL);
303 394 }
304 395
305 396 list_insert_tail(&overlay_mux_list, mux);
306 397 mutex_exit(&overlay_mux_lock);
307 398
308 399 *errp = 0;
309 400 return (mux);
310 401 }
311 402
312 403 void
313 404 overlay_mux_close(overlay_mux_t *mux)
314 405 {
315 406 mutex_enter(&overlay_mux_lock);
316 407 mutex_enter(&mux->omux_lock);
317 408 mux->omux_count--;
318 409 if (mux->omux_count != 0) {
319 410 mutex_exit(&mux->omux_lock);
320 411 mutex_exit(&overlay_mux_lock);
321 412 return;
322 413 }
323 414 list_remove(&overlay_mux_list, mux);
324 415 mutex_exit(&mux->omux_lock);
325 416 mutex_exit(&overlay_mux_lock);
326 417
327 418 ksocket_close(mux->omux_ksock, kcred);
328 419 avl_destroy(&mux->omux_devices);
329 420 kmem_free(mux->omux_addr, mux->omux_alen);
330 421 kmem_free(mux, sizeof (overlay_mux_t));
331 422 }
332 423
333 424 void
334 425 overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
335 426 {
336 427 mutex_enter(&mux->omux_lock);
337 428 avl_add(&mux->omux_devices, odd);
338 429 mutex_exit(&mux->omux_lock);
339 430 }
340 431
341 432 void
342 433 overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
343 434 {
344 435 mutex_enter(&mux->omux_lock);
345 436 avl_remove(&mux->omux_devices, odd);
346 437 mutex_exit(&mux->omux_lock);
|
↓ open down ↓ |
43 lines elided |
↑ open up ↑ |
347 438 }
348 439
349 440 int
350 441 overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
351 442 {
352 443 int ret;
353 444
354 445 /*
355 446 * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
356 447 * that isn't actually supported by UDP at this time.
448 + *
449 + * Send with MSG_DONTWAIT to indicate clogged UDP sockets upstack.
357 450 */
358 - ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred);
451 + ret = ksocket_sendmblk(mux->omux_ksock, hdr, MSG_DONTWAIT, &mp, kcred);
452 + /*
453 + * NOTE: ksocket_sendmblk() may send partial packets downstack,
454 + * returning what's not sent in &mp (i.e. mp pre-call might be a
455 + * b_cont of mp post-call). We can't hold up this message (it's a
456 + * datagram), so we drop, and let the caller cope.
457 + */
359 458 if (ret != 0)
360 459 freemsg(mp);
361 460
362 461 return (ret);
363 462 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX