Print this page
16884 viona TSO should better handle csum offloads
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/intel/io/viona/viona_tx.c
+++ new/usr/src/uts/intel/io/viona/viona_tx.c
1 1 /*
2 2 * Copyright (c) 2013 Chris Torek <torek @ torek net>
3 3 * All rights reserved.
4 4 *
5 5 * Redistribution and use in source and binary forms, with or without
6 6 * modification, are permitted provided that the following conditions
7 7 * are met:
8 8 * 1. Redistributions of source code must retain the above copyright
9 9 * notice, this list of conditions and the following disclaimer.
10 10 * 2. Redistributions in binary form must reproduce the above copyright
11 11 * notice, this list of conditions and the following disclaimer in the
12 12 * documentation and/or other materials provided with the distribution.
13 13 *
14 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 24 * SUCH DAMAGE.
25 25 */
26 26 /*
27 27 * This file and its contents are supplied under the terms of the
28 28 * Common Development and Distribution License ("CDDL"), version 1.0.
|
↓ open down ↓ |
28 lines elided |
↑ open up ↑ |
29 29 * You may only use this file in accordance with the terms of version
30 30 * 1.0 of the CDDL.
31 31 *
32 32 * A full copy of the text of the CDDL should have accompanied this
33 33 * source. A copy of the CDDL is also available via the Internet at
34 34 * http://www.illumos.org/license/CDDL.
35 35 *
36 36 * Copyright 2015 Pluribus Networks Inc.
37 37 * Copyright 2019 Joyent, Inc.
38 38 * Copyright 2024 Oxide Computer Company
39 + * Copyright 2024 MNX Cloud, Inc.
39 40 */
40 41
41 42
42 43 #include <sys/types.h>
43 44 #include <sys/smt.h>
44 45 #include <sys/strsubr.h>
45 46
46 47 #include <sys/pattr.h>
47 48 #include <sys/dlpi.h>
48 49 #include <inet/ip.h>
49 50 #include <inet/ip_impl.h>
50 51
51 52 #include "viona_impl.h"
52 53
53 54 #define BNXE_NIC_DRIVER "bnxe"
54 55
55 56 /*
56 57 * Tunable controls tx copy by default on or off
57 58 */
58 59 boolean_t viona_default_tx_copy = B_TRUE;
59 60
60 61 /*
61 62 * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
62 63 * transmission to free resources.
63 64 */
64 65 kmutex_t viona_force_copy_lock;
65 66 static enum viona_force_copy {
66 67 VFC_UNINITALIZED = 0,
67 68 VFC_COPY_UNEEDED = 1,
68 69 VFC_COPY_REQUIRED = 2,
69 70 } viona_force_copy_state = VFC_UNINITALIZED;
70 71
71 72 struct viona_desb {
72 73 frtn_t d_frtn;
73 74 viona_vring_t *d_ring;
74 75 uint_t d_ref;
75 76 uint32_t d_len;
76 77 uint16_t d_cookie;
77 78 uchar_t *d_headers;
78 79 vmm_page_t *d_pages;
79 80 };
80 81
81 82 static void viona_tx(viona_link_t *, viona_vring_t *);
82 83 static void viona_desb_release(viona_desb_t *);
83 84
84 85
85 86 static void
86 87 viona_tx_wait_outstanding(viona_vring_t *ring)
87 88 {
88 89 ASSERT(MUTEX_HELD(&ring->vr_lock));
89 90
90 91 while (ring->vr_xfer_outstanding != 0) {
91 92 /*
92 93 * Paying heed to signals is counterproductive here. This is a
93 94 * very tight loop if pending transfers take an extended amount
94 95 * of time to be reclaimed while the host process is exiting.
95 96 */
96 97 cv_wait(&ring->vr_cv, &ring->vr_lock);
97 98 }
98 99 }
99 100
100 101 /*
101 102 * Check if full TX packet copying is needed. This should not be called from
102 103 * viona attach()/detach() context.
103 104 */
104 105 static boolean_t
105 106 viona_tx_copy_needed(void)
106 107 {
107 108 boolean_t result;
108 109
109 110 if (viona_default_tx_copy) {
110 111 return (B_TRUE);
111 112 }
112 113
113 114 mutex_enter(&viona_force_copy_lock);
114 115 if (viona_force_copy_state == VFC_UNINITALIZED) {
115 116 major_t bnxe_major;
116 117
117 118 /*
118 119 * The original code for viona featured an explicit check for
119 120 * the bnxe driver which, when found present, necessitated that
120 121 * all transmissions be copied into their own mblks instead of
121 122 * passing guest memory to the underlying device.
122 123 *
123 124 * The motivations for this are unclear, but until it can be
124 125 * proven unnecessary, the check lives on.
125 126 */
126 127 viona_force_copy_state = VFC_COPY_UNEEDED;
127 128 if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
128 129 != DDI_MAJOR_T_NONE) {
129 130 if (ddi_hold_installed_driver(bnxe_major) != NULL) {
130 131 viona_force_copy_state = VFC_COPY_REQUIRED;
131 132 ddi_rele_driver(bnxe_major);
132 133 }
133 134 }
134 135 }
135 136 result = (viona_force_copy_state == VFC_COPY_REQUIRED);
136 137 mutex_exit(&viona_force_copy_lock);
137 138
138 139 return (result);
139 140 }
140 141
141 142 void
142 143 viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz)
143 144 {
144 145 /* Allocate desb handles for TX ring if packet copying is disabled */
145 146 if (!viona_tx_copy_needed()) {
146 147 viona_desb_t *dp;
147 148
148 149 dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
149 150 ring->vr_txdesb = dp;
150 151 for (uint_t i = 0; i < qsz; i++, dp++) {
151 152 dp->d_frtn.free_func = viona_desb_release;
152 153 dp->d_frtn.free_arg = (void *)dp;
153 154 dp->d_ring = ring;
154 155 dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN,
155 156 KM_SLEEP);
156 157 }
157 158 }
158 159
159 160 /* Allocate ring-sized iovec buffers for TX */
160 161 ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP);
161 162 }
162 163
163 164 void
164 165 viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz)
165 166 {
166 167 if (ring->vr_txdesb != NULL) {
167 168 viona_desb_t *dp = ring->vr_txdesb;
168 169
169 170 for (uint_t i = 0; i < qsz; i++, dp++) {
170 171 kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN);
171 172 }
172 173 kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz);
173 174 ring->vr_txdesb = NULL;
174 175 }
175 176
176 177 if (ring->vr_txiov != NULL) {
177 178 kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz);
178 179 ring->vr_txiov = NULL;
179 180 }
180 181 }
181 182
182 183 static void
183 184 viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
184 185 {
185 186 vq_pushchain(ring, len, cookie);
186 187
187 188 membar_enter();
188 189 viona_intr_ring(ring, B_FALSE);
189 190 }
190 191
191 192 #define TX_BURST_THRESH 32
192 193
193 194 void
194 195 viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
195 196 {
196 197 (void) thread_vsetname(curthread, "viona_tx_%p", ring);
197 198
198 199 ASSERT(MUTEX_HELD(&ring->vr_lock));
199 200 ASSERT3U(ring->vr_state, ==, VRS_RUN);
200 201
201 202 mutex_exit(&ring->vr_lock);
202 203
203 204 for (;;) {
204 205 uint_t ntx = 0, burst = 0;
205 206
206 207 viona_ring_disable_notify(ring);
207 208 while (viona_ring_num_avail(ring) != 0) {
208 209 viona_tx(link, ring);
209 210 ntx++;
210 211 burst++;
211 212
212 213 /*
213 214 * It is advantageous for throughput to keep this
214 215 * transmission loop tight, but periodic breaks to
215 216 * check for other events are of value too.
216 217 */
217 218 if (burst >= TX_BURST_THRESH) {
218 219 mutex_enter(&ring->vr_lock);
219 220 const bool need_bail = vring_need_bail(ring);
220 221 mutex_exit(&ring->vr_lock);
221 222
222 223 if (need_bail) {
223 224 break;
224 225 }
225 226 burst = 0;
226 227 }
227 228 }
228 229
229 230 VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx);
230 231
231 232 /*
232 233 * Check for available descriptors on the ring once more in
233 234 * case a late addition raced with the NO_NOTIFY flag toggle.
234 235 *
235 236 * The barrier ensures that visibility of the no-notify
236 237 * store does not cross the viona_ring_num_avail() check below.
237 238 */
238 239 viona_ring_enable_notify(ring);
239 240 membar_enter();
240 241
241 242 if (viona_ring_num_avail(ring) == 0 &&
242 243 (link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
243 244 /*
244 245 * The NOTIFY_ON_EMPTY interrupt should not pay heed to
245 246 * the presence of AVAIL_NO_INTERRUPT.
246 247 */
247 248 viona_intr_ring(ring, B_TRUE);
248 249 }
249 250
250 251 mutex_enter(&ring->vr_lock);
251 252 for (;;) {
252 253 if (vring_need_bail(ring)) {
253 254 ring->vr_state = VRS_STOP;
254 255 viona_tx_wait_outstanding(ring);
255 256 return;
256 257 }
257 258
258 259 if (vmm_drv_lease_expired(ring->vr_lease)) {
259 260 ring->vr_state_flags |= VRSF_RENEW;
260 261 /*
261 262 * When renewing the lease for the ring, no TX
262 263 * frames may be outstanding, as they contain
263 264 * references to guest memory.
264 265 */
265 266 viona_tx_wait_outstanding(ring);
266 267
267 268 const boolean_t renewed =
268 269 viona_ring_lease_renew(ring);
269 270 ring->vr_state_flags &= ~VRSF_RENEW;
270 271
271 272 if (!renewed) {
272 273 /* stop ring on failed renewal */
273 274 ring->vr_state = VRS_STOP;
274 275 return;
275 276 }
276 277 }
277 278
278 279 if (viona_ring_num_avail(ring) != 0) {
279 280 break;
280 281 }
281 282
282 283 /* Wait for further activity on the ring */
283 284 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
284 285 }
285 286 mutex_exit(&ring->vr_lock);
286 287 }
287 288 /* UNREACHABLE */
288 289 }
289 290
290 291 static void
291 292 viona_desb_release(viona_desb_t *dp)
292 293 {
293 294 viona_vring_t *ring = dp->d_ring;
294 295 uint_t ref;
295 296 uint32_t len;
296 297 uint16_t cookie;
297 298
298 299 ref = atomic_dec_uint_nv(&dp->d_ref);
299 300 if (ref > 1) {
300 301 return;
301 302 }
302 303
303 304 /*
304 305 * The desb corresponding to this index must be ready for reuse before
305 306 * the descriptor is returned to the guest via the 'used' ring.
306 307 */
307 308 len = dp->d_len;
308 309 cookie = dp->d_cookie;
309 310 dp->d_len = 0;
310 311 dp->d_cookie = 0;
311 312 vmm_drv_page_release_chain(dp->d_pages);
312 313 dp->d_pages = NULL;
313 314
314 315 /*
315 316 * Ensure all other changes to the desb are visible prior to zeroing its
316 317 * refcount, signifying its readiness for reuse.
317 318 */
318 319 membar_exit();
319 320 dp->d_ref = 0;
320 321
321 322 viona_tx_done(ring, len, cookie);
322 323
323 324 mutex_enter(&ring->vr_lock);
324 325 if ((--ring->vr_xfer_outstanding) == 0) {
325 326 cv_broadcast(&ring->vr_cv);
326 327 }
327 328 mutex_exit(&ring->vr_lock);
328 329 }
329 330
330 331 static boolean_t
331 332 viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
332 333 mblk_t *mp, uint32_t len)
333 334 {
334 335 viona_link_t *link = ring->vr_link;
335 336 const struct ether_header *eth;
336 337 uint_t eth_len = sizeof (struct ether_header);
337 338 ushort_t ftype;
338 339 ipha_t *ipha = NULL;
339 340 uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */
340 341 uint16_t flags = 0;
341 342 const uint_t csum_start = hdr->vrh_csum_start;
342 343 const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start;
343 344
344 345 /*
345 346 * Validate that the checksum offsets provided by the guest are within
346 347 * the bounds of the packet. Additionally, ensure that the checksum
347 348 * contents field is within the headers mblk copied by viona_tx().
348 349 */
349 350 if (csum_start >= len || csum_start < eth_len || csum_stuff >= len ||
350 351 (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) {
351 352 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
352 353 VIONA_RING_STAT_INCR(ring, fail_hcksum);
353 354 return (B_FALSE);
354 355 }
355 356
356 357 /*
357 358 * This is guaranteed to be safe thanks to the header copying
358 359 * done in viona_tx().
359 360 */
360 361 eth = (const struct ether_header *)mp->b_rptr;
361 362 ftype = ntohs(eth->ether_type);
362 363
363 364 if (ftype == ETHERTYPE_VLAN) {
364 365 const struct ether_vlan_header *veth;
365 366
366 367 /* punt on QinQ for now */
367 368 eth_len = sizeof (struct ether_vlan_header);
368 369 veth = (const struct ether_vlan_header *)eth;
369 370 ftype = ntohs(veth->ether_type);
370 371 }
371 372
372 373 if (ftype == ETHERTYPE_IP) {
373 374 ipha = (ipha_t *)(mp->b_rptr + eth_len);
374 375
375 376 ipproto = ipha->ipha_protocol;
376 377 } else if (ftype == ETHERTYPE_IPV6) {
|
↓ open down ↓ |
328 lines elided |
↑ open up ↑ |
377 378 ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len);
378 379
379 380 ipproto = ip6h->ip6_nxt;
380 381 }
381 382
382 383 /*
383 384 * We ignore hdr_len because the spec says it can't be
384 385 * trusted. Besides, our own stack will determine the header
385 386 * boundary.
386 387 */
387 - if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
388 - (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
388 + if ((hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
389 389 ftype == ETHERTYPE_IP) {
390 - uint16_t *cksump;
391 - uint32_t cksum;
392 - ipaddr_t src = ipha->ipha_src;
393 - ipaddr_t dst = ipha->ipha_dst;
390 + if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0) {
391 + uint16_t *cksump;
392 + uint32_t cksum;
393 + ipaddr_t src = ipha->ipha_src;
394 + ipaddr_t dst = ipha->ipha_dst;
394 395
395 - /*
396 - * Our native IP stack doesn't set the L4 length field
397 - * of the pseudo header when LSO is in play. Other IP
398 - * stacks, e.g. Linux, do include the length field.
399 - * This is a problem because the hardware expects that
400 - * the length field is not set. When it is set it will
401 - * cause an incorrect TCP checksum to be generated.
402 - * The reason this works in Linux is because Linux
403 - * corrects the pseudo-header checksum in the driver
404 - * code. In order to get the correct HW checksum we
405 - * need to assume the guest's IP stack gave us a bogus
406 - * TCP partial checksum and calculate it ourselves.
407 - */
408 - cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
409 - cksum = IP_TCP_CSUM_COMP;
410 - cksum += (dst >> 16) + (dst & 0xFFFF) +
411 - (src >> 16) + (src & 0xFFFF);
412 - cksum = (cksum & 0xFFFF) + (cksum >> 16);
413 - *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
396 + /*
397 + * Our native IP stack doesn't set the L4 length field
398 + * of the pseudo header when LSO is in play. Other IP
399 + * stacks, e.g. Linux, do include the length field.
400 + * This is a problem because the hardware expects that
401 + * the length field is not set. When it is set it will
402 + * cause an incorrect TCP checksum to be generated.
403 + * The reason this works in Linux is because Linux
404 + * corrects the pseudo-header checksum in the driver
405 + * code. In order to get the correct HW checksum we
406 + * need to assume the guest's IP stack gave us a bogus
407 + * TCP partial checksum and calculate it ourselves.
408 + */
409 + cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
410 + cksum = IP_TCP_CSUM_COMP;
411 + cksum += (dst >> 16) + (dst & 0xFFFF) +
412 + (src >> 16) + (src & 0xFFFF);
413 + cksum = (cksum & 0xFFFF) + (cksum >> 16);
414 + *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
415 + }
414 416
415 417 /*
416 418 * Since viona is a "legacy device", the data stored
417 419 * by the driver will be in the guest's native endian
418 420 * format (see sections 2.4.3 and 5.1.6.1 of the
419 421 * VIRTIO 1.0 spec for more info). At this time the
420 422 * only guests using viona are x86 and we can assume
421 423 * little-endian.
422 424 */
423 425 lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO);
424 426
425 427 /*
426 428 * Hardware, like ixgbe, expects the client to request
427 429 * IP header checksum offload if it's sending LSO (see
428 430 * ixgbe_get_context()). Unfortunately, virtio makes
429 431 * no allowances for negotiating IP header checksum
430 432 * and HW offload, only TCP checksum. We add the flag
431 433 * and zero-out the checksum field. This mirrors the
432 434 * behavior of our native IP stack (which does this in
433 435 * the interest of HW that expects the field to be
434 436 * zero).
435 437 */
436 438 flags |= HCK_IPV4_HDRCKSUM;
437 439 ipha->ipha_hdr_checksum = 0;
438 440 }
439 441
440 442 /*
441 443 * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure
442 444 * HW_LSO, if present, is not lost.
443 445 */
444 446 flags |= DB_CKSUMFLAGS(mp);
445 447
446 448 /*
447 449 * Partial checksum support from the NIC is ideal, since it most
448 450 * closely maps to the interface defined by virtio.
449 451 */
450 452 if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
451 453 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
452 454 /*
453 455 * MAC expects these offsets to be relative to the
454 456 * start of the L3 header rather than the L2 frame.
455 457 */
456 458 flags |= HCK_PARTIALCKSUM;
457 459 mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len,
458 460 len - eth_len, 0, flags);
459 461 return (B_TRUE);
460 462 }
461 463
462 464 /*
463 465 * Without partial checksum support, look to the L3/L4 protocol
464 466 * information to see if the NIC can handle it. If not, the
465 467 * checksum will need to calculated inline.
466 468 */
467 469 if (ftype == ETHERTYPE_IP) {
468 470 if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
469 471 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
470 472 uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
471 473 *csump = 0;
472 474 flags |= HCK_FULLCKSUM;
473 475 mac_hcksum_set(mp, 0, 0, 0, 0, flags);
474 476 return (B_TRUE);
475 477 }
476 478
477 479 /* XXX: Implement manual fallback checksumming? */
478 480 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
479 481 VIONA_RING_STAT_INCR(ring, fail_hcksum);
480 482 return (B_FALSE);
481 483 } else if (ftype == ETHERTYPE_IPV6) {
482 484 if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
483 485 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
484 486 uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
485 487 *csump = 0;
486 488 flags |= HCK_FULLCKSUM;
487 489 mac_hcksum_set(mp, 0, 0, 0, 0, flags);
488 490 return (B_TRUE);
489 491 }
490 492
491 493 /* XXX: Implement manual fallback checksumming? */
492 494 VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
493 495 VIONA_RING_STAT_INCR(ring, fail_hcksum6);
494 496 return (B_FALSE);
495 497 }
496 498
497 499 /* Cannot even emulate hcksum for unrecognized protocols */
498 500 VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
499 501 VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
500 502 return (B_FALSE);
501 503 }
502 504
503 505 static void
504 506 viona_tx(viona_link_t *link, viona_vring_t *ring)
505 507 {
506 508 struct iovec *iov = ring->vr_txiov;
507 509 const uint_t max_segs = ring->vr_size;
508 510 uint16_t cookie;
509 511 int i, n;
510 512 uint32_t len, base_off = 0;
511 513 uint32_t min_copy = VIONA_MAX_HDRS_LEN;
512 514 mblk_t *mp_head, *mp_tail, *mp;
513 515 viona_desb_t *dp = NULL;
514 516 mac_client_handle_t link_mch = link->l_mch;
515 517 const struct virtio_net_hdr *hdr;
516 518 vmm_page_t *pages = NULL;
517 519
518 520 mp_head = mp_tail = NULL;
519 521
520 522 ASSERT(iov != NULL);
521 523
522 524 n = vq_popchain(ring, iov, max_segs, &cookie, &pages);
523 525 if (n == 0) {
524 526 VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
525 527 VIONA_RING_STAT_INCR(ring, tx_absent);
526 528 return;
527 529 } else if (n < 0) {
528 530 /*
529 531 * Any error encountered in vq_popchain has already resulted in
530 532 * specific probe and statistic handling. Further action here
531 533 * is unnecessary.
532 534 */
533 535 return;
534 536 }
535 537
536 538 /* Grab the header and ensure it is of adequate length */
537 539 hdr = (const struct virtio_net_hdr *)iov[0].iov_base;
538 540 len = iov[0].iov_len;
539 541 if (len < sizeof (struct virtio_net_hdr)) {
540 542 goto drop_fail;
541 543 }
542 544
543 545 /* Make sure the packet headers are always in the first mblk. */
544 546 if (ring->vr_txdesb != NULL) {
545 547 dp = &ring->vr_txdesb[cookie];
546 548
547 549 /*
548 550 * If the guest driver is operating properly, each desb slot
549 551 * should be available for use when processing a TX descriptor
550 552 * from the 'avail' ring. In the case of drivers that reuse a
551 553 * descriptor before it has been posted to the 'used' ring, the
552 554 * data is simply dropped.
553 555 */
554 556 if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
555 557 dp = NULL;
556 558 goto drop_fail;
557 559 }
558 560
559 561 dp->d_cookie = cookie;
560 562 mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0,
561 563 &dp->d_frtn);
562 564
563 565 /* Account for the successful desballoc. */
564 566 if (mp_head != NULL)
565 567 dp->d_ref++;
566 568 } else {
567 569 mp_head = allocb(VIONA_MAX_HDRS_LEN, 0);
568 570 }
569 571
570 572 if (mp_head == NULL)
571 573 goto drop_fail;
572 574
573 575 mp_tail = mp_head;
574 576
575 577 /*
576 578 * We always copy enough of the guest data to cover the
577 579 * headers. This protects us from TOCTOU attacks and allows
578 580 * message block length assumptions to be made in subsequent
579 581 * code. In many cases, this means copying more data than
580 582 * strictly necessary. That's okay, as it is the larger packets
581 583 * (such as LSO) that really benefit from desballoc().
582 584 */
583 585 for (i = 1; i < n; i++) {
584 586 const uint32_t to_copy = MIN(min_copy, iov[i].iov_len);
585 587
586 588 bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy);
587 589 mp_head->b_wptr += to_copy;
588 590 len += to_copy;
589 591 min_copy -= to_copy;
590 592
591 593 /*
592 594 * We've met the minimum copy requirement. The rest of
593 595 * the guest data can be referenced.
594 596 */
595 597 if (min_copy == 0) {
596 598 /*
597 599 * If we copied all contents of this
598 600 * descriptor then move onto the next one.
599 601 * Otherwise, record how far we are into the
600 602 * current descriptor.
601 603 */
602 604 if (iov[i].iov_len == to_copy)
603 605 i++;
604 606 else
605 607 base_off = to_copy;
606 608
607 609 break;
608 610 }
609 611 }
610 612
611 613 ASSERT3P(mp_head, !=, NULL);
612 614 ASSERT3P(mp_tail, !=, NULL);
613 615
614 616 for (; i < n; i++) {
615 617 uintptr_t base = (uintptr_t)iov[i].iov_base + base_off;
616 618 uint32_t chunk = iov[i].iov_len - base_off;
617 619
618 620 ASSERT3U(base_off, <, iov[i].iov_len);
619 621 ASSERT3U(chunk, >, 0);
620 622
621 623 if (dp != NULL) {
622 624 mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn);
623 625 if (mp == NULL) {
624 626 goto drop_fail;
625 627 }
626 628 dp->d_ref++;
627 629 } else {
628 630 mp = allocb(chunk, BPRI_MED);
629 631 if (mp == NULL) {
630 632 goto drop_fail;
631 633 }
632 634 bcopy((uchar_t *)base, mp->b_wptr, chunk);
633 635 }
634 636
635 637 base_off = 0;
636 638 len += chunk;
637 639 mp->b_wptr += chunk;
638 640 mp_tail->b_cont = mp;
639 641 mp_tail = mp;
640 642 }
641 643
642 644 if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
643 645 /*
644 646 * The hook consumer may elect to free the mblk_t and set
645 647 * our mblk_t ** to NULL. When using a viona_desb_t
646 648 * (dp != NULL), we do not want the corresponding cleanup to
647 649 * occur during the viona_hook() call. We instead want to
648 650 * reset and recycle dp for future use. To prevent cleanup
649 651 * during the viona_hook() call, we take a ref on dp (if being
650 652 * used), and release it on success. On failure, the
651 653 * freemsgchain() call will release all the refs taken earlier
652 654 * in viona_tx() (aside from the initial ref and the one we
653 655 * take), and drop_hook will reset dp for reuse.
654 656 */
655 657 if (dp != NULL)
656 658 dp->d_ref++;
657 659
658 660 /*
659 661 * Pass &mp instead of &mp_head so we don't lose track of
660 662 * mp_head if the hook consumer (i.e. ipf) elects to free mp
661 663 * and set mp to NULL.
662 664 */
663 665 mp = mp_head;
664 666 if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
665 667 if (mp != NULL)
666 668 freemsgchain(mp);
667 669 goto drop_hook;
668 670 }
669 671
670 672 if (dp != NULL) {
671 673 dp->d_ref--;
672 674
673 675 /*
674 676 * It is possible that the hook(s) accepted the packet,
675 677 * but as part of its processing, it issued a pull-up
676 678 * which released all references to the desb. In that
677 679 * case, go back to acting like the packet is entirely
678 680 * copied (which it is).
679 681 */
680 682 if (dp->d_ref == 1) {
681 683 dp->d_cookie = 0;
682 684 dp->d_ref = 0;
683 685 dp = NULL;
684 686 }
685 687 }
686 688 }
687 689
688 690 /*
689 691 * Request hardware checksumming, if necessary. If the guest
690 692 * sent an LSO packet then it must have also negotiated and
691 693 * requested partial checksum; therefore the LSO logic is
692 694 * contained within viona_tx_csum().
693 695 */
694 696 if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
695 697 (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
696 698 if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) {
697 699 goto drop_fail;
698 700 }
699 701 }
700 702
701 703 if (dp != NULL) {
702 704 dp->d_len = len;
703 705 dp->d_pages = pages;
704 706 mutex_enter(&ring->vr_lock);
705 707 ring->vr_xfer_outstanding++;
706 708 mutex_exit(&ring->vr_lock);
707 709 } else {
708 710 /*
709 711 * If the data was cloned out of the ring, the descriptors can
710 712 * be marked as 'used' now, rather than deferring that action
711 713 * until after successful packet transmission.
712 714 */
713 715 vmm_drv_page_release_chain(pages);
714 716 viona_tx_done(ring, len, cookie);
715 717 }
716 718
717 719 /*
718 720 * From viona's point of view, this is a successful transmit, even if
719 721 * something downstream decides to drop the packet.
720 722 */
721 723 viona_ring_stat_accept(ring, len);
722 724
723 725 /*
724 726 * We're potentially going deep into the networking layer; make sure the
725 727 * guest can't run concurrently.
726 728 */
727 729 smt_begin_unsafe();
728 730 /*
729 731 * Ignore, for now, any signal from MAC about whether the outgoing
730 732 * packet was dropped or not.
731 733 */
732 734 (void) mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
733 735 smt_end_unsafe();
734 736 return;
735 737
736 738 drop_fail:
737 739 /*
738 740 * On the off chance that memory is not available via the desballoc or
739 741 * allocb calls, there are few options left besides to fail and drop
740 742 * the frame on the floor.
741 743 *
742 744 * First account for it in the error stats.
743 745 */
744 746 viona_ring_stat_error(ring);
745 747
746 748 if (dp != NULL) {
747 749 /*
748 750 * Take an additional reference on the desb handle (if present)
749 751 * so any desballoc-sourced mblks can release their hold on it
750 752 * without the handle reaching its final state and executing
751 753 * its clean-up logic.
752 754 */
753 755 dp->d_ref++;
754 756 }
755 757
756 758 /*
757 759 * Free any already-allocated blocks and sum up the total length of the
758 760 * dropped data to be released to the used ring.
759 761 */
760 762 freemsgchain(mp_head);
761 763
762 764 drop_hook:
763 765 len = 0;
764 766 for (uint_t i = 0; i < n; i++) {
765 767 len += iov[i].iov_len;
766 768 }
767 769
768 770 if (dp != NULL) {
769 771 VERIFY(dp->d_ref == 2);
770 772
771 773 /* Clean up the desb handle, releasing the extra hold. */
772 774 dp->d_len = 0;
773 775 dp->d_cookie = 0;
774 776 dp->d_ref = 0;
775 777 }
776 778
777 779 /* Count in the stats as a drop, rather than an error */
778 780 viona_ring_stat_drop(ring);
779 781
780 782 VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len,
781 783 uint16_t, cookie);
782 784 vmm_drv_page_release_chain(pages);
783 785 viona_tx_done(ring, len, cookie);
784 786 }
|
↓ open down ↓ |
361 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX