1 /*
2 * Copyright (c) 2013 Chris Torek <torek @ torek net>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 /*
27 * This file and its contents are supplied under the terms of the
28 * Common Development and Distribution License ("CDDL"), version 1.0.
29 * You may only use this file in accordance with the terms of version
30 * 1.0 of the CDDL.
31 *
32 * A full copy of the text of the CDDL should have accompanied this
33 * source. A copy of the CDDL is also available via the Internet at
34 * http://www.illumos.org/license/CDDL.
35 *
36 * Copyright 2015 Pluribus Networks Inc.
37 * Copyright 2019 Joyent, Inc.
38 * Copyright 2024 Oxide Computer Company
39 * Copyright 2024 MNX Cloud, Inc.
40 */
41
42
43 #include <sys/types.h>
44 #include <sys/smt.h>
45 #include <sys/strsubr.h>
46
47 #include <sys/pattr.h>
48 #include <sys/dlpi.h>
49 #include <inet/ip.h>
50 #include <inet/ip_impl.h>
51
52 #include "viona_impl.h"
53
54 #define BNXE_NIC_DRIVER "bnxe"
55
56 /*
57 * Tunable controls tx copy by default on or off
58 */
59 boolean_t viona_default_tx_copy = B_TRUE;
60
61 /*
62 * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
63 * transmission to free resources.
64 */
65 kmutex_t viona_force_copy_lock;
66 static enum viona_force_copy {
67 VFC_UNINITALIZED = 0,
68 VFC_COPY_UNEEDED = 1,
69 VFC_COPY_REQUIRED = 2,
70 } viona_force_copy_state = VFC_UNINITALIZED;
71
72 struct viona_desb {
73 frtn_t d_frtn;
74 viona_vring_t *d_ring;
75 uint_t d_ref;
76 uint32_t d_len;
77 uint16_t d_cookie;
78 uchar_t *d_headers;
79 vmm_page_t *d_pages;
80 };
81
82 static void viona_tx(viona_link_t *, viona_vring_t *);
83 static void viona_desb_release(viona_desb_t *);
84
85
86 static void
87 viona_tx_wait_outstanding(viona_vring_t *ring)
88 {
89 ASSERT(MUTEX_HELD(&ring->vr_lock));
90
91 while (ring->vr_xfer_outstanding != 0) {
92 /*
93 * Paying heed to signals is counterproductive here. This is a
94 * very tight loop if pending transfers take an extended amount
95 * of time to be reclaimed while the host process is exiting.
96 */
97 cv_wait(&ring->vr_cv, &ring->vr_lock);
98 }
99 }
100
101 /*
102 * Check if full TX packet copying is needed. This should not be called from
103 * viona attach()/detach() context.
104 */
105 static boolean_t
106 viona_tx_copy_needed(void)
107 {
108 boolean_t result;
109
110 if (viona_default_tx_copy) {
111 return (B_TRUE);
112 }
113
114 mutex_enter(&viona_force_copy_lock);
115 if (viona_force_copy_state == VFC_UNINITALIZED) {
116 major_t bnxe_major;
117
118 /*
119 * The original code for viona featured an explicit check for
120 * the bnxe driver which, when found present, necessitated that
121 * all transmissions be copied into their own mblks instead of
122 * passing guest memory to the underlying device.
123 *
124 * The motivations for this are unclear, but until it can be
125 * proven unnecessary, the check lives on.
126 */
127 viona_force_copy_state = VFC_COPY_UNEEDED;
128 if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
129 != DDI_MAJOR_T_NONE) {
130 if (ddi_hold_installed_driver(bnxe_major) != NULL) {
131 viona_force_copy_state = VFC_COPY_REQUIRED;
132 ddi_rele_driver(bnxe_major);
133 }
134 }
135 }
136 result = (viona_force_copy_state == VFC_COPY_REQUIRED);
137 mutex_exit(&viona_force_copy_lock);
138
139 return (result);
140 }
141
142 void
143 viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz)
144 {
145 /* Allocate desb handles for TX ring if packet copying is disabled */
146 if (!viona_tx_copy_needed()) {
147 viona_desb_t *dp;
148
149 dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
150 ring->vr_txdesb = dp;
151 for (uint_t i = 0; i < qsz; i++, dp++) {
152 dp->d_frtn.free_func = viona_desb_release;
153 dp->d_frtn.free_arg = (void *)dp;
154 dp->d_ring = ring;
155 dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN,
156 KM_SLEEP);
157 }
158 }
159
160 /* Allocate ring-sized iovec buffers for TX */
161 ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP);
162 }
163
164 void
165 viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz)
166 {
167 if (ring->vr_txdesb != NULL) {
168 viona_desb_t *dp = ring->vr_txdesb;
169
170 for (uint_t i = 0; i < qsz; i++, dp++) {
171 kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN);
172 }
173 kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz);
174 ring->vr_txdesb = NULL;
175 }
176
177 if (ring->vr_txiov != NULL) {
178 kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz);
179 ring->vr_txiov = NULL;
180 }
181 }
182
183 static void
184 viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
185 {
186 vq_pushchain(ring, len, cookie);
187
188 membar_enter();
189 viona_intr_ring(ring, B_FALSE);
190 }
191
192 #define TX_BURST_THRESH 32
193
194 void
195 viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
196 {
197 (void) thread_vsetname(curthread, "viona_tx_%p", ring);
198
199 ASSERT(MUTEX_HELD(&ring->vr_lock));
200 ASSERT3U(ring->vr_state, ==, VRS_RUN);
201
202 mutex_exit(&ring->vr_lock);
203
204 for (;;) {
205 uint_t ntx = 0, burst = 0;
206
207 viona_ring_disable_notify(ring);
208 while (viona_ring_num_avail(ring) != 0) {
209 viona_tx(link, ring);
210 ntx++;
211 burst++;
212
213 /*
214 * It is advantageous for throughput to keep this
215 * transmission loop tight, but periodic breaks to
216 * check for other events are of value too.
217 */
218 if (burst >= TX_BURST_THRESH) {
219 mutex_enter(&ring->vr_lock);
220 const bool need_bail = vring_need_bail(ring);
221 mutex_exit(&ring->vr_lock);
222
223 if (need_bail) {
224 break;
225 }
226 burst = 0;
227 }
228 }
229
230 VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx);
231
232 /*
233 * Check for available descriptors on the ring once more in
234 * case a late addition raced with the NO_NOTIFY flag toggle.
235 *
236 * The barrier ensures that visibility of the no-notify
237 * store does not cross the viona_ring_num_avail() check below.
238 */
239 viona_ring_enable_notify(ring);
240 membar_enter();
241
242 if (viona_ring_num_avail(ring) == 0 &&
243 (link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
244 /*
245 * The NOTIFY_ON_EMPTY interrupt should not pay heed to
246 * the presence of AVAIL_NO_INTERRUPT.
247 */
248 viona_intr_ring(ring, B_TRUE);
249 }
250
251 mutex_enter(&ring->vr_lock);
252 for (;;) {
253 if (vring_need_bail(ring)) {
254 ring->vr_state = VRS_STOP;
255 viona_tx_wait_outstanding(ring);
256 return;
257 }
258
259 if (vmm_drv_lease_expired(ring->vr_lease)) {
260 ring->vr_state_flags |= VRSF_RENEW;
261 /*
262 * When renewing the lease for the ring, no TX
263 * frames may be outstanding, as they contain
264 * references to guest memory.
265 */
266 viona_tx_wait_outstanding(ring);
267
268 const boolean_t renewed =
269 viona_ring_lease_renew(ring);
270 ring->vr_state_flags &= ~VRSF_RENEW;
271
272 if (!renewed) {
273 /* stop ring on failed renewal */
274 ring->vr_state = VRS_STOP;
275 return;
276 }
277 }
278
279 if (viona_ring_num_avail(ring) != 0) {
280 break;
281 }
282
283 /* Wait for further activity on the ring */
284 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
285 }
286 mutex_exit(&ring->vr_lock);
287 }
288 /* UNREACHABLE */
289 }
290
291 static void
292 viona_desb_release(viona_desb_t *dp)
293 {
294 viona_vring_t *ring = dp->d_ring;
295 uint_t ref;
296 uint32_t len;
297 uint16_t cookie;
298
299 ref = atomic_dec_uint_nv(&dp->d_ref);
300 if (ref > 1) {
301 return;
302 }
303
304 /*
305 * The desb corresponding to this index must be ready for reuse before
306 * the descriptor is returned to the guest via the 'used' ring.
307 */
308 len = dp->d_len;
309 cookie = dp->d_cookie;
310 dp->d_len = 0;
311 dp->d_cookie = 0;
312 vmm_drv_page_release_chain(dp->d_pages);
313 dp->d_pages = NULL;
314
315 /*
316 * Ensure all other changes to the desb are visible prior to zeroing its
317 * refcount, signifying its readiness for reuse.
318 */
319 membar_exit();
320 dp->d_ref = 0;
321
322 viona_tx_done(ring, len, cookie);
323
324 mutex_enter(&ring->vr_lock);
325 if ((--ring->vr_xfer_outstanding) == 0) {
326 cv_broadcast(&ring->vr_cv);
327 }
328 mutex_exit(&ring->vr_lock);
329 }
330
331 static boolean_t
332 viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
333 mblk_t *mp, uint32_t len)
334 {
335 viona_link_t *link = ring->vr_link;
336 const struct ether_header *eth;
337 uint_t eth_len = sizeof (struct ether_header);
338 ushort_t ftype;
339 ipha_t *ipha = NULL;
340 uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */
341 uint16_t flags = 0;
342 const uint_t csum_start = hdr->vrh_csum_start;
343 const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start;
344
345 /*
346 * Validate that the checksum offsets provided by the guest are within
347 * the bounds of the packet. Additionally, ensure that the checksum
348 * contents field is within the headers mblk copied by viona_tx().
349 */
350 if (csum_start >= len || csum_start < eth_len || csum_stuff >= len ||
351 (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) {
352 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
353 VIONA_RING_STAT_INCR(ring, fail_hcksum);
354 return (B_FALSE);
355 }
356
357 /*
358 * This is guaranteed to be safe thanks to the header copying
359 * done in viona_tx().
360 */
361 eth = (const struct ether_header *)mp->b_rptr;
362 ftype = ntohs(eth->ether_type);
363
364 if (ftype == ETHERTYPE_VLAN) {
365 const struct ether_vlan_header *veth;
366
367 /* punt on QinQ for now */
368 eth_len = sizeof (struct ether_vlan_header);
369 veth = (const struct ether_vlan_header *)eth;
370 ftype = ntohs(veth->ether_type);
371 }
372
373 if (ftype == ETHERTYPE_IP) {
374 ipha = (ipha_t *)(mp->b_rptr + eth_len);
375
376 ipproto = ipha->ipha_protocol;
377 } else if (ftype == ETHERTYPE_IPV6) {
378 ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len);
379
380 ipproto = ip6h->ip6_nxt;
381 }
382
383 /*
384 * We ignore hdr_len because the spec says it can't be
385 * trusted. Besides, our own stack will determine the header
386 * boundary.
387 */
388 if ((hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
389 ftype == ETHERTYPE_IP) {
390 if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0) {
391 uint16_t *cksump;
392 uint32_t cksum;
393 ipaddr_t src = ipha->ipha_src;
394 ipaddr_t dst = ipha->ipha_dst;
395
396 /*
397 * Our native IP stack doesn't set the L4 length field
398 * of the pseudo header when LSO is in play. Other IP
399 * stacks, e.g. Linux, do include the length field.
400 * This is a problem because the hardware expects that
401 * the length field is not set. When it is set it will
402 * cause an incorrect TCP checksum to be generated.
403 * The reason this works in Linux is because Linux
404 * corrects the pseudo-header checksum in the driver
405 * code. In order to get the correct HW checksum we
406 * need to assume the guest's IP stack gave us a bogus
407 * TCP partial checksum and calculate it ourselves.
408 */
409 cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
410 cksum = IP_TCP_CSUM_COMP;
411 cksum += (dst >> 16) + (dst & 0xFFFF) +
412 (src >> 16) + (src & 0xFFFF);
413 cksum = (cksum & 0xFFFF) + (cksum >> 16);
414 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
415 }
416
417 /*
418 * Since viona is a "legacy device", the data stored
419 * by the driver will be in the guest's native endian
420 * format (see sections 2.4.3 and 5.1.6.1 of the
421 * VIRTIO 1.0 spec for more info). At this time the
422 * only guests using viona are x86 and we can assume
423 * little-endian.
424 */
425 lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO);
426
427 /*
428 * Hardware, like ixgbe, expects the client to request
429 * IP header checksum offload if it's sending LSO (see
430 * ixgbe_get_context()). Unfortunately, virtio makes
431 * no allowances for negotiating IP header checksum
432 * and HW offload, only TCP checksum. We add the flag
433 * and zero-out the checksum field. This mirrors the
434 * behavior of our native IP stack (which does this in
435 * the interest of HW that expects the field to be
436 * zero).
437 */
438 flags |= HCK_IPV4_HDRCKSUM;
439 ipha->ipha_hdr_checksum = 0;
440 }
441
442 /*
443 * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure
444 * HW_LSO, if present, is not lost.
445 */
446 flags |= DB_CKSUMFLAGS(mp);
447
448 /*
449 * Partial checksum support from the NIC is ideal, since it most
450 * closely maps to the interface defined by virtio.
451 */
452 if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
453 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
454 /*
455 * MAC expects these offsets to be relative to the
456 * start of the L3 header rather than the L2 frame.
457 */
458 flags |= HCK_PARTIALCKSUM;
459 mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len,
460 len - eth_len, 0, flags);
461 return (B_TRUE);
462 }
463
464 /*
465 * Without partial checksum support, look to the L3/L4 protocol
466 * information to see if the NIC can handle it. If not, the
467 * checksum will need to calculated inline.
468 */
469 if (ftype == ETHERTYPE_IP) {
470 if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
471 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
472 uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
473 *csump = 0;
474 flags |= HCK_FULLCKSUM;
475 mac_hcksum_set(mp, 0, 0, 0, 0, flags);
476 return (B_TRUE);
477 }
478
479 /* XXX: Implement manual fallback checksumming? */
480 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
481 VIONA_RING_STAT_INCR(ring, fail_hcksum);
482 return (B_FALSE);
483 } else if (ftype == ETHERTYPE_IPV6) {
484 if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
485 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
486 uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
487 *csump = 0;
488 flags |= HCK_FULLCKSUM;
489 mac_hcksum_set(mp, 0, 0, 0, 0, flags);
490 return (B_TRUE);
491 }
492
493 /* XXX: Implement manual fallback checksumming? */
494 VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
495 VIONA_RING_STAT_INCR(ring, fail_hcksum6);
496 return (B_FALSE);
497 }
498
499 /* Cannot even emulate hcksum for unrecognized protocols */
500 VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
501 VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
502 return (B_FALSE);
503 }
504
505 static void
506 viona_tx(viona_link_t *link, viona_vring_t *ring)
507 {
508 struct iovec *iov = ring->vr_txiov;
509 const uint_t max_segs = ring->vr_size;
510 uint16_t cookie;
511 int i, n;
512 uint32_t len, base_off = 0;
513 uint32_t min_copy = VIONA_MAX_HDRS_LEN;
514 mblk_t *mp_head, *mp_tail, *mp;
515 viona_desb_t *dp = NULL;
516 mac_client_handle_t link_mch = link->l_mch;
517 const struct virtio_net_hdr *hdr;
518 vmm_page_t *pages = NULL;
519
520 mp_head = mp_tail = NULL;
521
522 ASSERT(iov != NULL);
523
524 n = vq_popchain(ring, iov, max_segs, &cookie, &pages);
525 if (n == 0) {
526 VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
527 VIONA_RING_STAT_INCR(ring, tx_absent);
528 return;
529 } else if (n < 0) {
530 /*
531 * Any error encountered in vq_popchain has already resulted in
532 * specific probe and statistic handling. Further action here
533 * is unnecessary.
534 */
535 return;
536 }
537
538 /* Grab the header and ensure it is of adequate length */
539 hdr = (const struct virtio_net_hdr *)iov[0].iov_base;
540 len = iov[0].iov_len;
541 if (len < sizeof (struct virtio_net_hdr)) {
542 goto drop_fail;
543 }
544
545 /* Make sure the packet headers are always in the first mblk. */
546 if (ring->vr_txdesb != NULL) {
547 dp = &ring->vr_txdesb[cookie];
548
549 /*
550 * If the guest driver is operating properly, each desb slot
551 * should be available for use when processing a TX descriptor
552 * from the 'avail' ring. In the case of drivers that reuse a
553 * descriptor before it has been posted to the 'used' ring, the
554 * data is simply dropped.
555 */
556 if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
557 dp = NULL;
558 goto drop_fail;
559 }
560
561 dp->d_cookie = cookie;
562 mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0,
563 &dp->d_frtn);
564
565 /* Account for the successful desballoc. */
566 if (mp_head != NULL)
567 dp->d_ref++;
568 } else {
569 mp_head = allocb(VIONA_MAX_HDRS_LEN, 0);
570 }
571
572 if (mp_head == NULL)
573 goto drop_fail;
574
575 mp_tail = mp_head;
576
577 /*
578 * We always copy enough of the guest data to cover the
579 * headers. This protects us from TOCTOU attacks and allows
580 * message block length assumptions to be made in subsequent
581 * code. In many cases, this means copying more data than
582 * strictly necessary. That's okay, as it is the larger packets
583 * (such as LSO) that really benefit from desballoc().
584 */
585 for (i = 1; i < n; i++) {
586 const uint32_t to_copy = MIN(min_copy, iov[i].iov_len);
587
588 bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy);
589 mp_head->b_wptr += to_copy;
590 len += to_copy;
591 min_copy -= to_copy;
592
593 /*
594 * We've met the minimum copy requirement. The rest of
595 * the guest data can be referenced.
596 */
597 if (min_copy == 0) {
598 /*
599 * If we copied all contents of this
600 * descriptor then move onto the next one.
601 * Otherwise, record how far we are into the
602 * current descriptor.
603 */
604 if (iov[i].iov_len == to_copy)
605 i++;
606 else
607 base_off = to_copy;
608
609 break;
610 }
611 }
612
613 ASSERT3P(mp_head, !=, NULL);
614 ASSERT3P(mp_tail, !=, NULL);
615
616 for (; i < n; i++) {
617 uintptr_t base = (uintptr_t)iov[i].iov_base + base_off;
618 uint32_t chunk = iov[i].iov_len - base_off;
619
620 ASSERT3U(base_off, <, iov[i].iov_len);
621 ASSERT3U(chunk, >, 0);
622
623 if (dp != NULL) {
624 mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn);
625 if (mp == NULL) {
626 goto drop_fail;
627 }
628 dp->d_ref++;
629 } else {
630 mp = allocb(chunk, BPRI_MED);
631 if (mp == NULL) {
632 goto drop_fail;
633 }
634 bcopy((uchar_t *)base, mp->b_wptr, chunk);
635 }
636
637 base_off = 0;
638 len += chunk;
639 mp->b_wptr += chunk;
640 mp_tail->b_cont = mp;
641 mp_tail = mp;
642 }
643
644 if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
645 /*
646 * The hook consumer may elect to free the mblk_t and set
647 * our mblk_t ** to NULL. When using a viona_desb_t
648 * (dp != NULL), we do not want the corresponding cleanup to
649 * occur during the viona_hook() call. We instead want to
650 * reset and recycle dp for future use. To prevent cleanup
651 * during the viona_hook() call, we take a ref on dp (if being
652 * used), and release it on success. On failure, the
653 * freemsgchain() call will release all the refs taken earlier
654 * in viona_tx() (aside from the initial ref and the one we
655 * take), and drop_hook will reset dp for reuse.
656 */
657 if (dp != NULL)
658 dp->d_ref++;
659
660 /*
661 * Pass &mp instead of &mp_head so we don't lose track of
662 * mp_head if the hook consumer (i.e. ipf) elects to free mp
663 * and set mp to NULL.
664 */
665 mp = mp_head;
666 if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
667 if (mp != NULL)
668 freemsgchain(mp);
669 goto drop_hook;
670 }
671
672 if (dp != NULL) {
673 dp->d_ref--;
674
675 /*
676 * It is possible that the hook(s) accepted the packet,
677 * but as part of its processing, it issued a pull-up
678 * which released all references to the desb. In that
679 * case, go back to acting like the packet is entirely
680 * copied (which it is).
681 */
682 if (dp->d_ref == 1) {
683 dp->d_cookie = 0;
684 dp->d_ref = 0;
685 dp = NULL;
686 }
687 }
688 }
689
690 /*
691 * Request hardware checksumming, if necessary. If the guest
692 * sent an LSO packet then it must have also negotiated and
693 * requested partial checksum; therefore the LSO logic is
694 * contained within viona_tx_csum().
695 */
696 if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
697 (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
698 if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) {
699 goto drop_fail;
700 }
701 }
702
703 if (dp != NULL) {
704 dp->d_len = len;
705 dp->d_pages = pages;
706 mutex_enter(&ring->vr_lock);
707 ring->vr_xfer_outstanding++;
708 mutex_exit(&ring->vr_lock);
709 } else {
710 /*
711 * If the data was cloned out of the ring, the descriptors can
712 * be marked as 'used' now, rather than deferring that action
713 * until after successful packet transmission.
714 */
715 vmm_drv_page_release_chain(pages);
716 viona_tx_done(ring, len, cookie);
717 }
718
719 /*
720 * From viona's point of view, this is a successful transmit, even if
721 * something downstream decides to drop the packet.
722 */
723 viona_ring_stat_accept(ring, len);
724
725 /*
726 * We're potentially going deep into the networking layer; make sure the
727 * guest can't run concurrently.
728 */
729 smt_begin_unsafe();
730 /*
731 * Ignore, for now, any signal from MAC about whether the outgoing
732 * packet was dropped or not.
733 */
734 (void) mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
735 smt_end_unsafe();
736 return;
737
738 drop_fail:
739 /*
740 * On the off chance that memory is not available via the desballoc or
741 * allocb calls, there are few options left besides to fail and drop
742 * the frame on the floor.
743 *
744 * First account for it in the error stats.
745 */
746 viona_ring_stat_error(ring);
747
748 if (dp != NULL) {
749 /*
750 * Take an additional reference on the desb handle (if present)
751 * so any desballoc-sourced mblks can release their hold on it
752 * without the handle reaching its final state and executing
753 * its clean-up logic.
754 */
755 dp->d_ref++;
756 }
757
758 /*
759 * Free any already-allocated blocks and sum up the total length of the
760 * dropped data to be released to the used ring.
761 */
762 freemsgchain(mp_head);
763
764 drop_hook:
765 len = 0;
766 for (uint_t i = 0; i < n; i++) {
767 len += iov[i].iov_len;
768 }
769
770 if (dp != NULL) {
771 VERIFY(dp->d_ref == 2);
772
773 /* Clean up the desb handle, releasing the extra hold. */
774 dp->d_len = 0;
775 dp->d_cookie = 0;
776 dp->d_ref = 0;
777 }
778
779 /* Count in the stats as a drop, rather than an error */
780 viona_ring_stat_drop(ring);
781
782 VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len,
783 uint16_t, cookie);
784 vmm_drv_page_release_chain(pages);
785 viona_tx_done(ring, len, cookie);
786 }