16884 New usr/src/uts/intel/io/viona/viona

   1 /*
   2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26 /*
  27  * This file and its contents are supplied under the terms of the
  28  * Common Development and Distribution License ("CDDL"), version 1.0.
  29  * You may only use this file in accordance with the terms of version
  30  * 1.0 of the CDDL.
  31  *
  32  * A full copy of the text of the CDDL should have accompanied this
  33  * source.  A copy of the CDDL is also available via the Internet at
  34  * http://www.illumos.org/license/CDDL.
  35  *
  36  * Copyright 2015 Pluribus Networks Inc.
  37  * Copyright 2019 Joyent, Inc.
  38  * Copyright 2024 Oxide Computer Company
  39  * Copyright 2024 MNX Cloud, Inc.
  40  */
  41 
  42 
  43 #include <sys/types.h>
  44 #include <sys/smt.h>
  45 #include <sys/strsubr.h>
  46 
  47 #include <sys/pattr.h>
  48 #include <sys/dlpi.h>
  49 #include <inet/ip.h>
  50 #include <inet/ip_impl.h>
  51 
  52 #include "viona_impl.h"
  53 
  54 #define BNXE_NIC_DRIVER         "bnxe"
  55 
  56 /*
  57  * Tunable controls tx copy by default on or off
  58  */
  59 boolean_t viona_default_tx_copy = B_TRUE;
  60 
  61 /*
  62  * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
  63  * transmission to free resources.
  64  */
  65 kmutex_t viona_force_copy_lock;
  66 static enum viona_force_copy {
  67         VFC_UNINITALIZED        = 0,
  68         VFC_COPY_UNEEDED        = 1,
  69         VFC_COPY_REQUIRED       = 2,
  70 } viona_force_copy_state = VFC_UNINITALIZED;
  71 
  72 struct viona_desb {
  73         frtn_t                  d_frtn;
  74         viona_vring_t           *d_ring;
  75         uint_t                  d_ref;
  76         uint32_t                d_len;
  77         uint16_t                d_cookie;
  78         uchar_t                 *d_headers;
  79         vmm_page_t              *d_pages;
  80 };
  81 
  82 static void viona_tx(viona_link_t *, viona_vring_t *);
  83 static void viona_desb_release(viona_desb_t *);
  84 
  85 
  86 static void
  87 viona_tx_wait_outstanding(viona_vring_t *ring)
  88 {
  89         ASSERT(MUTEX_HELD(&ring->vr_lock));
  90 
  91         while (ring->vr_xfer_outstanding != 0) {
  92                 /*
  93                  * Paying heed to signals is counterproductive here.  This is a
  94                  * very tight loop if pending transfers take an extended amount
  95                  * of time to be reclaimed while the host process is exiting.
  96                  */
  97                 cv_wait(&ring->vr_cv, &ring->vr_lock);
  98         }
  99 }
 100 
 101 /*
 102  * Check if full TX packet copying is needed.  This should not be called from
 103  * viona attach()/detach() context.
 104  */
 105 static boolean_t
 106 viona_tx_copy_needed(void)
 107 {
 108         boolean_t result;
 109 
 110         if (viona_default_tx_copy) {
 111                 return (B_TRUE);
 112         }
 113 
 114         mutex_enter(&viona_force_copy_lock);
 115         if (viona_force_copy_state == VFC_UNINITALIZED) {
 116                 major_t bnxe_major;
 117 
 118                 /*
 119                  * The original code for viona featured an explicit check for
 120                  * the bnxe driver which, when found present, necessitated that
 121                  * all transmissions be copied into their own mblks instead of
 122                  * passing guest memory to the underlying device.
 123                  *
 124                  * The motivations for this are unclear, but until it can be
 125                  * proven unnecessary, the check lives on.
 126                  */
 127                 viona_force_copy_state = VFC_COPY_UNEEDED;
 128                 if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
 129                     != DDI_MAJOR_T_NONE) {
 130                         if (ddi_hold_installed_driver(bnxe_major) != NULL) {
 131                                 viona_force_copy_state = VFC_COPY_REQUIRED;
 132                                 ddi_rele_driver(bnxe_major);
 133                         }
 134                 }
 135         }
 136         result = (viona_force_copy_state == VFC_COPY_REQUIRED);
 137         mutex_exit(&viona_force_copy_lock);
 138 
 139         return (result);
 140 }
 141 
 142 void
 143 viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz)
 144 {
 145         /* Allocate desb handles for TX ring if packet copying is disabled */
 146         if (!viona_tx_copy_needed()) {
 147                 viona_desb_t *dp;
 148 
 149                 dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
 150                 ring->vr_txdesb = dp;
 151                 for (uint_t i = 0; i < qsz; i++, dp++) {
 152                         dp->d_frtn.free_func = viona_desb_release;
 153                         dp->d_frtn.free_arg = (void *)dp;
 154                         dp->d_ring = ring;
 155                         dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN,
 156                             KM_SLEEP);
 157                 }
 158         }
 159 
 160         /* Allocate ring-sized iovec buffers for TX */
 161         ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP);
 162 }
 163 
 164 void
 165 viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz)
 166 {
 167         if (ring->vr_txdesb != NULL) {
 168                 viona_desb_t *dp = ring->vr_txdesb;
 169 
 170                 for (uint_t i = 0; i < qsz; i++, dp++) {
 171                         kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN);
 172                 }
 173                 kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz);
 174                 ring->vr_txdesb = NULL;
 175         }
 176 
 177         if (ring->vr_txiov != NULL) {
 178                 kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz);
 179                 ring->vr_txiov = NULL;
 180         }
 181 }
 182 
 183 static void
 184 viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
 185 {
 186         vq_pushchain(ring, len, cookie);
 187 
 188         membar_enter();
 189         viona_intr_ring(ring, B_FALSE);
 190 }
 191 
 192 #define TX_BURST_THRESH 32
 193 
 194 void
 195 viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
 196 {
 197         (void) thread_vsetname(curthread, "viona_tx_%p", ring);
 198 
 199         ASSERT(MUTEX_HELD(&ring->vr_lock));
 200         ASSERT3U(ring->vr_state, ==, VRS_RUN);
 201 
 202         mutex_exit(&ring->vr_lock);
 203 
 204         for (;;) {
 205                 uint_t ntx = 0, burst = 0;
 206 
 207                 viona_ring_disable_notify(ring);
 208                 while (viona_ring_num_avail(ring) != 0) {
 209                         viona_tx(link, ring);
 210                         ntx++;
 211                         burst++;
 212 
 213                         /*
 214                          * It is advantageous for throughput to keep this
 215                          * transmission loop tight, but periodic breaks to
 216                          * check for other events are of value too.
 217                          */
 218                         if (burst >= TX_BURST_THRESH) {
 219                                 mutex_enter(&ring->vr_lock);
 220                                 const bool need_bail = vring_need_bail(ring);
 221                                 mutex_exit(&ring->vr_lock);
 222 
 223                                 if (need_bail) {
 224                                         break;
 225                                 }
 226                                 burst = 0;
 227                         }
 228                 }
 229 
 230                 VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx);
 231 
 232                 /*
 233                  * Check for available descriptors on the ring once more in
 234                  * case a late addition raced with the NO_NOTIFY flag toggle.
 235                  *
 236                  * The barrier ensures that visibility of the no-notify
 237                  * store does not cross the viona_ring_num_avail() check below.
 238                  */
 239                 viona_ring_enable_notify(ring);
 240                 membar_enter();
 241 
 242                 if (viona_ring_num_avail(ring) == 0 &&
 243                     (link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
 244                         /*
 245                          * The NOTIFY_ON_EMPTY interrupt should not pay heed to
 246                          * the presence of AVAIL_NO_INTERRUPT.
 247                          */
 248                         viona_intr_ring(ring, B_TRUE);
 249                 }
 250 
 251                 mutex_enter(&ring->vr_lock);
 252                 for (;;) {
 253                         if (vring_need_bail(ring)) {
 254                                 ring->vr_state = VRS_STOP;
 255                                 viona_tx_wait_outstanding(ring);
 256                                 return;
 257                         }
 258 
 259                         if (vmm_drv_lease_expired(ring->vr_lease)) {
 260                                 ring->vr_state_flags |= VRSF_RENEW;
 261                                 /*
 262                                  * When renewing the lease for the ring, no TX
 263                                  * frames may be outstanding, as they contain
 264                                  * references to guest memory.
 265                                  */
 266                                 viona_tx_wait_outstanding(ring);
 267 
 268                                 const boolean_t renewed =
 269                                     viona_ring_lease_renew(ring);
 270                                 ring->vr_state_flags &= ~VRSF_RENEW;
 271 
 272                                 if (!renewed) {
 273                                         /* stop ring on failed renewal */
 274                                         ring->vr_state = VRS_STOP;
 275                                         return;
 276                                 }
 277                         }
 278 
 279                         if (viona_ring_num_avail(ring) != 0) {
 280                                 break;
 281                         }
 282 
 283                         /* Wait for further activity on the ring */
 284                         (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
 285                 }
 286                 mutex_exit(&ring->vr_lock);
 287         }
 288         /* UNREACHABLE */
 289 }
 290 
 291 static void
 292 viona_desb_release(viona_desb_t *dp)
 293 {
 294         viona_vring_t *ring = dp->d_ring;
 295         uint_t ref;
 296         uint32_t len;
 297         uint16_t cookie;
 298 
 299         ref = atomic_dec_uint_nv(&dp->d_ref);
 300         if (ref > 1) {
 301                 return;
 302         }
 303 
 304         /*
 305          * The desb corresponding to this index must be ready for reuse before
 306          * the descriptor is returned to the guest via the 'used' ring.
 307          */
 308         len = dp->d_len;
 309         cookie = dp->d_cookie;
 310         dp->d_len = 0;
 311         dp->d_cookie = 0;
 312         vmm_drv_page_release_chain(dp->d_pages);
 313         dp->d_pages = NULL;
 314 
 315         /*
 316          * Ensure all other changes to the desb are visible prior to zeroing its
 317          * refcount, signifying its readiness for reuse.
 318          */
 319         membar_exit();
 320         dp->d_ref = 0;
 321 
 322         viona_tx_done(ring, len, cookie);
 323 
 324         mutex_enter(&ring->vr_lock);
 325         if ((--ring->vr_xfer_outstanding) == 0) {
 326                 cv_broadcast(&ring->vr_cv);
 327         }
 328         mutex_exit(&ring->vr_lock);
 329 }
 330 
 331 static boolean_t
 332 viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
 333     mblk_t *mp, uint32_t len)
 334 {
 335         viona_link_t *link = ring->vr_link;
 336         const struct ether_header *eth;
 337         uint_t eth_len = sizeof (struct ether_header);
 338         ushort_t ftype;
 339         ipha_t *ipha = NULL;
 340         uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */
 341         uint16_t flags = 0;
 342         const uint_t csum_start = hdr->vrh_csum_start;
 343         const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start;
 344 
 345         /*
 346          * Validate that the checksum offsets provided by the guest are within
 347          * the bounds of the packet.  Additionally, ensure that the checksum
 348          * contents field is within the headers mblk copied by viona_tx().
 349          */
 350         if (csum_start >= len || csum_start < eth_len || csum_stuff >= len ||
 351             (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) {
 352                 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
 353                 VIONA_RING_STAT_INCR(ring, fail_hcksum);
 354                 return (B_FALSE);
 355         }
 356 
 357         /*
 358          * This is guaranteed to be safe thanks to the header copying
 359          * done in viona_tx().
 360          */
 361         eth = (const struct ether_header *)mp->b_rptr;
 362         ftype = ntohs(eth->ether_type);
 363 
 364         if (ftype == ETHERTYPE_VLAN) {
 365                 const struct ether_vlan_header *veth;
 366 
 367                 /* punt on QinQ for now */
 368                 eth_len = sizeof (struct ether_vlan_header);
 369                 veth = (const struct ether_vlan_header *)eth;
 370                 ftype = ntohs(veth->ether_type);
 371         }
 372 
 373         if (ftype == ETHERTYPE_IP) {
 374                 ipha = (ipha_t *)(mp->b_rptr + eth_len);
 375 
 376                 ipproto = ipha->ipha_protocol;
 377         } else if (ftype == ETHERTYPE_IPV6) {
 378                 ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len);
 379 
 380                 ipproto = ip6h->ip6_nxt;
 381         }
 382 
 383         /*
 384          * We ignore hdr_len because the spec says it can't be
 385          * trusted. Besides, our own stack will determine the header
 386          * boundary.
 387          */
 388         if ((hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
 389             ftype == ETHERTYPE_IP) {
 390                 if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0) {
 391                         uint16_t        *cksump;
 392                         uint32_t        cksum;
 393                         ipaddr_t        src = ipha->ipha_src;
 394                         ipaddr_t        dst = ipha->ipha_dst;
 395 
 396                         /*
 397                          * Our native IP stack doesn't set the L4 length field
 398                          * of the pseudo header when LSO is in play. Other IP
 399                          * stacks, e.g. Linux, do include the length field.
 400                          * This is a problem because the hardware expects that
 401                          * the length field is not set. When it is set it will
 402                          * cause an incorrect TCP checksum to be generated.
 403                          * The reason this works in Linux is because Linux
 404                          * corrects the pseudo-header checksum in the driver
 405                          * code. In order to get the correct HW checksum we
 406                          * need to assume the guest's IP stack gave us a bogus
 407                          * TCP partial checksum and calculate it ourselves.
 408                          */
 409                         cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
 410                         cksum = IP_TCP_CSUM_COMP;
 411                         cksum += (dst >> 16) + (dst & 0xFFFF) +
 412                             (src >> 16) + (src & 0xFFFF);
 413                         cksum = (cksum & 0xFFFF) + (cksum >> 16);
 414                         *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
 415                 }
 416 
 417                 /*
 418                  * Since viona is a "legacy device", the data stored
 419                  * by the driver will be in the guest's native endian
 420                  * format (see sections 2.4.3 and 5.1.6.1 of the
 421                  * VIRTIO 1.0 spec for more info). At this time the
 422                  * only guests using viona are x86 and we can assume
 423                  * little-endian.
 424                  */
 425                 lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO);
 426 
 427                 /*
 428                  * Hardware, like ixgbe, expects the client to request
 429                  * IP header checksum offload if it's sending LSO (see
 430                  * ixgbe_get_context()). Unfortunately, virtio makes
 431                  * no allowances for negotiating IP header checksum
 432                  * and HW offload, only TCP checksum. We add the flag
 433                  * and zero-out the checksum field. This mirrors the
 434                  * behavior of our native IP stack (which does this in
 435                  * the interest of HW that expects the field to be
 436                  * zero).
 437                  */
 438                 flags |= HCK_IPV4_HDRCKSUM;
 439                 ipha->ipha_hdr_checksum = 0;
 440         }
 441 
 442         /*
 443          * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure
 444          * HW_LSO, if present, is not lost.
 445          */
 446         flags |= DB_CKSUMFLAGS(mp);
 447 
 448         /*
 449          * Partial checksum support from the NIC is ideal, since it most
 450          * closely maps to the interface defined by virtio.
 451          */
 452         if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
 453             (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
 454                 /*
 455                  * MAC expects these offsets to be relative to the
 456                  * start of the L3 header rather than the L2 frame.
 457                  */
 458                 flags |= HCK_PARTIALCKSUM;
 459                 mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len,
 460                     len - eth_len, 0, flags);
 461                 return (B_TRUE);
 462         }
 463 
 464         /*
 465          * Without partial checksum support, look to the L3/L4 protocol
 466          * information to see if the NIC can handle it.  If not, the
 467          * checksum will need to calculated inline.
 468          */
 469         if (ftype == ETHERTYPE_IP) {
 470                 if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
 471                     (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
 472                         uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
 473                         *csump = 0;
 474                         flags |= HCK_FULLCKSUM;
 475                         mac_hcksum_set(mp, 0, 0, 0, 0, flags);
 476                         return (B_TRUE);
 477                 }
 478 
 479                 /* XXX: Implement manual fallback checksumming? */
 480                 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
 481                 VIONA_RING_STAT_INCR(ring, fail_hcksum);
 482                 return (B_FALSE);
 483         } else if (ftype == ETHERTYPE_IPV6) {
 484                 if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
 485                     (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
 486                         uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
 487                         *csump = 0;
 488                         flags |= HCK_FULLCKSUM;
 489                         mac_hcksum_set(mp, 0, 0, 0, 0, flags);
 490                         return (B_TRUE);
 491                 }
 492 
 493                 /* XXX: Implement manual fallback checksumming? */
 494                 VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
 495                 VIONA_RING_STAT_INCR(ring, fail_hcksum6);
 496                 return (B_FALSE);
 497         }
 498 
 499         /* Cannot even emulate hcksum for unrecognized protocols */
 500         VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
 501         VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
 502         return (B_FALSE);
 503 }
 504 
 505 static void
 506 viona_tx(viona_link_t *link, viona_vring_t *ring)
 507 {
 508         struct iovec            *iov = ring->vr_txiov;
 509         const uint_t            max_segs = ring->vr_size;
 510         uint16_t                cookie;
 511         int                     i, n;
 512         uint32_t                len, base_off = 0;
 513         uint32_t                min_copy = VIONA_MAX_HDRS_LEN;
 514         mblk_t                  *mp_head, *mp_tail, *mp;
 515         viona_desb_t            *dp = NULL;
 516         mac_client_handle_t     link_mch = link->l_mch;
 517         const struct virtio_net_hdr *hdr;
 518         vmm_page_t *pages = NULL;
 519 
 520         mp_head = mp_tail = NULL;
 521 
 522         ASSERT(iov != NULL);
 523 
 524         n = vq_popchain(ring, iov, max_segs, &cookie, &pages);
 525         if (n == 0) {
 526                 VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
 527                 VIONA_RING_STAT_INCR(ring, tx_absent);
 528                 return;
 529         } else if (n < 0) {
 530                 /*
 531                  * Any error encountered in vq_popchain has already resulted in
 532                  * specific probe and statistic handling.  Further action here
 533                  * is unnecessary.
 534                  */
 535                 return;
 536         }
 537 
 538         /* Grab the header and ensure it is of adequate length */
 539         hdr = (const struct virtio_net_hdr *)iov[0].iov_base;
 540         len = iov[0].iov_len;
 541         if (len < sizeof (struct virtio_net_hdr)) {
 542                 goto drop_fail;
 543         }
 544 
 545         /* Make sure the packet headers are always in the first mblk. */
 546         if (ring->vr_txdesb != NULL) {
 547                 dp = &ring->vr_txdesb[cookie];
 548 
 549                 /*
 550                  * If the guest driver is operating properly, each desb slot
 551                  * should be available for use when processing a TX descriptor
 552                  * from the 'avail' ring.  In the case of drivers that reuse a
 553                  * descriptor before it has been posted to the 'used' ring, the
 554                  * data is simply dropped.
 555                  */
 556                 if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
 557                         dp = NULL;
 558                         goto drop_fail;
 559                 }
 560 
 561                 dp->d_cookie = cookie;
 562                 mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0,
 563                     &dp->d_frtn);
 564 
 565                 /* Account for the successful desballoc. */
 566                 if (mp_head != NULL)
 567                         dp->d_ref++;
 568         } else {
 569                 mp_head = allocb(VIONA_MAX_HDRS_LEN, 0);
 570         }
 571 
 572         if (mp_head == NULL)
 573                 goto drop_fail;
 574 
 575         mp_tail = mp_head;
 576 
 577         /*
 578          * We always copy enough of the guest data to cover the
 579          * headers. This protects us from TOCTOU attacks and allows
 580          * message block length assumptions to be made in subsequent
 581          * code. In many cases, this means copying more data than
 582          * strictly necessary. That's okay, as it is the larger packets
 583          * (such as LSO) that really benefit from desballoc().
 584          */
 585         for (i = 1; i < n; i++) {
 586                 const uint32_t to_copy = MIN(min_copy, iov[i].iov_len);
 587 
 588                 bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy);
 589                 mp_head->b_wptr += to_copy;
 590                 len += to_copy;
 591                 min_copy -= to_copy;
 592 
 593                 /*
 594                  * We've met the minimum copy requirement. The rest of
 595                  * the guest data can be referenced.
 596                  */
 597                 if (min_copy == 0) {
 598                         /*
 599                          * If we copied all contents of this
 600                          * descriptor then move onto the next one.
 601                          * Otherwise, record how far we are into the
 602                          * current descriptor.
 603                          */
 604                         if (iov[i].iov_len == to_copy)
 605                                 i++;
 606                         else
 607                                 base_off = to_copy;
 608 
 609                         break;
 610                 }
 611         }
 612 
 613         ASSERT3P(mp_head, !=, NULL);
 614         ASSERT3P(mp_tail, !=, NULL);
 615 
 616         for (; i < n; i++) {
 617                 uintptr_t base = (uintptr_t)iov[i].iov_base + base_off;
 618                 uint32_t chunk = iov[i].iov_len - base_off;
 619 
 620                 ASSERT3U(base_off, <, iov[i].iov_len);
 621                 ASSERT3U(chunk, >, 0);
 622 
 623                 if (dp != NULL) {
 624                         mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn);
 625                         if (mp == NULL) {
 626                                 goto drop_fail;
 627                         }
 628                         dp->d_ref++;
 629                 } else {
 630                         mp = allocb(chunk, BPRI_MED);
 631                         if (mp == NULL) {
 632                                 goto drop_fail;
 633                         }
 634                         bcopy((uchar_t *)base, mp->b_wptr, chunk);
 635                 }
 636 
 637                 base_off = 0;
 638                 len += chunk;
 639                 mp->b_wptr += chunk;
 640                 mp_tail->b_cont = mp;
 641                 mp_tail = mp;
 642         }
 643 
 644         if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
 645                 /*
 646                  * The hook consumer may elect to free the mblk_t and set
 647                  * our mblk_t ** to NULL.  When using a viona_desb_t
 648                  * (dp != NULL), we do not want the corresponding cleanup to
 649                  * occur during the viona_hook() call. We instead want to
 650                  * reset and recycle dp for future use.  To prevent cleanup
 651                  * during the viona_hook() call, we take a ref on dp (if being
 652                  * used), and release it on success.  On failure, the
 653                  * freemsgchain() call will release all the refs taken earlier
 654                  * in viona_tx() (aside from the initial ref and the one we
 655                  * take), and drop_hook will reset dp for reuse.
 656                  */
 657                 if (dp != NULL)
 658                         dp->d_ref++;
 659 
 660                 /*
 661                  * Pass &mp instead of &mp_head so we don't lose track of
 662                  * mp_head if the hook consumer (i.e. ipf) elects to free mp
 663                  * and set mp to NULL.
 664                  */
 665                 mp = mp_head;
 666                 if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
 667                         if (mp != NULL)
 668                                 freemsgchain(mp);
 669                         goto drop_hook;
 670                 }
 671 
 672                 if (dp != NULL) {
 673                         dp->d_ref--;
 674 
 675                         /*
 676                          * It is possible that the hook(s) accepted the packet,
 677                          * but as part of its processing, it issued a pull-up
 678                          * which released all references to the desb.  In that
 679                          * case, go back to acting like the packet is entirely
 680                          * copied (which it is).
 681                          */
 682                         if (dp->d_ref == 1) {
 683                                 dp->d_cookie = 0;
 684                                 dp->d_ref = 0;
 685                                 dp = NULL;
 686                         }
 687                 }
 688         }
 689 
 690         /*
 691          * Request hardware checksumming, if necessary. If the guest
 692          * sent an LSO packet then it must have also negotiated and
 693          * requested partial checksum; therefore the LSO logic is
 694          * contained within viona_tx_csum().
 695          */
 696         if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
 697             (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
 698                 if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) {
 699                         goto drop_fail;
 700                 }
 701         }
 702 
 703         if (dp != NULL) {
 704                 dp->d_len = len;
 705                 dp->d_pages = pages;
 706                 mutex_enter(&ring->vr_lock);
 707                 ring->vr_xfer_outstanding++;
 708                 mutex_exit(&ring->vr_lock);
 709         } else {
 710                 /*
 711                  * If the data was cloned out of the ring, the descriptors can
 712                  * be marked as 'used' now, rather than deferring that action
 713                  * until after successful packet transmission.
 714                  */
 715                 vmm_drv_page_release_chain(pages);
 716                 viona_tx_done(ring, len, cookie);
 717         }
 718 
 719         /*
 720          * From viona's point of view, this is a successful transmit, even if
 721          * something downstream decides to drop the packet.
 722          */
 723         viona_ring_stat_accept(ring, len);
 724 
 725         /*
 726          * We're potentially going deep into the networking layer; make sure the
 727          * guest can't run concurrently.
 728          */
 729         smt_begin_unsafe();
 730         /*
 731          * Ignore, for now, any signal from MAC about whether the outgoing
 732          * packet was dropped or not.
 733          */
 734         (void) mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
 735         smt_end_unsafe();
 736         return;
 737 
 738 drop_fail:
 739         /*
 740          * On the off chance that memory is not available via the desballoc or
 741          * allocb calls, there are few options left besides to fail and drop
 742          * the frame on the floor.
 743          *
 744          * First account for it in the error stats.
 745          */
 746         viona_ring_stat_error(ring);
 747 
 748         if (dp != NULL) {
 749                 /*
 750                  * Take an additional reference on the desb handle (if present)
 751                  * so any desballoc-sourced mblks can release their hold on it
 752                  * without the handle reaching its final state and executing
 753                  * its clean-up logic.
 754                  */
 755                 dp->d_ref++;
 756         }
 757 
 758         /*
 759          * Free any already-allocated blocks and sum up the total length of the
 760          * dropped data to be released to the used ring.
 761          */
 762         freemsgchain(mp_head);
 763 
 764 drop_hook:
 765         len = 0;
 766         for (uint_t i = 0; i < n; i++) {
 767                 len += iov[i].iov_len;
 768         }
 769 
 770         if (dp != NULL) {
 771                 VERIFY(dp->d_ref == 2);
 772 
 773                 /* Clean up the desb handle, releasing the extra hold. */
 774                 dp->d_len = 0;
 775                 dp->d_cookie = 0;
 776                 dp->d_ref = 0;
 777         }
 778 
 779         /* Count in the stats as a drop, rather than an error */
 780         viona_ring_stat_drop(ring);
 781 
 782         VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len,
 783             uint16_t, cookie);
 784         vmm_drv_page_release_chain(pages);
 785         viona_tx_done(ring, len, cookie);
 786 }