1 /*
   2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26 /*
  27  * This file and its contents are supplied under the terms of the
  28  * Common Development and Distribution License ("CDDL"), version 1.0.
  29  * You may only use this file in accordance with the terms of version
  30  * 1.0 of the CDDL.
  31  *
  32  * A full copy of the text of the CDDL should have accompanied this
  33  * source.  A copy of the CDDL is also available via the Internet at
  34  * http://www.illumos.org/license/CDDL.
  35  *
  36  * Copyright 2015 Pluribus Networks Inc.
  37  * Copyright 2019 Joyent, Inc.
  38  * Copyright 2024 Oxide Computer Company
  39  */
  40 
  41 
  42 #include <sys/types.h>
  43 #include <sys/smt.h>
  44 #include <sys/strsubr.h>
  45 
  46 #include <sys/pattr.h>
  47 #include <sys/dlpi.h>
  48 #include <inet/ip.h>
  49 #include <inet/ip_impl.h>
  50 
  51 #include "viona_impl.h"
  52 
  53 #define BNXE_NIC_DRIVER         "bnxe"
  54 
  55 /*
  56  * Tunable controls tx copy by default on or off
  57  */
  58 boolean_t viona_default_tx_copy = B_TRUE;
  59 
  60 /*
  61  * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
  62  * transmission to free resources.
  63  */
  64 kmutex_t viona_force_copy_lock;
  65 static enum viona_force_copy {
  66         VFC_UNINITALIZED        = 0,
  67         VFC_COPY_UNEEDED        = 1,
  68         VFC_COPY_REQUIRED       = 2,
  69 } viona_force_copy_state = VFC_UNINITALIZED;
  70 
  71 struct viona_desb {
  72         frtn_t                  d_frtn;
  73         viona_vring_t           *d_ring;
  74         uint_t                  d_ref;
  75         uint32_t                d_len;
  76         uint16_t                d_cookie;
  77         uchar_t                 *d_headers;
  78         vmm_page_t              *d_pages;
  79 };
  80 
  81 static void viona_tx(viona_link_t *, viona_vring_t *);
  82 static void viona_desb_release(viona_desb_t *);
  83 
  84 
  85 static void
  86 viona_tx_wait_outstanding(viona_vring_t *ring)
  87 {
  88         ASSERT(MUTEX_HELD(&ring->vr_lock));
  89 
  90         while (ring->vr_xfer_outstanding != 0) {
  91                 /*
  92                  * Paying heed to signals is counterproductive here.  This is a
  93                  * very tight loop if pending transfers take an extended amount
  94                  * of time to be reclaimed while the host process is exiting.
  95                  */
  96                 cv_wait(&ring->vr_cv, &ring->vr_lock);
  97         }
  98 }
  99 
 100 /*
 101  * Check if full TX packet copying is needed.  This should not be called from
 102  * viona attach()/detach() context.
 103  */
 104 static boolean_t
 105 viona_tx_copy_needed(void)
 106 {
 107         boolean_t result;
 108 
 109         if (viona_default_tx_copy) {
 110                 return (B_TRUE);
 111         }
 112 
 113         mutex_enter(&viona_force_copy_lock);
 114         if (viona_force_copy_state == VFC_UNINITALIZED) {
 115                 major_t bnxe_major;
 116 
 117                 /*
 118                  * The original code for viona featured an explicit check for
 119                  * the bnxe driver which, when found present, necessitated that
 120                  * all transmissions be copied into their own mblks instead of
 121                  * passing guest memory to the underlying device.
 122                  *
 123                  * The motivations for this are unclear, but until it can be
 124                  * proven unnecessary, the check lives on.
 125                  */
 126                 viona_force_copy_state = VFC_COPY_UNEEDED;
 127                 if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
 128                     != DDI_MAJOR_T_NONE) {
 129                         if (ddi_hold_installed_driver(bnxe_major) != NULL) {
 130                                 viona_force_copy_state = VFC_COPY_REQUIRED;
 131                                 ddi_rele_driver(bnxe_major);
 132                         }
 133                 }
 134         }
 135         result = (viona_force_copy_state == VFC_COPY_REQUIRED);
 136         mutex_exit(&viona_force_copy_lock);
 137 
 138         return (result);
 139 }
 140 
 141 void
 142 viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz)
 143 {
 144         /* Allocate desb handles for TX ring if packet copying is disabled */
 145         if (!viona_tx_copy_needed()) {
 146                 viona_desb_t *dp;
 147 
 148                 dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
 149                 ring->vr_txdesb = dp;
 150                 for (uint_t i = 0; i < qsz; i++, dp++) {
 151                         dp->d_frtn.free_func = viona_desb_release;
 152                         dp->d_frtn.free_arg = (void *)dp;
 153                         dp->d_ring = ring;
 154                         dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN,
 155                             KM_SLEEP);
 156                 }
 157         }
 158 
 159         /* Allocate ring-sized iovec buffers for TX */
 160         ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP);
 161 }
 162 
 163 void
 164 viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz)
 165 {
 166         if (ring->vr_txdesb != NULL) {
 167                 viona_desb_t *dp = ring->vr_txdesb;
 168 
 169                 for (uint_t i = 0; i < qsz; i++, dp++) {
 170                         kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN);
 171                 }
 172                 kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz);
 173                 ring->vr_txdesb = NULL;
 174         }
 175 
 176         if (ring->vr_txiov != NULL) {
 177                 kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz);
 178                 ring->vr_txiov = NULL;
 179         }
 180 }
 181 
 182 static void
 183 viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
 184 {
 185         vq_pushchain(ring, len, cookie);
 186 
 187         membar_enter();
 188         viona_intr_ring(ring, B_FALSE);
 189 }
 190 
 191 #define TX_BURST_THRESH 32
 192 
 193 void
 194 viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
 195 {
 196         (void) thread_vsetname(curthread, "viona_tx_%p", ring);
 197 
 198         ASSERT(MUTEX_HELD(&ring->vr_lock));
 199         ASSERT3U(ring->vr_state, ==, VRS_RUN);
 200 
 201         mutex_exit(&ring->vr_lock);
 202 
 203         for (;;) {
 204                 uint_t ntx = 0, burst = 0;
 205 
 206                 viona_ring_disable_notify(ring);
 207                 while (viona_ring_num_avail(ring) != 0) {
 208                         viona_tx(link, ring);
 209                         ntx++;
 210                         burst++;
 211 
 212                         /*
 213                          * It is advantageous for throughput to keep this
 214                          * transmission loop tight, but periodic breaks to
 215                          * check for other events are of value too.
 216                          */
 217                         if (burst >= TX_BURST_THRESH) {
 218                                 mutex_enter(&ring->vr_lock);
 219                                 const bool need_bail = vring_need_bail(ring);
 220                                 mutex_exit(&ring->vr_lock);
 221 
 222                                 if (need_bail) {
 223                                         break;
 224                                 }
 225                                 burst = 0;
 226                         }
 227                 }
 228 
 229                 VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx);
 230 
 231                 /*
 232                  * Check for available descriptors on the ring once more in
 233                  * case a late addition raced with the NO_NOTIFY flag toggle.
 234                  *
 235                  * The barrier ensures that visibility of the no-notify
 236                  * store does not cross the viona_ring_num_avail() check below.
 237                  */
 238                 viona_ring_enable_notify(ring);
 239                 membar_enter();
 240 
 241                 if (viona_ring_num_avail(ring) == 0 &&
 242                     (link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
 243                         /*
 244                          * The NOTIFY_ON_EMPTY interrupt should not pay heed to
 245                          * the presence of AVAIL_NO_INTERRUPT.
 246                          */
 247                         viona_intr_ring(ring, B_TRUE);
 248                 }
 249 
 250                 mutex_enter(&ring->vr_lock);
 251                 for (;;) {
 252                         if (vring_need_bail(ring)) {
 253                                 ring->vr_state = VRS_STOP;
 254                                 viona_tx_wait_outstanding(ring);
 255                                 return;
 256                         }
 257 
 258                         if (vmm_drv_lease_expired(ring->vr_lease)) {
 259                                 ring->vr_state_flags |= VRSF_RENEW;
 260                                 /*
 261                                  * When renewing the lease for the ring, no TX
 262                                  * frames may be outstanding, as they contain
 263                                  * references to guest memory.
 264                                  */
 265                                 viona_tx_wait_outstanding(ring);
 266 
 267                                 const boolean_t renewed =
 268                                     viona_ring_lease_renew(ring);
 269                                 ring->vr_state_flags &= ~VRSF_RENEW;
 270 
 271                                 if (!renewed) {
 272                                         /* stop ring on failed renewal */
 273                                         ring->vr_state = VRS_STOP;
 274                                         return;
 275                                 }
 276                         }
 277 
 278                         if (viona_ring_num_avail(ring) != 0) {
 279                                 break;
 280                         }
 281 
 282                         /* Wait for further activity on the ring */
 283                         (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
 284                 }
 285                 mutex_exit(&ring->vr_lock);
 286         }
 287         /* UNREACHABLE */
 288 }
 289 
 290 static void
 291 viona_desb_release(viona_desb_t *dp)
 292 {
 293         viona_vring_t *ring = dp->d_ring;
 294         uint_t ref;
 295         uint32_t len;
 296         uint16_t cookie;
 297 
 298         ref = atomic_dec_uint_nv(&dp->d_ref);
 299         if (ref > 1) {
 300                 return;
 301         }
 302 
 303         /*
 304          * The desb corresponding to this index must be ready for reuse before
 305          * the descriptor is returned to the guest via the 'used' ring.
 306          */
 307         len = dp->d_len;
 308         cookie = dp->d_cookie;
 309         dp->d_len = 0;
 310         dp->d_cookie = 0;
 311         vmm_drv_page_release_chain(dp->d_pages);
 312         dp->d_pages = NULL;
 313 
 314         /*
 315          * Ensure all other changes to the desb are visible prior to zeroing its
 316          * refcount, signifying its readiness for reuse.
 317          */
 318         membar_exit();
 319         dp->d_ref = 0;
 320 
 321         viona_tx_done(ring, len, cookie);
 322 
 323         mutex_enter(&ring->vr_lock);
 324         if ((--ring->vr_xfer_outstanding) == 0) {
 325                 cv_broadcast(&ring->vr_cv);
 326         }
 327         mutex_exit(&ring->vr_lock);
 328 }
 329 
 330 static boolean_t
 331 viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
 332     mblk_t *mp, uint32_t len)
 333 {
 334         viona_link_t *link = ring->vr_link;
 335         const struct ether_header *eth;
 336         uint_t eth_len = sizeof (struct ether_header);
 337         ushort_t ftype;
 338         ipha_t *ipha = NULL;
 339         uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */
 340         uint16_t flags = 0;
 341         const uint_t csum_start = hdr->vrh_csum_start;
 342         const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start;
 343 
 344         /*
 345          * Validate that the checksum offsets provided by the guest are within
 346          * the bounds of the packet.  Additionally, ensure that the checksum
 347          * contents field is within the headers mblk copied by viona_tx().
 348          */
 349         if (csum_start >= len || csum_start < eth_len || csum_stuff >= len ||
 350             (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) {
 351                 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
 352                 VIONA_RING_STAT_INCR(ring, fail_hcksum);
 353                 return (B_FALSE);
 354         }
 355 
 356         /*
 357          * This is guaranteed to be safe thanks to the header copying
 358          * done in viona_tx().
 359          */
 360         eth = (const struct ether_header *)mp->b_rptr;
 361         ftype = ntohs(eth->ether_type);
 362 
 363         if (ftype == ETHERTYPE_VLAN) {
 364                 const struct ether_vlan_header *veth;
 365 
 366                 /* punt on QinQ for now */
 367                 eth_len = sizeof (struct ether_vlan_header);
 368                 veth = (const struct ether_vlan_header *)eth;
 369                 ftype = ntohs(veth->ether_type);
 370         }
 371 
 372         if (ftype == ETHERTYPE_IP) {
 373                 ipha = (ipha_t *)(mp->b_rptr + eth_len);
 374 
 375                 ipproto = ipha->ipha_protocol;
 376         } else if (ftype == ETHERTYPE_IPV6) {
 377                 ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len);
 378 
 379                 ipproto = ip6h->ip6_nxt;
 380         }
 381 
 382         /*
 383          * We ignore hdr_len because the spec says it can't be
 384          * trusted. Besides, our own stack will determine the header
 385          * boundary.
 386          */
 387         if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
 388             (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
 389             ftype == ETHERTYPE_IP) {
 390                 uint16_t        *cksump;
 391                 uint32_t        cksum;
 392                 ipaddr_t        src = ipha->ipha_src;
 393                 ipaddr_t        dst = ipha->ipha_dst;
 394 
 395                 /*
 396                  * Our native IP stack doesn't set the L4 length field
 397                  * of the pseudo header when LSO is in play. Other IP
 398                  * stacks, e.g. Linux, do include the length field.
 399                  * This is a problem because the hardware expects that
 400                  * the length field is not set. When it is set it will
 401                  * cause an incorrect TCP checksum to be generated.
 402                  * The reason this works in Linux is because Linux
 403                  * corrects the pseudo-header checksum in the driver
 404                  * code. In order to get the correct HW checksum we
 405                  * need to assume the guest's IP stack gave us a bogus
 406                  * TCP partial checksum and calculate it ourselves.
 407                  */
 408                 cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
 409                 cksum = IP_TCP_CSUM_COMP;
 410                 cksum += (dst >> 16) + (dst & 0xFFFF) +
 411                     (src >> 16) + (src & 0xFFFF);
 412                 cksum = (cksum & 0xFFFF) + (cksum >> 16);
 413                 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
 414 
 415                 /*
 416                  * Since viona is a "legacy device", the data stored
 417                  * by the driver will be in the guest's native endian
 418                  * format (see sections 2.4.3 and 5.1.6.1 of the
 419                  * VIRTIO 1.0 spec for more info). At this time the
 420                  * only guests using viona are x86 and we can assume
 421                  * little-endian.
 422                  */
 423                 lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO);
 424 
 425                 /*
 426                  * Hardware, like ixgbe, expects the client to request
 427                  * IP header checksum offload if it's sending LSO (see
 428                  * ixgbe_get_context()). Unfortunately, virtio makes
 429                  * no allowances for negotiating IP header checksum
 430                  * and HW offload, only TCP checksum. We add the flag
 431                  * and zero-out the checksum field. This mirrors the
 432                  * behavior of our native IP stack (which does this in
 433                  * the interest of HW that expects the field to be
 434                  * zero).
 435                  */
 436                 flags |= HCK_IPV4_HDRCKSUM;
 437                 ipha->ipha_hdr_checksum = 0;
 438         }
 439 
 440         /*
 441          * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure
 442          * HW_LSO, if present, is not lost.
 443          */
 444         flags |= DB_CKSUMFLAGS(mp);
 445 
 446         /*
 447          * Partial checksum support from the NIC is ideal, since it most
 448          * closely maps to the interface defined by virtio.
 449          */
 450         if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
 451             (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
 452                 /*
 453                  * MAC expects these offsets to be relative to the
 454                  * start of the L3 header rather than the L2 frame.
 455                  */
 456                 flags |= HCK_PARTIALCKSUM;
 457                 mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len,
 458                     len - eth_len, 0, flags);
 459                 return (B_TRUE);
 460         }
 461 
 462         /*
 463          * Without partial checksum support, look to the L3/L4 protocol
 464          * information to see if the NIC can handle it.  If not, the
 465          * checksum will need to calculated inline.
 466          */
 467         if (ftype == ETHERTYPE_IP) {
 468                 if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
 469                     (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
 470                         uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
 471                         *csump = 0;
 472                         flags |= HCK_FULLCKSUM;
 473                         mac_hcksum_set(mp, 0, 0, 0, 0, flags);
 474                         return (B_TRUE);
 475                 }
 476 
 477                 /* XXX: Implement manual fallback checksumming? */
 478                 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
 479                 VIONA_RING_STAT_INCR(ring, fail_hcksum);
 480                 return (B_FALSE);
 481         } else if (ftype == ETHERTYPE_IPV6) {
 482                 if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
 483                     (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
 484                         uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
 485                         *csump = 0;
 486                         flags |= HCK_FULLCKSUM;
 487                         mac_hcksum_set(mp, 0, 0, 0, 0, flags);
 488                         return (B_TRUE);
 489                 }
 490 
 491                 /* XXX: Implement manual fallback checksumming? */
 492                 VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
 493                 VIONA_RING_STAT_INCR(ring, fail_hcksum6);
 494                 return (B_FALSE);
 495         }
 496 
 497         /* Cannot even emulate hcksum for unrecognized protocols */
 498         VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
 499         VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
 500         return (B_FALSE);
 501 }
 502 
 503 static void
 504 viona_tx(viona_link_t *link, viona_vring_t *ring)
 505 {
 506         struct iovec            *iov = ring->vr_txiov;
 507         const uint_t            max_segs = ring->vr_size;
 508         uint16_t                cookie;
 509         int                     i, n;
 510         uint32_t                len, base_off = 0;
 511         uint32_t                min_copy = VIONA_MAX_HDRS_LEN;
 512         mblk_t                  *mp_head, *mp_tail, *mp;
 513         viona_desb_t            *dp = NULL;
 514         mac_client_handle_t     link_mch = link->l_mch;
 515         const struct virtio_net_hdr *hdr;
 516         vmm_page_t *pages = NULL;
 517 
 518         mp_head = mp_tail = NULL;
 519 
 520         ASSERT(iov != NULL);
 521 
 522         n = vq_popchain(ring, iov, max_segs, &cookie, &pages);
 523         if (n == 0) {
 524                 VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
 525                 VIONA_RING_STAT_INCR(ring, tx_absent);
 526                 return;
 527         } else if (n < 0) {
 528                 /*
 529                  * Any error encountered in vq_popchain has already resulted in
 530                  * specific probe and statistic handling.  Further action here
 531                  * is unnecessary.
 532                  */
 533                 return;
 534         }
 535 
 536         /* Grab the header and ensure it is of adequate length */
 537         hdr = (const struct virtio_net_hdr *)iov[0].iov_base;
 538         len = iov[0].iov_len;
 539         if (len < sizeof (struct virtio_net_hdr)) {
 540                 goto drop_fail;
 541         }
 542 
 543         /* Make sure the packet headers are always in the first mblk. */
 544         if (ring->vr_txdesb != NULL) {
 545                 dp = &ring->vr_txdesb[cookie];
 546 
 547                 /*
 548                  * If the guest driver is operating properly, each desb slot
 549                  * should be available for use when processing a TX descriptor
 550                  * from the 'avail' ring.  In the case of drivers that reuse a
 551                  * descriptor before it has been posted to the 'used' ring, the
 552                  * data is simply dropped.
 553                  */
 554                 if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
 555                         dp = NULL;
 556                         goto drop_fail;
 557                 }
 558 
 559                 dp->d_cookie = cookie;
 560                 mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0,
 561                     &dp->d_frtn);
 562 
 563                 /* Account for the successful desballoc. */
 564                 if (mp_head != NULL)
 565                         dp->d_ref++;
 566         } else {
 567                 mp_head = allocb(VIONA_MAX_HDRS_LEN, 0);
 568         }
 569 
 570         if (mp_head == NULL)
 571                 goto drop_fail;
 572 
 573         mp_tail = mp_head;
 574 
 575         /*
 576          * We always copy enough of the guest data to cover the
 577          * headers. This protects us from TOCTOU attacks and allows
 578          * message block length assumptions to be made in subsequent
 579          * code. In many cases, this means copying more data than
 580          * strictly necessary. That's okay, as it is the larger packets
 581          * (such as LSO) that really benefit from desballoc().
 582          */
 583         for (i = 1; i < n; i++) {
 584                 const uint32_t to_copy = MIN(min_copy, iov[i].iov_len);
 585 
 586                 bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy);
 587                 mp_head->b_wptr += to_copy;
 588                 len += to_copy;
 589                 min_copy -= to_copy;
 590 
 591                 /*
 592                  * We've met the minimum copy requirement. The rest of
 593                  * the guest data can be referenced.
 594                  */
 595                 if (min_copy == 0) {
 596                         /*
 597                          * If we copied all contents of this
 598                          * descriptor then move onto the next one.
 599                          * Otherwise, record how far we are into the
 600                          * current descriptor.
 601                          */
 602                         if (iov[i].iov_len == to_copy)
 603                                 i++;
 604                         else
 605                                 base_off = to_copy;
 606 
 607                         break;
 608                 }
 609         }
 610 
 611         ASSERT3P(mp_head, !=, NULL);
 612         ASSERT3P(mp_tail, !=, NULL);
 613 
 614         for (; i < n; i++) {
 615                 uintptr_t base = (uintptr_t)iov[i].iov_base + base_off;
 616                 uint32_t chunk = iov[i].iov_len - base_off;
 617 
 618                 ASSERT3U(base_off, <, iov[i].iov_len);
 619                 ASSERT3U(chunk, >, 0);
 620 
 621                 if (dp != NULL) {
 622                         mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn);
 623                         if (mp == NULL) {
 624                                 goto drop_fail;
 625                         }
 626                         dp->d_ref++;
 627                 } else {
 628                         mp = allocb(chunk, BPRI_MED);
 629                         if (mp == NULL) {
 630                                 goto drop_fail;
 631                         }
 632                         bcopy((uchar_t *)base, mp->b_wptr, chunk);
 633                 }
 634 
 635                 base_off = 0;
 636                 len += chunk;
 637                 mp->b_wptr += chunk;
 638                 mp_tail->b_cont = mp;
 639                 mp_tail = mp;
 640         }
 641 
 642         if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
 643                 /*
 644                  * The hook consumer may elect to free the mblk_t and set
 645                  * our mblk_t ** to NULL.  When using a viona_desb_t
 646                  * (dp != NULL), we do not want the corresponding cleanup to
 647                  * occur during the viona_hook() call. We instead want to
 648                  * reset and recycle dp for future use.  To prevent cleanup
 649                  * during the viona_hook() call, we take a ref on dp (if being
 650                  * used), and release it on success.  On failure, the
 651                  * freemsgchain() call will release all the refs taken earlier
 652                  * in viona_tx() (aside from the initial ref and the one we
 653                  * take), and drop_hook will reset dp for reuse.
 654                  */
 655                 if (dp != NULL)
 656                         dp->d_ref++;
 657 
 658                 /*
 659                  * Pass &mp instead of &mp_head so we don't lose track of
 660                  * mp_head if the hook consumer (i.e. ipf) elects to free mp
 661                  * and set mp to NULL.
 662                  */
 663                 mp = mp_head;
 664                 if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
 665                         if (mp != NULL)
 666                                 freemsgchain(mp);
 667                         goto drop_hook;
 668                 }
 669 
 670                 if (dp != NULL) {
 671                         dp->d_ref--;
 672 
 673                         /*
 674                          * It is possible that the hook(s) accepted the packet,
 675                          * but as part of its processing, it issued a pull-up
 676                          * which released all references to the desb.  In that
 677                          * case, go back to acting like the packet is entirely
 678                          * copied (which it is).
 679                          */
 680                         if (dp->d_ref == 1) {
 681                                 dp->d_cookie = 0;
 682                                 dp->d_ref = 0;
 683                                 dp = NULL;
 684                         }
 685                 }
 686         }
 687 
 688         /*
 689          * Request hardware checksumming, if necessary. If the guest
 690          * sent an LSO packet then it must have also negotiated and
 691          * requested partial checksum; therefore the LSO logic is
 692          * contained within viona_tx_csum().
 693          */
 694         if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
 695             (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
 696                 if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) {
 697                         goto drop_fail;
 698                 }
 699         }
 700 
 701         if (dp != NULL) {
 702                 dp->d_len = len;
 703                 dp->d_pages = pages;
 704                 mutex_enter(&ring->vr_lock);
 705                 ring->vr_xfer_outstanding++;
 706                 mutex_exit(&ring->vr_lock);
 707         } else {
 708                 /*
 709                  * If the data was cloned out of the ring, the descriptors can
 710                  * be marked as 'used' now, rather than deferring that action
 711                  * until after successful packet transmission.
 712                  */
 713                 vmm_drv_page_release_chain(pages);
 714                 viona_tx_done(ring, len, cookie);
 715         }
 716 
 717         /*
 718          * From viona's point of view, this is a successful transmit, even if
 719          * something downstream decides to drop the packet.
 720          */
 721         viona_ring_stat_accept(ring, len);
 722 
 723         /*
 724          * We're potentially going deep into the networking layer; make sure the
 725          * guest can't run concurrently.
 726          */
 727         smt_begin_unsafe();
 728         /*
 729          * Ignore, for now, any signal from MAC about whether the outgoing
 730          * packet was dropped or not.
 731          */
 732         (void) mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
 733         smt_end_unsafe();
 734         return;
 735 
 736 drop_fail:
 737         /*
 738          * On the off chance that memory is not available via the desballoc or
 739          * allocb calls, there are few options left besides to fail and drop
 740          * the frame on the floor.
 741          *
 742          * First account for it in the error stats.
 743          */
 744         viona_ring_stat_error(ring);
 745 
 746         if (dp != NULL) {
 747                 /*
 748                  * Take an additional reference on the desb handle (if present)
 749                  * so any desballoc-sourced mblks can release their hold on it
 750                  * without the handle reaching its final state and executing
 751                  * its clean-up logic.
 752                  */
 753                 dp->d_ref++;
 754         }
 755 
 756         /*
 757          * Free any already-allocated blocks and sum up the total length of the
 758          * dropped data to be released to the used ring.
 759          */
 760         freemsgchain(mp_head);
 761 
 762 drop_hook:
 763         len = 0;
 764         for (uint_t i = 0; i < n; i++) {
 765                 len += iov[i].iov_len;
 766         }
 767 
 768         if (dp != NULL) {
 769                 VERIFY(dp->d_ref == 2);
 770 
 771                 /* Clean up the desb handle, releasing the extra hold. */
 772                 dp->d_len = 0;
 773                 dp->d_cookie = 0;
 774                 dp->d_ref = 0;
 775         }
 776 
 777         /* Count in the stats as a drop, rather than an error */
 778         viona_ring_stat_drop(ring);
 779 
 780         VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len,
 781             uint16_t, cookie);
 782         vmm_drv_page_release_chain(pages);
 783         viona_tx_done(ring, len, cookie);
 784 }