1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  29  */
  30 
  31 /*
  32  *
  33  * Copyright (c) 2004 Christian Limpach.
  34  * All rights reserved.
  35  *
  36  * Redistribution and use in source and binary forms, with or without
  37  * modification, are permitted provided that the following conditions
  38  * are met:
  39  * 1. Redistributions of source code must retain the above copyright
  40  *    notice, this list of conditions and the following disclaimer.
  41  * 2. Redistributions in binary form must reproduce the above copyright
  42  *    notice, this list of conditions and the following disclaimer in the
  43  *    documentation and/or other materials provided with the distribution.
  44  * 3. This section intentionally left blank.
  45  * 4. The name of the author may not be used to endorse or promote products
  46  *    derived from this software without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  49  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  50  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  51  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  52  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  53  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  54  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  55  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  56  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  57  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  58  */
  59 /*
  60  * Section 3 of the above license was updated in response to bug 6379571.
  61  */
  62 
  63 /*
  64  * xnf.c - GLDv3 network driver for domU.
  65  */
  66 
  67 /*
  68  * This driver uses four per-instance locks:
  69  *
  70  * xnf_gref_lock:
  71  *
  72  *    Protects access to the grant reference list stored in
  73  *    xnf_gref_head. Grant references should be acquired and released
  74  *    using gref_get() and gref_put() respectively.
  75  *
  76  * xnf_schedlock:
  77  *
  78  *    Protects:
  79  *    xnf_need_sched - used to record that a previous transmit attempt
  80  *       failed (and consequently it will be necessary to call
  81  *       mac_tx_update() when transmit resources are available).
  82  *    xnf_pending_multicast - the number of multicast requests that
  83  *       have been submitted to the backend for which we have not
  84  *       processed responses.
  85  *
  86  * xnf_txlock:
  87  *
  88  *    Protects the transmit ring (xnf_tx_ring) and associated
  89  *    structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
  90  *
  91  * xnf_rxlock:
  92  *
  93  *    Protects the receive ring (xnf_rx_ring) and associated
  94  *    structures (notably xnf_rx_pkt_info).
  95  *
  96  * If driver-global state that affects both the transmit and receive
  97  * rings is manipulated, both xnf_txlock and xnf_rxlock should be
  98  * held, in that order.
  99  *
 100  * xnf_schedlock is acquired both whilst holding xnf_txlock and
 101  * without. It should always be acquired after xnf_txlock if both are
 102  * held.
 103  *
 104  * Notes:
 105  * - atomic_add_64() is used to manipulate counters where we require
 106  *   accuracy. For counters intended only for observation by humans,
 107  *   post increment/decrement are used instead.
 108  */
 109 
 110 #include <sys/types.h>
 111 #include <sys/errno.h>
 112 #include <sys/param.h>
 113 #include <sys/sysmacros.h>
 114 #include <sys/systm.h>
 115 #include <sys/stream.h>
 116 #include <sys/strsubr.h>
 117 #include <sys/strsun.h>
 118 #include <sys/conf.h>
 119 #include <sys/ddi.h>
 120 #include <sys/devops.h>
 121 #include <sys/sunddi.h>
 122 #include <sys/sunndi.h>
 123 #include <sys/dlpi.h>
 124 #include <sys/ethernet.h>
 125 #include <sys/strsun.h>
 126 #include <sys/pattr.h>
 127 #include <inet/ip.h>
 128 #include <inet/ip_impl.h>
 129 #include <inet/tcp.h>
 130 #include <netinet/udp.h>
 131 #include <sys/gld.h>
 132 #include <sys/modctl.h>
 133 #include <sys/mac_provider.h>
 134 #include <sys/mac_ether.h>
 135 #include <sys/bootinfo.h>
 136 #include <sys/mach_mmu.h>
 137 #ifdef  XPV_HVM_DRIVER
 138 #include <sys/xpv_support.h>
 139 #include <sys/hypervisor.h>
 140 #else
 141 #include <sys/hypervisor.h>
 142 #include <sys/evtchn_impl.h>
 143 #include <sys/balloon_impl.h>
 144 #endif
 145 #include <xen/public/io/netif.h>
 146 #include <sys/gnttab.h>
 147 #include <xen/sys/xendev.h>
 148 #include <sys/sdt.h>
 149 #include <sys/note.h>
 150 #include <sys/debug.h>
 151 
 152 #include <io/xnf.h>
 153 
 154 #if defined(DEBUG) || defined(__lint)
 155 #define XNF_DEBUG
 156 #endif
 157 
 158 #ifdef XNF_DEBUG
 159 int xnf_debug = 0;
 160 xnf_t *xnf_debug_instance = NULL;
 161 #endif
 162 
 163 /*
 164  * On a 32 bit PAE system physical and machine addresses are larger
 165  * than 32 bits.  ddi_btop() on such systems take an unsigned long
 166  * argument, and so addresses above 4G are truncated before ddi_btop()
 167  * gets to see them.  To avoid this, code the shift operation here.
 168  */
 169 #define xnf_btop(addr)  ((addr) >> PAGESHIFT)
 170 
 171 /*
 172  * The parameters below should only be changed in /etc/system, never in mdb.
 173  */
 174 
 175 /*
 176  * Should we use the multicast control feature if the backend provides
 177  * it?
 178  */
 179 boolean_t xnf_multicast_control = B_TRUE;
 180 
 181 /*
 182  * Should we allow scatter-gather for tx if backend allows it?
 183  */
 184 boolean_t xnf_enable_tx_sg = B_TRUE;
 185 
 186 /*
 187  * Should we allow scatter-gather for rx if backend allows it?
 188  */
 189 boolean_t xnf_enable_rx_sg = B_TRUE;
 190 
 191 /*
 192  * Should we allow lso for tx sends if backend allows it?
 193  * Requires xnf_enable_tx_sg to be also set to TRUE.
 194  */
 195 boolean_t xnf_enable_lso = B_TRUE;
 196 
 197 /*
 198  * Should we allow lro on rx if backend supports it?
 199  * Requires xnf_enable_rx_sg to be also set to TRUE.
 200  *
 201  * !! WARNING !!
 202  * LRO is not yet supported in the OS so this should be left as FALSE.
 203  * !! WARNING !!
 204  */
 205 boolean_t xnf_enable_lro = B_FALSE;
 206 
 207 /*
 208  * Received packets below this size are copied to a new streams buffer
 209  * rather than being desballoc'ed.
 210  *
 211  * This value is chosen to accommodate traffic where there are a large
 212  * number of small packets. For data showing a typical distribution,
 213  * see:
 214  *
 215  * Sinha07a:
 216  *      Rishi Sinha, Christos Papadopoulos, and John
 217  *      Heidemann. Internet Packet Size Distributions: Some
 218  *      Observations. Technical Report ISI-TR-2007-643,
 219  *      USC/Information Sciences Institute, May, 2007. Orignally
 220  *      released October 2005 as web page
 221  *      http://netweb.usc.edu/~sinha/pkt-sizes/.
 222  *      <http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
 223  */
 224 size_t xnf_rx_copy_limit = 64;
 225 
 226 #define INVALID_GRANT_HANDLE    ((grant_handle_t)-1)
 227 #define INVALID_GRANT_REF       ((grant_ref_t)-1)
 228 #define INVALID_TX_ID           ((uint16_t)-1)
 229 
 230 #define TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
 231 #define TX_ID_VALID(i) \
 232         (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
 233 
 234 /*
 235  * calculate how many pages are spanned by an mblk fragment
 236  */
 237 #define xnf_mblk_pages(mp)      (MBLKL(mp) == 0 ? 0 : \
 238     xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1)
 239 
 240 /* Required system entry points */
 241 static int      xnf_attach(dev_info_t *, ddi_attach_cmd_t);
 242 static int      xnf_detach(dev_info_t *, ddi_detach_cmd_t);
 243 
 244 /* Required driver entry points for Nemo */
 245 static int      xnf_start(void *);
 246 static void     xnf_stop(void *);
 247 static int      xnf_set_mac_addr(void *, const uint8_t *);
 248 static int      xnf_set_multicast(void *, boolean_t, const uint8_t *);
 249 static int      xnf_set_promiscuous(void *, boolean_t);
 250 static mblk_t   *xnf_send(void *, mblk_t *);
 251 static uint_t   xnf_intr(caddr_t);
 252 static int      xnf_stat(void *, uint_t, uint64_t *);
 253 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
 254 static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
 255 static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t,
 256     const void *);
 257 static void xnf_propinfo(void *, const char *, mac_prop_id_t,
 258     mac_prop_info_handle_t);
 259 
 260 /* Driver private functions */
 261 static int xnf_alloc_dma_resources(xnf_t *);
 262 static void xnf_release_dma_resources(xnf_t *);
 263 static void xnf_release_mblks(xnf_t *);
 264 
 265 static int xnf_buf_constructor(void *, void *, int);
 266 static void xnf_buf_destructor(void *, void *);
 267 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
 268 #pragma inline(xnf_buf_get)
 269 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
 270 #pragma inline(xnf_buf_put)
 271 static void xnf_buf_refresh(xnf_buf_t *);
 272 #pragma inline(xnf_buf_refresh)
 273 static void xnf_buf_recycle(xnf_buf_t *);
 274 
 275 static int xnf_tx_buf_constructor(void *, void *, int);
 276 static void xnf_tx_buf_destructor(void *, void *);
 277 
 278 static grant_ref_t xnf_gref_get(xnf_t *);
 279 #pragma inline(xnf_gref_get)
 280 static void xnf_gref_put(xnf_t *, grant_ref_t);
 281 #pragma inline(xnf_gref_put)
 282 
 283 static xnf_txid_t *xnf_txid_get(xnf_t *);
 284 #pragma inline(xnf_txid_get)
 285 static void xnf_txid_put(xnf_t *, xnf_txid_t *);
 286 #pragma inline(xnf_txid_put)
 287 
 288 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
 289 static int xnf_tx_clean_ring(xnf_t  *);
 290 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
 291     void *, void *);
 292 static boolean_t xnf_kstat_init(xnf_t *);
 293 static void xnf_rx_collect(xnf_t *);
 294 
 295 #define XNF_CALLBACK_FLAGS      (MC_GETCAPAB | MC_PROPERTIES)
 296 
 297 static mac_callbacks_t xnf_callbacks = {
 298         .mc_callbacks = XNF_CALLBACK_FLAGS,
 299         .mc_getstat = xnf_stat,
 300         .mc_start = xnf_start,
 301         .mc_stop = xnf_stop,
 302         .mc_setpromisc = xnf_set_promiscuous,
 303         .mc_multicst = xnf_set_multicast,
 304         .mc_unicst = xnf_set_mac_addr,
 305         .mc_tx = xnf_send,
 306         .mc_getcapab = xnf_getcapab,
 307         .mc_setprop = xnf_setprop,
 308         .mc_getprop = xnf_getprop,
 309         .mc_propinfo = xnf_propinfo,
 310 };
 311 
 312 /* DMA attributes for network ring buffer */
 313 static ddi_dma_attr_t ringbuf_dma_attr = {
 314         .dma_attr_version = DMA_ATTR_V0,
 315         .dma_attr_addr_lo = 0,
 316         .dma_attr_addr_hi = 0xffffffffffffffffULL,
 317         .dma_attr_count_max = 0x7fffffff,
 318         .dma_attr_align = MMU_PAGESIZE,
 319         .dma_attr_burstsizes = 0x7ff,
 320         .dma_attr_minxfer = 1,
 321         .dma_attr_maxxfer = 0xffffffffU,
 322         .dma_attr_seg = 0xffffffffffffffffULL,
 323         .dma_attr_sgllen = 1,
 324         .dma_attr_granular = 1,
 325         .dma_attr_flags = 0
 326 };
 327 
 328 /* DMA attributes for receive data */
 329 static ddi_dma_attr_t rx_buf_dma_attr = {
 330         .dma_attr_version = DMA_ATTR_V0,
 331         .dma_attr_addr_lo = 0,
 332         .dma_attr_addr_hi = 0xffffffffffffffffULL,
 333         .dma_attr_count_max = MMU_PAGEOFFSET,
 334         .dma_attr_align = MMU_PAGESIZE, /* allocation alignment */
 335         .dma_attr_burstsizes = 0x7ff,
 336         .dma_attr_minxfer = 1,
 337         .dma_attr_maxxfer = 0xffffffffU,
 338         .dma_attr_seg = 0xffffffffffffffffULL,
 339         .dma_attr_sgllen = 1,
 340         .dma_attr_granular = 1,
 341         .dma_attr_flags = 0
 342 };
 343 
 344 /* DMA attributes for transmit data */
 345 static ddi_dma_attr_t tx_buf_dma_attr = {
 346         .dma_attr_version = DMA_ATTR_V0,
 347         .dma_attr_addr_lo = 0,
 348         .dma_attr_addr_hi = 0xffffffffffffffffULL,
 349         .dma_attr_count_max = MMU_PAGEOFFSET,
 350         .dma_attr_align = 1,
 351         .dma_attr_burstsizes = 0x7ff,
 352         .dma_attr_minxfer = 1,
 353         .dma_attr_maxxfer = 0xffffffffU,
 354         .dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */
 355         .dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */
 356         .dma_attr_granular = 1,
 357         .dma_attr_flags = 0
 358 };
 359 
 360 /* DMA access attributes for registers and descriptors */
 361 static ddi_device_acc_attr_t accattr = {
 362         DDI_DEVICE_ATTR_V0,
 363         DDI_STRUCTURE_LE_ACC,   /* This is a little-endian device */
 364         DDI_STRICTORDER_ACC
 365 };
 366 
 367 /* DMA access attributes for data: NOT to be byte swapped. */
 368 static ddi_device_acc_attr_t data_accattr = {
 369         DDI_DEVICE_ATTR_V0,
 370         DDI_NEVERSWAP_ACC,
 371         DDI_STRICTORDER_ACC
 372 };
 373 
 374 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
 375     nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
 376 
 377 static struct modldrv xnf_modldrv = {
 378         &mod_driverops,
 379         "Virtual Ethernet driver",
 380         &xnf_dev_ops
 381 };
 382 
 383 static struct modlinkage modlinkage = {
 384         MODREV_1, &xnf_modldrv, NULL
 385 };
 386 
 387 int
 388 _init(void)
 389 {
 390         int r;
 391 
 392         mac_init_ops(&xnf_dev_ops, "xnf");
 393         r = mod_install(&modlinkage);
 394         if (r != DDI_SUCCESS)
 395                 mac_fini_ops(&xnf_dev_ops);
 396 
 397         return (r);
 398 }
 399 
 400 int
 401 _fini(void)
 402 {
 403         return (EBUSY); /* XXPV should be removable */
 404 }
 405 
 406 int
 407 _info(struct modinfo *modinfop)
 408 {
 409         return (mod_info(&modlinkage, modinfop));
 410 }
 411 
 412 /*
 413  * Acquire a grant reference.
 414  */
 415 static grant_ref_t
 416 xnf_gref_get(xnf_t *xnfp)
 417 {
 418         grant_ref_t gref;
 419 
 420         mutex_enter(&xnfp->xnf_gref_lock);
 421 
 422         do {
 423                 gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);
 424 
 425         } while ((gref == INVALID_GRANT_REF) &&
 426             (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));
 427 
 428         mutex_exit(&xnfp->xnf_gref_lock);
 429 
 430         if (gref == INVALID_GRANT_REF) {
 431                 xnfp->xnf_stat_gref_failure++;
 432         } else {
 433                 atomic_inc_64(&xnfp->xnf_stat_gref_outstanding);
 434                 if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
 435                         xnfp->xnf_stat_gref_peak =
 436                             xnfp->xnf_stat_gref_outstanding;
 437         }
 438 
 439         return (gref);
 440 }
 441 
 442 /*
 443  * Release a grant reference.
 444  */
 445 static void
 446 xnf_gref_put(xnf_t *xnfp, grant_ref_t gref)
 447 {
 448         ASSERT(gref != INVALID_GRANT_REF);
 449 
 450         mutex_enter(&xnfp->xnf_gref_lock);
 451         gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
 452         mutex_exit(&xnfp->xnf_gref_lock);
 453 
 454         atomic_dec_64(&xnfp->xnf_stat_gref_outstanding);
 455 }
 456 
 457 /*
 458  * Acquire a transmit id.
 459  */
 460 static xnf_txid_t *
 461 xnf_txid_get(xnf_t *xnfp)
 462 {
 463         xnf_txid_t *tidp;
 464 
 465         ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
 466 
 467         if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
 468                 return (NULL);
 469 
 470         ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));
 471 
 472         tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
 473         xnfp->xnf_tx_pkt_id_head = tidp->next;
 474         tidp->next = INVALID_TX_ID;
 475 
 476         ASSERT(tidp->txbuf == NULL);
 477 
 478         return (tidp);
 479 }
 480 
 481 /*
 482  * Release a transmit id.
 483  */
 484 static void
 485 xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
 486 {
 487         ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
 488         ASSERT(TX_ID_VALID(tidp->id));
 489         ASSERT(tidp->next == INVALID_TX_ID);
 490 
 491         tidp->txbuf = NULL;
 492         tidp->next = xnfp->xnf_tx_pkt_id_head;
 493         xnfp->xnf_tx_pkt_id_head = tidp->id;
 494 }
 495 
 496 static void
 497 xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp)
 498 {
 499         ASSERT3U(txp->tx_type, ==, TX_DATA);
 500 
 501         /*
 502          * We are either using a lookaside buffer or we are mapping existing
 503          * buffers.
 504          */
 505         if (txp->tx_bdesc != NULL) {
 506                 ASSERT(!txp->tx_handle_bound);
 507                 xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
 508         } else {
 509                 if (txp->tx_txreq.gref != INVALID_GRANT_REF) {
 510                         if (gnttab_query_foreign_access(txp->tx_txreq.gref) !=
 511                             0) {
 512                                 cmn_err(CE_PANIC, "tx grant %d still in use by "
 513                                     "backend domain", txp->tx_txreq.gref);
 514                         }
 515                         (void) gnttab_end_foreign_access_ref(
 516                             txp->tx_txreq.gref, 1);
 517                         xnf_gref_put(xnfp, txp->tx_txreq.gref);
 518                 }
 519 
 520                 if (txp->tx_handle_bound)
 521                         (void) ddi_dma_unbind_handle(txp->tx_dma_handle);
 522         }
 523 
 524         if (txp->tx_mp != NULL)
 525                 freemsg(txp->tx_mp);
 526 
 527         if (txp->tx_prev != NULL) {
 528                 ASSERT3P(txp->tx_prev->tx_next, ==, txp);
 529                 txp->tx_prev->tx_next = NULL;
 530         }
 531 
 532         if (txp->tx_txreq.id != INVALID_TX_ID) {
 533                 /*
 534                  * This should be only possible when resuming from a suspend.
 535                  */
 536                 ASSERT(!xnfp->xnf_connected);
 537                 xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id));
 538                 txp->tx_txreq.id = INVALID_TX_ID;
 539         }
 540 
 541         kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
 542 }
 543 
 544 static void
 545 xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp)
 546 {
 547         if (txp == NULL)
 548                 return;
 549 
 550         while (txp->tx_next != NULL)
 551                 txp = txp->tx_next;
 552 
 553         /*
 554          * We free the chain in reverse order so that grants can be released
 555          * for all dma chunks before unbinding the dma handles. The mblk is
 556          * freed last, after all its fragments' dma handles are unbound.
 557          */
 558         xnf_txbuf_t *prev;
 559         for (; txp != NULL; txp = prev) {
 560                 prev = txp->tx_prev;
 561                 xnf_data_txbuf_free(xnfp, txp);
 562         }
 563 }
 564 
 565 static xnf_txbuf_t *
 566 xnf_data_txbuf_alloc(xnf_t *xnfp)
 567 {
 568         xnf_txbuf_t *txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
 569         txp->tx_type = TX_DATA;
 570         txp->tx_next = NULL;
 571         txp->tx_prev = NULL;
 572         txp->tx_head = txp;
 573         txp->tx_frags_to_ack = 0;
 574         txp->tx_mp = NULL;
 575         txp->tx_bdesc = NULL;
 576         txp->tx_handle_bound = B_FALSE;
 577         txp->tx_txreq.gref = INVALID_GRANT_REF;
 578         txp->tx_txreq.id = INVALID_TX_ID;
 579 
 580         return (txp);
 581 }
 582 
 583 /*
 584  * Get `wanted' slots in the transmit ring, waiting for at least that
 585  * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
 586  * `wanted' to zero.
 587  *
 588  * Return the number of slots available.
 589  */
 590 static int
 591 xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
 592 {
 593         int slotsfree;
 594         boolean_t forced_clean = (wanted == 0);
 595 
 596         ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
 597 
 598         /* LINTED: constant in conditional context */
 599         while (B_TRUE) {
 600                 slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);
 601 
 602                 if ((slotsfree < wanted) || forced_clean)
 603                         slotsfree = xnf_tx_clean_ring(xnfp);
 604 
 605                 /*
 606                  * If there are more than we need free, tell other
 607                  * people to come looking again. We hold txlock, so we
 608                  * are able to take our slots before anyone else runs.
 609                  */
 610                 if (slotsfree > wanted)
 611                         cv_broadcast(&xnfp->xnf_cv_tx_slots);
 612 
 613                 if (slotsfree >= wanted)
 614                         break;
 615 
 616                 if (!wait)
 617                         break;
 618 
 619                 cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
 620         }
 621 
 622         ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));
 623 
 624         return (slotsfree);
 625 }
 626 
 627 static int
 628 xnf_setup_rings(xnf_t *xnfp)
 629 {
 630         domid_t                 oeid;
 631         struct xenbus_device    *xsd;
 632         RING_IDX                i;
 633         int                     err;
 634         xnf_txid_t              *tidp;
 635         xnf_buf_t **bdescp;
 636 
 637         oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
 638         xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
 639 
 640         if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
 641                 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
 642 
 643         err = gnttab_grant_foreign_access(oeid,
 644             xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
 645         if (err <= 0) {
 646                 err = -err;
 647                 xenbus_dev_error(xsd, err, "granting access to tx ring page");
 648                 goto out;
 649         }
 650         xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
 651 
 652         if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
 653                 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
 654 
 655         err = gnttab_grant_foreign_access(oeid,
 656             xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
 657         if (err <= 0) {
 658                 err = -err;
 659                 xenbus_dev_error(xsd, err, "granting access to rx ring page");
 660                 goto out;
 661         }
 662         xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
 663 
 664         mutex_enter(&xnfp->xnf_txlock);
 665 
 666         /*
 667          * We first cleanup the TX ring in case we are doing a resume.
 668          * Note that this can lose packets, but we expect to stagger on.
 669          */
 670         xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
 671         for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
 672             i < NET_TX_RING_SIZE;
 673             i++, tidp++) {
 674                 xnf_txbuf_t *txp = tidp->txbuf;
 675                 if (txp == NULL)
 676                         continue;
 677 
 678                 switch (txp->tx_type) {
 679                 case TX_DATA:
 680                         /*
 681                          * txid_put() will be called for each txbuf's txid in
 682                          * the chain which will result in clearing tidp->txbuf.
 683                          */
 684                         xnf_data_txbuf_free_chain(xnfp, txp);
 685 
 686                         break;
 687 
 688                 case TX_MCAST_REQ:
 689                         txp->tx_type = TX_MCAST_RSP;
 690                         txp->tx_status = NETIF_RSP_DROPPED;
 691                         cv_broadcast(&xnfp->xnf_cv_multicast);
 692 
 693                         /*
 694                          * The request consumed two slots in the ring,
 695                          * yet only a single xnf_txid_t is used. Step
 696                          * over the empty slot.
 697                          */
 698                         i++;
 699                         ASSERT3U(i, <, NET_TX_RING_SIZE);
 700                         break;
 701 
 702                 case TX_MCAST_RSP:
 703                         break;
 704                 }
 705         }
 706 
 707         /*
 708          * Now purge old list and add each txid to the new free list.
 709          */
 710         xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
 711         for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
 712             i < NET_TX_RING_SIZE;
 713             i++, tidp++) {
 714                 tidp->id = i;
 715                 ASSERT3P(tidp->txbuf, ==, NULL);
 716                 tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
 717                 xnf_txid_put(xnfp, tidp);
 718         }
 719 
 720         /* LINTED: constant in conditional context */
 721         SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
 722         /* LINTED: constant in conditional context */
 723         FRONT_RING_INIT(&xnfp->xnf_tx_ring,
 724             xnfp->xnf_tx_ring.sring, PAGESIZE);
 725 
 726         mutex_exit(&xnfp->xnf_txlock);
 727 
 728         mutex_enter(&xnfp->xnf_rxlock);
 729 
 730         /*
 731          * Clean out any buffers currently posted to the receive ring
 732          * before we reset it.
 733          */
 734         for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
 735             i < NET_RX_RING_SIZE;
 736             i++, bdescp++) {
 737                 if (*bdescp != NULL) {
 738                         xnf_buf_put(xnfp, *bdescp, B_FALSE);
 739                         *bdescp = NULL;
 740                 }
 741         }
 742 
 743         /* LINTED: constant in conditional context */
 744         SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
 745         /* LINTED: constant in conditional context */
 746         FRONT_RING_INIT(&xnfp->xnf_rx_ring,
 747             xnfp->xnf_rx_ring.sring, PAGESIZE);
 748 
 749         /*
 750          * Fill the ring with buffers.
 751          */
 752         for (i = 0; i < NET_RX_RING_SIZE; i++) {
 753                 xnf_buf_t *bdesc;
 754 
 755                 bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
 756                 VERIFY(bdesc != NULL);
 757                 xnf_rxbuf_hang(xnfp, bdesc);
 758         }
 759 
 760         /* LINTED: constant in conditional context */
 761         RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
 762 
 763         mutex_exit(&xnfp->xnf_rxlock);
 764 
 765         return (0);
 766 
 767 out:
 768         if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
 769                 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
 770         xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
 771 
 772         if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
 773                 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
 774         xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
 775 
 776         return (err);
 777 }
 778 
 779 /*
 780  * Connect driver to back end, called to set up communication with
 781  * back end driver both initially and on resume after restore/migrate.
 782  */
 783 void
 784 xnf_be_connect(xnf_t *xnfp)
 785 {
 786         const char      *message;
 787         xenbus_transaction_t xbt;
 788         struct          xenbus_device *xsd;
 789         char            *xsname;
 790         int             err;
 791 
 792         ASSERT(!xnfp->xnf_connected);
 793 
 794         xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
 795         xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
 796 
 797         err = xnf_setup_rings(xnfp);
 798         if (err != 0) {
 799                 cmn_err(CE_WARN, "failed to set up tx/rx rings");
 800                 xenbus_dev_error(xsd, err, "setting up ring");
 801                 return;
 802         }
 803 
 804 again:
 805         err = xenbus_transaction_start(&xbt);
 806         if (err != 0) {
 807                 xenbus_dev_error(xsd, EIO, "starting transaction");
 808                 return;
 809         }
 810 
 811         err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
 812             xnfp->xnf_tx_ring_ref);
 813         if (err != 0) {
 814                 message = "writing tx ring-ref";
 815                 goto abort_transaction;
 816         }
 817 
 818         err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
 819             xnfp->xnf_rx_ring_ref);
 820         if (err != 0) {
 821                 message = "writing rx ring-ref";
 822                 goto abort_transaction;
 823         }
 824 
 825         err = xenbus_printf(xbt, xsname, "event-channel", "%u",
 826             xnfp->xnf_evtchn);
 827         if (err != 0) {
 828                 message = "writing event-channel";
 829                 goto abort_transaction;
 830         }
 831 
 832         err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
 833         if (err != 0) {
 834                 message = "writing feature-rx-notify";
 835                 goto abort_transaction;
 836         }
 837 
 838         err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
 839         if (err != 0) {
 840                 message = "writing request-rx-copy";
 841                 goto abort_transaction;
 842         }
 843 
 844         if (xnfp->xnf_be_mcast_control) {
 845                 err = xenbus_printf(xbt, xsname, "request-multicast-control",
 846                     "%d", 1);
 847                 if (err != 0) {
 848                         message = "writing request-multicast-control";
 849                         goto abort_transaction;
 850                 }
 851         }
 852 
 853         /*
 854          * Tell backend if we support scatter-gather lists on the rx side.
 855          */
 856         err = xenbus_printf(xbt, xsname, "feature-sg", "%d",
 857             xnf_enable_rx_sg ? 1 : 0);
 858         if (err != 0) {
 859                 message = "writing feature-sg";
 860                 goto abort_transaction;
 861         }
 862 
 863         /*
 864          * Tell backend if we support LRO for IPv4. Scatter-gather on rx is
 865          * a prerequisite.
 866          */
 867         err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d",
 868             (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0);
 869         if (err != 0) {
 870                 message = "writing feature-gso-tcpv4";
 871                 goto abort_transaction;
 872         }
 873 
 874         err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
 875         if (err != 0) {
 876                 message = "switching state to XenbusStateConnected";
 877                 goto abort_transaction;
 878         }
 879 
 880         err = xenbus_transaction_end(xbt, 0);
 881         if (err != 0) {
 882                 if (err == EAGAIN)
 883                         goto again;
 884                 xenbus_dev_error(xsd, err, "completing transaction");
 885         }
 886 
 887         return;
 888 
 889 abort_transaction:
 890         (void) xenbus_transaction_end(xbt, 1);
 891         xenbus_dev_error(xsd, err, "%s", message);
 892 }
 893 
 894 /*
 895  * Read configuration information from xenstore.
 896  */
 897 void
 898 xnf_read_config(xnf_t *xnfp)
 899 {
 900         int err, be_cap;
 901         char mac[ETHERADDRL * 3];
 902         char *oename = xvdi_get_oename(xnfp->xnf_devinfo);
 903 
 904         err = xenbus_scanf(XBT_NULL, oename, "mac",
 905             "%s", (char *)&mac[0]);
 906         if (err != 0) {
 907                 /*
 908                  * bad: we're supposed to be set up with a proper mac
 909                  * addr. at this point
 910                  */
 911                 cmn_err(CE_WARN, "%s%d: no mac address",
 912                     ddi_driver_name(xnfp->xnf_devinfo),
 913                     ddi_get_instance(xnfp->xnf_devinfo));
 914                         return;
 915         }
 916         if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
 917                 err = ENOENT;
 918                 xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
 919                     "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
 920                 return;
 921         }
 922 
 923         err = xenbus_scanf(XBT_NULL, oename,
 924             "feature-rx-copy", "%d", &be_cap);
 925         /*
 926          * If we fail to read the store we assume that the key is
 927          * absent, implying an older domain at the far end.  Older
 928          * domains cannot do HV copy.
 929          */
 930         if (err != 0)
 931                 be_cap = 0;
 932         xnfp->xnf_be_rx_copy = (be_cap != 0);
 933 
 934         err = xenbus_scanf(XBT_NULL, oename,
 935             "feature-multicast-control", "%d", &be_cap);
 936         /*
 937          * If we fail to read the store we assume that the key is
 938          * absent, implying an older domain at the far end.  Older
 939          * domains do not support multicast control.
 940          */
 941         if (err != 0)
 942                 be_cap = 0;
 943         xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
 944 
 945         /*
 946          * See if back-end supports scatter-gather for transmits. If not,
 947          * we will not support LSO and limit the mtu to 1500.
 948          */
 949         err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap);
 950         if (err != 0) {
 951                 be_cap = 0;
 952                 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
 953                     "'feature-sg' from backend driver");
 954         }
 955         if (be_cap == 0) {
 956                 dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not "
 957                     "supported for transmits in the backend driver. LSO is "
 958                     "disabled and MTU is restricted to 1500 bytes.");
 959         }
 960         xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg;
 961 
 962         if (xnfp->xnf_be_tx_sg) {
 963                 /*
 964                  * Check if LSO is supported. Currently we only check for
 965                  * IPv4 as Illumos doesn't support LSO for IPv6.
 966                  */
 967                 err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d",
 968                     &be_cap);
 969                 if (err != 0) {
 970                         be_cap = 0;
 971                         dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
 972                             "'feature-gso-tcpv4' from backend driver");
 973                 }
 974                 if (be_cap == 0) {
 975                         dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not "
 976                             "supported by the backend driver. Performance "
 977                             "will be affected.");
 978                 }
 979                 xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso;
 980         }
 981 }
 982 
 983 /*
 984  *  attach(9E) -- Attach a device to the system
 985  */
 986 static int
 987 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
 988 {
 989         mac_register_t *macp;
 990         xnf_t *xnfp;
 991         int err;
 992         char cachename[32];
 993 
 994 #ifdef XNF_DEBUG
 995         if (xnf_debug & XNF_DEBUG_DDI)
 996                 printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
 997                     (void *)devinfo);
 998 #endif
 999 
1000         switch (cmd) {
1001         case DDI_RESUME:
1002                 xnfp = ddi_get_driver_private(devinfo);
1003                 xnfp->xnf_gen++;
1004 
1005                 (void) xvdi_resume(devinfo);
1006                 (void) xvdi_alloc_evtchn(devinfo);
1007                 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
1008 #ifdef XPV_HVM_DRIVER
1009                 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
1010                     xnfp);
1011 #else
1012                 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
1013                     (caddr_t)xnfp);
1014 #endif
1015                 return (DDI_SUCCESS);
1016 
1017         case DDI_ATTACH:
1018                 break;
1019 
1020         default:
1021                 return (DDI_FAILURE);
1022         }
1023 
1024         /*
1025          *  Allocate gld_mac_info_t and xnf_instance structures
1026          */
1027         macp = mac_alloc(MAC_VERSION);
1028         if (macp == NULL)
1029                 return (DDI_FAILURE);
1030         xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
1031 
1032         xnfp->xnf_tx_pkt_id =
1033             kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP);
1034 
1035         xnfp->xnf_rx_pkt_info =
1036             kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP);
1037 
1038         macp->m_dip = devinfo;
1039         macp->m_driver = xnfp;
1040         xnfp->xnf_devinfo = devinfo;
1041 
1042         macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1043         macp->m_src_addr = xnfp->xnf_mac_addr;
1044         macp->m_callbacks = &xnf_callbacks;
1045         macp->m_min_sdu = 0;
1046         xnfp->xnf_mtu = ETHERMTU;
1047         macp->m_max_sdu = xnfp->xnf_mtu;
1048 
1049         xnfp->xnf_running = B_FALSE;
1050         xnfp->xnf_connected = B_FALSE;
1051         xnfp->xnf_be_rx_copy = B_FALSE;
1052         xnfp->xnf_be_mcast_control = B_FALSE;
1053         xnfp->xnf_need_sched = B_FALSE;
1054 
1055         xnfp->xnf_rx_head = NULL;
1056         xnfp->xnf_rx_tail = NULL;
1057         xnfp->xnf_rx_new_buffers_posted = B_FALSE;
1058 
1059 #ifdef XPV_HVM_DRIVER
1060         /* Report our version to dom0 */
1061         (void) xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
1062             HVMPV_XNF_VERS);
1063 #endif
1064 
1065         /*
1066          * Get the iblock cookie with which to initialize the mutexes.
1067          */
1068         if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
1069             != DDI_SUCCESS)
1070                 goto failure;
1071 
1072         mutex_init(&xnfp->xnf_txlock,
1073             NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1074         mutex_init(&xnfp->xnf_rxlock,
1075             NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1076         mutex_init(&xnfp->xnf_schedlock,
1077             NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1078         mutex_init(&xnfp->xnf_gref_lock,
1079             NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1080 
1081         cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
1082         cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
1083         cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);
1084 
1085         (void) sprintf(cachename, "xnf_buf_cache_%d",
1086             ddi_get_instance(devinfo));
1087         xnfp->xnf_buf_cache = kmem_cache_create(cachename,
1088             sizeof (xnf_buf_t), 0,
1089             xnf_buf_constructor, xnf_buf_destructor,
1090             NULL, xnfp, NULL, 0);
1091         if (xnfp->xnf_buf_cache == NULL)
1092                 goto failure_0;
1093 
1094         (void) sprintf(cachename, "xnf_tx_buf_cache_%d",
1095             ddi_get_instance(devinfo));
1096         xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
1097             sizeof (xnf_txbuf_t), 0,
1098             xnf_tx_buf_constructor, xnf_tx_buf_destructor,
1099             NULL, xnfp, NULL, 0);
1100         if (xnfp->xnf_tx_buf_cache == NULL)
1101                 goto failure_1;
1102 
1103         xnfp->xnf_gref_head = INVALID_GRANT_REF;
1104 
1105         if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
1106                 cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
1107                     "driver data structures",
1108                     ddi_get_instance(xnfp->xnf_devinfo));
1109                 goto failure_2;
1110         }
1111 
1112         xnfp->xnf_rx_ring.sring->rsp_event =
1113             xnfp->xnf_tx_ring.sring->rsp_event = 1;
1114 
1115         xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
1116         xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
1117 
1118         /* set driver private pointer now */
1119         ddi_set_driver_private(devinfo, xnfp);
1120 
1121         if (!xnf_kstat_init(xnfp))
1122                 goto failure_3;
1123 
1124         /*
1125          * Allocate an event channel, add the interrupt handler and
1126          * bind it to the event channel.
1127          */
1128         (void) xvdi_alloc_evtchn(devinfo);
1129         xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
1130 #ifdef XPV_HVM_DRIVER
1131         ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
1132 #else
1133         (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
1134 #endif
1135 
1136         err = mac_register(macp, &xnfp->xnf_mh);
1137         mac_free(macp);
1138         macp = NULL;
1139         if (err != 0)
1140                 goto failure_4;
1141 
1142         if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
1143             != DDI_SUCCESS)
1144                 goto failure_5;
1145 
1146 #ifdef XPV_HVM_DRIVER
1147         /*
1148          * In the HVM case, this driver essentially replaces a driver for
1149          * a 'real' PCI NIC. Without the "model" property set to
1150          * "Ethernet controller", like the PCI code does, netbooting does
1151          * not work correctly, as strplumb_get_netdev_path() will not find
1152          * this interface.
1153          */
1154         (void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
1155             "Ethernet controller");
1156 #endif
1157 
1158 #ifdef XNF_DEBUG
1159         if (xnf_debug_instance == NULL)
1160                 xnf_debug_instance = xnfp;
1161 #endif
1162 
1163         return (DDI_SUCCESS);
1164 
1165 failure_5:
1166         (void) mac_unregister(xnfp->xnf_mh);
1167 
1168 failure_4:
1169 #ifdef XPV_HVM_DRIVER
1170         ec_unbind_evtchn(xnfp->xnf_evtchn);
1171         xvdi_free_evtchn(devinfo);
1172 #else
1173         ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1174 #endif
1175         xnfp->xnf_evtchn = INVALID_EVTCHN;
1176         kstat_delete(xnfp->xnf_kstat_aux);
1177 
1178 failure_3:
1179         xnf_release_dma_resources(xnfp);
1180 
1181 failure_2:
1182         kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1183 
1184 failure_1:
1185         kmem_cache_destroy(xnfp->xnf_buf_cache);
1186 
1187 failure_0:
1188         cv_destroy(&xnfp->xnf_cv_tx_slots);
1189         cv_destroy(&xnfp->xnf_cv_multicast);
1190         cv_destroy(&xnfp->xnf_cv_state);
1191 
1192         mutex_destroy(&xnfp->xnf_gref_lock);
1193         mutex_destroy(&xnfp->xnf_schedlock);
1194         mutex_destroy(&xnfp->xnf_rxlock);
1195         mutex_destroy(&xnfp->xnf_txlock);
1196 
1197 failure:
1198         kmem_free(xnfp, sizeof (*xnfp));
1199         if (macp != NULL)
1200                 mac_free(macp);
1201 
1202         return (DDI_FAILURE);
1203 }
1204 
1205 /*  detach(9E) -- Detach a device from the system */
1206 static int
1207 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
1208 {
1209         xnf_t *xnfp;            /* Our private device info */
1210 
1211 #ifdef XNF_DEBUG
1212         if (xnf_debug & XNF_DEBUG_DDI)
1213                 printf("xnf_detach(0x%p)\n", (void *)devinfo);
1214 #endif
1215 
1216         xnfp = ddi_get_driver_private(devinfo);
1217 
1218         switch (cmd) {
1219         case DDI_SUSPEND:
1220 #ifdef XPV_HVM_DRIVER
1221                 ec_unbind_evtchn(xnfp->xnf_evtchn);
1222                 xvdi_free_evtchn(devinfo);
1223 #else
1224                 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1225 #endif
1226 
1227                 xvdi_suspend(devinfo);
1228 
1229                 mutex_enter(&xnfp->xnf_rxlock);
1230                 mutex_enter(&xnfp->xnf_txlock);
1231 
1232                 xnfp->xnf_evtchn = INVALID_EVTCHN;
1233                 xnfp->xnf_connected = B_FALSE;
1234                 mutex_exit(&xnfp->xnf_txlock);
1235                 mutex_exit(&xnfp->xnf_rxlock);
1236 
1237                 /* claim link to be down after disconnect */
1238                 mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
1239                 return (DDI_SUCCESS);
1240 
1241         case DDI_DETACH:
1242                 break;
1243 
1244         default:
1245                 return (DDI_FAILURE);
1246         }
1247 
1248         if (xnfp->xnf_connected)
1249                 return (DDI_FAILURE);
1250 
1251         /*
1252          * Cannot detach if we have xnf_buf_t outstanding.
1253          */
1254         if (xnfp->xnf_stat_buf_allocated > 0)
1255                 return (DDI_FAILURE);
1256 
1257         if (mac_unregister(xnfp->xnf_mh) != 0)
1258                 return (DDI_FAILURE);
1259 
1260         kstat_delete(xnfp->xnf_kstat_aux);
1261 
1262         /* Stop the receiver */
1263         xnf_stop(xnfp);
1264 
1265         xvdi_remove_event_handler(devinfo, XS_OE_STATE);
1266 
1267         /* Remove the interrupt */
1268 #ifdef XPV_HVM_DRIVER
1269         ec_unbind_evtchn(xnfp->xnf_evtchn);
1270         xvdi_free_evtchn(devinfo);
1271 #else
1272         ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1273 #endif
1274 
1275         /* Release any pending xmit mblks */
1276         xnf_release_mblks(xnfp);
1277 
1278         /* Release all DMA resources */
1279         xnf_release_dma_resources(xnfp);
1280 
1281         cv_destroy(&xnfp->xnf_cv_tx_slots);
1282         cv_destroy(&xnfp->xnf_cv_multicast);
1283         cv_destroy(&xnfp->xnf_cv_state);
1284 
1285         kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1286         kmem_cache_destroy(xnfp->xnf_buf_cache);
1287 
1288         mutex_destroy(&xnfp->xnf_gref_lock);
1289         mutex_destroy(&xnfp->xnf_schedlock);
1290         mutex_destroy(&xnfp->xnf_rxlock);
1291         mutex_destroy(&xnfp->xnf_txlock);
1292 
1293         kmem_free(xnfp, sizeof (*xnfp));
1294 
1295         return (DDI_SUCCESS);
1296 }
1297 
1298 /*
1299  *  xnf_set_mac_addr() -- set the physical network address on the board.
1300  */
1301 static int
1302 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
1303 {
1304         _NOTE(ARGUNUSED(arg, macaddr));
1305 
1306         /*
1307          * We can't set our macaddr.
1308          */
1309         return (ENOTSUP);
1310 }
1311 
1312 /*
1313  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
1314  *
1315  *  Program the hardware to enable/disable the multicast address
1316  *  in "mca".  Enable if "add" is true, disable if false.
1317  */
1318 static int
1319 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
1320 {
1321         xnf_t *xnfp = arg;
1322         xnf_txbuf_t *txp;
1323         int n_slots;
1324         RING_IDX slot;
1325         xnf_txid_t *tidp;
1326         netif_tx_request_t *txrp;
1327         struct netif_extra_info *erp;
1328         boolean_t notify, result;
1329 
1330         /*
1331          * If the backend does not support multicast control then we
1332          * must assume that the right packets will just arrive.
1333          */
1334         if (!xnfp->xnf_be_mcast_control)
1335                 return (0);
1336 
1337         txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
1338 
1339         mutex_enter(&xnfp->xnf_txlock);
1340 
1341         /*
1342          * If we're not yet connected then claim success. This is
1343          * acceptable because we refresh the entire set of multicast
1344          * addresses when we get connected.
1345          *
1346          * We can't wait around here because the MAC layer expects
1347          * this to be a non-blocking operation - waiting ends up
1348          * causing a deadlock during resume.
1349          */
1350         if (!xnfp->xnf_connected) {
1351                 mutex_exit(&xnfp->xnf_txlock);
1352                 return (0);
1353         }
1354 
1355         /*
1356          * 1. Acquire two slots in the ring.
1357          * 2. Fill in the slots.
1358          * 3. Request notification when the operation is done.
1359          * 4. Kick the peer.
1360          * 5. Wait for the response via xnf_tx_clean_ring().
1361          */
1362 
1363         n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE);
1364         ASSERT(n_slots >= 2);
1365 
1366         slot = xnfp->xnf_tx_ring.req_prod_pvt;
1367         tidp = xnf_txid_get(xnfp);
1368         VERIFY(tidp != NULL);
1369 
1370         txp->tx_type = TX_MCAST_REQ;
1371         txp->tx_slot = slot;
1372 
1373         txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1374         erp = (struct netif_extra_info *)
1375             RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);
1376 
1377         txrp->gref = 0;
1378         txrp->size = 0;
1379         txrp->offset = 0;
1380         /* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
1381         txrp->id = txp->tx_txreq.id = tidp->id;
1382         txrp->flags = NETTXF_extra_info;
1383 
1384         erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
1385             XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
1386         bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);
1387 
1388         tidp->txbuf = txp;
1389 
1390         xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;
1391 
1392         mutex_enter(&xnfp->xnf_schedlock);
1393         xnfp->xnf_pending_multicast++;
1394         mutex_exit(&xnfp->xnf_schedlock);
1395 
1396         /* LINTED: constant in conditional context */
1397         RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1398             notify);
1399         if (notify)
1400                 ec_notify_via_evtchn(xnfp->xnf_evtchn);
1401 
1402         while (txp->tx_type == TX_MCAST_REQ)
1403                 cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock);
1404 
1405         ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP);
1406 
1407         mutex_enter(&xnfp->xnf_schedlock);
1408         xnfp->xnf_pending_multicast--;
1409         mutex_exit(&xnfp->xnf_schedlock);
1410 
1411         result = (txp->tx_status == NETIF_RSP_OKAY);
1412 
1413         xnf_txid_put(xnfp, tidp);
1414 
1415         mutex_exit(&xnfp->xnf_txlock);
1416 
1417         kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1418 
1419         return (result ? 0 : 1);
1420 }
1421 
1422 /*
1423  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
1424  *
1425  *  Program the hardware to enable/disable promiscuous mode.
1426  */
1427 static int
1428 xnf_set_promiscuous(void *arg, boolean_t on)
1429 {
1430         _NOTE(ARGUNUSED(arg, on));
1431 
1432         /*
1433          * We can't really do this, but we pretend that we can in
1434          * order that snoop will work.
1435          */
1436         return (0);
1437 }
1438 
1439 /*
1440  * Clean buffers that we have responses for from the transmit ring.
1441  */
1442 static int
1443 xnf_tx_clean_ring(xnf_t *xnfp)
1444 {
1445         boolean_t work_to_do;
1446 
1447         ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1448 
1449 loop:
1450         while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1451                 RING_IDX cons, prod, i;
1452 
1453                 cons = xnfp->xnf_tx_ring.rsp_cons;
1454                 prod = xnfp->xnf_tx_ring.sring->rsp_prod;
1455                 membar_consumer();
1456                 /*
1457                  * Clean tx requests from ring that we have responses
1458                  * for.
1459                  */
1460                 DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
1461                 for (i = cons; i != prod; i++) {
1462                         netif_tx_response_t *trp;
1463                         xnf_txid_t *tidp;
1464                         xnf_txbuf_t *txp;
1465 
1466                         trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
1467                         /*
1468                          * if this slot was occupied by netif_extra_info_t,
1469                          * then the response will be NETIF_RSP_NULL. In this
1470                          * case there are no resources to clean up.
1471                          */
1472                         if (trp->status == NETIF_RSP_NULL)
1473                                 continue;
1474 
1475                         ASSERT(TX_ID_VALID(trp->id));
1476 
1477                         tidp = TX_ID_TO_TXID(xnfp, trp->id);
1478                         ASSERT3U(tidp->id, ==, trp->id);
1479                         ASSERT3U(tidp->next, ==, INVALID_TX_ID);
1480 
1481                         txp = tidp->txbuf;
1482                         ASSERT(txp != NULL);
1483                         ASSERT3U(txp->tx_txreq.id, ==, trp->id);
1484 
1485                         switch (txp->tx_type) {
1486                         case TX_DATA:
1487                                 /*
1488                                  * We must put the txid for each response we
1489                                  * acknowledge to make sure that we never have
1490                                  * more free slots than txids. Because of this
1491                                  * we do it here instead of waiting for it to
1492                                  * be done in xnf_data_txbuf_free_chain().
1493                                  */
1494                                 xnf_txid_put(xnfp, tidp);
1495                                 txp->tx_txreq.id = INVALID_TX_ID;
1496                                 ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0);
1497                                 txp->tx_head->tx_frags_to_ack--;
1498 
1499                                 /*
1500                                  * We clean the whole chain once we got a
1501                                  * response for each fragment.
1502                                  */
1503                                 if (txp->tx_head->tx_frags_to_ack == 0)
1504                                         xnf_data_txbuf_free_chain(xnfp, txp);
1505 
1506                                 break;
1507 
1508                         case TX_MCAST_REQ:
1509                                 txp->tx_type = TX_MCAST_RSP;
1510                                 txp->tx_status = trp->status;
1511                                 cv_broadcast(&xnfp->xnf_cv_multicast);
1512 
1513                                 break;
1514 
1515                         default:
1516                                 cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
1517                                     "invalid xnf_txbuf_t type: %d",
1518                                     txp->tx_type);
1519                                 break;
1520                         }
1521                 }
1522                 /*
1523                  * Record the last response we dealt with so that we
1524                  * know where to start next time around.
1525                  */
1526                 xnfp->xnf_tx_ring.rsp_cons = prod;
1527                 membar_enter();
1528         }
1529 
1530         /* LINTED: constant in conditional context */
1531         RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1532         if (work_to_do)
1533                 goto loop;
1534 
1535         return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1536 }
1537 
1538 /*
1539  * Allocate and fill in a look-aside buffer for the packet `mp'. Used
1540  * to ensure that the packet is physically contiguous and contained
1541  * within a single page.
1542  */
1543 static xnf_buf_t *
1544 xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen)
1545 {
1546         xnf_buf_t *bd;
1547         caddr_t bp;
1548 
1549         bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE);
1550         if (bd == NULL)
1551                 return (NULL);
1552 
1553         bp = bd->buf;
1554         while (mp != NULL) {
1555                 size_t len = MBLKL(mp);
1556 
1557                 bcopy(mp->b_rptr, bp, len);
1558                 bp += len;
1559 
1560                 mp = mp->b_cont;
1561         }
1562 
1563         *plen = bp - bd->buf;
1564         ASSERT3U(*plen, <=, PAGESIZE);
1565 
1566         xnfp->xnf_stat_tx_lookaside++;
1567 
1568         return (bd);
1569 }
1570 
1571 /*
1572  * Insert the pseudo-header checksum into the packet.
1573  * Assumes packet is IPv4, TCP/UDP since we only advertised support for
1574  * HCKSUM_INET_FULL_V4.
1575  */
1576 int
1577 xnf_pseudo_cksum(mblk_t *mp)
1578 {
1579         struct ether_header *ehp;
1580         uint16_t sap, iplen, *stuff;
1581         uint32_t cksum;
1582         size_t len;
1583         ipha_t *ipha;
1584         ipaddr_t src, dst;
1585         uchar_t *ptr;
1586 
1587         ptr = mp->b_rptr;
1588         len = MBLKL(mp);
1589 
1590         /* Each header must fit completely in an mblk. */
1591         ASSERT3U(len, >=, sizeof (*ehp));
1592 
1593         ehp = (struct ether_header *)ptr;
1594 
1595         if (ntohs(ehp->ether_type) == VLAN_TPID) {
1596                 struct ether_vlan_header *evhp;
1597                 ASSERT3U(len, >=, sizeof (*evhp));
1598                 evhp = (struct ether_vlan_header *)ptr;
1599                 sap = ntohs(evhp->ether_type);
1600                 ptr += sizeof (*evhp);
1601                 len -= sizeof (*evhp);
1602         } else {
1603                 sap = ntohs(ehp->ether_type);
1604                 ptr += sizeof (*ehp);
1605                 len -= sizeof (*ehp);
1606         }
1607 
1608         ASSERT3U(sap, ==, ETHERTYPE_IP);
1609 
1610         /*
1611          * Ethernet and IP headers may be in different mblks.
1612          */
1613         ASSERT3P(ptr, <=, mp->b_wptr);
1614         if (ptr == mp->b_wptr) {
1615                 mp = mp->b_cont;
1616                 ptr = mp->b_rptr;
1617                 len = MBLKL(mp);
1618         }
1619 
1620         ASSERT3U(len, >=, sizeof (ipha_t));
1621         ipha = (ipha_t *)ptr;
1622 
1623         /*
1624          * We assume the IP header has no options. (This is enforced in
1625          * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM).
1626          */
1627         ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH);
1628         iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
1629 
1630         ptr += IP_SIMPLE_HDR_LENGTH;
1631         len -= IP_SIMPLE_HDR_LENGTH;
1632 
1633         /*
1634          * IP and L4 headers may be in different mblks.
1635          */
1636         ASSERT3P(ptr, <=, mp->b_wptr);
1637         if (ptr == mp->b_wptr) {
1638                 mp = mp->b_cont;
1639                 ptr = mp->b_rptr;
1640                 len = MBLKL(mp);
1641         }
1642 
1643         switch (ipha->ipha_protocol) {
1644         case IPPROTO_TCP:
1645                 ASSERT3U(len, >=, sizeof (tcph_t));
1646                 stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET);
1647                 cksum = IP_TCP_CSUM_COMP;
1648                 break;
1649         case IPPROTO_UDP:
1650                 ASSERT3U(len, >=, sizeof (struct udphdr));
1651                 stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET);
1652                 cksum = IP_UDP_CSUM_COMP;
1653                 break;
1654         default:
1655                 cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
1656                     ipha->ipha_protocol);
1657                 return (EINVAL);
1658         }
1659 
1660         src = ipha->ipha_src;
1661         dst = ipha->ipha_dst;
1662 
1663         cksum += (dst >> 16) + (dst & 0xFFFF);
1664         cksum += (src >> 16) + (src & 0xFFFF);
1665         cksum += htons(iplen);
1666 
1667         cksum = (cksum >> 16) + (cksum & 0xFFFF);
1668         cksum = (cksum >> 16) + (cksum & 0xFFFF);
1669 
1670         ASSERT(cksum <= 0xFFFF);
1671 
1672         *stuff = (uint16_t)(cksum ? cksum : ~cksum);
1673 
1674         return (0);
1675 }
1676 
1677 /*
1678  * Push a packet into the transmit ring.
1679  *
1680  * Note: the format of a tx packet that spans multiple slots is similar to
1681  * what is described in xnf_rx_one_packet().
1682  */
1683 static void
1684 xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head)
1685 {
1686         int nslots = 0;
1687         int extras = 0;
1688         RING_IDX slot;
1689         boolean_t notify;
1690 
1691         ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1692         ASSERT(xnfp->xnf_running);
1693 
1694         slot = xnfp->xnf_tx_ring.req_prod_pvt;
1695 
1696         /*
1697          * The caller has already checked that we have enough slots to proceed.
1698          */
1699         for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) {
1700                 xnf_txid_t *tidp;
1701                 netif_tx_request_t *txrp;
1702 
1703                 tidp = xnf_txid_get(xnfp);
1704                 VERIFY(tidp != NULL);
1705                 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1706 
1707                 txp->tx_slot = slot;
1708                 txp->tx_txreq.id = tidp->id;
1709                 *txrp = txp->tx_txreq;
1710 
1711                 tidp->txbuf = txp;
1712                 slot++;
1713                 nslots++;
1714 
1715                 /*
1716                  * When present, LSO info is placed in a slot after the first
1717                  * data segment, and doesn't require a txid.
1718                  */
1719                 if (txp->tx_txreq.flags & NETTXF_extra_info) {
1720                         netif_extra_info_t *extra;
1721                         ASSERT3U(nslots, ==, 1);
1722 
1723                         extra = (netif_extra_info_t *)
1724                             RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1725                         *extra = txp->tx_extra;
1726                         slot++;
1727                         nslots++;
1728                         extras = 1;
1729                 }
1730         }
1731 
1732         ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX);
1733 
1734         /*
1735          * Store the number of data fragments.
1736          */
1737         head->tx_frags_to_ack = nslots - extras;
1738 
1739         xnfp->xnf_tx_ring.req_prod_pvt = slot;
1740 
1741         /*
1742          * Tell the peer that we sent something, if it cares.
1743          */
1744         /* LINTED: constant in conditional context */
1745         RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify);
1746         if (notify)
1747                 ec_notify_via_evtchn(xnfp->xnf_evtchn);
1748 }
1749 
1750 static xnf_txbuf_t *
1751 xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp)
1752 {
1753         xnf_txbuf_t *txp = xnf_data_txbuf_alloc(xnfp);
1754         size_t length;
1755 
1756         txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length);
1757         if (txp->tx_bdesc == NULL) {
1758                 xnf_data_txbuf_free(xnfp, txp);
1759                 return (NULL);
1760         }
1761         txp->tx_mfn = txp->tx_bdesc->buf_mfn;
1762         txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
1763         txp->tx_txreq.size = length;
1764         txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET;
1765         txp->tx_txreq.flags = 0;
1766 
1767         return (txp);
1768 }
1769 
1770 static xnf_txbuf_t *
1771 xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp)
1772 {
1773         xnf_txbuf_t *head = NULL;
1774         xnf_txbuf_t *tail = NULL;
1775         domid_t oeid;
1776         int nsegs = 0;
1777 
1778         oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1779 
1780         for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) {
1781                 ddi_dma_handle_t dma_handle;
1782                 ddi_dma_cookie_t dma_cookie;
1783                 uint_t ncookies;
1784                 xnf_txbuf_t *txp;
1785 
1786                 if (MBLKL(ml) == 0)
1787                         continue;
1788 
1789                 txp = xnf_data_txbuf_alloc(xnfp);
1790 
1791                 if (head == NULL) {
1792                         head = txp;
1793                 } else {
1794                         ASSERT(tail != NULL);
1795                         TXBUF_SETNEXT(tail, txp);
1796                         txp->tx_head = head;
1797                 }
1798 
1799                 /*
1800                  * The necessary segmentation rules (e.g. not crossing a page
1801                  * boundary) are enforced by the dma attributes of the handle.
1802                  */
1803                 dma_handle = txp->tx_dma_handle;
1804                 int ret = ddi_dma_addr_bind_handle(dma_handle,
1805                     NULL, (char *)ml->b_rptr, MBLKL(ml),
1806                     DDI_DMA_WRITE | DDI_DMA_STREAMING,
1807                     DDI_DMA_DONTWAIT, 0, &dma_cookie,
1808                     &ncookies);
1809                 if (ret != DDI_DMA_MAPPED) {
1810                         if (ret != DDI_DMA_NORESOURCES) {
1811                                 dev_err(xnfp->xnf_devinfo, CE_WARN,
1812                                     "ddi_dma_addr_bind_handle() failed "
1813                                     "[dma_error=%d]", ret);
1814                         }
1815                         goto error;
1816                 }
1817                 txp->tx_handle_bound = B_TRUE;
1818 
1819                 ASSERT(ncookies > 0);
1820                 for (int i = 0; i < ncookies; i++) {
1821                         if (nsegs == XEN_MAX_TX_DATA_PAGES) {
1822                                 dev_err(xnfp->xnf_devinfo, CE_WARN,
1823                                     "xnf_dmamap_alloc() failed: "
1824                                     "too many segments");
1825                                 goto error;
1826                         }
1827                         if (i > 0) {
1828                                 txp = xnf_data_txbuf_alloc(xnfp);
1829                                 ASSERT(tail != NULL);
1830                                 TXBUF_SETNEXT(tail, txp);
1831                                 txp->tx_head = head;
1832                         }
1833 
1834                         txp->tx_mfn =
1835                             xnf_btop(pa_to_ma(dma_cookie.dmac_laddress));
1836                         txp->tx_txreq.gref = xnf_gref_get(xnfp);
1837                         if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
1838                                 dev_err(xnfp->xnf_devinfo, CE_WARN,
1839                                     "xnf_dmamap_alloc() failed: "
1840                                     "invalid grant ref");
1841                                 goto error;
1842                         }
1843                         gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
1844                             oeid, txp->tx_mfn, 1);
1845                         txp->tx_txreq.offset =
1846                             dma_cookie.dmac_laddress & PAGEOFFSET;
1847                         txp->tx_txreq.size = dma_cookie.dmac_size;
1848                         txp->tx_txreq.flags = 0;
1849 
1850                         ddi_dma_nextcookie(dma_handle, &dma_cookie);
1851                         nsegs++;
1852 
1853                         if (tail != NULL)
1854                                 tail->tx_txreq.flags = NETTXF_more_data;
1855                         tail = txp;
1856                 }
1857         }
1858 
1859         *countp = nsegs;
1860         return (head);
1861 
1862 error:
1863         xnf_data_txbuf_free_chain(xnfp, head);
1864         return (NULL);
1865 }
1866 
1867 static void
1868 xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head,
1869     uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss)
1870 {
1871         if (lso_flags != 0) {
1872                 ASSERT3U(lso_flags, ==, HW_LSO);
1873                 ASSERT3P(head->tx_bdesc, ==, NULL);
1874 
1875                 head->tx_txreq.flags |= NETTXF_extra_info;
1876                 netif_extra_info_t *extra = &head->tx_extra;
1877                 extra->type = XEN_NETIF_EXTRA_TYPE_GSO;
1878                 extra->flags = 0;
1879                 extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
1880                 extra->u.gso.size = mss;
1881                 extra->u.gso.features = 0;
1882                 extra->u.gso.pad = 0;
1883         } else if (cksum_flags != 0) {
1884                 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1885                 /*
1886                  * If the local protocol stack requests checksum
1887                  * offload we set the 'checksum blank' flag,
1888                  * indicating to the peer that we need the checksum
1889                  * calculated for us.
1890                  *
1891                  * We _don't_ set the validated flag, because we haven't
1892                  * validated that the data and the checksum match.
1893                  *
1894                  * Note: we already called xnf_pseudo_cksum() in
1895                  * xnf_send(), so we just set the txreq flag here.
1896                  */
1897                 head->tx_txreq.flags |= NETTXF_csum_blank;
1898                 xnfp->xnf_stat_tx_cksum_deferred++;
1899         }
1900 }
1901 
1902 /*
1903  * Send packet mp. Called by the MAC framework.
1904  */
1905 static mblk_t *
1906 xnf_send(void *arg, mblk_t *mp)
1907 {
1908         xnf_t *xnfp = arg;
1909         xnf_txbuf_t *head;
1910         mblk_t *ml;
1911         int length;
1912         int pages, chunks, slots, slots_free;
1913         uint32_t cksum_flags, lso_flags, mss;
1914         boolean_t pulledup = B_FALSE;
1915         boolean_t force_copy = B_FALSE;
1916 
1917         ASSERT3P(mp->b_next, ==, NULL);
1918 
1919         mutex_enter(&xnfp->xnf_txlock);
1920 
1921         /*
1922          * Wait until we are connected to the backend.
1923          */
1924         while (!xnfp->xnf_connected)
1925                 cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
1926 
1927         /*
1928          * To simplify logic and be in sync with the rescheduling mechanism,
1929          * we require the maximum amount of slots that could be used by a
1930          * transaction to be free before proceeding. The only downside of doing
1931          * this is that it slightly reduces the effective size of the ring.
1932          */
1933         slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE);
1934         if (slots_free < XEN_MAX_SLOTS_PER_TX) {
1935                 /*
1936                  * We need to ask for a re-schedule later as the ring is full.
1937                  */
1938                 mutex_enter(&xnfp->xnf_schedlock);
1939                 xnfp->xnf_need_sched = B_TRUE;
1940                 mutex_exit(&xnfp->xnf_schedlock);
1941 
1942                 xnfp->xnf_stat_tx_defer++;
1943                 mutex_exit(&xnfp->xnf_txlock);
1944                 return (mp);
1945         }
1946 
1947         /*
1948          * Get hw offload parameters.
1949          * This must be done before pulling up the mp as those parameters
1950          * are not copied over.
1951          */
1952         mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags);
1953         mac_lso_get(mp, &mss, &lso_flags);
1954 
1955         /*
1956          * XXX: fix MAC framework so that we can advertise support for
1957          * partial checksum for IPv4 only. This way we won't need to calculate
1958          * the pseudo header checksum ourselves.
1959          */
1960         if (cksum_flags != 0) {
1961                 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1962                 (void) xnf_pseudo_cksum(mp);
1963         }
1964 
1965 pulledup:
1966         for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL;
1967             ml = ml->b_cont, chunks++) {
1968                 pages += xnf_mblk_pages(ml);
1969                 length += MBLKL(ml);
1970         }
1971         DTRACE_PROBE3(packet, int, length, int, chunks, int, pages);
1972         DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss);
1973 
1974         /*
1975          * If the ethernet header crosses a page boundary the packet
1976          * will be dropped by the backend. In practice it seems like
1977          * this happens fairly rarely so we'll do nothing unless the
1978          * packet is small enough to fit in a look-aside buffer.
1979          */
1980         if (((uintptr_t)mp->b_rptr & PAGEOFFSET) +
1981             sizeof (struct ether_header) > PAGESIZE) {
1982                 xnfp->xnf_stat_tx_eth_hdr_split++;
1983                 if (length <= PAGESIZE)
1984                         force_copy = B_TRUE;
1985         }
1986 
1987         if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) {
1988                 /*
1989                  * If the packet spans several pages and scatter-gather is not
1990                  * supported then use a look-aside buffer.
1991                  */
1992                 ASSERT3U(length, <=, PAGESIZE);
1993                 head = xnf_mblk_copy(xnfp, mp);
1994                 if (head == NULL) {
1995                         dev_err(xnfp->xnf_devinfo, CE_WARN,
1996                             "xnf_mblk_copy() failed");
1997                         goto drop;
1998                 }
1999         } else {
2000                 /*
2001                  * There's a limit for how many pages can be passed to the
2002                  * backend. If we pass that limit, the packet will be dropped
2003                  * and some backend implementations (e.g. Linux) could even
2004                  * offline the interface.
2005                  */
2006                 if (pages > XEN_MAX_TX_DATA_PAGES) {
2007                         if (pulledup) {
2008                                 dev_err(xnfp->xnf_devinfo, CE_WARN,
2009                                     "too many pages, even after pullup: %d.",
2010                                     pages);
2011                                 goto drop;
2012                         }
2013 
2014                         /*
2015                          * Defragment packet if it spans too many pages.
2016                          */
2017                         mblk_t *newmp = msgpullup(mp, -1);
2018                         freemsg(mp);
2019                         mp = newmp;
2020                         xnfp->xnf_stat_tx_pullup++;
2021                         pulledup = B_TRUE;
2022                         goto pulledup;
2023                 }
2024 
2025                 head = xnf_mblk_map(xnfp, mp, &slots);
2026                 if (head == NULL)
2027                         goto drop;
2028 
2029                 IMPLY(slots > 1, xnfp->xnf_be_tx_sg);
2030         }
2031 
2032         /*
2033          * Set tx_mp so that mblk is freed when the txbuf chain is freed.
2034          */
2035         head->tx_mp = mp;
2036 
2037         xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss);
2038 
2039         /*
2040          * The first request must store the total length of the packet.
2041          */
2042         head->tx_txreq.size = length;
2043 
2044         /*
2045          * Push the packet we have prepared into the ring.
2046          */
2047         xnf_tx_push_packet(xnfp, head);
2048         xnfp->xnf_stat_opackets++;
2049         xnfp->xnf_stat_obytes += length;
2050 
2051         mutex_exit(&xnfp->xnf_txlock);
2052         return (NULL);
2053 
2054 drop:
2055         freemsg(mp);
2056         xnfp->xnf_stat_tx_drop++;
2057         mutex_exit(&xnfp->xnf_txlock);
2058         return (NULL);
2059 }
2060 
2061 /*
2062  * Notification of RX packets. Currently no TX-complete interrupt is
2063  * used, as we clean the TX ring lazily.
2064  */
2065 static uint_t
2066 xnf_intr(caddr_t arg)
2067 {
2068         xnf_t *xnfp = (xnf_t *)arg;
2069         mblk_t *mp;
2070         boolean_t need_sched, clean_ring;
2071 
2072         mutex_enter(&xnfp->xnf_rxlock);
2073 
2074         /*
2075          * Interrupts before we are connected are spurious.
2076          */
2077         if (!xnfp->xnf_connected) {
2078                 mutex_exit(&xnfp->xnf_rxlock);
2079                 xnfp->xnf_stat_unclaimed_interrupts++;
2080                 return (DDI_INTR_UNCLAIMED);
2081         }
2082 
2083         /*
2084          * Receive side processing.
2085          */
2086         do {
2087                 /*
2088                  * Collect buffers from the ring.
2089                  */
2090                 xnf_rx_collect(xnfp);
2091 
2092                 /*
2093                  * Interrupt me when the next receive buffer is consumed.
2094                  */
2095                 xnfp->xnf_rx_ring.sring->rsp_event =
2096                     xnfp->xnf_rx_ring.rsp_cons + 1;
2097                 xen_mb();
2098 
2099         } while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));
2100 
2101         if (xnfp->xnf_rx_new_buffers_posted) {
2102                 boolean_t notify;
2103 
2104                 /*
2105                  * Indicate to the peer that we have re-filled the
2106                  * receive ring, if it cares.
2107                  */
2108                 /* LINTED: constant in conditional context */
2109                 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
2110                 if (notify)
2111                         ec_notify_via_evtchn(xnfp->xnf_evtchn);
2112                 xnfp->xnf_rx_new_buffers_posted = B_FALSE;
2113         }
2114 
2115         mp = xnfp->xnf_rx_head;
2116         xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;
2117 
2118         xnfp->xnf_stat_interrupts++;
2119         mutex_exit(&xnfp->xnf_rxlock);
2120 
2121         if (mp != NULL)
2122                 mac_rx(xnfp->xnf_mh, NULL, mp);
2123 
2124         /*
2125          * Transmit side processing.
2126          *
2127          * If a previous transmit attempt failed or we have pending
2128          * multicast requests, clean the ring.
2129          *
2130          * If we previously stalled transmission and cleaning produces
2131          * some free slots, tell upstream to attempt sending again.
2132          *
2133          * The odd style is to avoid acquiring xnf_txlock unless we
2134          * will actually look inside the tx machinery.
2135          */
2136         mutex_enter(&xnfp->xnf_schedlock);
2137         need_sched = xnfp->xnf_need_sched;
2138         clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
2139         mutex_exit(&xnfp->xnf_schedlock);
2140 
2141         if (clean_ring) {
2142                 int free_slots;
2143 
2144                 mutex_enter(&xnfp->xnf_txlock);
2145                 free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE);
2146 
2147                 if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) {
2148                         mutex_enter(&xnfp->xnf_schedlock);
2149                         xnfp->xnf_need_sched = B_FALSE;
2150                         mutex_exit(&xnfp->xnf_schedlock);
2151 
2152                         mac_tx_update(xnfp->xnf_mh);
2153                 }
2154                 mutex_exit(&xnfp->xnf_txlock);
2155         }
2156 
2157         return (DDI_INTR_CLAIMED);
2158 }
2159 
2160 /*
2161  *  xnf_start() -- start the board receiving and enable interrupts.
2162  */
2163 static int
2164 xnf_start(void *arg)
2165 {
2166         xnf_t *xnfp = arg;
2167 
2168 #ifdef XNF_DEBUG
2169         if (xnf_debug & XNF_DEBUG_TRACE)
2170                 printf("xnf%d start(0x%p)\n",
2171                     ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
2172 #endif
2173 
2174         mutex_enter(&xnfp->xnf_rxlock);
2175         mutex_enter(&xnfp->xnf_txlock);
2176 
2177         /* Accept packets from above. */
2178         xnfp->xnf_running = B_TRUE;
2179 
2180         mutex_exit(&xnfp->xnf_txlock);
2181         mutex_exit(&xnfp->xnf_rxlock);
2182 
2183         return (0);
2184 }
2185 
2186 /* xnf_stop() - disable hardware */
2187 static void
2188 xnf_stop(void *arg)
2189 {
2190         xnf_t *xnfp = arg;
2191 
2192 #ifdef XNF_DEBUG
2193         if (xnf_debug & XNF_DEBUG_TRACE)
2194                 printf("xnf%d stop(0x%p)\n",
2195                     ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
2196 #endif
2197 
2198         mutex_enter(&xnfp->xnf_rxlock);
2199         mutex_enter(&xnfp->xnf_txlock);
2200 
2201         xnfp->xnf_running = B_FALSE;
2202 
2203         mutex_exit(&xnfp->xnf_txlock);
2204         mutex_exit(&xnfp->xnf_rxlock);
2205 }
2206 
2207 /*
2208  * Hang buffer `bdesc' on the RX ring.
2209  */
2210 static void
2211 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
2212 {
2213         netif_rx_request_t *reqp;
2214         RING_IDX hang_ix;
2215 
2216         ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2217 
2218         reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
2219             xnfp->xnf_rx_ring.req_prod_pvt);
2220         hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
2221         ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);
2222 
2223         reqp->id = bdesc->id = hang_ix;
2224         reqp->gref = bdesc->grant_ref;
2225 
2226         xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
2227         xnfp->xnf_rx_ring.req_prod_pvt++;
2228 
2229         xnfp->xnf_rx_new_buffers_posted = B_TRUE;
2230 }
2231 
2232 /*
2233  * Receive an entire packet from the ring, starting from slot *consp.
2234  * prod indicates the slot of the latest response.
2235  * On return, *consp will point to the head of the next packet.
2236  *
2237  * Note: If slot prod was reached before we could gather a full packet, we will
2238  * drop the partial packet; this would most likely indicate a bug in either
2239  * the front-end or the back-end driver.
2240  *
2241  * An rx packet can consist of several fragments and thus span multiple slots.
2242  * Each fragment can contain up to 4k of data.
2243  *
2244  * A typical 9000 MTU packet with look like this:
2245  * +------+---------------------+-------------------+-----------------------+
2246  * | SLOT | TYPE                | CONTENTS          | FLAGS                 |
2247  * +------+---------------------+-------------------+-----------------------+
2248  * | 1    | netif_rx_response_t | 1st data fragment | more_data             |
2249  * +------+---------------------+-------------------+-----------------------+
2250  * | 2    | netif_rx_response_t | 2nd data fragment | more_data             |
2251  * +------+---------------------+-------------------+-----------------------+
2252  * | 3    | netif_rx_response_t | 3rd data fragment | [none]                |
2253  * +------+---------------------+-------------------+-----------------------+
2254  *
2255  * Fragments are chained by setting NETRXF_more_data in the previous
2256  * response's flags. If there are additional flags, such as
2257  * NETRXF_data_validated or NETRXF_extra_info, those should be set on the
2258  * first fragment.
2259  *
2260  * Sometimes extra info can be present. If so, it will follow the first
2261  * fragment, and NETRXF_extra_info flag will be set on the first response.
2262  * If LRO is set on a packet, it will be stored in the extra info. Conforming
2263  * to the spec, extra info can also be chained, but must all be present right
2264  * after the first fragment.
2265  *
2266  * Example of a packet with 2 extra infos:
2267  * +------+---------------------+-------------------+-----------------------+
2268  * | SLOT | TYPE                | CONTENTS          | FLAGS                 |
2269  * +------+---------------------+-------------------+-----------------------+
2270  * | 1    | netif_rx_response_t | 1st data fragment | extra_info, more_data |
2271  * +------+---------------------+-------------------+-----------------------+
2272  * | 2    | netif_extra_info_t  | 1st extra info    | EXTRA_FLAG_MORE       |
2273  * +------+---------------------+-------------------+-----------------------+
2274  * | 3    | netif_extra_info_t  | 2nd extra info    | [none]                |
2275  * +------+---------------------+-------------------+-----------------------+
2276  * | 4    | netif_rx_response_t | 2nd data fragment | more_data             |
2277  * +------+---------------------+-------------------+-----------------------+
2278  * | 5    | netif_rx_response_t | 3rd data fragment | more_data             |
2279  * +------+---------------------+-------------------+-----------------------+
2280  * | 6    | netif_rx_response_t | 4th data fragment | [none]                |
2281  * +------+---------------------+-------------------+-----------------------+
2282  *
2283  * In practice, the only extra we expect is for LRO, but only if we advertise
2284  * that we support it to the backend (xnf_enable_lro == TRUE).
2285  */
2286 static int
2287 xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp)
2288 {
2289         mblk_t *head = NULL;
2290         mblk_t *tail = NULL;
2291         mblk_t *mp;
2292         int error = 0;
2293         RING_IDX cons = *consp;
2294         netif_extra_info_t lro;
2295         boolean_t is_lro = B_FALSE;
2296         boolean_t is_extra = B_FALSE;
2297 
2298         netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2299 
2300         boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0;
2301         boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0;
2302         boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0;
2303 
2304         IMPLY(more_data, xnf_enable_rx_sg);
2305 
2306         while (cons != prod) {
2307                 xnf_buf_t *bdesc;
2308                 int len, off;
2309                 int rxidx = cons & (NET_RX_RING_SIZE - 1);
2310 
2311                 bdesc = xnfp->xnf_rx_pkt_info[rxidx];
2312                 xnfp->xnf_rx_pkt_info[rxidx] = NULL;
2313 
2314                 if (is_extra) {
2315                         netif_extra_info_t *extra = (netif_extra_info_t *)&rsp;
2316                         /*
2317                          * The only extra we expect is for LRO, and it should
2318                          * only be present once.
2319                          */
2320                         if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO &&
2321                             !is_lro) {
2322                                 ASSERT(xnf_enable_lro);
2323                                 lro = *extra;
2324                                 is_lro = B_TRUE;
2325                                 DTRACE_PROBE1(lro, netif_extra_info_t *, &lro);
2326                         } else {
2327                                 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet "
2328                                     "contains unexpected extra info of type %d",
2329                                     extra->type);
2330                                 error = EINVAL;
2331                         }
2332                         more_extra =
2333                             (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0;
2334 
2335                         goto hang_buf;
2336                 }
2337 
2338                 ASSERT3U(bdesc->id, ==, rsp.id);
2339 
2340                 /*
2341                  * status stores packet length when >= 0, or errors when < 0.
2342                  */
2343                 len = rsp.status;
2344                 off = rsp.offset;
2345                 more_data = (rsp.flags & NETRXF_more_data) != 0;
2346 
2347                 /*
2348                  * sanity checks.
2349                  */
2350                 if (!xnfp->xnf_running) {
2351                         error = EBUSY;
2352                 } else if (len <= 0) {
2353                         xnfp->xnf_stat_errrx++;
2354 
2355                         switch (len) {
2356                         case 0:
2357                                 xnfp->xnf_stat_runt++;
2358                                 break;
2359                         case NETIF_RSP_ERROR:
2360                                 xnfp->xnf_stat_mac_rcv_error++;
2361                                 break;
2362                         case NETIF_RSP_DROPPED:
2363                                 xnfp->xnf_stat_norxbuf++;
2364                                 break;
2365                         }
2366                         error = EINVAL;
2367                 } else if (bdesc->grant_ref == INVALID_GRANT_REF) {
2368                         dev_err(xnfp->xnf_devinfo, CE_WARN,
2369                             "Bad rx grant reference, rsp id %d", rsp.id);
2370                         error = EINVAL;
2371                 } else if ((off + len) > PAGESIZE) {
2372                         dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses "
2373                             "page boundary (offset %d, length %d)", off, len);
2374                         error = EINVAL;
2375                 }
2376 
2377                 if (error != 0) {
2378                         /*
2379                          * If an error has been detected, we do not attempt
2380                          * to read the data but we still need to replace
2381                          * the rx bufs.
2382                          */
2383                         goto hang_buf;
2384                 }
2385 
2386                 xnf_buf_t *nbuf = NULL;
2387 
2388                 /*
2389                  * If the packet is below a pre-determined size we will
2390                  * copy data out of the buf rather than replace it.
2391                  */
2392                 if (len > xnf_rx_copy_limit)
2393                         nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
2394 
2395                 if (nbuf != NULL) {
2396                         mp = desballoc((unsigned char *)bdesc->buf,
2397                             bdesc->len, 0, &bdesc->free_rtn);
2398 
2399                         if (mp == NULL) {
2400                                 xnfp->xnf_stat_rx_desballoc_fail++;
2401                                 xnfp->xnf_stat_norxbuf++;
2402                                 error = ENOMEM;
2403                                 /*
2404                                  * we free the buf we just allocated as we
2405                                  * will re-hang the old buf.
2406                                  */
2407                                 xnf_buf_put(xnfp, nbuf, B_FALSE);
2408                                 goto hang_buf;
2409                         }
2410 
2411                         mp->b_rptr = mp->b_rptr + off;
2412                         mp->b_wptr = mp->b_rptr + len;
2413 
2414                         /*
2415                          * Release the grant as the backend doesn't need to
2416                          * access this buffer anymore and grants are scarce.
2417                          */
2418                         (void) gnttab_end_foreign_access_ref(bdesc->grant_ref,
2419                             0);
2420                         xnf_gref_put(xnfp, bdesc->grant_ref);
2421                         bdesc->grant_ref = INVALID_GRANT_REF;
2422 
2423                         bdesc = nbuf;
2424                 } else {
2425                         /*
2426                          * We failed to allocate a new buf or decided to reuse
2427                          * the old one. In either case we copy the data off it
2428                          * and put it back into the ring.
2429                          */
2430                         mp = allocb(len, 0);
2431                         if (mp == NULL) {
2432                                 xnfp->xnf_stat_rx_allocb_fail++;
2433                                 xnfp->xnf_stat_norxbuf++;
2434                                 error = ENOMEM;
2435                                 goto hang_buf;
2436                         }
2437                         bcopy(bdesc->buf + off, mp->b_wptr, len);
2438                         mp->b_wptr += len;
2439                 }
2440 
2441                 if (head == NULL)
2442                         head = mp;
2443                 else
2444                         tail->b_cont = mp;
2445                 tail = mp;
2446 
2447 hang_buf:
2448                 /*
2449                  * No matter what happens, for each response we need to hang
2450                  * a new buf on the rx ring. Put either the old one, or a new
2451                  * one if the old one is borrowed by the kernel via desballoc().
2452                  */
2453                 xnf_rxbuf_hang(xnfp, bdesc);
2454                 cons++;
2455 
2456                 /* next response is an extra */
2457                 is_extra = more_extra;
2458 
2459                 if (!more_data && !more_extra)
2460                         break;
2461 
2462                 /*
2463                  * Note that since requests and responses are union'd on the
2464                  * same ring, we copy the response to a local variable instead
2465                  * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have
2466                  * overwritten contents of rsp.
2467                  */
2468                 rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2469         }
2470 
2471         /*
2472          * Check that we do not get stuck in a loop.
2473          */
2474         ASSERT3U(*consp, !=, cons);
2475         *consp = cons;
2476 
2477         /*
2478          * We ran out of responses but the flags indicate there is more data.
2479          */
2480         if (more_data) {
2481                 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments.");
2482                 error = EINVAL;
2483         }
2484         if (more_extra) {
2485                 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments "
2486                     "(extras).");
2487                 error = EINVAL;
2488         }
2489 
2490         /*
2491          * An error means the packet must be dropped. If we have already formed
2492          * a partial packet, then discard it.
2493          */
2494         if (error != 0) {
2495                 if (head != NULL)
2496                         freemsg(head);
2497                 xnfp->xnf_stat_rx_drop++;
2498                 return (error);
2499         }
2500 
2501         ASSERT(head != NULL);
2502 
2503         if (hwcsum) {
2504                 /*
2505                  * If the peer says that the data has been validated then we
2506                  * declare that the full checksum has been verified.
2507                  *
2508                  * We don't look at the "checksum blank" flag, and hence could
2509                  * have a packet here that we are asserting is good with
2510                  * a blank checksum.
2511                  */
2512                 mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
2513                 xnfp->xnf_stat_rx_cksum_no_need++;
2514         }
2515 
2516         /* XXX: set lro info for packet once LRO is supported in OS. */
2517 
2518         *mpp = head;
2519 
2520         return (0);
2521 }
2522 
2523 /*
2524  * Collect packets from the RX ring, storing them in `xnfp' for later use.
2525  */
2526 static void
2527 xnf_rx_collect(xnf_t *xnfp)
2528 {
2529         RING_IDX prod;
2530 
2531         ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2532 
2533         prod = xnfp->xnf_rx_ring.sring->rsp_prod;
2534         /*
2535          * Ensure we see queued responses up to 'prod'.
2536          */
2537         membar_consumer();
2538 
2539         while (xnfp->xnf_rx_ring.rsp_cons != prod) {
2540                 mblk_t *mp;
2541 
2542                 /*
2543                  * Collect a packet.
2544                  * rsp_cons is updated inside xnf_rx_one_packet().
2545                  */
2546                 int error = xnf_rx_one_packet(xnfp, prod,
2547                     &xnfp->xnf_rx_ring.rsp_cons, &mp);
2548                 if (error == 0) {
2549                         xnfp->xnf_stat_ipackets++;
2550                         xnfp->xnf_stat_rbytes += xmsgsize(mp);
2551 
2552                         /*
2553                          * Append the mblk to the rx list.
2554                          */
2555                         if (xnfp->xnf_rx_head == NULL) {
2556                                 ASSERT3P(xnfp->xnf_rx_tail, ==, NULL);
2557                                 xnfp->xnf_rx_head = mp;
2558                         } else {
2559                                 ASSERT(xnfp->xnf_rx_tail != NULL);
2560                                 xnfp->xnf_rx_tail->b_next = mp;
2561                         }
2562                         xnfp->xnf_rx_tail = mp;
2563                 }
2564         }
2565 }
2566 
2567 /*
2568  *  xnf_alloc_dma_resources() -- initialize the drivers structures
2569  */
2570 static int
2571 xnf_alloc_dma_resources(xnf_t *xnfp)
2572 {
2573         dev_info_t              *devinfo = xnfp->xnf_devinfo;
2574         size_t                  len;
2575         ddi_dma_cookie_t        dma_cookie;
2576         uint_t                  ncookies;
2577         int                     rc;
2578         caddr_t                 rptr;
2579 
2580         /*
2581          * The code below allocates all the DMA data structures that
2582          * need to be released when the driver is detached.
2583          *
2584          * Allocate page for the transmit descriptor ring.
2585          */
2586         if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2587             DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
2588                 goto alloc_error;
2589 
2590         if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
2591             PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2592             DDI_DMA_SLEEP, 0, &rptr, &len,
2593             &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
2594                 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2595                 xnfp->xnf_tx_ring_dma_handle = NULL;
2596                 goto alloc_error;
2597         }
2598 
2599         if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
2600             rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2601             DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2602                 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2603                 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2604                 xnfp->xnf_tx_ring_dma_handle = NULL;
2605                 xnfp->xnf_tx_ring_dma_acchandle = NULL;
2606                 if (rc == DDI_DMA_NORESOURCES)
2607                         goto alloc_error;
2608                 else
2609                         goto error;
2610         }
2611 
2612         ASSERT(ncookies == 1);
2613         bzero(rptr, PAGESIZE);
2614         /* LINTED: constant in conditional context */
2615         SHARED_RING_INIT((netif_tx_sring_t *)rptr);
2616         /* LINTED: constant in conditional context */
2617         FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
2618         xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
2619 
2620         /*
2621          * Allocate page for the receive descriptor ring.
2622          */
2623         if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2624             DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
2625                 goto alloc_error;
2626 
2627         if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
2628             PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2629             DDI_DMA_SLEEP, 0, &rptr, &len,
2630             &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
2631                 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2632                 xnfp->xnf_rx_ring_dma_handle = NULL;
2633                 goto alloc_error;
2634         }
2635 
2636         if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2637             rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2638             DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2639                 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2640                 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2641                 xnfp->xnf_rx_ring_dma_handle = NULL;
2642                 xnfp->xnf_rx_ring_dma_acchandle = NULL;
2643                 if (rc == DDI_DMA_NORESOURCES)
2644                         goto alloc_error;
2645                 else
2646                         goto error;
2647         }
2648 
2649         ASSERT(ncookies == 1);
2650         bzero(rptr, PAGESIZE);
2651         /* LINTED: constant in conditional context */
2652         SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2653         /* LINTED: constant in conditional context */
2654         FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2655         xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2656 
2657         return (DDI_SUCCESS);
2658 
2659 alloc_error:
2660         cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2661             ddi_get_instance(xnfp->xnf_devinfo));
2662 error:
2663         xnf_release_dma_resources(xnfp);
2664         return (DDI_FAILURE);
2665 }
2666 
2667 /*
2668  * Release all DMA resources in the opposite order from acquisition
2669  */
2670 static void
2671 xnf_release_dma_resources(xnf_t *xnfp)
2672 {
2673         int i;
2674 
2675         /*
2676          * Free receive buffers which are currently associated with
2677          * descriptors.
2678          */
2679         mutex_enter(&xnfp->xnf_rxlock);
2680         for (i = 0; i < NET_RX_RING_SIZE; i++) {
2681                 xnf_buf_t *bp;
2682 
2683                 if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
2684                         continue;
2685                 xnfp->xnf_rx_pkt_info[i] = NULL;
2686                 xnf_buf_put(xnfp, bp, B_FALSE);
2687         }
2688         mutex_exit(&xnfp->xnf_rxlock);
2689 
2690         /* Free the receive ring buffer. */
2691         if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2692                 (void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2693                 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2694                 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2695                 xnfp->xnf_rx_ring_dma_acchandle = NULL;
2696         }
2697         /* Free the transmit ring buffer. */
2698         if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2699                 (void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2700                 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2701                 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2702                 xnfp->xnf_tx_ring_dma_acchandle = NULL;
2703         }
2704 
2705 }
2706 
2707 /*
2708  * Release any packets and associated structures used by the TX ring.
2709  */
2710 static void
2711 xnf_release_mblks(xnf_t *xnfp)
2712 {
2713         RING_IDX i;
2714         xnf_txid_t *tidp;
2715 
2716         for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
2717             i < NET_TX_RING_SIZE;
2718             i++, tidp++) {
2719                 xnf_txbuf_t *txp = tidp->txbuf;
2720 
2721                 if (txp != NULL) {
2722                         ASSERT(txp->tx_mp != NULL);
2723                         freemsg(txp->tx_mp);
2724 
2725                         xnf_txid_put(xnfp, tidp);
2726                         kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
2727                 }
2728         }
2729 }
2730 
2731 static int
2732 xnf_buf_constructor(void *buf, void *arg, int kmflag)
2733 {
2734         int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2735         xnf_buf_t *bdesc = buf;
2736         xnf_t *xnfp = arg;
2737         ddi_dma_cookie_t dma_cookie;
2738         uint_t ncookies;
2739         size_t len;
2740 
2741         if (kmflag & KM_NOSLEEP)
2742                 ddiflags = DDI_DMA_DONTWAIT;
2743 
2744         /* Allocate a DMA access handle for the buffer. */
2745         if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr,
2746             ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2747                 goto failure;
2748 
2749         /* Allocate DMA-able memory for buffer. */
2750         if (ddi_dma_mem_alloc(bdesc->dma_handle,
2751             PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
2752             &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2753                 goto failure_1;
2754 
2755         /* Bind to virtual address of buffer to get physical address. */
2756         if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2757             bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
2758             ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2759                 goto failure_2;
2760         ASSERT(ncookies == 1);
2761 
2762         bdesc->free_rtn.free_func = xnf_buf_recycle;
2763         bdesc->free_rtn.free_arg = (caddr_t)bdesc;
2764         bdesc->xnfp = xnfp;
2765         bdesc->buf_phys = dma_cookie.dmac_laddress;
2766         bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2767         bdesc->len = dma_cookie.dmac_size;
2768         bdesc->grant_ref = INVALID_GRANT_REF;
2769         bdesc->gen = xnfp->xnf_gen;
2770 
2771         atomic_inc_64(&xnfp->xnf_stat_buf_allocated);
2772 
2773         return (0);
2774 
2775 failure_2:
2776         ddi_dma_mem_free(&bdesc->acc_handle);
2777 
2778 failure_1:
2779         ddi_dma_free_handle(&bdesc->dma_handle);
2780 
2781 failure:
2782 
2783         ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2784         return (-1);
2785 }
2786 
2787 static void
2788 xnf_buf_destructor(void *buf, void *arg)
2789 {
2790         xnf_buf_t *bdesc = buf;
2791         xnf_t *xnfp = arg;
2792 
2793         (void) ddi_dma_unbind_handle(bdesc->dma_handle);
2794         ddi_dma_mem_free(&bdesc->acc_handle);
2795         ddi_dma_free_handle(&bdesc->dma_handle);
2796 
2797         atomic_dec_64(&xnfp->xnf_stat_buf_allocated);
2798 }
2799 
2800 static xnf_buf_t *
2801 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
2802 {
2803         grant_ref_t gref;
2804         xnf_buf_t *bufp;
2805 
2806         /*
2807          * Usually grant references are more scarce than memory, so we
2808          * attempt to acquire a grant reference first.
2809          */
2810         gref = xnf_gref_get(xnfp);
2811         if (gref == INVALID_GRANT_REF)
2812                 return (NULL);
2813 
2814         bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
2815         if (bufp == NULL) {
2816                 xnf_gref_put(xnfp, gref);
2817                 return (NULL);
2818         }
2819 
2820         ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF);
2821 
2822         bufp->grant_ref = gref;
2823 
2824         if (bufp->gen != xnfp->xnf_gen)
2825                 xnf_buf_refresh(bufp);
2826 
2827         gnttab_grant_foreign_access_ref(bufp->grant_ref,
2828             xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
2829             bufp->buf_mfn, readonly ? 1 : 0);
2830 
2831         atomic_inc_64(&xnfp->xnf_stat_buf_outstanding);
2832 
2833         return (bufp);
2834 }
2835 
2836 static void
2837 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
2838 {
2839         if (bufp->grant_ref != INVALID_GRANT_REF) {
2840                 (void) gnttab_end_foreign_access_ref(
2841                     bufp->grant_ref, readonly ? 1 : 0);
2842                 xnf_gref_put(xnfp, bufp->grant_ref);
2843                 bufp->grant_ref = INVALID_GRANT_REF;
2844         }
2845 
2846         kmem_cache_free(xnfp->xnf_buf_cache, bufp);
2847 
2848         atomic_dec_64(&xnfp->xnf_stat_buf_outstanding);
2849 }
2850 
2851 /*
2852  * Refresh any cached data about a buffer after resume.
2853  */
2854 static void
2855 xnf_buf_refresh(xnf_buf_t *bdesc)
2856 {
2857         bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2858         bdesc->gen = bdesc->xnfp->xnf_gen;
2859 }
2860 
2861 /*
2862  * Streams `freeb' routine for `xnf_buf_t' when used as transmit
2863  * look-aside buffers.
2864  */
2865 static void
2866 xnf_buf_recycle(xnf_buf_t *bdesc)
2867 {
2868         xnf_t *xnfp = bdesc->xnfp;
2869 
2870         xnf_buf_put(xnfp, bdesc, B_TRUE);
2871 }
2872 
2873 static int
2874 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
2875 {
2876         int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2877         xnf_txbuf_t *txp = buf;
2878         xnf_t *xnfp = arg;
2879 
2880         if (kmflag & KM_NOSLEEP)
2881                 ddiflags = DDI_DMA_DONTWAIT;
2882 
2883         if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr,
2884             ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) {
2885                 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2886                 return (-1);
2887         }
2888 
2889         return (0);
2890 }
2891 
2892 static void
2893 xnf_tx_buf_destructor(void *buf, void *arg)
2894 {
2895         _NOTE(ARGUNUSED(arg));
2896         xnf_txbuf_t *txp = buf;
2897 
2898         ddi_dma_free_handle(&txp->tx_dma_handle);
2899 }
2900 
2901 /*
2902  * Statistics.
2903  */
2904 static char *xnf_aux_statistics[] = {
2905         "tx_cksum_deferred",
2906         "rx_cksum_no_need",
2907         "interrupts",
2908         "unclaimed_interrupts",
2909         "tx_pullup",
2910         "tx_lookaside",
2911         "tx_drop",
2912         "tx_eth_hdr_split",
2913         "buf_allocated",
2914         "buf_outstanding",
2915         "gref_outstanding",
2916         "gref_failure",
2917         "gref_peak",
2918         "rx_allocb_fail",
2919         "rx_desballoc_fail",
2920 };
2921 
2922 static int
2923 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2924 {
2925         xnf_t *xnfp;
2926         kstat_named_t *knp;
2927 
2928         if (flag != KSTAT_READ)
2929                 return (EACCES);
2930 
2931         xnfp = ksp->ks_private;
2932         knp = ksp->ks_data;
2933 
2934         /*
2935          * Assignment order must match that of the names in
2936          * xnf_aux_statistics.
2937          */
2938         (knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2939         (knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2940 
2941         (knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2942         (knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2943         (knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2944         (knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside;
2945         (knp++)->value.ui64 = xnfp->xnf_stat_tx_drop;
2946         (knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split;
2947 
2948         (knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
2949         (knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
2950         (knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
2951         (knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
2952         (knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
2953         (knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
2954         (knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;
2955 
2956         return (0);
2957 }
2958 
2959 static boolean_t
2960 xnf_kstat_init(xnf_t *xnfp)
2961 {
2962         int nstat = sizeof (xnf_aux_statistics) /
2963             sizeof (xnf_aux_statistics[0]);
2964         char **cp = xnf_aux_statistics;
2965         kstat_named_t *knp;
2966 
2967         /*
2968          * Create and initialise kstats.
2969          */
2970         if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2971             ddi_get_instance(xnfp->xnf_devinfo),
2972             "aux_statistics", "net", KSTAT_TYPE_NAMED,
2973             nstat, 0)) == NULL)
2974                 return (B_FALSE);
2975 
2976         xnfp->xnf_kstat_aux->ks_private = xnfp;
2977         xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2978 
2979         knp = xnfp->xnf_kstat_aux->ks_data;
2980         while (nstat > 0) {
2981                 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2982 
2983                 knp++;
2984                 cp++;
2985                 nstat--;
2986         }
2987 
2988         kstat_install(xnfp->xnf_kstat_aux);
2989 
2990         return (B_TRUE);
2991 }
2992 
2993 static int
2994 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2995 {
2996         xnf_t *xnfp = arg;
2997 
2998         mutex_enter(&xnfp->xnf_rxlock);
2999         mutex_enter(&xnfp->xnf_txlock);
3000 
3001 #define mac_stat(q, r)                          \
3002         case (MAC_STAT_##q):                    \
3003                 *val = xnfp->xnf_stat_##r;   \
3004                 break
3005 
3006 #define ether_stat(q, r)                        \
3007         case (ETHER_STAT_##q):                  \
3008                 *val = xnfp->xnf_stat_##r;   \
3009                 break
3010 
3011         switch (stat) {
3012 
3013         mac_stat(IPACKETS, ipackets);
3014         mac_stat(OPACKETS, opackets);
3015         mac_stat(RBYTES, rbytes);
3016         mac_stat(OBYTES, obytes);
3017         mac_stat(NORCVBUF, norxbuf);
3018         mac_stat(IERRORS, errrx);
3019         mac_stat(NOXMTBUF, tx_defer);
3020 
3021         ether_stat(MACRCV_ERRORS, mac_rcv_error);
3022         ether_stat(TOOSHORT_ERRORS, runt);
3023 
3024         /* always claim to be in full duplex mode */
3025         case ETHER_STAT_LINK_DUPLEX:
3026                 *val = LINK_DUPLEX_FULL;
3027                 break;
3028 
3029         /* always claim to be at 1Gb/s link speed */
3030         case MAC_STAT_IFSPEED:
3031                 *val = 1000000000ull;
3032                 break;
3033 
3034         default:
3035                 mutex_exit(&xnfp->xnf_txlock);
3036                 mutex_exit(&xnfp->xnf_rxlock);
3037 
3038                 return (ENOTSUP);
3039         }
3040 
3041 #undef mac_stat
3042 #undef ether_stat
3043 
3044         mutex_exit(&xnfp->xnf_txlock);
3045         mutex_exit(&xnfp->xnf_rxlock);
3046 
3047         return (0);
3048 }
3049 
3050 static int
3051 xnf_change_mtu(xnf_t *xnfp, uint32_t mtu)
3052 {
3053         if (mtu > ETHERMTU) {
3054                 if (!xnf_enable_tx_sg) {
3055                         dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3056                             "because scatter-gather is disabled for transmit "
3057                             "in driver settings", ETHERMTU);
3058                         return (EINVAL);
3059                 } else if (!xnf_enable_rx_sg) {
3060                         dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3061                             "because scatter-gather is disabled for receive "
3062                             "in driver settings", ETHERMTU);
3063                         return (EINVAL);
3064                 } else if (!xnfp->xnf_be_tx_sg) {
3065                         dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3066                             "because backend doesn't support scatter-gather",
3067                             ETHERMTU);
3068                         return (EINVAL);
3069                 }
3070                 if (mtu > XNF_MAXPKT)
3071                         return (EINVAL);
3072         }
3073         int error = mac_maxsdu_update(xnfp->xnf_mh, mtu);
3074         if (error == 0)
3075                 xnfp->xnf_mtu = mtu;
3076 
3077         return (error);
3078 }
3079 
3080 /*ARGSUSED*/
3081 static int
3082 xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3083     uint_t prop_val_size, void *prop_val)
3084 {
3085         xnf_t *xnfp = data;
3086 
3087         switch (prop_id) {
3088         case MAC_PROP_MTU:
3089                 ASSERT(prop_val_size >= sizeof (uint32_t));
3090                 bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t));
3091                 break;
3092         default:
3093                 return (ENOTSUP);
3094         }
3095         return (0);
3096 }
3097 
3098 /*ARGSUSED*/
3099 static int
3100 xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3101     uint_t prop_val_size, const void *prop_val)
3102 {
3103         xnf_t *xnfp = data;
3104         uint32_t new_mtu;
3105         int error;
3106 
3107         switch (prop_id) {
3108         case MAC_PROP_MTU:
3109                 ASSERT(prop_val_size >= sizeof (uint32_t));
3110                 bcopy(prop_val, &new_mtu, sizeof (new_mtu));
3111                 error = xnf_change_mtu(xnfp, new_mtu);
3112                 break;
3113         default:
3114                 return (ENOTSUP);
3115         }
3116 
3117         return (error);
3118 }
3119 
3120 /*ARGSUSED*/
3121 static void
3122 xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id,
3123     mac_prop_info_handle_t prop_handle)
3124 {
3125         switch (prop_id) {
3126         case MAC_PROP_MTU:
3127                 mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT);
3128                 break;
3129         default:
3130                 break;
3131         }
3132 }
3133 
3134 static boolean_t
3135 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
3136 {
3137         xnf_t *xnfp = arg;
3138 
3139         switch (cap) {
3140         case MAC_CAPAB_HCKSUM: {
3141                 uint32_t *capab = cap_data;
3142 
3143                 /*
3144                  * Whilst the flag used to communicate with the IO
3145                  * domain is called "NETTXF_csum_blank", the checksum
3146                  * in the packet must contain the pseudo-header
3147                  * checksum and not zero.
3148                  *
3149                  * To help out the IO domain, we might use
3150                  * HCKSUM_INET_PARTIAL. Unfortunately our stack will
3151                  * then use checksum offload for IPv6 packets, which
3152                  * the IO domain can't handle.
3153                  *
3154                  * As a result, we declare outselves capable of
3155                  * HCKSUM_INET_FULL_V4. This means that we receive
3156                  * IPv4 packets from the stack with a blank checksum
3157                  * field and must insert the pseudo-header checksum
3158                  * before passing the packet to the IO domain.
3159                  */
3160                 *capab = HCKSUM_INET_FULL_V4;
3161 
3162                 /*
3163                  * TODO: query the "feature-ipv6-csum-offload" capability.
3164                  * If enabled, that could allow us to use HCKSUM_INET_PARTIAL.
3165                  */
3166 
3167                 break;
3168         }
3169         case MAC_CAPAB_LSO: {
3170                 if (!xnfp->xnf_be_lso)
3171                         return (B_FALSE);
3172 
3173                 mac_capab_lso_t *lso = cap_data;
3174                 lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
3175                 lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET;
3176                 break;
3177         }
3178         default:
3179                 return (B_FALSE);
3180         }
3181 
3182         return (B_TRUE);
3183 }
3184 
3185 /*
3186  * The state of the peer has changed - react accordingly.
3187  */
3188 static void
3189 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
3190     void *arg, void *impl_data)
3191 {
3192         _NOTE(ARGUNUSED(id, arg));
3193         xnf_t *xnfp = ddi_get_driver_private(dip);
3194         XenbusState new_state = *(XenbusState *)impl_data;
3195 
3196         ASSERT(xnfp != NULL);
3197 
3198         switch (new_state) {
3199         case XenbusStateUnknown:
3200         case XenbusStateInitialising:
3201         case XenbusStateInitialised:
3202         case XenbusStateClosing:
3203         case XenbusStateClosed:
3204         case XenbusStateReconfiguring:
3205         case XenbusStateReconfigured:
3206                 break;
3207 
3208         case XenbusStateInitWait:
3209                 xnf_read_config(xnfp);
3210 
3211                 if (!xnfp->xnf_be_rx_copy) {
3212                         cmn_err(CE_WARN,
3213                             "The xnf driver requires a dom0 that "
3214                             "supports 'feature-rx-copy'.");
3215                         (void) xvdi_switch_state(xnfp->xnf_devinfo,
3216                             XBT_NULL, XenbusStateClosed);
3217                         break;
3218                 }
3219 
3220                 /*
3221                  * Connect to the backend.
3222                  */
3223                 xnf_be_connect(xnfp);
3224 
3225                 /*
3226                  * Our MAC address as discovered by xnf_read_config().
3227                  */
3228                 mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
3229 
3230                 /*
3231                  * We do not know if some features such as LSO are supported
3232                  * until we connect to the backend. We request the MAC layer
3233                  * to poll our capabilities again.
3234                  */
3235                 mac_capab_update(xnfp->xnf_mh);
3236 
3237                 break;
3238 
3239         case XenbusStateConnected:
3240                 mutex_enter(&xnfp->xnf_rxlock);
3241                 mutex_enter(&xnfp->xnf_txlock);
3242 
3243                 xnfp->xnf_connected = B_TRUE;
3244                 /*
3245                  * Wake up any threads waiting to send data to
3246                  * backend.
3247                  */
3248                 cv_broadcast(&xnfp->xnf_cv_state);
3249 
3250                 mutex_exit(&xnfp->xnf_txlock);
3251                 mutex_exit(&xnfp->xnf_rxlock);
3252 
3253                 /*
3254                  * Kick the peer in case it missed any transmits
3255                  * request in the TX ring.
3256                  */
3257                 ec_notify_via_evtchn(xnfp->xnf_evtchn);
3258 
3259                 /*
3260                  * There may already be completed receive requests in
3261                  * the ring sent by backend after it gets connected
3262                  * but before we see its state change here, so we call
3263                  * xnf_intr() to handle them, if any.
3264                  */
3265                 (void) xnf_intr((caddr_t)xnfp);
3266 
3267                 /*
3268                  * Mark the link up now that we are connected.
3269                  */
3270                 mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
3271 
3272                 /*
3273                  * Tell the backend about the multicast addresses in
3274                  * which we are interested.
3275                  */
3276                 mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);
3277 
3278                 break;
3279 
3280         default:
3281                 break;
3282         }
3283 }