1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  29  */
  30 
  31 /*
  32  *
  33  * Copyright (c) 2004 Christian Limpach.
  34  * All rights reserved.
  35  *
  36  * Redistribution and use in source and binary forms, with or without
  37  * modification, are permitted provided that the following conditions
  38  * are met:
  39  * 1. Redistributions of source code must retain the above copyright
  40  *    notice, this list of conditions and the following disclaimer.
  41  * 2. Redistributions in binary form must reproduce the above copyright
  42  *    notice, this list of conditions and the following disclaimer in the
  43  *    documentation and/or other materials provided with the distribution.
  44  * 3. This section intentionally left blank.
  45  * 4. The name of the author may not be used to endorse or promote products
  46  *    derived from this software without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  49  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  50  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  51  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  52  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  53  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  54  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  55  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  56  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  57  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  58  */
  59 /*
  60  * Section 3 of the above license was updated in response to bug 6379571.
  61  */
  62 
  63 /*
  64  * xnf.c - GLDv3 network driver for domU.
  65  */
  66 
  67 /*
  68  * This driver uses four per-instance locks:
  69  *
  70  * xnf_gref_lock:
  71  *
  72  *    Protects access to the grant reference list stored in
  73  *    xnf_gref_head. Grant references should be acquired and released
  74  *    using gref_get() and gref_put() respectively.
  75  *
  76  * xnf_schedlock:
  77  *
  78  *    Protects:
  79  *    xnf_need_sched - used to record that a previous transmit attempt
  80  *       failed (and consequently it will be necessary to call
  81  *       mac_tx_update() when transmit resources are available).
  82  *    xnf_pending_multicast - the number of multicast requests that
  83  *       have been submitted to the backend for which we have not
  84  *       processed responses.
  85  *
  86  * xnf_txlock:
  87  *
  88  *    Protects the transmit ring (xnf_tx_ring) and associated
  89  *    structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
  90  *
  91  * xnf_rxlock:
  92  *
  93  *    Protects the receive ring (xnf_rx_ring) and associated
  94  *    structures (notably xnf_rx_pkt_info).
  95  *
  96  * If driver-global state that affects both the transmit and receive
  97  * rings is manipulated, both xnf_txlock and xnf_rxlock should be
  98  * held, in that order.
  99  *
 100  * xnf_schedlock is acquired both whilst holding xnf_txlock and
 101  * without. It should always be acquired after xnf_txlock if both are
 102  * held.
 103  *
 104  * Notes:
 105  * - atomic_add_64() is used to manipulate counters where we require
 106  *   accuracy. For counters intended only for observation by humans,
 107  *   post increment/decrement are used instead.
 108  */
 109 
 110 #include <sys/types.h>
 111 #include <sys/errno.h>
 112 #include <sys/param.h>
 113 #include <sys/sysmacros.h>
 114 #include <sys/systm.h>
 115 #include <sys/stream.h>
 116 #include <sys/strsubr.h>
 117 #include <sys/strsun.h>
 118 #include <sys/conf.h>
 119 #include <sys/ddi.h>
 120 #include <sys/devops.h>
 121 #include <sys/sunddi.h>
 122 #include <sys/sunndi.h>
 123 #include <sys/dlpi.h>
 124 #include <sys/ethernet.h>
 125 #include <sys/strsun.h>
 126 #include <sys/pattr.h>
 127 #include <inet/ip.h>
 128 #include <inet/ip_impl.h>
 129 #include <inet/tcp.h>
 130 #include <netinet/udp.h>
 131 #include <sys/gld.h>
 132 #include <sys/modctl.h>
 133 #include <sys/mac_provider.h>
 134 #include <sys/mac_ether.h>
 135 #include <sys/bootinfo.h>
 136 #include <sys/mach_mmu.h>
 137 #ifdef  XPV_HVM_DRIVER
 138 #include <sys/xpv_support.h>
 139 #include <sys/hypervisor.h>
 140 #else
 141 #include <sys/hypervisor.h>
 142 #include <sys/evtchn_impl.h>
 143 #include <sys/balloon_impl.h>
 144 #endif
 145 #include <xen/public/io/netif.h>
 146 #include <sys/gnttab.h>
 147 #include <xen/sys/xendev.h>
 148 #include <sys/sdt.h>
 149 #include <sys/note.h>
 150 #include <sys/debug.h>
 151 
 152 #include <io/xnf.h>
 153 
 154 #if defined(DEBUG) || defined(__lint)
 155 #define XNF_DEBUG
 156 #endif
 157 
 158 #ifdef XNF_DEBUG
 159 int xnf_debug = 0;
 160 xnf_t *xnf_debug_instance = NULL;
 161 #endif
 162 
 163 /*
 164  * On a 32 bit PAE system physical and machine addresses are larger
 165  * than 32 bits.  ddi_btop() on such systems take an unsigned long
 166  * argument, and so addresses above 4G are truncated before ddi_btop()
 167  * gets to see them.  To avoid this, code the shift operation here.
 168  */
 169 #define xnf_btop(addr)  ((addr) >> PAGESHIFT)
 170 
 171 /*
 172  * The parameters below should only be changed in /etc/system, never in mdb.
 173  */
 174 
 175 /*
 176  * Should we use the multicast control feature if the backend provides
 177  * it?
 178  */
 179 boolean_t xnf_multicast_control = B_TRUE;
 180 
 181 /*
 182  * Should we allow scatter-gather for tx if backend allows it?
 183  */
 184 boolean_t xnf_enable_tx_sg = B_TRUE;
 185 
 186 /*
 187  * Should we allow scatter-gather for rx if backend allows it?
 188  */
 189 boolean_t xnf_enable_rx_sg = B_TRUE;
 190 
 191 /*
 192  * Should we allow lso for tx sends if backend allows it?
 193  * Requires xnf_enable_tx_sg to be also set to TRUE.
 194  */
 195 boolean_t xnf_enable_lso = B_TRUE;
 196 
 197 /*
 198  * Should we allow lro on rx if backend supports it?
 199  * Requires xnf_enable_rx_sg to be also set to TRUE.
 200  *
 201  * !! WARNING !!
 202  * LRO is not yet supported in the OS so this should be left as FALSE.
 203  * !! WARNING !!
 204  */
 205 boolean_t xnf_enable_lro = B_FALSE;
 206 
 207 /*
 208  * Received packets below this size are copied to a new streams buffer
 209  * rather than being desballoc'ed.
 210  *
 211  * This value is chosen to accommodate traffic where there are a large
 212  * number of small packets. For data showing a typical distribution,
 213  * see:
 214  *
 215  * Sinha07a:
 216  *      Rishi Sinha, Christos Papadopoulos, and John
 217  *      Heidemann. Internet Packet Size Distributions: Some
 218  *      Observations. Technical Report ISI-TR-2007-643,
 219  *      USC/Information Sciences Institute, May, 2007. Orignally
 220  *      released October 2005 as web page
 221  *      http://netweb.usc.edu/~sinha/pkt-sizes/.
 222  *      <http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
 223  */
 224 size_t xnf_rx_copy_limit = 64;
 225 
 226 #define INVALID_GRANT_HANDLE    ((grant_handle_t)-1)
 227 #define INVALID_GRANT_REF       ((grant_ref_t)-1)
 228 #define INVALID_TX_ID           ((uint16_t)-1)
 229 
 230 #define TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
 231 #define TX_ID_VALID(i) \
 232         (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
 233 
 234 /*
 235  * calculate how many pages are spanned by an mblk fragment
 236  */
 237 #define xnf_mblk_pages(mp)      (MBLKL(mp) == 0 ? 0 : \
 238     xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1)
 239 
 240 /* Required system entry points */
 241 static int      xnf_attach(dev_info_t *, ddi_attach_cmd_t);
 242 static int      xnf_detach(dev_info_t *, ddi_detach_cmd_t);
 243 
 244 /* Required driver entry points for Nemo */
 245 static int      xnf_start(void *);
 246 static void     xnf_stop(void *);
 247 static int      xnf_set_mac_addr(void *, const uint8_t *);
 248 static int      xnf_set_multicast(void *, boolean_t, const uint8_t *);
 249 static int      xnf_set_promiscuous(void *, boolean_t);
 250 static mblk_t   *xnf_send(void *, mblk_t *);
 251 static uint_t   xnf_intr(caddr_t);
 252 static int      xnf_stat(void *, uint_t, uint64_t *);
 253 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
 254 static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
 255 static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t,
 256     const void *);
 257 static void xnf_propinfo(void *, const char *, mac_prop_id_t,
 258     mac_prop_info_handle_t);
 259 
 260 /* Driver private functions */
 261 static int xnf_alloc_dma_resources(xnf_t *);
 262 static void xnf_release_dma_resources(xnf_t *);
 263 static void xnf_release_mblks(xnf_t *);
 264 
 265 static int xnf_buf_constructor(void *, void *, int);
 266 static void xnf_buf_destructor(void *, void *);
 267 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
 268 #pragma inline(xnf_buf_get)
 269 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
 270 #pragma inline(xnf_buf_put)
 271 static void xnf_buf_refresh(xnf_buf_t *);
 272 #pragma inline(xnf_buf_refresh)
 273 static void xnf_buf_recycle(xnf_buf_t *);
 274 
 275 static int xnf_tx_buf_constructor(void *, void *, int);
 276 static void xnf_tx_buf_destructor(void *, void *);
 277 
 278 static grant_ref_t xnf_gref_get(xnf_t *);
 279 #pragma inline(xnf_gref_get)
 280 static void xnf_gref_put(xnf_t *, grant_ref_t);
 281 #pragma inline(xnf_gref_put)
 282 
 283 static xnf_txid_t *xnf_txid_get(xnf_t *);
 284 #pragma inline(xnf_txid_get)
 285 static void xnf_txid_put(xnf_t *, xnf_txid_t *);
 286 #pragma inline(xnf_txid_put)
 287 
 288 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
 289 static int xnf_tx_clean_ring(xnf_t  *);
 290 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
 291     void *, void *);
 292 static boolean_t xnf_kstat_init(xnf_t *);
 293 static void xnf_rx_collect(xnf_t *);
 294 
 295 #define XNF_CALLBACK_FLAGS      (MC_GETCAPAB | MC_PROPERTIES)
 296 
 297 static mac_callbacks_t xnf_callbacks = {
 298         .mc_callbacks = XNF_CALLBACK_FLAGS,
 299         .mc_getstat = xnf_stat,
 300         .mc_start = xnf_start,
 301         .mc_stop = xnf_stop,
 302         .mc_setpromisc = xnf_set_promiscuous,
 303         .mc_multicst = xnf_set_multicast,
 304         .mc_unicst = xnf_set_mac_addr,
 305         .mc_tx = xnf_send,
 306         .mc_getcapab = xnf_getcapab,
 307         .mc_setprop = xnf_setprop,
 308         .mc_getprop = xnf_getprop,
 309         .mc_propinfo = xnf_propinfo,
 310 };
 311 
 312 /* DMA attributes for network ring buffer */
 313 static ddi_dma_attr_t ringbuf_dma_attr = {
 314         .dma_attr_version = DMA_ATTR_V0,
 315         .dma_attr_addr_lo = 0,
 316         .dma_attr_addr_hi = 0xffffffffffffffffULL,
 317         .dma_attr_count_max = 0x7fffffff,
 318         .dma_attr_align = MMU_PAGESIZE,
 319         .dma_attr_burstsizes = 0x7ff,
 320         .dma_attr_minxfer = 1,
 321         .dma_attr_maxxfer = 0xffffffffU,
 322         .dma_attr_seg = 0xffffffffffffffffULL,
 323         .dma_attr_sgllen = 1,
 324         .dma_attr_granular = 1,
 325         .dma_attr_flags = 0
 326 };
 327 
 328 /* DMA attributes for receive data */
 329 static ddi_dma_attr_t rx_buf_dma_attr = {
 330         .dma_attr_version = DMA_ATTR_V0,
 331         .dma_attr_addr_lo = 0,
 332         .dma_attr_addr_hi = 0xffffffffffffffffULL,
 333         .dma_attr_count_max = MMU_PAGEOFFSET,
 334         .dma_attr_align = MMU_PAGESIZE, /* allocation alignment */
 335         .dma_attr_burstsizes = 0x7ff,
 336         .dma_attr_minxfer = 1,
 337         .dma_attr_maxxfer = 0xffffffffU,
 338         .dma_attr_seg = 0xffffffffffffffffULL,
 339         .dma_attr_sgllen = 1,
 340         .dma_attr_granular = 1,
 341         .dma_attr_flags = 0
 342 };
 343 
 344 /* DMA attributes for transmit data */
 345 static ddi_dma_attr_t tx_buf_dma_attr = {
 346         .dma_attr_version = DMA_ATTR_V0,
 347         .dma_attr_addr_lo = 0,
 348         .dma_attr_addr_hi = 0xffffffffffffffffULL,
 349         .dma_attr_count_max = MMU_PAGEOFFSET,
 350         .dma_attr_align = 1,
 351         .dma_attr_burstsizes = 0x7ff,
 352         .dma_attr_minxfer = 1,
 353         .dma_attr_maxxfer = 0xffffffffU,
 354         .dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */
 355         .dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */
 356         .dma_attr_granular = 1,
 357         .dma_attr_flags = 0
 358 };
 359 
 360 /* DMA access attributes for registers and descriptors */
 361 static ddi_device_acc_attr_t accattr = {
 362         DDI_DEVICE_ATTR_V0,
 363         DDI_STRUCTURE_LE_ACC,   /* This is a little-endian device */
 364         DDI_STRICTORDER_ACC
 365 };
 366 
 367 /* DMA access attributes for data: NOT to be byte swapped. */
 368 static ddi_device_acc_attr_t data_accattr = {
 369         DDI_DEVICE_ATTR_V0,
 370         DDI_NEVERSWAP_ACC,
 371         DDI_STRICTORDER_ACC
 372 };
 373 
 374 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
 375     nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
 376 
 377 static struct modldrv xnf_modldrv = {
 378         &mod_driverops,
 379         "Virtual Ethernet driver",
 380         &xnf_dev_ops
 381 };
 382 
 383 static struct modlinkage modlinkage = {
 384         MODREV_1, &xnf_modldrv, NULL
 385 };
 386 
 387 int
 388 _init(void)
 389 {
 390         int r;
 391 
 392         mac_init_ops(&xnf_dev_ops, "xnf");
 393         r = mod_install(&modlinkage);
 394         if (r != DDI_SUCCESS)
 395                 mac_fini_ops(&xnf_dev_ops);
 396 
 397         return (r);
 398 }
 399 
 400 int
 401 _fini(void)
 402 {
 403         return (EBUSY); /* XXPV should be removable */
 404 }
 405 
 406 int
 407 _info(struct modinfo *modinfop)
 408 {
 409         return (mod_info(&modlinkage, modinfop));
 410 }
 411 
 412 /*
 413  * Acquire a grant reference.
 414  */
 415 static grant_ref_t
 416 xnf_gref_get(xnf_t *xnfp)
 417 {
 418         grant_ref_t gref;
 419 
 420         mutex_enter(&xnfp->xnf_gref_lock);
 421 
 422         do {
 423                 gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);
 424 
 425         } while ((gref == INVALID_GRANT_REF) &&
 426             (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));
 427 
 428         mutex_exit(&xnfp->xnf_gref_lock);
 429 
 430         if (gref == INVALID_GRANT_REF) {
 431                 xnfp->xnf_stat_gref_failure++;
 432         } else {
 433                 atomic_inc_64(&xnfp->xnf_stat_gref_outstanding);
 434                 if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
 435                         xnfp->xnf_stat_gref_peak =
 436                             xnfp->xnf_stat_gref_outstanding;
 437         }
 438 
 439         return (gref);
 440 }
 441 
 442 /*
 443  * Release a grant reference.
 444  */
 445 static void
 446 xnf_gref_put(xnf_t *xnfp, grant_ref_t gref)
 447 {
 448         ASSERT(gref != INVALID_GRANT_REF);
 449 
 450         mutex_enter(&xnfp->xnf_gref_lock);
 451         gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
 452         mutex_exit(&xnfp->xnf_gref_lock);
 453 
 454         atomic_dec_64(&xnfp->xnf_stat_gref_outstanding);
 455 }
 456 
 457 /*
 458  * Acquire a transmit id.
 459  */
 460 static xnf_txid_t *
 461 xnf_txid_get(xnf_t *xnfp)
 462 {
 463         xnf_txid_t *tidp;
 464 
 465         ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
 466 
 467         if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
 468                 return (NULL);
 469 
 470         ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));
 471 
 472         tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
 473         xnfp->xnf_tx_pkt_id_head = tidp->next;
 474         tidp->next = INVALID_TX_ID;
 475 
 476         ASSERT(tidp->txbuf == NULL);
 477 
 478         return (tidp);
 479 }
 480 
 481 /*
 482  * Release a transmit id.
 483  */
 484 static void
 485 xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
 486 {
 487         ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
 488         ASSERT(TX_ID_VALID(tidp->id));
 489         ASSERT(tidp->next == INVALID_TX_ID);
 490 
 491         tidp->txbuf = NULL;
 492         tidp->next = xnfp->xnf_tx_pkt_id_head;
 493         xnfp->xnf_tx_pkt_id_head = tidp->id;
 494 }
 495 
 496 static void
 497 xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp)
 498 {
 499         ASSERT3U(txp->tx_type, ==, TX_DATA);
 500 
 501         /*
 502          * We are either using a lookaside buffer or we are mapping existing
 503          * buffers.
 504          */
 505         if (txp->tx_bdesc != NULL) {
 506                 ASSERT(!txp->tx_handle_bound);
 507                 xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
 508         } else {
 509                 if (txp->tx_txreq.gref != INVALID_GRANT_REF) {
 510                         if (gnttab_query_foreign_access(txp->tx_txreq.gref) !=
 511                             0) {
 512                                 cmn_err(CE_PANIC, "tx grant %d still in use by "
 513                                     "backend domain", txp->tx_txreq.gref);
 514                         }
 515                         (void) gnttab_end_foreign_access_ref(
 516                             txp->tx_txreq.gref, 1);
 517                         xnf_gref_put(xnfp, txp->tx_txreq.gref);
 518                 }
 519 
 520                 if (txp->tx_handle_bound)
 521                         (void) ddi_dma_unbind_handle(txp->tx_dma_handle);
 522         }
 523 
 524         if (txp->tx_mp != NULL)
 525                 freemsg(txp->tx_mp);
 526 
 527         if (txp->tx_prev != NULL) {
 528                 ASSERT3P(txp->tx_prev->tx_next, ==, txp);
 529                 txp->tx_prev->tx_next = NULL;
 530         }
 531 
 532         if (txp->tx_txreq.id != INVALID_TX_ID) {
 533                 /*
 534                  * This should be only possible when resuming from a suspend.
 535                  */
 536                 ASSERT(!xnfp->xnf_connected);
 537                 xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id));
 538                 txp->tx_txreq.id = INVALID_TX_ID;
 539         }
 540 
 541         kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
 542 }
 543 
 544 static void
 545 xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp)
 546 {
 547         if (txp == NULL)
 548                 return;
 549 
 550         while (txp->tx_next != NULL)
 551                 txp = txp->tx_next;
 552 
 553         /*
 554          * We free the chain in reverse order so that grants can be released
 555          * for all dma chunks before unbinding the dma handles. The mblk is
 556          * freed last, after all its fragments' dma handles are unbound.
 557          */
 558         xnf_txbuf_t *prev;
 559         for (; txp != NULL; txp = prev) {
 560                 prev = txp->tx_prev;
 561                 xnf_data_txbuf_free(xnfp, txp);
 562         }
 563 }
 564 
 565 static xnf_txbuf_t *
 566 xnf_data_txbuf_alloc(xnf_t *xnfp)
 567 {
 568         xnf_txbuf_t *txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
 569         txp->tx_type = TX_DATA;
 570         txp->tx_next = NULL;
 571         txp->tx_prev = NULL;
 572         txp->tx_head = txp;
 573         txp->tx_frags_to_ack = 0;
 574         txp->tx_mp = NULL;
 575         txp->tx_bdesc = NULL;
 576         txp->tx_handle_bound = B_FALSE;
 577         txp->tx_txreq.gref = INVALID_GRANT_REF;
 578         txp->tx_txreq.id = INVALID_TX_ID;
 579 
 580         return (txp);
 581 }
 582 
 583 /*
 584  * Get `wanted' slots in the transmit ring, waiting for at least that
 585  * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
 586  * `wanted' to zero.
 587  *
 588  * Return the number of slots available.
 589  */
 590 static int
 591 xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
 592 {
 593         int slotsfree;
 594         boolean_t forced_clean = (wanted == 0);
 595 
 596         ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
 597 
 598         /* LINTED: constant in conditional context */
 599         while (B_TRUE) {
 600                 slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);
 601 
 602                 if ((slotsfree < wanted) || forced_clean)
 603                         slotsfree = xnf_tx_clean_ring(xnfp);
 604 
 605                 /*
 606                  * If there are more than we need free, tell other
 607                  * people to come looking again. We hold txlock, so we
 608                  * are able to take our slots before anyone else runs.
 609                  */
 610                 if (slotsfree > wanted)
 611                         cv_broadcast(&xnfp->xnf_cv_tx_slots);
 612 
 613                 if (slotsfree >= wanted)
 614                         break;
 615 
 616                 if (!wait)
 617                         break;
 618 
 619                 cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
 620         }
 621 
 622         ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));
 623 
 624         return (slotsfree);
 625 }
 626 
 627 static int
 628 xnf_setup_rings(xnf_t *xnfp)
 629 {
 630         domid_t                 oeid;
 631         struct xenbus_device    *xsd;
 632         RING_IDX                i;
 633         int                     err;
 634         xnf_txid_t              *tidp;
 635         xnf_buf_t **bdescp;
 636 
 637         oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
 638         xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
 639 
 640         if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
 641                 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
 642 
 643         err = gnttab_grant_foreign_access(oeid,
 644             xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
 645         if (err <= 0) {
 646                 err = -err;
 647                 xenbus_dev_error(xsd, err, "granting access to tx ring page");
 648                 goto out;
 649         }
 650         xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
 651 
 652         if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
 653                 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
 654 
 655         err = gnttab_grant_foreign_access(oeid,
 656             xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
 657         if (err <= 0) {
 658                 err = -err;
 659                 xenbus_dev_error(xsd, err, "granting access to rx ring page");
 660                 goto out;
 661         }
 662         xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
 663 
 664         mutex_enter(&xnfp->xnf_txlock);
 665 
 666         /*
 667          * We first cleanup the TX ring in case we are doing a resume.
 668          * Note that this can lose packets, but we expect to stagger on.
 669          */
 670         xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
 671         for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
 672             i < NET_TX_RING_SIZE;
 673             i++, tidp++) {
 674                 xnf_txbuf_t *txp = tidp->txbuf;
 675                 if (txp == NULL)
 676                         continue;
 677 
 678                 switch (txp->tx_type) {
 679                 case TX_DATA:
 680                         /*
 681                          * txid_put() will be called for each txbuf's txid in
 682                          * the chain which will result in clearing tidp->txbuf.
 683                          */
 684                         xnf_data_txbuf_free_chain(xnfp, txp);
 685 
 686                         break;
 687 
 688                 case TX_MCAST_REQ:
 689                         txp->tx_type = TX_MCAST_RSP;
 690                         txp->tx_status = NETIF_RSP_DROPPED;
 691                         cv_broadcast(&xnfp->xnf_cv_multicast);
 692 
 693                         /*
 694                          * The request consumed two slots in the ring,
 695                          * yet only a single xnf_txid_t is used. Step
 696                          * over the empty slot.
 697                          */
 698                         i++;
 699                         ASSERT3U(i, <, NET_TX_RING_SIZE);
 700                         break;
 701 
 702                 case TX_MCAST_RSP:
 703                         break;
 704                 }
 705         }
 706 
 707         /*
 708          * Now purge old list and add each txid to the new free list.
 709          */
 710         xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
 711         for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
 712             i < NET_TX_RING_SIZE;
 713             i++, tidp++) {
 714                 tidp->id = i;
 715                 ASSERT3P(tidp->txbuf, ==, NULL);
 716                 tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
 717                 xnf_txid_put(xnfp, tidp);
 718         }
 719 
 720         /* LINTED: constant in conditional context */
 721         SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
 722         /* LINTED: constant in conditional context */
 723         FRONT_RING_INIT(&xnfp->xnf_tx_ring,
 724             xnfp->xnf_tx_ring.sring, PAGESIZE);
 725 
 726         mutex_exit(&xnfp->xnf_txlock);
 727 
 728         mutex_enter(&xnfp->xnf_rxlock);
 729 
 730         /*
 731          * Clean out any buffers currently posted to the receive ring
 732          * before we reset it.
 733          */
 734         for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
 735             i < NET_RX_RING_SIZE;
 736             i++, bdescp++) {
 737                 if (*bdescp != NULL) {
 738                         xnf_buf_put(xnfp, *bdescp, B_FALSE);
 739                         *bdescp = NULL;
 740                 }
 741         }
 742 
 743         /* LINTED: constant in conditional context */
 744         SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
 745         /* LINTED: constant in conditional context */
 746         FRONT_RING_INIT(&xnfp->xnf_rx_ring,
 747             xnfp->xnf_rx_ring.sring, PAGESIZE);
 748 
 749         /*
 750          * Fill the ring with buffers.
 751          */
 752         for (i = 0; i < NET_RX_RING_SIZE; i++) {
 753                 xnf_buf_t *bdesc;
 754 
 755                 bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
 756                 VERIFY(bdesc != NULL);
 757                 xnf_rxbuf_hang(xnfp, bdesc);
 758         }
 759 
 760         /* LINTED: constant in conditional context */
 761         RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
 762 
 763         mutex_exit(&xnfp->xnf_rxlock);
 764 
 765         return (0);
 766 
 767 out:
 768         if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
 769                 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
 770         xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
 771 
 772         if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
 773                 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
 774         xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
 775 
 776         return (err);
 777 }
 778 
 779 /*
 780  * Connect driver to back end, called to set up communication with
 781  * back end driver both initially and on resume after restore/migrate.
 782  */
 783 void
 784 xnf_be_connect(xnf_t *xnfp)
 785 {
 786         const char      *message;
 787         xenbus_transaction_t xbt;
 788         struct          xenbus_device *xsd;
 789         char            *xsname;
 790         int             err;
 791 
 792         ASSERT(!xnfp->xnf_connected);
 793 
 794         xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
 795         xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
 796 
 797         err = xnf_setup_rings(xnfp);
 798         if (err != 0) {
 799                 cmn_err(CE_WARN, "failed to set up tx/rx rings");
 800                 xenbus_dev_error(xsd, err, "setting up ring");
 801                 return;
 802         }
 803 
 804 again:
 805         err = xenbus_transaction_start(&xbt);
 806         if (err != 0) {
 807                 xenbus_dev_error(xsd, EIO, "starting transaction");
 808                 return;
 809         }
 810 
 811         err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
 812             xnfp->xnf_tx_ring_ref);
 813         if (err != 0) {
 814                 message = "writing tx ring-ref";
 815                 goto abort_transaction;
 816         }
 817 
 818         err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
 819             xnfp->xnf_rx_ring_ref);
 820         if (err != 0) {
 821                 message = "writing rx ring-ref";
 822                 goto abort_transaction;
 823         }
 824 
 825         err = xenbus_printf(xbt, xsname, "event-channel", "%u",
 826             xnfp->xnf_evtchn);
 827         if (err != 0) {
 828                 message = "writing event-channel";
 829                 goto abort_transaction;
 830         }
 831 
 832         err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
 833         if (err != 0) {
 834                 message = "writing feature-rx-notify";
 835                 goto abort_transaction;
 836         }
 837 
 838         err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
 839         if (err != 0) {
 840                 message = "writing request-rx-copy";
 841                 goto abort_transaction;
 842         }
 843 
 844         if (xnfp->xnf_be_mcast_control) {
 845                 err = xenbus_printf(xbt, xsname, "request-multicast-control",
 846                     "%d", 1);
 847                 if (err != 0) {
 848                         message = "writing request-multicast-control";
 849                         goto abort_transaction;
 850                 }
 851         }
 852 
 853         /*
 854          * Tell backend if we support scatter-gather lists on the rx side.
 855          */
 856         err = xenbus_printf(xbt, xsname, "feature-sg", "%d",
 857             xnf_enable_rx_sg ? 1 : 0);
 858         if (err != 0) {
 859                 message = "writing feature-sg";
 860                 goto abort_transaction;
 861         }
 862 
 863         /*
 864          * Tell backend if we support LRO for IPv4. Scatter-gather on rx is
 865          * a prerequisite.
 866          */
 867         err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d",
 868             (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0);
 869         if (err != 0) {
 870                 message = "writing feature-gso-tcpv4";
 871                 goto abort_transaction;
 872         }
 873 
 874         err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
 875         if (err != 0) {
 876                 message = "switching state to XenbusStateConnected";
 877                 goto abort_transaction;
 878         }
 879 
 880         err = xenbus_transaction_end(xbt, 0);
 881         if (err != 0) {
 882                 if (err == EAGAIN)
 883                         goto again;
 884                 xenbus_dev_error(xsd, err, "completing transaction");
 885         }
 886 
 887         return;
 888 
 889 abort_transaction:
 890         (void) xenbus_transaction_end(xbt, 1);
 891         xenbus_dev_error(xsd, err, "%s", message);
 892 }
 893 
 894 /*
 895  * Read configuration information from xenstore.
 896  */
 897 void
 898 xnf_read_config(xnf_t *xnfp)
 899 {
 900         int err, be_cap;
 901         char mac[ETHERADDRL * 3];
 902         char *oename = xvdi_get_oename(xnfp->xnf_devinfo);
 903 
 904         err = xenbus_scanf(XBT_NULL, oename, "mac",
 905             "%s", (char *)&mac[0]);
 906         if (err != 0) {
 907                 /*
 908                  * bad: we're supposed to be set up with a proper mac
 909                  * addr. at this point
 910                  */
 911                 cmn_err(CE_WARN, "%s%d: no mac address",
 912                     ddi_driver_name(xnfp->xnf_devinfo),
 913                     ddi_get_instance(xnfp->xnf_devinfo));
 914                         return;
 915         }
 916         if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
 917                 err = ENOENT;
 918                 xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
 919                     "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
 920                 return;
 921         }
 922 
 923         err = xenbus_scanf(XBT_NULL, oename,
 924             "feature-rx-copy", "%d", &be_cap);
 925         /*
 926          * If we fail to read the store we assume that the key is
 927          * absent, implying an older domain at the far end.  Older
 928          * domains cannot do HV copy.
 929          */
 930         if (err != 0)
 931                 be_cap = 0;
 932         xnfp->xnf_be_rx_copy = (be_cap != 0);
 933 
 934         err = xenbus_scanf(XBT_NULL, oename,
 935             "feature-multicast-control", "%d", &be_cap);
 936         /*
 937          * If we fail to read the store we assume that the key is
 938          * absent, implying an older domain at the far end.  Older
 939          * domains do not support multicast control.
 940          */
 941         if (err != 0)
 942                 be_cap = 0;
 943         xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
 944 
 945         /*
 946          * See if back-end supports scatter-gather for transmits. If not,
 947          * we will not support LSO and limit the mtu to 1500.
 948          */
 949         err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap);
 950         if (err != 0) {
 951                 be_cap = 0;
 952                 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
 953                     "'feature-sg' from backend driver");
 954         }
 955         if (be_cap == 0) {
 956                 dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not "
 957                     "supported for transmits in the backend driver. LSO is "
 958                     "disabled and MTU is restricted to 1500 bytes.");
 959         }
 960         xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg;
 961 
 962         if (xnfp->xnf_be_tx_sg) {
 963                 /*
 964                  * Check if LSO is supported. Currently we only check for
 965                  * IPv4 as Illumos doesn't support LSO for IPv6.
 966                  */
 967                 err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d",
 968                     &be_cap);
 969                 if (err != 0) {
 970                         be_cap = 0;
 971                         dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
 972                             "'feature-gso-tcpv4' from backend driver");
 973                 }
 974                 if (be_cap == 0) {
 975                         dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not "
 976                             "supported by the backend driver. Performance "
 977                             "will be affected.");
 978                 }
 979                 xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso;
 980         }
 981 }
 982 
 983 /*
 984  *  attach(9E) -- Attach a device to the system
 985  */
 986 static int
 987 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
 988 {
 989         mac_register_t *macp;
 990         xnf_t *xnfp;
 991         int err;
 992         char cachename[32];
 993 
 994 #ifdef XNF_DEBUG
 995         if (xnf_debug & XNF_DEBUG_DDI)
 996                 printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
 997                     (void *)devinfo);
 998 #endif
 999 
1000         switch (cmd) {
1001         case DDI_RESUME:
1002                 xnfp = ddi_get_driver_private(devinfo);
1003                 xnfp->xnf_gen++;
1004 
1005                 (void) xvdi_resume(devinfo);
1006                 (void) xvdi_alloc_evtchn(devinfo);
1007                 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
1008 #ifdef XPV_HVM_DRIVER
1009                 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
1010                     xnfp);
1011 #else
1012                 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
1013                     (caddr_t)xnfp);
1014 #endif
1015                 return (DDI_SUCCESS);
1016 
1017         case DDI_ATTACH:
1018                 break;
1019 
1020         default:
1021                 return (DDI_FAILURE);
1022         }
1023 
1024         /*
1025          *  Allocate gld_mac_info_t and xnf_instance structures
1026          */
1027         macp = mac_alloc(MAC_VERSION);
1028         if (macp == NULL)
1029                 return (DDI_FAILURE);
1030         xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
1031 
1032         xnfp->xnf_tx_pkt_id =
1033             kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP);
1034 
1035         xnfp->xnf_rx_pkt_info =
1036             kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP);
1037 
1038         macp->m_dip = devinfo;
1039         macp->m_driver = xnfp;
1040         xnfp->xnf_devinfo = devinfo;
1041 
1042         macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1043         macp->m_src_addr = xnfp->xnf_mac_addr;
1044         macp->m_callbacks = &xnf_callbacks;
1045         macp->m_min_sdu = 0;
1046         xnfp->xnf_mtu = ETHERMTU;
1047         macp->m_max_sdu = xnfp->xnf_mtu;
1048 
1049         xnfp->xnf_running = B_FALSE;
1050         xnfp->xnf_connected = B_FALSE;
1051         xnfp->xnf_be_rx_copy = B_FALSE;
1052         xnfp->xnf_be_mcast_control = B_FALSE;
1053         xnfp->xnf_need_sched = B_FALSE;
1054 
1055         xnfp->xnf_rx_head = NULL;
1056         xnfp->xnf_rx_tail = NULL;
1057         xnfp->xnf_rx_new_buffers_posted = B_FALSE;
1058 
1059 #ifdef XPV_HVM_DRIVER
1060         /*
1061          * Report our version to dom0.
1062          */
1063         if (xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
1064             HVMPV_XNF_VERS))
1065                 cmn_err(CE_WARN, "xnf: couldn't write version\n");
1066 #endif
1067 
1068         /*
1069          * Get the iblock cookie with which to initialize the mutexes.
1070          */
1071         if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
1072             != DDI_SUCCESS)
1073                 goto failure;
1074 
1075         mutex_init(&xnfp->xnf_txlock,
1076             NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1077         mutex_init(&xnfp->xnf_rxlock,
1078             NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1079         mutex_init(&xnfp->xnf_schedlock,
1080             NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1081         mutex_init(&xnfp->xnf_gref_lock,
1082             NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1083 
1084         cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
1085         cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
1086         cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);
1087 
1088         (void) sprintf(cachename, "xnf_buf_cache_%d",
1089             ddi_get_instance(devinfo));
1090         xnfp->xnf_buf_cache = kmem_cache_create(cachename,
1091             sizeof (xnf_buf_t), 0,
1092             xnf_buf_constructor, xnf_buf_destructor,
1093             NULL, xnfp, NULL, 0);
1094         if (xnfp->xnf_buf_cache == NULL)
1095                 goto failure_0;
1096 
1097         (void) sprintf(cachename, "xnf_tx_buf_cache_%d",
1098             ddi_get_instance(devinfo));
1099         xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
1100             sizeof (xnf_txbuf_t), 0,
1101             xnf_tx_buf_constructor, xnf_tx_buf_destructor,
1102             NULL, xnfp, NULL, 0);
1103         if (xnfp->xnf_tx_buf_cache == NULL)
1104                 goto failure_1;
1105 
1106         xnfp->xnf_gref_head = INVALID_GRANT_REF;
1107 
1108         if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
1109                 cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
1110                     "driver data structures",
1111                     ddi_get_instance(xnfp->xnf_devinfo));
1112                 goto failure_2;
1113         }
1114 
1115         xnfp->xnf_rx_ring.sring->rsp_event =
1116             xnfp->xnf_tx_ring.sring->rsp_event = 1;
1117 
1118         xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
1119         xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
1120 
1121         /* set driver private pointer now */
1122         ddi_set_driver_private(devinfo, xnfp);
1123 
1124         if (!xnf_kstat_init(xnfp))
1125                 goto failure_3;
1126 
1127         /*
1128          * Allocate an event channel, add the interrupt handler and
1129          * bind it to the event channel.
1130          */
1131         (void) xvdi_alloc_evtchn(devinfo);
1132         xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
1133 #ifdef XPV_HVM_DRIVER
1134         ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
1135 #else
1136         (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
1137 #endif
1138 
1139         err = mac_register(macp, &xnfp->xnf_mh);
1140         mac_free(macp);
1141         macp = NULL;
1142         if (err != 0)
1143                 goto failure_4;
1144 
1145         if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
1146             != DDI_SUCCESS)
1147                 goto failure_5;
1148 
1149 #ifdef XPV_HVM_DRIVER
1150         /*
1151          * In the HVM case, this driver essentially replaces a driver for
1152          * a 'real' PCI NIC. Without the "model" property set to
1153          * "Ethernet controller", like the PCI code does, netbooting does
1154          * not work correctly, as strplumb_get_netdev_path() will not find
1155          * this interface.
1156          */
1157         (void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
1158             "Ethernet controller");
1159 #endif
1160 
1161 #ifdef XNF_DEBUG
1162         if (xnf_debug_instance == NULL)
1163                 xnf_debug_instance = xnfp;
1164 #endif
1165 
1166         return (DDI_SUCCESS);
1167 
1168 failure_5:
1169         (void) mac_unregister(xnfp->xnf_mh);
1170 
1171 failure_4:
1172 #ifdef XPV_HVM_DRIVER
1173         ec_unbind_evtchn(xnfp->xnf_evtchn);
1174         xvdi_free_evtchn(devinfo);
1175 #else
1176         ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1177 #endif
1178         xnfp->xnf_evtchn = INVALID_EVTCHN;
1179         kstat_delete(xnfp->xnf_kstat_aux);
1180 
1181 failure_3:
1182         xnf_release_dma_resources(xnfp);
1183 
1184 failure_2:
1185         kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1186 
1187 failure_1:
1188         kmem_cache_destroy(xnfp->xnf_buf_cache);
1189 
1190 failure_0:
1191         cv_destroy(&xnfp->xnf_cv_tx_slots);
1192         cv_destroy(&xnfp->xnf_cv_multicast);
1193         cv_destroy(&xnfp->xnf_cv_state);
1194 
1195         mutex_destroy(&xnfp->xnf_gref_lock);
1196         mutex_destroy(&xnfp->xnf_schedlock);
1197         mutex_destroy(&xnfp->xnf_rxlock);
1198         mutex_destroy(&xnfp->xnf_txlock);
1199 
1200 failure:
1201         kmem_free(xnfp, sizeof (*xnfp));
1202         if (macp != NULL)
1203                 mac_free(macp);
1204 
1205         return (DDI_FAILURE);
1206 }
1207 
1208 /*  detach(9E) -- Detach a device from the system */
1209 static int
1210 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
1211 {
1212         xnf_t *xnfp;            /* Our private device info */
1213 
1214 #ifdef XNF_DEBUG
1215         if (xnf_debug & XNF_DEBUG_DDI)
1216                 printf("xnf_detach(0x%p)\n", (void *)devinfo);
1217 #endif
1218 
1219         xnfp = ddi_get_driver_private(devinfo);
1220 
1221         switch (cmd) {
1222         case DDI_SUSPEND:
1223 #ifdef XPV_HVM_DRIVER
1224                 ec_unbind_evtchn(xnfp->xnf_evtchn);
1225                 xvdi_free_evtchn(devinfo);
1226 #else
1227                 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1228 #endif
1229 
1230                 xvdi_suspend(devinfo);
1231 
1232                 mutex_enter(&xnfp->xnf_rxlock);
1233                 mutex_enter(&xnfp->xnf_txlock);
1234 
1235                 xnfp->xnf_evtchn = INVALID_EVTCHN;
1236                 xnfp->xnf_connected = B_FALSE;
1237                 mutex_exit(&xnfp->xnf_txlock);
1238                 mutex_exit(&xnfp->xnf_rxlock);
1239 
1240                 /* claim link to be down after disconnect */
1241                 mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
1242                 return (DDI_SUCCESS);
1243 
1244         case DDI_DETACH:
1245                 break;
1246 
1247         default:
1248                 return (DDI_FAILURE);
1249         }
1250 
1251         if (xnfp->xnf_connected)
1252                 return (DDI_FAILURE);
1253 
1254         /*
1255          * Cannot detach if we have xnf_buf_t outstanding.
1256          */
1257         if (xnfp->xnf_stat_buf_allocated > 0)
1258                 return (DDI_FAILURE);
1259 
1260         if (mac_unregister(xnfp->xnf_mh) != 0)
1261                 return (DDI_FAILURE);
1262 
1263         kstat_delete(xnfp->xnf_kstat_aux);
1264 
1265         /* Stop the receiver */
1266         xnf_stop(xnfp);
1267 
1268         xvdi_remove_event_handler(devinfo, XS_OE_STATE);
1269 
1270         /* Remove the interrupt */
1271 #ifdef XPV_HVM_DRIVER
1272         ec_unbind_evtchn(xnfp->xnf_evtchn);
1273         xvdi_free_evtchn(devinfo);
1274 #else
1275         ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1276 #endif
1277 
1278         /* Release any pending xmit mblks */
1279         xnf_release_mblks(xnfp);
1280 
1281         /* Release all DMA resources */
1282         xnf_release_dma_resources(xnfp);
1283 
1284         cv_destroy(&xnfp->xnf_cv_tx_slots);
1285         cv_destroy(&xnfp->xnf_cv_multicast);
1286         cv_destroy(&xnfp->xnf_cv_state);
1287 
1288         kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1289         kmem_cache_destroy(xnfp->xnf_buf_cache);
1290 
1291         mutex_destroy(&xnfp->xnf_gref_lock);
1292         mutex_destroy(&xnfp->xnf_schedlock);
1293         mutex_destroy(&xnfp->xnf_rxlock);
1294         mutex_destroy(&xnfp->xnf_txlock);
1295 
1296         kmem_free(xnfp, sizeof (*xnfp));
1297 
1298         return (DDI_SUCCESS);
1299 }
1300 
1301 /*
1302  *  xnf_set_mac_addr() -- set the physical network address on the board.
1303  */
1304 static int
1305 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
1306 {
1307         _NOTE(ARGUNUSED(arg, macaddr));
1308 
1309         /*
1310          * We can't set our macaddr.
1311          */
1312         return (ENOTSUP);
1313 }
1314 
1315 /*
1316  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
1317  *
1318  *  Program the hardware to enable/disable the multicast address
1319  *  in "mca".  Enable if "add" is true, disable if false.
1320  */
1321 static int
1322 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
1323 {
1324         xnf_t *xnfp = arg;
1325         xnf_txbuf_t *txp;
1326         int n_slots;
1327         RING_IDX slot;
1328         xnf_txid_t *tidp;
1329         netif_tx_request_t *txrp;
1330         struct netif_extra_info *erp;
1331         boolean_t notify, result;
1332 
1333         /*
1334          * If the backend does not support multicast control then we
1335          * must assume that the right packets will just arrive.
1336          */
1337         if (!xnfp->xnf_be_mcast_control)
1338                 return (0);
1339 
1340         txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
1341 
1342         mutex_enter(&xnfp->xnf_txlock);
1343 
1344         /*
1345          * If we're not yet connected then claim success. This is
1346          * acceptable because we refresh the entire set of multicast
1347          * addresses when we get connected.
1348          *
1349          * We can't wait around here because the MAC layer expects
1350          * this to be a non-blocking operation - waiting ends up
1351          * causing a deadlock during resume.
1352          */
1353         if (!xnfp->xnf_connected) {
1354                 mutex_exit(&xnfp->xnf_txlock);
1355                 return (0);
1356         }
1357 
1358         /*
1359          * 1. Acquire two slots in the ring.
1360          * 2. Fill in the slots.
1361          * 3. Request notification when the operation is done.
1362          * 4. Kick the peer.
1363          * 5. Wait for the response via xnf_tx_clean_ring().
1364          */
1365 
1366         n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE);
1367         ASSERT(n_slots >= 2);
1368 
1369         slot = xnfp->xnf_tx_ring.req_prod_pvt;
1370         tidp = xnf_txid_get(xnfp);
1371         VERIFY(tidp != NULL);
1372 
1373         txp->tx_type = TX_MCAST_REQ;
1374         txp->tx_slot = slot;
1375 
1376         txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1377         erp = (struct netif_extra_info *)
1378             RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);
1379 
1380         txrp->gref = 0;
1381         txrp->size = 0;
1382         txrp->offset = 0;
1383         /* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
1384         txrp->id = txp->tx_txreq.id = tidp->id;
1385         txrp->flags = NETTXF_extra_info;
1386 
1387         erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
1388             XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
1389         bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);
1390 
1391         tidp->txbuf = txp;
1392 
1393         xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;
1394 
1395         mutex_enter(&xnfp->xnf_schedlock);
1396         xnfp->xnf_pending_multicast++;
1397         mutex_exit(&xnfp->xnf_schedlock);
1398 
1399         /* LINTED: constant in conditional context */
1400         RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1401             notify);
1402         if (notify)
1403                 ec_notify_via_evtchn(xnfp->xnf_evtchn);
1404 
1405         while (txp->tx_type == TX_MCAST_REQ)
1406                 cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock);
1407 
1408         ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP);
1409 
1410         mutex_enter(&xnfp->xnf_schedlock);
1411         xnfp->xnf_pending_multicast--;
1412         mutex_exit(&xnfp->xnf_schedlock);
1413 
1414         result = (txp->tx_status == NETIF_RSP_OKAY);
1415 
1416         xnf_txid_put(xnfp, tidp);
1417 
1418         mutex_exit(&xnfp->xnf_txlock);
1419 
1420         kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1421 
1422         return (result ? 0 : 1);
1423 }
1424 
1425 /*
1426  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
1427  *
1428  *  Program the hardware to enable/disable promiscuous mode.
1429  */
1430 static int
1431 xnf_set_promiscuous(void *arg, boolean_t on)
1432 {
1433         _NOTE(ARGUNUSED(arg, on));
1434 
1435         /*
1436          * We can't really do this, but we pretend that we can in
1437          * order that snoop will work.
1438          */
1439         return (0);
1440 }
1441 
1442 /*
1443  * Clean buffers that we have responses for from the transmit ring.
1444  */
1445 static int
1446 xnf_tx_clean_ring(xnf_t *xnfp)
1447 {
1448         boolean_t work_to_do;
1449 
1450         ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1451 
1452 loop:
1453         while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1454                 RING_IDX cons, prod, i;
1455 
1456                 cons = xnfp->xnf_tx_ring.rsp_cons;
1457                 prod = xnfp->xnf_tx_ring.sring->rsp_prod;
1458                 membar_consumer();
1459                 /*
1460                  * Clean tx requests from ring that we have responses
1461                  * for.
1462                  */
1463                 DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
1464                 for (i = cons; i != prod; i++) {
1465                         netif_tx_response_t *trp;
1466                         xnf_txid_t *tidp;
1467                         xnf_txbuf_t *txp;
1468 
1469                         trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
1470                         /*
1471                          * if this slot was occupied by netif_extra_info_t,
1472                          * then the response will be NETIF_RSP_NULL. In this
1473                          * case there are no resources to clean up.
1474                          */
1475                         if (trp->status == NETIF_RSP_NULL)
1476                                 continue;
1477 
1478                         ASSERT(TX_ID_VALID(trp->id));
1479 
1480                         tidp = TX_ID_TO_TXID(xnfp, trp->id);
1481                         ASSERT3U(tidp->id, ==, trp->id);
1482                         ASSERT3U(tidp->next, ==, INVALID_TX_ID);
1483 
1484                         txp = tidp->txbuf;
1485                         ASSERT(txp != NULL);
1486                         ASSERT3U(txp->tx_txreq.id, ==, trp->id);
1487 
1488                         switch (txp->tx_type) {
1489                         case TX_DATA:
1490                                 /*
1491                                  * We must put the txid for each response we
1492                                  * acknowledge to make sure that we never have
1493                                  * more free slots than txids. Because of this
1494                                  * we do it here instead of waiting for it to
1495                                  * be done in xnf_data_txbuf_free_chain().
1496                                  */
1497                                 xnf_txid_put(xnfp, tidp);
1498                                 txp->tx_txreq.id = INVALID_TX_ID;
1499                                 ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0);
1500                                 txp->tx_head->tx_frags_to_ack--;
1501 
1502                                 /*
1503                                  * We clean the whole chain once we got a
1504                                  * response for each fragment.
1505                                  */
1506                                 if (txp->tx_head->tx_frags_to_ack == 0)
1507                                         xnf_data_txbuf_free_chain(xnfp, txp);
1508 
1509                                 break;
1510 
1511                         case TX_MCAST_REQ:
1512                                 txp->tx_type = TX_MCAST_RSP;
1513                                 txp->tx_status = trp->status;
1514                                 cv_broadcast(&xnfp->xnf_cv_multicast);
1515 
1516                                 break;
1517 
1518                         default:
1519                                 cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
1520                                     "invalid xnf_txbuf_t type: %d",
1521                                     txp->tx_type);
1522                                 break;
1523                         }
1524                 }
1525                 /*
1526                  * Record the last response we dealt with so that we
1527                  * know where to start next time around.
1528                  */
1529                 xnfp->xnf_tx_ring.rsp_cons = prod;
1530                 membar_enter();
1531         }
1532 
1533         /* LINTED: constant in conditional context */
1534         RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1535         if (work_to_do)
1536                 goto loop;
1537 
1538         return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1539 }
1540 
1541 /*
1542  * Allocate and fill in a look-aside buffer for the packet `mp'. Used
1543  * to ensure that the packet is physically contiguous and contained
1544  * within a single page.
1545  */
1546 static xnf_buf_t *
1547 xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen)
1548 {
1549         xnf_buf_t *bd;
1550         caddr_t bp;
1551 
1552         bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE);
1553         if (bd == NULL)
1554                 return (NULL);
1555 
1556         bp = bd->buf;
1557         while (mp != NULL) {
1558                 size_t len = MBLKL(mp);
1559 
1560                 bcopy(mp->b_rptr, bp, len);
1561                 bp += len;
1562 
1563                 mp = mp->b_cont;
1564         }
1565 
1566         *plen = bp - bd->buf;
1567         ASSERT3U(*plen, <=, PAGESIZE);
1568 
1569         xnfp->xnf_stat_tx_lookaside++;
1570 
1571         return (bd);
1572 }
1573 
1574 /*
1575  * Insert the pseudo-header checksum into the packet.
1576  * Assumes packet is IPv4, TCP/UDP since we only advertised support for
1577  * HCKSUM_INET_FULL_V4.
1578  */
1579 int
1580 xnf_pseudo_cksum(mblk_t *mp)
1581 {
1582         struct ether_header *ehp;
1583         uint16_t sap, iplen, *stuff;
1584         uint32_t cksum;
1585         size_t len;
1586         ipha_t *ipha;
1587         ipaddr_t src, dst;
1588         uchar_t *ptr;
1589 
1590         ptr = mp->b_rptr;
1591         len = MBLKL(mp);
1592 
1593         /* Each header must fit completely in an mblk. */
1594         ASSERT3U(len, >=, sizeof (*ehp));
1595 
1596         ehp = (struct ether_header *)ptr;
1597 
1598         if (ntohs(ehp->ether_type) == VLAN_TPID) {
1599                 struct ether_vlan_header *evhp;
1600                 ASSERT3U(len, >=, sizeof (*evhp));
1601                 evhp = (struct ether_vlan_header *)ptr;
1602                 sap = ntohs(evhp->ether_type);
1603                 ptr += sizeof (*evhp);
1604                 len -= sizeof (*evhp);
1605         } else {
1606                 sap = ntohs(ehp->ether_type);
1607                 ptr += sizeof (*ehp);
1608                 len -= sizeof (*ehp);
1609         }
1610 
1611         ASSERT3U(sap, ==, ETHERTYPE_IP);
1612 
1613         /*
1614          * Ethernet and IP headers may be in different mblks.
1615          */
1616         ASSERT3P(ptr, <=, mp->b_wptr);
1617         if (ptr == mp->b_wptr) {
1618                 mp = mp->b_cont;
1619                 ptr = mp->b_rptr;
1620                 len = MBLKL(mp);
1621         }
1622 
1623         ASSERT3U(len, >=, sizeof (ipha_t));
1624         ipha = (ipha_t *)ptr;
1625 
1626         /*
1627          * We assume the IP header has no options. (This is enforced in
1628          * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM).
1629          */
1630         ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH);
1631         iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
1632 
1633         ptr += IP_SIMPLE_HDR_LENGTH;
1634         len -= IP_SIMPLE_HDR_LENGTH;
1635 
1636         /*
1637          * IP and L4 headers may be in different mblks.
1638          */
1639         ASSERT3P(ptr, <=, mp->b_wptr);
1640         if (ptr == mp->b_wptr) {
1641                 mp = mp->b_cont;
1642                 ptr = mp->b_rptr;
1643                 len = MBLKL(mp);
1644         }
1645 
1646         switch (ipha->ipha_protocol) {
1647         case IPPROTO_TCP:
1648                 ASSERT3U(len, >=, sizeof (tcph_t));
1649                 stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET);
1650                 cksum = IP_TCP_CSUM_COMP;
1651                 break;
1652         case IPPROTO_UDP:
1653                 ASSERT3U(len, >=, sizeof (struct udphdr));
1654                 stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET);
1655                 cksum = IP_UDP_CSUM_COMP;
1656                 break;
1657         default:
1658                 cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
1659                     ipha->ipha_protocol);
1660                 return (EINVAL);
1661         }
1662 
1663         src = ipha->ipha_src;
1664         dst = ipha->ipha_dst;
1665 
1666         cksum += (dst >> 16) + (dst & 0xFFFF);
1667         cksum += (src >> 16) + (src & 0xFFFF);
1668         cksum += htons(iplen);
1669 
1670         cksum = (cksum >> 16) + (cksum & 0xFFFF);
1671         cksum = (cksum >> 16) + (cksum & 0xFFFF);
1672 
1673         ASSERT(cksum <= 0xFFFF);
1674 
1675         *stuff = (uint16_t)(cksum ? cksum : ~cksum);
1676 
1677         return (0);
1678 }
1679 
1680 /*
1681  * Push a packet into the transmit ring.
1682  *
1683  * Note: the format of a tx packet that spans multiple slots is similar to
1684  * what is described in xnf_rx_one_packet().
1685  */
1686 static void
1687 xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head)
1688 {
1689         int nslots = 0;
1690         int extras = 0;
1691         RING_IDX slot;
1692         boolean_t notify;
1693 
1694         ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1695         ASSERT(xnfp->xnf_running);
1696 
1697         slot = xnfp->xnf_tx_ring.req_prod_pvt;
1698 
1699         /*
1700          * The caller has already checked that we have enough slots to proceed.
1701          */
1702         for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) {
1703                 xnf_txid_t *tidp;
1704                 netif_tx_request_t *txrp;
1705 
1706                 tidp = xnf_txid_get(xnfp);
1707                 VERIFY(tidp != NULL);
1708                 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1709 
1710                 txp->tx_slot = slot;
1711                 txp->tx_txreq.id = tidp->id;
1712                 *txrp = txp->tx_txreq;
1713 
1714                 tidp->txbuf = txp;
1715                 slot++;
1716                 nslots++;
1717 
1718                 /*
1719                  * When present, LSO info is placed in a slot after the first
1720                  * data segment, and doesn't require a txid.
1721                  */
1722                 if (txp->tx_txreq.flags & NETTXF_extra_info) {
1723                         netif_extra_info_t *extra;
1724                         ASSERT3U(nslots, ==, 1);
1725 
1726                         extra = (netif_extra_info_t *)
1727                             RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1728                         *extra = txp->tx_extra;
1729                         slot++;
1730                         nslots++;
1731                         extras = 1;
1732                 }
1733         }
1734 
1735         ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX);
1736 
1737         /*
1738          * Store the number of data fragments.
1739          */
1740         head->tx_frags_to_ack = nslots - extras;
1741 
1742         xnfp->xnf_tx_ring.req_prod_pvt = slot;
1743 
1744         /*
1745          * Tell the peer that we sent something, if it cares.
1746          */
1747         /* LINTED: constant in conditional context */
1748         RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify);
1749         if (notify)
1750                 ec_notify_via_evtchn(xnfp->xnf_evtchn);
1751 }
1752 
1753 static xnf_txbuf_t *
1754 xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp)
1755 {
1756         xnf_txbuf_t *txp = xnf_data_txbuf_alloc(xnfp);
1757         size_t length;
1758 
1759         txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length);
1760         if (txp->tx_bdesc == NULL) {
1761                 xnf_data_txbuf_free(xnfp, txp);
1762                 return (NULL);
1763         }
1764         txp->tx_mfn = txp->tx_bdesc->buf_mfn;
1765         txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
1766         txp->tx_txreq.size = length;
1767         txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET;
1768         txp->tx_txreq.flags = 0;
1769 
1770         return (txp);
1771 }
1772 
1773 static xnf_txbuf_t *
1774 xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp)
1775 {
1776         xnf_txbuf_t *head = NULL;
1777         xnf_txbuf_t *tail = NULL;
1778         domid_t oeid;
1779         int nsegs = 0;
1780 
1781         oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1782 
1783         for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) {
1784                 ddi_dma_handle_t dma_handle;
1785                 ddi_dma_cookie_t dma_cookie;
1786                 uint_t ncookies;
1787                 xnf_txbuf_t *txp;
1788 
1789                 if (MBLKL(ml) == 0)
1790                         continue;
1791 
1792                 txp = xnf_data_txbuf_alloc(xnfp);
1793 
1794                 if (head == NULL) {
1795                         head = txp;
1796                 } else {
1797                         ASSERT(tail != NULL);
1798                         TXBUF_SETNEXT(tail, txp);
1799                         txp->tx_head = head;
1800                 }
1801 
1802                 /*
1803                  * The necessary segmentation rules (e.g. not crossing a page
1804                  * boundary) are enforced by the dma attributes of the handle.
1805                  */
1806                 dma_handle = txp->tx_dma_handle;
1807                 int ret = ddi_dma_addr_bind_handle(dma_handle,
1808                     NULL, (char *)ml->b_rptr, MBLKL(ml),
1809                     DDI_DMA_WRITE | DDI_DMA_STREAMING,
1810                     DDI_DMA_DONTWAIT, 0, &dma_cookie,
1811                     &ncookies);
1812                 if (ret != DDI_DMA_MAPPED) {
1813                         if (ret != DDI_DMA_NORESOURCES) {
1814                                 dev_err(xnfp->xnf_devinfo, CE_WARN,
1815                                     "ddi_dma_addr_bind_handle() failed "
1816                                     "[dma_error=%d]", ret);
1817                         }
1818                         goto error;
1819                 }
1820                 txp->tx_handle_bound = B_TRUE;
1821 
1822                 ASSERT(ncookies > 0);
1823                 for (int i = 0; i < ncookies; i++) {
1824                         if (nsegs == XEN_MAX_TX_DATA_PAGES) {
1825                                 dev_err(xnfp->xnf_devinfo, CE_WARN,
1826                                     "xnf_dmamap_alloc() failed: "
1827                                     "too many segments");
1828                                 goto error;
1829                         }
1830                         if (i > 0) {
1831                                 txp = xnf_data_txbuf_alloc(xnfp);
1832                                 ASSERT(tail != NULL);
1833                                 TXBUF_SETNEXT(tail, txp);
1834                                 txp->tx_head = head;
1835                         }
1836 
1837                         txp->tx_mfn =
1838                             xnf_btop(pa_to_ma(dma_cookie.dmac_laddress));
1839                         txp->tx_txreq.gref = xnf_gref_get(xnfp);
1840                         if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
1841                                 dev_err(xnfp->xnf_devinfo, CE_WARN,
1842                                     "xnf_dmamap_alloc() failed: "
1843                                     "invalid grant ref");
1844                                 goto error;
1845                         }
1846                         gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
1847                             oeid, txp->tx_mfn, 1);
1848                         txp->tx_txreq.offset =
1849                             dma_cookie.dmac_laddress & PAGEOFFSET;
1850                         txp->tx_txreq.size = dma_cookie.dmac_size;
1851                         txp->tx_txreq.flags = 0;
1852 
1853                         ddi_dma_nextcookie(dma_handle, &dma_cookie);
1854                         nsegs++;
1855 
1856                         if (tail != NULL)
1857                                 tail->tx_txreq.flags = NETTXF_more_data;
1858                         tail = txp;
1859                 }
1860         }
1861 
1862         *countp = nsegs;
1863         return (head);
1864 
1865 error:
1866         xnf_data_txbuf_free_chain(xnfp, head);
1867         return (NULL);
1868 }
1869 
1870 static void
1871 xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head,
1872     uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss)
1873 {
1874         if (lso_flags != 0) {
1875                 ASSERT3U(lso_flags, ==, HW_LSO);
1876                 ASSERT3P(head->tx_bdesc, ==, NULL);
1877 
1878                 head->tx_txreq.flags |= NETTXF_extra_info;
1879                 netif_extra_info_t *extra = &head->tx_extra;
1880                 extra->type = XEN_NETIF_EXTRA_TYPE_GSO;
1881                 extra->flags = 0;
1882                 extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
1883                 extra->u.gso.size = mss;
1884                 extra->u.gso.features = 0;
1885                 extra->u.gso.pad = 0;
1886         } else if (cksum_flags != 0) {
1887                 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1888                 /*
1889                  * If the local protocol stack requests checksum
1890                  * offload we set the 'checksum blank' flag,
1891                  * indicating to the peer that we need the checksum
1892                  * calculated for us.
1893                  *
1894                  * We _don't_ set the validated flag, because we haven't
1895                  * validated that the data and the checksum match.
1896                  *
1897                  * Note: we already called xnf_pseudo_cksum() in
1898                  * xnf_send(), so we just set the txreq flag here.
1899                  */
1900                 head->tx_txreq.flags |= NETTXF_csum_blank;
1901                 xnfp->xnf_stat_tx_cksum_deferred++;
1902         }
1903 }
1904 
1905 /*
1906  * Send packet mp. Called by the MAC framework.
1907  */
1908 static mblk_t *
1909 xnf_send(void *arg, mblk_t *mp)
1910 {
1911         xnf_t *xnfp = arg;
1912         xnf_txbuf_t *head;
1913         mblk_t *ml;
1914         int length;
1915         int pages, chunks, slots, slots_free;
1916         uint32_t cksum_flags, lso_flags, mss;
1917         boolean_t pulledup = B_FALSE;
1918         boolean_t force_copy = B_FALSE;
1919 
1920         ASSERT3P(mp->b_next, ==, NULL);
1921 
1922         mutex_enter(&xnfp->xnf_txlock);
1923 
1924         /*
1925          * Wait until we are connected to the backend.
1926          */
1927         while (!xnfp->xnf_connected)
1928                 cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
1929 
1930         /*
1931          * To simplify logic and be in sync with the rescheduling mechanism,
1932          * we require the maximum amount of slots that could be used by a
1933          * transaction to be free before proceeding. The only downside of doing
1934          * this is that it slightly reduces the effective size of the ring.
1935          */
1936         slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE);
1937         if (slots_free < XEN_MAX_SLOTS_PER_TX) {
1938                 /*
1939                  * We need to ask for a re-schedule later as the ring is full.
1940                  */
1941                 mutex_enter(&xnfp->xnf_schedlock);
1942                 xnfp->xnf_need_sched = B_TRUE;
1943                 mutex_exit(&xnfp->xnf_schedlock);
1944 
1945                 xnfp->xnf_stat_tx_defer++;
1946                 mutex_exit(&xnfp->xnf_txlock);
1947                 return (mp);
1948         }
1949 
1950         /*
1951          * Get hw offload parameters.
1952          * This must be done before pulling up the mp as those parameters
1953          * are not copied over.
1954          */
1955         mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags);
1956         mac_lso_get(mp, &mss, &lso_flags);
1957 
1958         /*
1959          * XXX: fix MAC framework so that we can advertise support for
1960          * partial checksum for IPv4 only. This way we won't need to calculate
1961          * the pseudo header checksum ourselves.
1962          */
1963         if (cksum_flags != 0) {
1964                 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1965                 (void) xnf_pseudo_cksum(mp);
1966         }
1967 
1968 pulledup:
1969         for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL;
1970             ml = ml->b_cont, chunks++) {
1971                 pages += xnf_mblk_pages(ml);
1972                 length += MBLKL(ml);
1973         }
1974         DTRACE_PROBE3(packet, int, length, int, chunks, int, pages);
1975         DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss);
1976 
1977         /*
1978          * If the ethernet header crosses a page boundary the packet
1979          * will be dropped by the backend. In practice it seems like
1980          * this happens fairly rarely so we'll do nothing unless the
1981          * packet is small enough to fit in a look-aside buffer.
1982          */
1983         if (((uintptr_t)mp->b_rptr & PAGEOFFSET) +
1984             sizeof (struct ether_header) > PAGESIZE) {
1985                 xnfp->xnf_stat_tx_eth_hdr_split++;
1986                 if (length <= PAGESIZE)
1987                         force_copy = B_TRUE;
1988         }
1989 
1990         if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) {
1991                 /*
1992                  * If the packet spans several pages and scatter-gather is not
1993                  * supported then use a look-aside buffer.
1994                  */
1995                 ASSERT3U(length, <=, PAGESIZE);
1996                 head = xnf_mblk_copy(xnfp, mp);
1997                 if (head == NULL) {
1998                         dev_err(xnfp->xnf_devinfo, CE_WARN,
1999                             "xnf_mblk_copy() failed");
2000                         goto drop;
2001                 }
2002         } else {
2003                 /*
2004                  * There's a limit for how many pages can be passed to the
2005                  * backend. If we pass that limit, the packet will be dropped
2006                  * and some backend implementations (e.g. Linux) could even
2007                  * offline the interface.
2008                  */
2009                 if (pages > XEN_MAX_TX_DATA_PAGES) {
2010                         if (pulledup) {
2011                                 dev_err(xnfp->xnf_devinfo, CE_WARN,
2012                                     "too many pages, even after pullup: %d.",
2013                                     pages);
2014                                 goto drop;
2015                         }
2016 
2017                         /*
2018                          * Defragment packet if it spans too many pages.
2019                          */
2020                         mblk_t *newmp = msgpullup(mp, -1);
2021                         freemsg(mp);
2022                         mp = newmp;
2023                         xnfp->xnf_stat_tx_pullup++;
2024                         pulledup = B_TRUE;
2025                         goto pulledup;
2026                 }
2027 
2028                 head = xnf_mblk_map(xnfp, mp, &slots);
2029                 if (head == NULL)
2030                         goto drop;
2031 
2032                 IMPLY(slots > 1, xnfp->xnf_be_tx_sg);
2033         }
2034 
2035         /*
2036          * Set tx_mp so that mblk is freed when the txbuf chain is freed.
2037          */
2038         head->tx_mp = mp;
2039 
2040         xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss);
2041 
2042         /*
2043          * The first request must store the total length of the packet.
2044          */
2045         head->tx_txreq.size = length;
2046 
2047         /*
2048          * Push the packet we have prepared into the ring.
2049          */
2050         xnf_tx_push_packet(xnfp, head);
2051         xnfp->xnf_stat_opackets++;
2052         xnfp->xnf_stat_obytes += length;
2053 
2054         mutex_exit(&xnfp->xnf_txlock);
2055         return (NULL);
2056 
2057 drop:
2058         freemsg(mp);
2059         xnfp->xnf_stat_tx_drop++;
2060         mutex_exit(&xnfp->xnf_txlock);
2061         return (NULL);
2062 }
2063 
2064 /*
2065  * Notification of RX packets. Currently no TX-complete interrupt is
2066  * used, as we clean the TX ring lazily.
2067  */
2068 static uint_t
2069 xnf_intr(caddr_t arg)
2070 {
2071         xnf_t *xnfp = (xnf_t *)arg;
2072         mblk_t *mp;
2073         boolean_t need_sched, clean_ring;
2074 
2075         mutex_enter(&xnfp->xnf_rxlock);
2076 
2077         /*
2078          * Interrupts before we are connected are spurious.
2079          */
2080         if (!xnfp->xnf_connected) {
2081                 mutex_exit(&xnfp->xnf_rxlock);
2082                 xnfp->xnf_stat_unclaimed_interrupts++;
2083                 return (DDI_INTR_UNCLAIMED);
2084         }
2085 
2086         /*
2087          * Receive side processing.
2088          */
2089         do {
2090                 /*
2091                  * Collect buffers from the ring.
2092                  */
2093                 xnf_rx_collect(xnfp);
2094 
2095                 /*
2096                  * Interrupt me when the next receive buffer is consumed.
2097                  */
2098                 xnfp->xnf_rx_ring.sring->rsp_event =
2099                     xnfp->xnf_rx_ring.rsp_cons + 1;
2100                 xen_mb();
2101 
2102         } while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));
2103 
2104         if (xnfp->xnf_rx_new_buffers_posted) {
2105                 boolean_t notify;
2106 
2107                 /*
2108                  * Indicate to the peer that we have re-filled the
2109                  * receive ring, if it cares.
2110                  */
2111                 /* LINTED: constant in conditional context */
2112                 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
2113                 if (notify)
2114                         ec_notify_via_evtchn(xnfp->xnf_evtchn);
2115                 xnfp->xnf_rx_new_buffers_posted = B_FALSE;
2116         }
2117 
2118         mp = xnfp->xnf_rx_head;
2119         xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;
2120 
2121         xnfp->xnf_stat_interrupts++;
2122         mutex_exit(&xnfp->xnf_rxlock);
2123 
2124         if (mp != NULL)
2125                 mac_rx(xnfp->xnf_mh, NULL, mp);
2126 
2127         /*
2128          * Transmit side processing.
2129          *
2130          * If a previous transmit attempt failed or we have pending
2131          * multicast requests, clean the ring.
2132          *
2133          * If we previously stalled transmission and cleaning produces
2134          * some free slots, tell upstream to attempt sending again.
2135          *
2136          * The odd style is to avoid acquiring xnf_txlock unless we
2137          * will actually look inside the tx machinery.
2138          */
2139         mutex_enter(&xnfp->xnf_schedlock);
2140         need_sched = xnfp->xnf_need_sched;
2141         clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
2142         mutex_exit(&xnfp->xnf_schedlock);
2143 
2144         if (clean_ring) {
2145                 int free_slots;
2146 
2147                 mutex_enter(&xnfp->xnf_txlock);
2148                 free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE);
2149 
2150                 if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) {
2151                         mutex_enter(&xnfp->xnf_schedlock);
2152                         xnfp->xnf_need_sched = B_FALSE;
2153                         mutex_exit(&xnfp->xnf_schedlock);
2154 
2155                         mac_tx_update(xnfp->xnf_mh);
2156                 }
2157                 mutex_exit(&xnfp->xnf_txlock);
2158         }
2159 
2160         return (DDI_INTR_CLAIMED);
2161 }
2162 
2163 /*
2164  *  xnf_start() -- start the board receiving and enable interrupts.
2165  */
2166 static int
2167 xnf_start(void *arg)
2168 {
2169         xnf_t *xnfp = arg;
2170 
2171 #ifdef XNF_DEBUG
2172         if (xnf_debug & XNF_DEBUG_TRACE)
2173                 printf("xnf%d start(0x%p)\n",
2174                     ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
2175 #endif
2176 
2177         mutex_enter(&xnfp->xnf_rxlock);
2178         mutex_enter(&xnfp->xnf_txlock);
2179 
2180         /* Accept packets from above. */
2181         xnfp->xnf_running = B_TRUE;
2182 
2183         mutex_exit(&xnfp->xnf_txlock);
2184         mutex_exit(&xnfp->xnf_rxlock);
2185 
2186         return (0);
2187 }
2188 
2189 /* xnf_stop() - disable hardware */
2190 static void
2191 xnf_stop(void *arg)
2192 {
2193         xnf_t *xnfp = arg;
2194 
2195 #ifdef XNF_DEBUG
2196         if (xnf_debug & XNF_DEBUG_TRACE)
2197                 printf("xnf%d stop(0x%p)\n",
2198                     ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
2199 #endif
2200 
2201         mutex_enter(&xnfp->xnf_rxlock);
2202         mutex_enter(&xnfp->xnf_txlock);
2203 
2204         xnfp->xnf_running = B_FALSE;
2205 
2206         mutex_exit(&xnfp->xnf_txlock);
2207         mutex_exit(&xnfp->xnf_rxlock);
2208 }
2209 
2210 /*
2211  * Hang buffer `bdesc' on the RX ring.
2212  */
2213 static void
2214 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
2215 {
2216         netif_rx_request_t *reqp;
2217         RING_IDX hang_ix;
2218 
2219         ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2220 
2221         reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
2222             xnfp->xnf_rx_ring.req_prod_pvt);
2223         hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
2224         ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);
2225 
2226         reqp->id = bdesc->id = hang_ix;
2227         reqp->gref = bdesc->grant_ref;
2228 
2229         xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
2230         xnfp->xnf_rx_ring.req_prod_pvt++;
2231 
2232         xnfp->xnf_rx_new_buffers_posted = B_TRUE;
2233 }
2234 
2235 /*
2236  * Receive an entire packet from the ring, starting from slot *consp.
2237  * prod indicates the slot of the latest response.
2238  * On return, *consp will point to the head of the next packet.
2239  *
2240  * Note: If slot prod was reached before we could gather a full packet, we will
2241  * drop the partial packet; this would most likely indicate a bug in either
2242  * the front-end or the back-end driver.
2243  *
2244  * An rx packet can consist of several fragments and thus span multiple slots.
2245  * Each fragment can contain up to 4k of data.
2246  *
2247  * A typical 9000 MTU packet with look like this:
2248  * +------+---------------------+-------------------+-----------------------+
2249  * | SLOT | TYPE                | CONTENTS          | FLAGS                 |
2250  * +------+---------------------+-------------------+-----------------------+
2251  * | 1    | netif_rx_response_t | 1st data fragment | more_data             |
2252  * +------+---------------------+-------------------+-----------------------+
2253  * | 2    | netif_rx_response_t | 2nd data fragment | more_data             |
2254  * +------+---------------------+-------------------+-----------------------+
2255  * | 3    | netif_rx_response_t | 3rd data fragment | [none]                |
2256  * +------+---------------------+-------------------+-----------------------+
2257  *
2258  * Fragments are chained by setting NETRXF_more_data in the previous
2259  * response's flags. If there are additional flags, such as
2260  * NETRXF_data_validated or NETRXF_extra_info, those should be set on the
2261  * first fragment.
2262  *
2263  * Sometimes extra info can be present. If so, it will follow the first
2264  * fragment, and NETRXF_extra_info flag will be set on the first response.
2265  * If LRO is set on a packet, it will be stored in the extra info. Conforming
2266  * to the spec, extra info can also be chained, but must all be present right
2267  * after the first fragment.
2268  *
2269  * Example of a packet with 2 extra infos:
2270  * +------+---------------------+-------------------+-----------------------+
2271  * | SLOT | TYPE                | CONTENTS          | FLAGS                 |
2272  * +------+---------------------+-------------------+-----------------------+
2273  * | 1    | netif_rx_response_t | 1st data fragment | extra_info, more_data |
2274  * +------+---------------------+-------------------+-----------------------+
2275  * | 2    | netif_extra_info_t  | 1st extra info    | EXTRA_FLAG_MORE       |
2276  * +------+---------------------+-------------------+-----------------------+
2277  * | 3    | netif_extra_info_t  | 2nd extra info    | [none]                |
2278  * +------+---------------------+-------------------+-----------------------+
2279  * | 4    | netif_rx_response_t | 2nd data fragment | more_data             |
2280  * +------+---------------------+-------------------+-----------------------+
2281  * | 5    | netif_rx_response_t | 3rd data fragment | more_data             |
2282  * +------+---------------------+-------------------+-----------------------+
2283  * | 6    | netif_rx_response_t | 4th data fragment | [none]                |
2284  * +------+---------------------+-------------------+-----------------------+
2285  *
2286  * In practice, the only extra we expect is for LRO, but only if we advertise
2287  * that we support it to the backend (xnf_enable_lro == TRUE).
2288  */
2289 static int
2290 xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp)
2291 {
2292         mblk_t *head = NULL;
2293         mblk_t *tail = NULL;
2294         mblk_t *mp;
2295         int error = 0;
2296         RING_IDX cons = *consp;
2297         netif_extra_info_t lro;
2298         boolean_t is_lro = B_FALSE;
2299         boolean_t is_extra = B_FALSE;
2300 
2301         netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2302 
2303         boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0;
2304         boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0;
2305         boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0;
2306 
2307         IMPLY(more_data, xnf_enable_rx_sg);
2308 
2309         while (cons != prod) {
2310                 xnf_buf_t *bdesc;
2311                 int len, off;
2312                 int rxidx = cons & (NET_RX_RING_SIZE - 1);
2313 
2314                 bdesc = xnfp->xnf_rx_pkt_info[rxidx];
2315                 xnfp->xnf_rx_pkt_info[rxidx] = NULL;
2316 
2317                 if (is_extra) {
2318                         netif_extra_info_t *extra = (netif_extra_info_t *)&rsp;
2319                         /*
2320                          * The only extra we expect is for LRO, and it should
2321                          * only be present once.
2322                          */
2323                         if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO &&
2324                             !is_lro) {
2325                                 ASSERT(xnf_enable_lro);
2326                                 lro = *extra;
2327                                 is_lro = B_TRUE;
2328                                 DTRACE_PROBE1(lro, netif_extra_info_t *, &lro);
2329                         } else {
2330                                 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet "
2331                                     "contains unexpected extra info of type %d",
2332                                     extra->type);
2333                                 error = EINVAL;
2334                         }
2335                         more_extra =
2336                             (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0;
2337 
2338                         goto hang_buf;
2339                 }
2340 
2341                 ASSERT3U(bdesc->id, ==, rsp.id);
2342 
2343                 /*
2344                  * status stores packet length when >= 0, or errors when < 0.
2345                  */
2346                 len = rsp.status;
2347                 off = rsp.offset;
2348                 more_data = (rsp.flags & NETRXF_more_data) != 0;
2349 
2350                 /*
2351                  * sanity checks.
2352                  */
2353                 if (!xnfp->xnf_running) {
2354                         error = EBUSY;
2355                 } else if (len <= 0) {
2356                         xnfp->xnf_stat_errrx++;
2357 
2358                         switch (len) {
2359                         case 0:
2360                                 xnfp->xnf_stat_runt++;
2361                                 break;
2362                         case NETIF_RSP_ERROR:
2363                                 xnfp->xnf_stat_mac_rcv_error++;
2364                                 break;
2365                         case NETIF_RSP_DROPPED:
2366                                 xnfp->xnf_stat_norxbuf++;
2367                                 break;
2368                         }
2369                         error = EINVAL;
2370                 } else if (bdesc->grant_ref == INVALID_GRANT_REF) {
2371                         dev_err(xnfp->xnf_devinfo, CE_WARN,
2372                             "Bad rx grant reference, rsp id %d", rsp.id);
2373                         error = EINVAL;
2374                 } else if ((off + len) > PAGESIZE) {
2375                         dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses "
2376                             "page boundary (offset %d, length %d)", off, len);
2377                         error = EINVAL;
2378                 }
2379 
2380                 if (error != 0) {
2381                         /*
2382                          * If an error has been detected, we do not attempt
2383                          * to read the data but we still need to replace
2384                          * the rx bufs.
2385                          */
2386                         goto hang_buf;
2387                 }
2388 
2389                 xnf_buf_t *nbuf = NULL;
2390 
2391                 /*
2392                  * If the packet is below a pre-determined size we will
2393                  * copy data out of the buf rather than replace it.
2394                  */
2395                 if (len > xnf_rx_copy_limit)
2396                         nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
2397 
2398                 if (nbuf != NULL) {
2399                         mp = desballoc((unsigned char *)bdesc->buf,
2400                             bdesc->len, 0, &bdesc->free_rtn);
2401 
2402                         if (mp == NULL) {
2403                                 xnfp->xnf_stat_rx_desballoc_fail++;
2404                                 xnfp->xnf_stat_norxbuf++;
2405                                 error = ENOMEM;
2406                                 /*
2407                                  * we free the buf we just allocated as we
2408                                  * will re-hang the old buf.
2409                                  */
2410                                 xnf_buf_put(xnfp, nbuf, B_FALSE);
2411                                 goto hang_buf;
2412                         }
2413 
2414                         mp->b_rptr = mp->b_rptr + off;
2415                         mp->b_wptr = mp->b_rptr + len;
2416 
2417                         /*
2418                          * Release the grant as the backend doesn't need to
2419                          * access this buffer anymore and grants are scarce.
2420                          */
2421                         (void) gnttab_end_foreign_access_ref(bdesc->grant_ref,
2422                             0);
2423                         xnf_gref_put(xnfp, bdesc->grant_ref);
2424                         bdesc->grant_ref = INVALID_GRANT_REF;
2425 
2426                         bdesc = nbuf;
2427                 } else {
2428                         /*
2429                          * We failed to allocate a new buf or decided to reuse
2430                          * the old one. In either case we copy the data off it
2431                          * and put it back into the ring.
2432                          */
2433                         mp = allocb(len, 0);
2434                         if (mp == NULL) {
2435                                 xnfp->xnf_stat_rx_allocb_fail++;
2436                                 xnfp->xnf_stat_norxbuf++;
2437                                 error = ENOMEM;
2438                                 goto hang_buf;
2439                         }
2440                         bcopy(bdesc->buf + off, mp->b_wptr, len);
2441                         mp->b_wptr += len;
2442                 }
2443 
2444                 if (head == NULL)
2445                         head = mp;
2446                 else
2447                         tail->b_cont = mp;
2448                 tail = mp;
2449 
2450 hang_buf:
2451                 /*
2452                  * No matter what happens, for each response we need to hang
2453                  * a new buf on the rx ring. Put either the old one, or a new
2454                  * one if the old one is borrowed by the kernel via desballoc().
2455                  */
2456                 xnf_rxbuf_hang(xnfp, bdesc);
2457                 cons++;
2458 
2459                 /* next response is an extra */
2460                 is_extra = more_extra;
2461 
2462                 if (!more_data && !more_extra)
2463                         break;
2464 
2465                 /*
2466                  * Note that since requests and responses are union'd on the
2467                  * same ring, we copy the response to a local variable instead
2468                  * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have
2469                  * overwritten contents of rsp.
2470                  */
2471                 rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2472         }
2473 
2474         /*
2475          * Check that we do not get stuck in a loop.
2476          */
2477         ASSERT3U(*consp, !=, cons);
2478         *consp = cons;
2479 
2480         /*
2481          * We ran out of responses but the flags indicate there is more data.
2482          */
2483         if (more_data) {
2484                 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments.");
2485                 error = EINVAL;
2486         }
2487         if (more_extra) {
2488                 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments "
2489                     "(extras).");
2490                 error = EINVAL;
2491         }
2492 
2493         /*
2494          * An error means the packet must be dropped. If we have already formed
2495          * a partial packet, then discard it.
2496          */
2497         if (error != 0) {
2498                 if (head != NULL)
2499                         freemsg(head);
2500                 xnfp->xnf_stat_rx_drop++;
2501                 return (error);
2502         }
2503 
2504         ASSERT(head != NULL);
2505 
2506         if (hwcsum) {
2507                 /*
2508                  * If the peer says that the data has been validated then we
2509                  * declare that the full checksum has been verified.
2510                  *
2511                  * We don't look at the "checksum blank" flag, and hence could
2512                  * have a packet here that we are asserting is good with
2513                  * a blank checksum.
2514                  */
2515                 mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
2516                 xnfp->xnf_stat_rx_cksum_no_need++;
2517         }
2518 
2519         /* XXX: set lro info for packet once LRO is supported in OS. */
2520 
2521         *mpp = head;
2522 
2523         return (0);
2524 }
2525 
2526 /*
2527  * Collect packets from the RX ring, storing them in `xnfp' for later use.
2528  */
2529 static void
2530 xnf_rx_collect(xnf_t *xnfp)
2531 {
2532         RING_IDX prod;
2533 
2534         ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2535 
2536         prod = xnfp->xnf_rx_ring.sring->rsp_prod;
2537         /*
2538          * Ensure we see queued responses up to 'prod'.
2539          */
2540         membar_consumer();
2541 
2542         while (xnfp->xnf_rx_ring.rsp_cons != prod) {
2543                 mblk_t *mp;
2544 
2545                 /*
2546                  * Collect a packet.
2547                  * rsp_cons is updated inside xnf_rx_one_packet().
2548                  */
2549                 int error = xnf_rx_one_packet(xnfp, prod,
2550                     &xnfp->xnf_rx_ring.rsp_cons, &mp);
2551                 if (error == 0) {
2552                         xnfp->xnf_stat_ipackets++;
2553                         xnfp->xnf_stat_rbytes += xmsgsize(mp);
2554 
2555                         /*
2556                          * Append the mblk to the rx list.
2557                          */
2558                         if (xnfp->xnf_rx_head == NULL) {
2559                                 ASSERT3P(xnfp->xnf_rx_tail, ==, NULL);
2560                                 xnfp->xnf_rx_head = mp;
2561                         } else {
2562                                 ASSERT(xnfp->xnf_rx_tail != NULL);
2563                                 xnfp->xnf_rx_tail->b_next = mp;
2564                         }
2565                         xnfp->xnf_rx_tail = mp;
2566                 }
2567         }
2568 }
2569 
2570 /*
2571  *  xnf_alloc_dma_resources() -- initialize the drivers structures
2572  */
2573 static int
2574 xnf_alloc_dma_resources(xnf_t *xnfp)
2575 {
2576         dev_info_t              *devinfo = xnfp->xnf_devinfo;
2577         size_t                  len;
2578         ddi_dma_cookie_t        dma_cookie;
2579         uint_t                  ncookies;
2580         int                     rc;
2581         caddr_t                 rptr;
2582 
2583         /*
2584          * The code below allocates all the DMA data structures that
2585          * need to be released when the driver is detached.
2586          *
2587          * Allocate page for the transmit descriptor ring.
2588          */
2589         if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2590             DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
2591                 goto alloc_error;
2592 
2593         if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
2594             PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2595             DDI_DMA_SLEEP, 0, &rptr, &len,
2596             &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
2597                 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2598                 xnfp->xnf_tx_ring_dma_handle = NULL;
2599                 goto alloc_error;
2600         }
2601 
2602         if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
2603             rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2604             DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2605                 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2606                 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2607                 xnfp->xnf_tx_ring_dma_handle = NULL;
2608                 xnfp->xnf_tx_ring_dma_acchandle = NULL;
2609                 if (rc == DDI_DMA_NORESOURCES)
2610                         goto alloc_error;
2611                 else
2612                         goto error;
2613         }
2614 
2615         ASSERT(ncookies == 1);
2616         bzero(rptr, PAGESIZE);
2617         /* LINTED: constant in conditional context */
2618         SHARED_RING_INIT((netif_tx_sring_t *)rptr);
2619         /* LINTED: constant in conditional context */
2620         FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
2621         xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
2622 
2623         /*
2624          * Allocate page for the receive descriptor ring.
2625          */
2626         if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2627             DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
2628                 goto alloc_error;
2629 
2630         if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
2631             PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2632             DDI_DMA_SLEEP, 0, &rptr, &len,
2633             &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
2634                 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2635                 xnfp->xnf_rx_ring_dma_handle = NULL;
2636                 goto alloc_error;
2637         }
2638 
2639         if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2640             rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2641             DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2642                 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2643                 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2644                 xnfp->xnf_rx_ring_dma_handle = NULL;
2645                 xnfp->xnf_rx_ring_dma_acchandle = NULL;
2646                 if (rc == DDI_DMA_NORESOURCES)
2647                         goto alloc_error;
2648                 else
2649                         goto error;
2650         }
2651 
2652         ASSERT(ncookies == 1);
2653         bzero(rptr, PAGESIZE);
2654         /* LINTED: constant in conditional context */
2655         SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2656         /* LINTED: constant in conditional context */
2657         FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2658         xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2659 
2660         return (DDI_SUCCESS);
2661 
2662 alloc_error:
2663         cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2664             ddi_get_instance(xnfp->xnf_devinfo));
2665 error:
2666         xnf_release_dma_resources(xnfp);
2667         return (DDI_FAILURE);
2668 }
2669 
2670 /*
2671  * Release all DMA resources in the opposite order from acquisition
2672  */
2673 static void
2674 xnf_release_dma_resources(xnf_t *xnfp)
2675 {
2676         int i;
2677 
2678         /*
2679          * Free receive buffers which are currently associated with
2680          * descriptors.
2681          */
2682         mutex_enter(&xnfp->xnf_rxlock);
2683         for (i = 0; i < NET_RX_RING_SIZE; i++) {
2684                 xnf_buf_t *bp;
2685 
2686                 if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
2687                         continue;
2688                 xnfp->xnf_rx_pkt_info[i] = NULL;
2689                 xnf_buf_put(xnfp, bp, B_FALSE);
2690         }
2691         mutex_exit(&xnfp->xnf_rxlock);
2692 
2693         /* Free the receive ring buffer. */
2694         if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2695                 (void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2696                 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2697                 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2698                 xnfp->xnf_rx_ring_dma_acchandle = NULL;
2699         }
2700         /* Free the transmit ring buffer. */
2701         if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2702                 (void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2703                 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2704                 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2705                 xnfp->xnf_tx_ring_dma_acchandle = NULL;
2706         }
2707 
2708 }
2709 
2710 /*
2711  * Release any packets and associated structures used by the TX ring.
2712  */
2713 static void
2714 xnf_release_mblks(xnf_t *xnfp)
2715 {
2716         RING_IDX i;
2717         xnf_txid_t *tidp;
2718 
2719         for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
2720             i < NET_TX_RING_SIZE;
2721             i++, tidp++) {
2722                 xnf_txbuf_t *txp = tidp->txbuf;
2723 
2724                 if (txp != NULL) {
2725                         ASSERT(txp->tx_mp != NULL);
2726                         freemsg(txp->tx_mp);
2727 
2728                         xnf_txid_put(xnfp, tidp);
2729                         kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
2730                 }
2731         }
2732 }
2733 
2734 static int
2735 xnf_buf_constructor(void *buf, void *arg, int kmflag)
2736 {
2737         int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2738         xnf_buf_t *bdesc = buf;
2739         xnf_t *xnfp = arg;
2740         ddi_dma_cookie_t dma_cookie;
2741         uint_t ncookies;
2742         size_t len;
2743 
2744         if (kmflag & KM_NOSLEEP)
2745                 ddiflags = DDI_DMA_DONTWAIT;
2746 
2747         /* Allocate a DMA access handle for the buffer. */
2748         if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr,
2749             ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2750                 goto failure;
2751 
2752         /* Allocate DMA-able memory for buffer. */
2753         if (ddi_dma_mem_alloc(bdesc->dma_handle,
2754             PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
2755             &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2756                 goto failure_1;
2757 
2758         /* Bind to virtual address of buffer to get physical address. */
2759         if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2760             bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
2761             ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2762                 goto failure_2;
2763         ASSERT(ncookies == 1);
2764 
2765         bdesc->free_rtn.free_func = xnf_buf_recycle;
2766         bdesc->free_rtn.free_arg = (caddr_t)bdesc;
2767         bdesc->xnfp = xnfp;
2768         bdesc->buf_phys = dma_cookie.dmac_laddress;
2769         bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2770         bdesc->len = dma_cookie.dmac_size;
2771         bdesc->grant_ref = INVALID_GRANT_REF;
2772         bdesc->gen = xnfp->xnf_gen;
2773 
2774         atomic_inc_64(&xnfp->xnf_stat_buf_allocated);
2775 
2776         return (0);
2777 
2778 failure_2:
2779         ddi_dma_mem_free(&bdesc->acc_handle);
2780 
2781 failure_1:
2782         ddi_dma_free_handle(&bdesc->dma_handle);
2783 
2784 failure:
2785 
2786         ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2787         return (-1);
2788 }
2789 
2790 static void
2791 xnf_buf_destructor(void *buf, void *arg)
2792 {
2793         xnf_buf_t *bdesc = buf;
2794         xnf_t *xnfp = arg;
2795 
2796         (void) ddi_dma_unbind_handle(bdesc->dma_handle);
2797         ddi_dma_mem_free(&bdesc->acc_handle);
2798         ddi_dma_free_handle(&bdesc->dma_handle);
2799 
2800         atomic_dec_64(&xnfp->xnf_stat_buf_allocated);
2801 }
2802 
2803 static xnf_buf_t *
2804 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
2805 {
2806         grant_ref_t gref;
2807         xnf_buf_t *bufp;
2808 
2809         /*
2810          * Usually grant references are more scarce than memory, so we
2811          * attempt to acquire a grant reference first.
2812          */
2813         gref = xnf_gref_get(xnfp);
2814         if (gref == INVALID_GRANT_REF)
2815                 return (NULL);
2816 
2817         bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
2818         if (bufp == NULL) {
2819                 xnf_gref_put(xnfp, gref);
2820                 return (NULL);
2821         }
2822 
2823         ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF);
2824 
2825         bufp->grant_ref = gref;
2826 
2827         if (bufp->gen != xnfp->xnf_gen)
2828                 xnf_buf_refresh(bufp);
2829 
2830         gnttab_grant_foreign_access_ref(bufp->grant_ref,
2831             xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
2832             bufp->buf_mfn, readonly ? 1 : 0);
2833 
2834         atomic_inc_64(&xnfp->xnf_stat_buf_outstanding);
2835 
2836         return (bufp);
2837 }
2838 
2839 static void
2840 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
2841 {
2842         if (bufp->grant_ref != INVALID_GRANT_REF) {
2843                 (void) gnttab_end_foreign_access_ref(
2844                     bufp->grant_ref, readonly ? 1 : 0);
2845                 xnf_gref_put(xnfp, bufp->grant_ref);
2846                 bufp->grant_ref = INVALID_GRANT_REF;
2847         }
2848 
2849         kmem_cache_free(xnfp->xnf_buf_cache, bufp);
2850 
2851         atomic_dec_64(&xnfp->xnf_stat_buf_outstanding);
2852 }
2853 
2854 /*
2855  * Refresh any cached data about a buffer after resume.
2856  */
2857 static void
2858 xnf_buf_refresh(xnf_buf_t *bdesc)
2859 {
2860         bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2861         bdesc->gen = bdesc->xnfp->xnf_gen;
2862 }
2863 
2864 /*
2865  * Streams `freeb' routine for `xnf_buf_t' when used as transmit
2866  * look-aside buffers.
2867  */
2868 static void
2869 xnf_buf_recycle(xnf_buf_t *bdesc)
2870 {
2871         xnf_t *xnfp = bdesc->xnfp;
2872 
2873         xnf_buf_put(xnfp, bdesc, B_TRUE);
2874 }
2875 
2876 static int
2877 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
2878 {
2879         int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2880         xnf_txbuf_t *txp = buf;
2881         xnf_t *xnfp = arg;
2882 
2883         if (kmflag & KM_NOSLEEP)
2884                 ddiflags = DDI_DMA_DONTWAIT;
2885 
2886         if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr,
2887             ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) {
2888                 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2889                 return (-1);
2890         }
2891 
2892         return (0);
2893 }
2894 
2895 static void
2896 xnf_tx_buf_destructor(void *buf, void *arg)
2897 {
2898         _NOTE(ARGUNUSED(arg));
2899         xnf_txbuf_t *txp = buf;
2900 
2901         ddi_dma_free_handle(&txp->tx_dma_handle);
2902 }
2903 
2904 /*
2905  * Statistics.
2906  */
2907 static char *xnf_aux_statistics[] = {
2908         "tx_cksum_deferred",
2909         "rx_cksum_no_need",
2910         "interrupts",
2911         "unclaimed_interrupts",
2912         "tx_pullup",
2913         "tx_lookaside",
2914         "tx_drop",
2915         "tx_eth_hdr_split",
2916         "buf_allocated",
2917         "buf_outstanding",
2918         "gref_outstanding",
2919         "gref_failure",
2920         "gref_peak",
2921         "rx_allocb_fail",
2922         "rx_desballoc_fail",
2923 };
2924 
2925 static int
2926 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2927 {
2928         xnf_t *xnfp;
2929         kstat_named_t *knp;
2930 
2931         if (flag != KSTAT_READ)
2932                 return (EACCES);
2933 
2934         xnfp = ksp->ks_private;
2935         knp = ksp->ks_data;
2936 
2937         /*
2938          * Assignment order must match that of the names in
2939          * xnf_aux_statistics.
2940          */
2941         (knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2942         (knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2943 
2944         (knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2945         (knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2946         (knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2947         (knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside;
2948         (knp++)->value.ui64 = xnfp->xnf_stat_tx_drop;
2949         (knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split;
2950 
2951         (knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
2952         (knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
2953         (knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
2954         (knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
2955         (knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
2956         (knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
2957         (knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;
2958 
2959         return (0);
2960 }
2961 
2962 static boolean_t
2963 xnf_kstat_init(xnf_t *xnfp)
2964 {
2965         int nstat = sizeof (xnf_aux_statistics) /
2966             sizeof (xnf_aux_statistics[0]);
2967         char **cp = xnf_aux_statistics;
2968         kstat_named_t *knp;
2969 
2970         /*
2971          * Create and initialise kstats.
2972          */
2973         if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2974             ddi_get_instance(xnfp->xnf_devinfo),
2975             "aux_statistics", "net", KSTAT_TYPE_NAMED,
2976             nstat, 0)) == NULL)
2977                 return (B_FALSE);
2978 
2979         xnfp->xnf_kstat_aux->ks_private = xnfp;
2980         xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2981 
2982         knp = xnfp->xnf_kstat_aux->ks_data;
2983         while (nstat > 0) {
2984                 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2985 
2986                 knp++;
2987                 cp++;
2988                 nstat--;
2989         }
2990 
2991         kstat_install(xnfp->xnf_kstat_aux);
2992 
2993         return (B_TRUE);
2994 }
2995 
2996 static int
2997 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2998 {
2999         xnf_t *xnfp = arg;
3000 
3001         mutex_enter(&xnfp->xnf_rxlock);
3002         mutex_enter(&xnfp->xnf_txlock);
3003 
3004 #define mac_stat(q, r)                          \
3005         case (MAC_STAT_##q):                    \
3006                 *val = xnfp->xnf_stat_##r;   \
3007                 break
3008 
3009 #define ether_stat(q, r)                        \
3010         case (ETHER_STAT_##q):                  \
3011                 *val = xnfp->xnf_stat_##r;   \
3012                 break
3013 
3014         switch (stat) {
3015 
3016         mac_stat(IPACKETS, ipackets);
3017         mac_stat(OPACKETS, opackets);
3018         mac_stat(RBYTES, rbytes);
3019         mac_stat(OBYTES, obytes);
3020         mac_stat(NORCVBUF, norxbuf);
3021         mac_stat(IERRORS, errrx);
3022         mac_stat(NOXMTBUF, tx_defer);
3023 
3024         ether_stat(MACRCV_ERRORS, mac_rcv_error);
3025         ether_stat(TOOSHORT_ERRORS, runt);
3026 
3027         /* always claim to be in full duplex mode */
3028         case ETHER_STAT_LINK_DUPLEX:
3029                 *val = LINK_DUPLEX_FULL;
3030                 break;
3031 
3032         /* always claim to be at 1Gb/s link speed */
3033         case MAC_STAT_IFSPEED:
3034                 *val = 1000000000ull;
3035                 break;
3036 
3037         default:
3038                 mutex_exit(&xnfp->xnf_txlock);
3039                 mutex_exit(&xnfp->xnf_rxlock);
3040 
3041                 return (ENOTSUP);
3042         }
3043 
3044 #undef mac_stat
3045 #undef ether_stat
3046 
3047         mutex_exit(&xnfp->xnf_txlock);
3048         mutex_exit(&xnfp->xnf_rxlock);
3049 
3050         return (0);
3051 }
3052 
3053 static int
3054 xnf_change_mtu(xnf_t *xnfp, uint32_t mtu)
3055 {
3056         if (mtu > ETHERMTU) {
3057                 if (!xnf_enable_tx_sg) {
3058                         dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3059                             "because scatter-gather is disabled for transmit "
3060                             "in driver settings", ETHERMTU);
3061                         return (EINVAL);
3062                 } else if (!xnf_enable_rx_sg) {
3063                         dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3064                             "because scatter-gather is disabled for receive "
3065                             "in driver settings", ETHERMTU);
3066                         return (EINVAL);
3067                 } else if (!xnfp->xnf_be_tx_sg) {
3068                         dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3069                             "because backend doesn't support scatter-gather",
3070                             ETHERMTU);
3071                         return (EINVAL);
3072                 }
3073                 if (mtu > XNF_MAXPKT)
3074                         return (EINVAL);
3075         }
3076         int error = mac_maxsdu_update(xnfp->xnf_mh, mtu);
3077         if (error == 0)
3078                 xnfp->xnf_mtu = mtu;
3079 
3080         return (error);
3081 }
3082 
3083 /*ARGSUSED*/
3084 static int
3085 xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3086     uint_t prop_val_size, void *prop_val)
3087 {
3088         xnf_t *xnfp = data;
3089 
3090         switch (prop_id) {
3091         case MAC_PROP_MTU:
3092                 ASSERT(prop_val_size >= sizeof (uint32_t));
3093                 bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t));
3094                 break;
3095         default:
3096                 return (ENOTSUP);
3097         }
3098         return (0);
3099 }
3100 
3101 /*ARGSUSED*/
3102 static int
3103 xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3104     uint_t prop_val_size, const void *prop_val)
3105 {
3106         xnf_t *xnfp = data;
3107         uint32_t new_mtu;
3108         int error;
3109 
3110         switch (prop_id) {
3111         case MAC_PROP_MTU:
3112                 ASSERT(prop_val_size >= sizeof (uint32_t));
3113                 bcopy(prop_val, &new_mtu, sizeof (new_mtu));
3114                 error = xnf_change_mtu(xnfp, new_mtu);
3115                 break;
3116         default:
3117                 return (ENOTSUP);
3118         }
3119 
3120         return (error);
3121 }
3122 
3123 /*ARGSUSED*/
3124 static void
3125 xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id,
3126     mac_prop_info_handle_t prop_handle)
3127 {
3128         switch (prop_id) {
3129         case MAC_PROP_MTU:
3130                 mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT);
3131                 break;
3132         default:
3133                 break;
3134         }
3135 }
3136 
3137 static boolean_t
3138 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
3139 {
3140         xnf_t *xnfp = arg;
3141 
3142         switch (cap) {
3143         case MAC_CAPAB_HCKSUM: {
3144                 uint32_t *capab = cap_data;
3145 
3146                 /*
3147                  * Whilst the flag used to communicate with the IO
3148                  * domain is called "NETTXF_csum_blank", the checksum
3149                  * in the packet must contain the pseudo-header
3150                  * checksum and not zero.
3151                  *
3152                  * To help out the IO domain, we might use
3153                  * HCKSUM_INET_PARTIAL. Unfortunately our stack will
3154                  * then use checksum offload for IPv6 packets, which
3155                  * the IO domain can't handle.
3156                  *
3157                  * As a result, we declare outselves capable of
3158                  * HCKSUM_INET_FULL_V4. This means that we receive
3159                  * IPv4 packets from the stack with a blank checksum
3160                  * field and must insert the pseudo-header checksum
3161                  * before passing the packet to the IO domain.
3162                  */
3163                 *capab = HCKSUM_INET_FULL_V4;
3164 
3165                 /*
3166                  * TODO: query the "feature-ipv6-csum-offload" capability.
3167                  * If enabled, that could allow us to use HCKSUM_INET_PARTIAL.
3168                  */
3169 
3170                 break;
3171         }
3172         case MAC_CAPAB_LSO: {
3173                 if (!xnfp->xnf_be_lso)
3174                         return (B_FALSE);
3175 
3176                 mac_capab_lso_t *lso = cap_data;
3177                 lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
3178                 lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET;
3179                 break;
3180         }
3181         default:
3182                 return (B_FALSE);
3183         }
3184 
3185         return (B_TRUE);
3186 }
3187 
3188 /*
3189  * The state of the peer has changed - react accordingly.
3190  */
3191 static void
3192 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
3193     void *arg, void *impl_data)
3194 {
3195         _NOTE(ARGUNUSED(id, arg));
3196         xnf_t *xnfp = ddi_get_driver_private(dip);
3197         XenbusState new_state = *(XenbusState *)impl_data;
3198 
3199         ASSERT(xnfp != NULL);
3200 
3201         switch (new_state) {
3202         case XenbusStateUnknown:
3203         case XenbusStateInitialising:
3204         case XenbusStateInitialised:
3205         case XenbusStateClosing:
3206         case XenbusStateClosed:
3207         case XenbusStateReconfiguring:
3208         case XenbusStateReconfigured:
3209                 break;
3210 
3211         case XenbusStateInitWait:
3212                 xnf_read_config(xnfp);
3213 
3214                 if (!xnfp->xnf_be_rx_copy) {
3215                         cmn_err(CE_WARN,
3216                             "The xnf driver requires a dom0 that "
3217                             "supports 'feature-rx-copy'.");
3218                         (void) xvdi_switch_state(xnfp->xnf_devinfo,
3219                             XBT_NULL, XenbusStateClosed);
3220                         break;
3221                 }
3222 
3223                 /*
3224                  * Connect to the backend.
3225                  */
3226                 xnf_be_connect(xnfp);
3227 
3228                 /*
3229                  * Our MAC address as discovered by xnf_read_config().
3230                  */
3231                 mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
3232 
3233                 /*
3234                  * We do not know if some features such as LSO are supported
3235                  * until we connect to the backend. We request the MAC layer
3236                  * to poll our capabilities again.
3237                  */
3238                 mac_capab_update(xnfp->xnf_mh);
3239 
3240                 break;
3241 
3242         case XenbusStateConnected:
3243                 mutex_enter(&xnfp->xnf_rxlock);
3244                 mutex_enter(&xnfp->xnf_txlock);
3245 
3246                 xnfp->xnf_connected = B_TRUE;
3247                 /*
3248                  * Wake up any threads waiting to send data to
3249                  * backend.
3250                  */
3251                 cv_broadcast(&xnfp->xnf_cv_state);
3252 
3253                 mutex_exit(&xnfp->xnf_txlock);
3254                 mutex_exit(&xnfp->xnf_rxlock);
3255 
3256                 /*
3257                  * Kick the peer in case it missed any transmits
3258                  * request in the TX ring.
3259                  */
3260                 ec_notify_via_evtchn(xnfp->xnf_evtchn);
3261 
3262                 /*
3263                  * There may already be completed receive requests in
3264                  * the ring sent by backend after it gets connected
3265                  * but before we see its state change here, so we call
3266                  * xnf_intr() to handle them, if any.
3267                  */
3268                 (void) xnf_intr((caddr_t)xnfp);
3269 
3270                 /*
3271                  * Mark the link up now that we are connected.
3272                  */
3273                 mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
3274 
3275                 /*
3276                  * Tell the backend about the multicast addresses in
3277                  * which we are interested.
3278                  */
3279                 mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);
3280 
3281                 break;
3282 
3283         default:
3284                 break;
3285         }
3286 }