1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  29  * Copyright 2017 Nexenta Systems, Inc.
  30  */
  31 
  32 /*
  33  * xdf.c - Xen Virtual Block Device Driver
  34  * TODO:
  35  *      - support alternate block size (currently only DEV_BSIZE supported)
  36  *      - revalidate geometry for removable devices
  37  *
  38  * This driver exports disk device nodes, accepts IO requests from those
  39  * nodes, and services those requests by talking to a backend device
  40  * in another domain.
  41  *
  42  * Communication with the backend device is done via a ringbuffer (which is
  43  * managed via xvdi interfaces) and dma memory (which is managed via ddi
  44  * interfaces).
  45  *
  46  * Communication with the backend device is dependant upon establishing a
  47  * connection to the backend device.  This connection process involves
  48  * reading device configuration information from xenbus and publishing
  49  * some frontend runtime configuration parameters via the xenbus (for
  50  * consumption by the backend).  Once we've published runtime configuration
  51  * information via the xenbus, the backend device can enter the connected
  52  * state and we'll enter the XD_CONNECTED state.  But before we can allow
  53  * random IO to begin, we need to do IO to the backend device to determine
  54  * the device label and if flush operations are supported.  Once this is
  55  * done we enter the XD_READY state and can process any IO operations.
  56  *
  57  * We receive notifications of xenbus state changes for the backend device
  58  * (aka, the "other end") via the xdf_oe_change() callback.  This callback
  59  * is single threaded, meaning that we can't receive new notification of
  60  * other end state changes while we're processing an outstanding
  61  * notification of an other end state change.  There for we can't do any
  62  * blocking operations from the xdf_oe_change() callback.  This is why we
  63  * have a seperate taskq (xdf_ready_tq) which exists to do the necessary
  64  * IO to get us from the XD_CONNECTED to the XD_READY state.  All IO
  65  * generated by the xdf_ready_tq thread (xdf_ready_tq_thread) will go
  66  * throught xdf_lb_rdwr(), which is a synchronous IO interface.  IOs
  67  * generated by the xdf_ready_tq_thread thread have priority over all
  68  * other IO requests.
  69  *
  70  * We also communicate with the backend device via the xenbus "media-req"
  71  * (XBP_MEDIA_REQ) property.  For more information on this see the
  72  * comments in blkif.h.
  73  */
  74 
  75 #include <io/xdf.h>
  76 
  77 #include <sys/conf.h>
  78 #include <sys/dkio.h>
  79 #include <sys/promif.h>
  80 #include <sys/sysmacros.h>
  81 #include <sys/kstat.h>
  82 #include <sys/mach_mmu.h>
  83 #ifdef XPV_HVM_DRIVER
  84 #include <sys/xpv_support.h>
  85 #else /* !XPV_HVM_DRIVER */
  86 #include <sys/evtchn_impl.h>
  87 #endif /* !XPV_HVM_DRIVER */
  88 #include <sys/sunndi.h>
  89 #include <public/io/xenbus.h>
  90 #include <xen/sys/xenbus_impl.h>
  91 #include <sys/scsi/generic/inquiry.h>
  92 #include <xen/io/blkif_impl.h>
  93 #include <sys/fdio.h>
  94 #include <sys/cdio.h>
  95 
  96 /*
  97  * DEBUG_EVAL can be used to include debug only statements without
  98  * having to use '#ifdef DEBUG' statements
  99  */
 100 #ifdef DEBUG
 101 #define DEBUG_EVAL(x)   (x)
 102 #else /* !DEBUG */
 103 #define DEBUG_EVAL(x)
 104 #endif /* !DEBUG */
 105 
 106 #define XDF_DRAIN_MSEC_DELAY            (50*1000)       /* 00.05 sec */
 107 #define XDF_DRAIN_RETRY_COUNT           200             /* 10.00 sec */
 108 #define XDF_STATE_TIMEOUT               (30*1000*1000)  /* 30.00 sec */
 109 
 110 #define INVALID_DOMID   ((domid_t)-1)
 111 #define FLUSH_DISKCACHE 0x1
 112 #define WRITE_BARRIER   0x2
 113 #define DEFAULT_FLUSH_BLOCK     156 /* block to write to cause a cache flush */
 114 #define USE_WRITE_BARRIER(vdp)                                          \
 115         ((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
 116 #define USE_FLUSH_DISKCACHE(vdp)                                        \
 117         ((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
 118 #define IS_WRITE_BARRIER(vdp, bp)                                       \
 119         (!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&                      \
 120         ((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
 121 #define IS_FLUSH_DISKCACHE(bp)                                          \
 122         (!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
 123 
 124 #define VREQ_DONE(vreq)                                                 \
 125         VOID2BOOLEAN(((vreq)->v_status == VREQ_DMAWIN_DONE) &&               \
 126             (((vreq)->v_flush_diskcache == FLUSH_DISKCACHE) ||               \
 127             (((vreq)->v_dmaw + 1) == (vreq)->v_ndmaws)))
 128 
 129 #define BP_VREQ(bp)             ((v_req_t *)((bp)->av_back))
 130 #define BP_VREQ_SET(bp, vreq)   (((bp)->av_back = (buf_t *)(vreq)))
 131 
 132 extern int              do_polled_io;
 133 
 134 /* run-time tunables that we don't want the compiler to optimize away */
 135 volatile int            xdf_debug = 0;
 136 volatile boolean_t      xdf_barrier_flush_disable = B_FALSE;
 137 
 138 /* per module globals */
 139 major_t                 xdf_major;
 140 static void             *xdf_ssp;
 141 static kmem_cache_t     *xdf_vreq_cache;
 142 static kmem_cache_t     *xdf_gs_cache;
 143 static int              xdf_maxphys = XB_MAXPHYS;
 144 static diskaddr_t       xdf_flush_block = DEFAULT_FLUSH_BLOCK;
 145 static int              xdf_fbrewrites; /* flush block re-write count */
 146 
 147 /* misc public functions */
 148 int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
 149 int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
 150 
 151 /*  misc private functions */
 152 static void xdf_io_start(xdf_t *);
 153 static void xdf_devid_setup(xdf_t *);
 154 
 155 /* callbacks from commmon label */
 156 static cmlb_tg_ops_t xdf_lb_ops = {
 157         TG_DK_OPS_VERSION_1,
 158         xdf_lb_rdwr,
 159         xdf_lb_getinfo
 160 };
 161 
 162 /*
 163  * I/O buffer DMA attributes
 164  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
 165  */
 166 static ddi_dma_attr_t xb_dma_attr = {
 167         DMA_ATTR_V0,
 168         (uint64_t)0,                    /* lowest address */
 169         (uint64_t)0xffffffffffffffff,   /* highest usable address */
 170         (uint64_t)0xffffff,             /* DMA counter limit max */
 171         (uint64_t)XB_BSIZE,             /* alignment in bytes */
 172         XB_BSIZE - 1,                   /* bitmap of burst sizes */
 173         XB_BSIZE,                       /* min transfer */
 174         (uint64_t)XB_MAX_XFER,          /* maximum transfer */
 175         (uint64_t)PAGEOFFSET,           /* 1 page segment length  */
 176         BLKIF_MAX_SEGMENTS_PER_REQUEST, /* maximum number of segments */
 177         XB_BSIZE,                       /* granularity */
 178         0,                              /* flags (reserved) */
 179 };
 180 
 181 static ddi_device_acc_attr_t xc_acc_attr = {
 182         DDI_DEVICE_ATTR_V0,
 183         DDI_NEVERSWAP_ACC,
 184         DDI_STRICTORDER_ACC
 185 };
 186 
 187 static void
 188 xdf_timeout_handler(void *arg)
 189 {
 190         xdf_t *vdp = arg;
 191 
 192         mutex_enter(&vdp->xdf_dev_lk);
 193         vdp->xdf_timeout_id = 0;
 194         mutex_exit(&vdp->xdf_dev_lk);
 195 
 196         /* new timeout thread could be re-scheduled */
 197         xdf_io_start(vdp);
 198 }
 199 
 200 /*
 201  * callback func when DMA/GTE resources is available
 202  *
 203  * Note: we only register one callback function to grant table subsystem
 204  * since we only have one 'struct gnttab_free_callback' in xdf_t.
 205  */
 206 static int
 207 xdf_dmacallback(caddr_t arg)
 208 {
 209         xdf_t *vdp = (xdf_t *)arg;
 210         ASSERT(vdp != NULL);
 211 
 212         DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
 213             vdp->xdf_addr));
 214 
 215         ddi_trigger_softintr(vdp->xdf_softintr_id);
 216         return (DDI_DMA_CALLBACK_DONE);
 217 }
 218 
 219 static ge_slot_t *
 220 gs_get(xdf_t *vdp, int isread)
 221 {
 222         grant_ref_t gh;
 223         ge_slot_t *gs;
 224 
 225         /* try to alloc GTEs needed in this slot, first */
 226         if (gnttab_alloc_grant_references(
 227             BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
 228                 if (vdp->xdf_gnt_callback.next == NULL) {
 229                         SETDMACBON(vdp);
 230                         gnttab_request_free_callback(
 231                             &vdp->xdf_gnt_callback,
 232                             (void (*)(void *))xdf_dmacallback,
 233                             (void *)vdp,
 234                             BLKIF_MAX_SEGMENTS_PER_REQUEST);
 235                 }
 236                 return (NULL);
 237         }
 238 
 239         gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
 240         if (gs == NULL) {
 241                 gnttab_free_grant_references(gh);
 242                 if (vdp->xdf_timeout_id == 0)
 243                         /* restart I/O after one second */
 244                         vdp->xdf_timeout_id =
 245                             timeout(xdf_timeout_handler, vdp, hz);
 246                 return (NULL);
 247         }
 248 
 249         /* init gs_slot */
 250         gs->gs_oeid = vdp->xdf_peer;
 251         gs->gs_isread = isread;
 252         gs->gs_ghead = gh;
 253         gs->gs_ngrefs = 0;
 254 
 255         return (gs);
 256 }
 257 
 258 static void
 259 gs_free(ge_slot_t *gs)
 260 {
 261         int             i;
 262 
 263         /* release all grant table entry resources used in this slot */
 264         for (i = 0; i < gs->gs_ngrefs; i++)
 265                 gnttab_end_foreign_access(gs->gs_ge[i], !gs->gs_isread, 0);
 266         gnttab_free_grant_references(gs->gs_ghead);
 267         list_remove(&gs->gs_vreq->v_gs, gs);
 268         kmem_cache_free(xdf_gs_cache, gs);
 269 }
 270 
 271 static grant_ref_t
 272 gs_grant(ge_slot_t *gs, mfn_t mfn)
 273 {
 274         grant_ref_t gr = gnttab_claim_grant_reference(&gs->gs_ghead);
 275 
 276         ASSERT(gr != -1);
 277         ASSERT(gs->gs_ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
 278         gs->gs_ge[gs->gs_ngrefs++] = gr;
 279         gnttab_grant_foreign_access_ref(gr, gs->gs_oeid, mfn, !gs->gs_isread);
 280 
 281         return (gr);
 282 }
 283 
 284 /*
 285  * Alloc a vreq for this bp
 286  * bp->av_back contains the pointer to the vreq upon return
 287  */
 288 static v_req_t *
 289 vreq_get(xdf_t *vdp, buf_t *bp)
 290 {
 291         v_req_t *vreq = NULL;
 292 
 293         ASSERT(BP_VREQ(bp) == NULL);
 294 
 295         vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
 296         if (vreq == NULL) {
 297                 if (vdp->xdf_timeout_id == 0)
 298                         /* restart I/O after one second */
 299                         vdp->xdf_timeout_id =
 300                             timeout(xdf_timeout_handler, vdp, hz);
 301                 return (NULL);
 302         }
 303         bzero(vreq, sizeof (v_req_t));
 304         list_create(&vreq->v_gs, sizeof (ge_slot_t),
 305             offsetof(ge_slot_t, gs_vreq_link));
 306         vreq->v_buf = bp;
 307         vreq->v_status = VREQ_INIT;
 308         vreq->v_runq = B_FALSE;
 309         BP_VREQ_SET(bp, vreq);
 310         /* init of other fields in vreq is up to the caller */
 311 
 312         list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
 313 
 314         return (vreq);
 315 }
 316 
 317 static void
 318 vreq_free(xdf_t *vdp, v_req_t *vreq)
 319 {
 320         buf_t   *bp = vreq->v_buf;
 321 
 322         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 323         ASSERT(BP_VREQ(bp) == vreq);
 324 
 325         list_remove(&vdp->xdf_vreq_act, vreq);
 326 
 327         if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
 328                 goto done;
 329 
 330         switch (vreq->v_status) {
 331         case VREQ_DMAWIN_DONE:
 332         case VREQ_GS_ALLOCED:
 333         case VREQ_DMABUF_BOUND:
 334                 (void) ddi_dma_unbind_handle(vreq->v_dmahdl);
 335                 /*FALLTHRU*/
 336         case VREQ_DMAMEM_ALLOCED:
 337                 if (!ALIGNED_XFER(bp)) {
 338                         ASSERT(vreq->v_abuf != NULL);
 339                         if (!IS_ERROR(bp) && IS_READ(bp))
 340                                 bcopy(vreq->v_abuf, bp->b_un.b_addr,
 341                                     bp->b_bcount);
 342                         ddi_dma_mem_free(&vreq->v_align);
 343                 }
 344                 /*FALLTHRU*/
 345         case VREQ_MEMDMAHDL_ALLOCED:
 346                 if (!ALIGNED_XFER(bp))
 347                         ddi_dma_free_handle(&vreq->v_memdmahdl);
 348                 /*FALLTHRU*/
 349         case VREQ_DMAHDL_ALLOCED:
 350                 ddi_dma_free_handle(&vreq->v_dmahdl);
 351                 break;
 352         default:
 353                 break;
 354         }
 355 done:
 356         ASSERT(!vreq->v_runq);
 357         list_destroy(&vreq->v_gs);
 358         kmem_cache_free(xdf_vreq_cache, vreq);
 359 }
 360 
 361 /*
 362  * Snarf new data if our flush block was re-written
 363  */
 364 static void
 365 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
 366 {
 367         int nblks;
 368         boolean_t mapin;
 369 
 370         if (IS_WRITE_BARRIER(vdp, bp))
 371                 return; /* write was a flush write */
 372 
 373         mapin = B_FALSE;
 374         nblks = bp->b_bcount >> DEV_BSHIFT;
 375         if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
 376                 xdf_fbrewrites++;
 377                 if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
 378                         mapin = B_TRUE;
 379                         bp_mapin(bp);
 380                 }
 381                 bcopy(bp->b_un.b_addr +
 382                     ((xdf_flush_block - blkno) << DEV_BSHIFT),
 383                     vdp->xdf_cache_flush_block, DEV_BSIZE);
 384                 if (mapin)
 385                         bp_mapout(bp);
 386         }
 387 }
 388 
 389 /*
 390  * Initalize the DMA and grant table resources for the buf
 391  */
 392 static int
 393 vreq_setup(xdf_t *vdp, v_req_t *vreq)
 394 {
 395         int rc;
 396         ddi_dma_attr_t dmaattr;
 397         uint_t ndcs, ndws;
 398         ddi_dma_handle_t dh;
 399         ddi_dma_handle_t mdh;
 400         ddi_dma_cookie_t dc;
 401         ddi_acc_handle_t abh;
 402         caddr_t aba;
 403         ge_slot_t *gs;
 404         size_t bufsz;
 405         off_t off;
 406         size_t sz;
 407         buf_t *bp = vreq->v_buf;
 408         int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
 409             DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
 410 
 411         switch (vreq->v_status) {
 412         case VREQ_INIT:
 413                 if (IS_FLUSH_DISKCACHE(bp)) {
 414                         if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
 415                                 DPRINTF(DMA_DBG, ("xdf@%s: "
 416                                     "get ge_slotfailed\n", vdp->xdf_addr));
 417                                 return (DDI_FAILURE);
 418                         }
 419                         vreq->v_blkno = 0;
 420                         vreq->v_nslots = 1;
 421                         vreq->v_flush_diskcache = FLUSH_DISKCACHE;
 422                         vreq->v_status = VREQ_GS_ALLOCED;
 423                         gs->gs_vreq = vreq;
 424                         list_insert_head(&vreq->v_gs, gs);
 425                         return (DDI_SUCCESS);
 426                 }
 427 
 428                 if (IS_WRITE_BARRIER(vdp, bp))
 429                         vreq->v_flush_diskcache = WRITE_BARRIER;
 430                 vreq->v_blkno = bp->b_blkno +
 431                     (diskaddr_t)(uintptr_t)bp->b_private;
 432                 /* See if we wrote new data to our flush block */
 433                 if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
 434                         check_fbwrite(vdp, bp, vreq->v_blkno);
 435                 vreq->v_status = VREQ_INIT_DONE;
 436                 /*FALLTHRU*/
 437 
 438         case VREQ_INIT_DONE:
 439                 /*
 440                  * alloc DMA handle
 441                  */
 442                 rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
 443                     xdf_dmacallback, (caddr_t)vdp, &dh);
 444                 if (rc != DDI_SUCCESS) {
 445                         SETDMACBON(vdp);
 446                         DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
 447                             vdp->xdf_addr));
 448                         return (DDI_FAILURE);
 449                 }
 450 
 451                 vreq->v_dmahdl = dh;
 452                 vreq->v_status = VREQ_DMAHDL_ALLOCED;
 453                 /*FALLTHRU*/
 454 
 455         case VREQ_DMAHDL_ALLOCED:
 456                 /*
 457                  * alloc dma handle for 512-byte aligned buf
 458                  */
 459                 if (!ALIGNED_XFER(bp)) {
 460                         /*
 461                          * XXPV: we need to temporarily enlarge the seg
 462                          * boundary and s/g length to work round CR6381968
 463                          */
 464                         dmaattr = xb_dma_attr;
 465                         dmaattr.dma_attr_seg = (uint64_t)-1;
 466                         dmaattr.dma_attr_sgllen = INT_MAX;
 467                         rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
 468                             xdf_dmacallback, (caddr_t)vdp, &mdh);
 469                         if (rc != DDI_SUCCESS) {
 470                                 SETDMACBON(vdp);
 471                                 DPRINTF(DMA_DBG, ("xdf@%s: "
 472                                     "unaligned buf DMAhandle alloc failed\n",
 473                                     vdp->xdf_addr));
 474                                 return (DDI_FAILURE);
 475                         }
 476                         vreq->v_memdmahdl = mdh;
 477                         vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
 478                 }
 479                 /*FALLTHRU*/
 480 
 481         case VREQ_MEMDMAHDL_ALLOCED:
 482                 /*
 483                  * alloc 512-byte aligned buf
 484                  */
 485                 if (!ALIGNED_XFER(bp)) {
 486                         if (bp->b_flags & (B_PAGEIO | B_PHYS))
 487                                 bp_mapin(bp);
 488                         rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
 489                             roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
 490                             DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
 491                             &aba, &bufsz, &abh);
 492                         if (rc != DDI_SUCCESS) {
 493                                 SETDMACBON(vdp);
 494                                 DPRINTF(DMA_DBG, ("xdf@%s: "
 495                                     "DMA mem allocation failed\n",
 496                                     vdp->xdf_addr));
 497                                 return (DDI_FAILURE);
 498                         }
 499 
 500                         vreq->v_abuf = aba;
 501                         vreq->v_align = abh;
 502                         vreq->v_status = VREQ_DMAMEM_ALLOCED;
 503 
 504                         ASSERT(bufsz >= bp->b_bcount);
 505                         if (!IS_READ(bp))
 506                                 bcopy(bp->b_un.b_addr, vreq->v_abuf,
 507                                     bp->b_bcount);
 508                 }
 509                 /*FALLTHRU*/
 510 
 511         case VREQ_DMAMEM_ALLOCED:
 512                 /*
 513                  * dma bind
 514                  */
 515                 if (ALIGNED_XFER(bp)) {
 516                         rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
 517                             dma_flags, xdf_dmacallback, (caddr_t)vdp,
 518                             &dc, &ndcs);
 519                 } else {
 520                         rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
 521                             NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
 522                             xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
 523                 }
 524                 if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
 525                         /* get num of dma windows */
 526                         if (rc == DDI_DMA_PARTIAL_MAP) {
 527                                 rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
 528                                 ASSERT(rc == DDI_SUCCESS);
 529                         } else {
 530                                 ndws = 1;
 531                         }
 532                 } else {
 533                         SETDMACBON(vdp);
 534                         DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
 535                             vdp->xdf_addr));
 536                         return (DDI_FAILURE);
 537                 }
 538 
 539                 vreq->v_dmac = dc;
 540                 vreq->v_dmaw = 0;
 541                 vreq->v_ndmacs = ndcs;
 542                 vreq->v_ndmaws = ndws;
 543                 vreq->v_nslots = ndws;
 544                 vreq->v_status = VREQ_DMABUF_BOUND;
 545                 /*FALLTHRU*/
 546 
 547         case VREQ_DMABUF_BOUND:
 548                 /*
 549                  * get ge_slot, callback is set upon failure from gs_get(),
 550                  * if not set previously
 551                  */
 552                 if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
 553                         DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
 554                             vdp->xdf_addr));
 555                         return (DDI_FAILURE);
 556                 }
 557 
 558                 vreq->v_status = VREQ_GS_ALLOCED;
 559                 gs->gs_vreq = vreq;
 560                 list_insert_head(&vreq->v_gs, gs);
 561                 break;
 562 
 563         case VREQ_GS_ALLOCED:
 564                 /* nothing need to be done */
 565                 break;
 566 
 567         case VREQ_DMAWIN_DONE:
 568                 /*
 569                  * move to the next dma window
 570                  */
 571                 ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
 572 
 573                 /* get a ge_slot for this DMA window */
 574                 if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
 575                         DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
 576                             vdp->xdf_addr));
 577                         return (DDI_FAILURE);
 578                 }
 579 
 580                 vreq->v_dmaw++;
 581                 VERIFY(ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
 582                     &vreq->v_dmac, &vreq->v_ndmacs) == DDI_SUCCESS);
 583                 vreq->v_status = VREQ_GS_ALLOCED;
 584                 gs->gs_vreq = vreq;
 585                 list_insert_head(&vreq->v_gs, gs);
 586                 break;
 587 
 588         default:
 589                 return (DDI_FAILURE);
 590         }
 591 
 592         return (DDI_SUCCESS);
 593 }
 594 
 595 static int
 596 xdf_cmlb_attach(xdf_t *vdp)
 597 {
 598         dev_info_t      *dip = vdp->xdf_dip;
 599 
 600         return (cmlb_attach(dip, &xdf_lb_ops,
 601             XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
 602             XD_IS_RM(vdp), B_TRUE,
 603             XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
 604             0, vdp->xdf_vd_lbl, NULL));
 605 }
 606 
 607 static void
 608 xdf_io_err(buf_t *bp, int err, size_t resid)
 609 {
 610         bioerror(bp, err);
 611         if (resid == 0)
 612                 bp->b_resid = bp->b_bcount;
 613         biodone(bp);
 614 }
 615 
 616 static void
 617 xdf_kstat_enter(xdf_t *vdp, buf_t *bp)
 618 {
 619         v_req_t *vreq = BP_VREQ(bp);
 620 
 621         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 622 
 623         if (vdp->xdf_xdev_iostat == NULL)
 624                 return;
 625         if ((vreq != NULL) && vreq->v_runq) {
 626                 kstat_runq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
 627         } else {
 628                 kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
 629         }
 630 }
 631 
 632 static void
 633 xdf_kstat_exit(xdf_t *vdp, buf_t *bp)
 634 {
 635         v_req_t *vreq = BP_VREQ(bp);
 636 
 637         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 638 
 639         if (vdp->xdf_xdev_iostat == NULL)
 640                 return;
 641 
 642         if ((vreq != NULL) && vreq->v_runq) {
 643                 kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
 644         } else {
 645                 kstat_waitq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
 646         }
 647 
 648         if (bp->b_flags & B_READ) {
 649                 KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->reads++;
 650                 KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->nread += bp->b_bcount;
 651         } else if (bp->b_flags & B_WRITE) {
 652                 KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->writes++;
 653                 KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->nwritten += bp->b_bcount;
 654         }
 655 }
 656 
 657 static void
 658 xdf_kstat_waitq_to_runq(xdf_t *vdp, buf_t *bp)
 659 {
 660         v_req_t *vreq = BP_VREQ(bp);
 661 
 662         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 663         ASSERT(!vreq->v_runq);
 664 
 665         vreq->v_runq = B_TRUE;
 666         if (vdp->xdf_xdev_iostat == NULL)
 667                 return;
 668         kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
 669 }
 670 
 671 static void
 672 xdf_kstat_runq_to_waitq(xdf_t *vdp, buf_t *bp)
 673 {
 674         v_req_t *vreq = BP_VREQ(bp);
 675 
 676         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 677         ASSERT(vreq->v_runq);
 678 
 679         vreq->v_runq = B_FALSE;
 680         if (vdp->xdf_xdev_iostat == NULL)
 681                 return;
 682         kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
 683 }
 684 
 685 int
 686 xdf_kstat_create(dev_info_t *dip)
 687 {
 688         xdf_t           *vdp = (xdf_t *)ddi_get_driver_private(dip);
 689         kstat_t         *kstat;
 690         buf_t           *bp;
 691 
 692         if ((kstat = kstat_create("xdf", ddi_get_instance(dip), NULL, "disk",
 693             KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL)
 694                 return (-1);
 695 
 696         /* See comment about locking in xdf_kstat_delete(). */
 697         mutex_enter(&vdp->xdf_iostat_lk);
 698         mutex_enter(&vdp->xdf_dev_lk);
 699 
 700         /* only one kstat can exist at a time */
 701         if (vdp->xdf_xdev_iostat != NULL) {
 702                 mutex_exit(&vdp->xdf_dev_lk);
 703                 mutex_exit(&vdp->xdf_iostat_lk);
 704                 kstat_delete(kstat);
 705                 return (-1);
 706         }
 707 
 708         vdp->xdf_xdev_iostat = kstat;
 709         vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
 710         kstat_install(vdp->xdf_xdev_iostat);
 711 
 712         /*
 713          * Now that we've created a kstat, we need to update the waitq and
 714          * runq counts for the kstat to reflect our current state.
 715          *
 716          * For a buf_t structure to be on the runq, it must have a ring
 717          * buffer slot associated with it.  To get a ring buffer slot the
 718          * buf must first have a v_req_t and a ge_slot_t associated with it.
 719          * Then when it is granted a ring buffer slot, v_runq will be set to
 720          * true.
 721          *
 722          * For a buf_t structure to be on the waitq, it must not be on the
 723          * runq.  So to find all the buf_t's that should be on waitq, we
 724          * walk the active buf list and add any buf_t's which aren't on the
 725          * runq to the waitq.
 726          */
 727         bp = vdp->xdf_f_act;
 728         while (bp != NULL) {
 729                 xdf_kstat_enter(vdp, bp);
 730                 bp = bp->av_forw;
 731         }
 732         if (vdp->xdf_ready_tq_bp != NULL)
 733                 xdf_kstat_enter(vdp, vdp->xdf_ready_tq_bp);
 734 
 735         mutex_exit(&vdp->xdf_dev_lk);
 736         mutex_exit(&vdp->xdf_iostat_lk);
 737         return (0);
 738 }
 739 
 740 void
 741 xdf_kstat_delete(dev_info_t *dip)
 742 {
 743         xdf_t           *vdp = (xdf_t *)ddi_get_driver_private(dip);
 744         kstat_t         *kstat;
 745         buf_t           *bp;
 746 
 747         /*
 748          * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
 749          * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
 750          * and the contents of the our kstat.  xdf_iostat_lk is used
 751          * to protect the allocation and freeing of the actual kstat.
 752          * xdf_dev_lk can't be used for this purpose because kstat
 753          * readers use it to access the contents of the kstat and
 754          * hence it can't be held when calling kstat_delete().
 755          */
 756         mutex_enter(&vdp->xdf_iostat_lk);
 757         mutex_enter(&vdp->xdf_dev_lk);
 758 
 759         if (vdp->xdf_xdev_iostat == NULL) {
 760                 mutex_exit(&vdp->xdf_dev_lk);
 761                 mutex_exit(&vdp->xdf_iostat_lk);
 762                 return;
 763         }
 764 
 765         /*
 766          * We're about to destroy the kstat structures, so it isn't really
 767          * necessary to update the runq and waitq counts.  But, since this
 768          * isn't a hot code path we can afford to be a little pedantic and
 769          * go ahead and decrement the runq and waitq kstat counters to zero
 770          * before free'ing them.  This helps us ensure that we've gotten all
 771          * our accounting correct.
 772          *
 773          * For an explanation of how we determine which buffers go on the
 774          * runq vs which go on the waitq, see the comments in
 775          * xdf_kstat_create().
 776          */
 777         bp = vdp->xdf_f_act;
 778         while (bp != NULL) {
 779                 xdf_kstat_exit(vdp, bp);
 780                 bp = bp->av_forw;
 781         }
 782         if (vdp->xdf_ready_tq_bp != NULL)
 783                 xdf_kstat_exit(vdp, vdp->xdf_ready_tq_bp);
 784 
 785         kstat = vdp->xdf_xdev_iostat;
 786         vdp->xdf_xdev_iostat = NULL;
 787         mutex_exit(&vdp->xdf_dev_lk);
 788         kstat_delete(kstat);
 789         mutex_exit(&vdp->xdf_iostat_lk);
 790 }
 791 
 792 /*
 793  * Add an IO requests onto the active queue.
 794  *
 795  * We have to detect IOs generated by xdf_ready_tq_thread.  These IOs
 796  * are used to establish a connection to the backend, so they receive
 797  * priority over all other IOs.  Since xdf_ready_tq_thread only does
 798  * synchronous IO, there can only be one xdf_ready_tq_thread request at any
 799  * given time and we record the buf associated with that request in
 800  * xdf_ready_tq_bp.
 801  */
 802 static void
 803 xdf_bp_push(xdf_t *vdp, buf_t *bp)
 804 {
 805         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 806         ASSERT(bp->av_forw == NULL);
 807 
 808         xdf_kstat_enter(vdp, bp);
 809 
 810         if (curthread == vdp->xdf_ready_tq_thread) {
 811                 /* new IO requests from the ready thread */
 812                 ASSERT(vdp->xdf_ready_tq_bp == NULL);
 813                 vdp->xdf_ready_tq_bp = bp;
 814                 return;
 815         }
 816 
 817         /* this is normal IO request */
 818         ASSERT(bp != vdp->xdf_ready_tq_bp);
 819 
 820         if (vdp->xdf_f_act == NULL) {
 821                 /* this is only only IO on the active queue */
 822                 ASSERT(vdp->xdf_l_act == NULL);
 823                 ASSERT(vdp->xdf_i_act == NULL);
 824                 vdp->xdf_f_act = vdp->xdf_l_act = vdp->xdf_i_act = bp;
 825                 return;
 826         }
 827 
 828         /* add this IO to the tail of the active queue */
 829         vdp->xdf_l_act->av_forw = bp;
 830         vdp->xdf_l_act = bp;
 831         if (vdp->xdf_i_act == NULL)
 832                 vdp->xdf_i_act = bp;
 833 }
 834 
 835 static void
 836 xdf_bp_pop(xdf_t *vdp, buf_t *bp)
 837 {
 838         buf_t   *bp_iter;
 839 
 840         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 841         ASSERT(VREQ_DONE(BP_VREQ(bp)));
 842 
 843         if (vdp->xdf_ready_tq_bp == bp) {
 844                 /* we're done with a ready thread IO request */
 845                 ASSERT(bp->av_forw == NULL);
 846                 vdp->xdf_ready_tq_bp = NULL;
 847                 return;
 848         }
 849 
 850         /* we're done with a normal IO request */
 851         ASSERT((bp->av_forw != NULL) || (bp == vdp->xdf_l_act));
 852         ASSERT((bp->av_forw == NULL) || (bp != vdp->xdf_l_act));
 853         ASSERT(VREQ_DONE(BP_VREQ(vdp->xdf_f_act)));
 854         ASSERT(vdp->xdf_f_act != vdp->xdf_i_act);
 855 
 856         if (bp == vdp->xdf_f_act) {
 857                 /* This IO was at the head of our active queue. */
 858                 vdp->xdf_f_act = bp->av_forw;
 859                 if (bp == vdp->xdf_l_act)
 860                         vdp->xdf_l_act = NULL;
 861         } else {
 862                 /* There IO finished before some other pending IOs. */
 863                 bp_iter = vdp->xdf_f_act;
 864                 while (bp != bp_iter->av_forw) {
 865                         bp_iter = bp_iter->av_forw;
 866                         ASSERT(VREQ_DONE(BP_VREQ(bp_iter)));
 867                         ASSERT(bp_iter != vdp->xdf_i_act);
 868                 }
 869                 bp_iter->av_forw = bp->av_forw;
 870                 if (bp == vdp->xdf_l_act)
 871                         vdp->xdf_l_act = bp_iter;
 872         }
 873         bp->av_forw = NULL;
 874 }
 875 
 876 static buf_t *
 877 xdf_bp_next(xdf_t *vdp)
 878 {
 879         v_req_t *vreq;
 880         buf_t   *bp;
 881 
 882         if (vdp->xdf_state == XD_CONNECTED) {
 883                 /*
 884                  * If we're in the XD_CONNECTED state, we only service IOs
 885                  * from the xdf_ready_tq_thread thread.
 886                  */
 887                 if ((bp = vdp->xdf_ready_tq_bp) == NULL)
 888                         return (NULL);
 889                 if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
 890                         return (bp);
 891                 return (NULL);
 892         }
 893 
 894         /* if we're not in the XD_CONNECTED or XD_READY state we can't do IO */
 895         if (vdp->xdf_state != XD_READY)
 896                 return (NULL);
 897 
 898         ASSERT(vdp->xdf_ready_tq_bp == NULL);
 899         for (;;) {
 900                 if ((bp = vdp->xdf_i_act) == NULL)
 901                         return (NULL);
 902                 if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
 903                         return (bp);
 904 
 905                 /* advance the active buf index pointer */
 906                 vdp->xdf_i_act = bp->av_forw;
 907         }
 908 }
 909 
 910 static void
 911 xdf_io_fini(xdf_t *vdp, uint64_t id, int bioerr)
 912 {
 913         ge_slot_t       *gs = (ge_slot_t *)(uintptr_t)id;
 914         v_req_t         *vreq = gs->gs_vreq;
 915         buf_t           *bp = vreq->v_buf;
 916 
 917         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 918         ASSERT(BP_VREQ(bp) == vreq);
 919 
 920         gs_free(gs);
 921 
 922         if (bioerr != 0)
 923                 bioerror(bp, bioerr);
 924         ASSERT(vreq->v_nslots > 0);
 925         if (--vreq->v_nslots > 0)
 926                 return;
 927 
 928         /* remove this IO from our active queue */
 929         xdf_bp_pop(vdp, bp);
 930 
 931         ASSERT(vreq->v_runq);
 932         xdf_kstat_exit(vdp, bp);
 933         vreq->v_runq = B_FALSE;
 934         vreq_free(vdp, vreq);
 935 
 936         if (IS_ERROR(bp)) {
 937                 xdf_io_err(bp, geterror(bp), 0);
 938         } else if (bp->b_resid != 0) {
 939                 /* Partial transfers are an error */
 940                 xdf_io_err(bp, EIO, bp->b_resid);
 941         } else {
 942                 biodone(bp);
 943         }
 944 }
 945 
 946 /*
 947  * xdf interrupt handler
 948  */
 949 static uint_t
 950 xdf_intr_locked(xdf_t *vdp)
 951 {
 952         xendev_ring_t *xbr;
 953         blkif_response_t *resp;
 954         int bioerr;
 955         uint64_t id;
 956         uint8_t op;
 957         uint16_t status;
 958         ddi_acc_handle_t acchdl;
 959 
 960         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
 961 
 962         if ((xbr = vdp->xdf_xb_ring) == NULL)
 963                 return (DDI_INTR_UNCLAIMED);
 964 
 965         acchdl = vdp->xdf_xb_ring_hdl;
 966 
 967         /*
 968          * complete all requests which have a response
 969          */
 970         while (resp = xvdi_ring_get_response(xbr)) {
 971                 id = ddi_get64(acchdl, &resp->id);
 972                 op = ddi_get8(acchdl, &resp->operation);
 973                 status = ddi_get16(acchdl, (uint16_t *)&resp->status);
 974                 DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
 975                     op, id, status));
 976 
 977                 if (status != BLKIF_RSP_OKAY) {
 978                         DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
 979                             vdp->xdf_addr,
 980                             (op == BLKIF_OP_READ) ? "reading" : "writing"));
 981                         bioerr = EIO;
 982                 } else {
 983                         bioerr = 0;
 984                 }
 985 
 986                 xdf_io_fini(vdp, id, bioerr);
 987         }
 988         return (DDI_INTR_CLAIMED);
 989 }
 990 
 991 /*
 992  * xdf_intr runs at PIL 5, so no one else can grab xdf_dev_lk and
 993  * block at a lower pil.
 994  */
 995 static uint_t
 996 xdf_intr(caddr_t arg)
 997 {
 998         xdf_t *vdp = (xdf_t *)arg;
 999         int rv;
1000 
1001         mutex_enter(&vdp->xdf_dev_lk);
1002         rv = xdf_intr_locked(vdp);
1003         mutex_exit(&vdp->xdf_dev_lk);
1004 
1005         if (!do_polled_io)
1006                 xdf_io_start(vdp);
1007 
1008         return (rv);
1009 }
1010 
1011 static void
1012 xdf_ring_push(xdf_t *vdp)
1013 {
1014         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1015 
1016         if (vdp->xdf_xb_ring == NULL)
1017                 return;
1018 
1019         if (xvdi_ring_push_request(vdp->xdf_xb_ring)) {
1020                 DPRINTF(IO_DBG, (
1021                     "xdf@%s: xdf_ring_push: sent request(s) to backend\n",
1022                     vdp->xdf_addr));
1023         }
1024 
1025         if (xvdi_get_evtchn(vdp->xdf_dip) != INVALID_EVTCHN)
1026                 xvdi_notify_oe(vdp->xdf_dip);
1027 }
1028 
1029 static int
1030 xdf_ring_drain_locked(xdf_t *vdp)
1031 {
1032         int             pollc, rv = 0;
1033 
1034         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1035 
1036         if (xdf_debug & SUSRES_DBG)
1037                 xen_printf("xdf_ring_drain: start\n");
1038 
1039         for (pollc = 0; pollc < XDF_DRAIN_RETRY_COUNT; pollc++) {
1040                 if (vdp->xdf_xb_ring == NULL)
1041                         goto out;
1042 
1043                 if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1044                         (void) xdf_intr_locked(vdp);
1045                 if (!xvdi_ring_has_incomp_request(vdp->xdf_xb_ring))
1046                         goto out;
1047                 xdf_ring_push(vdp);
1048 
1049                 /* file-backed devices can be slow */
1050                 mutex_exit(&vdp->xdf_dev_lk);
1051 #ifdef XPV_HVM_DRIVER
1052                 (void) HYPERVISOR_yield();
1053 #endif /* XPV_HVM_DRIVER */
1054                 delay(drv_usectohz(XDF_DRAIN_MSEC_DELAY));
1055                 mutex_enter(&vdp->xdf_dev_lk);
1056         }
1057         cmn_err(CE_WARN, "xdf@%s: xdf_ring_drain: timeout", vdp->xdf_addr);
1058 
1059 out:
1060         if (vdp->xdf_xb_ring != NULL) {
1061                 if (xvdi_ring_has_incomp_request(vdp->xdf_xb_ring) ||
1062                     xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1063                         rv = EIO;
1064         }
1065         if (xdf_debug & SUSRES_DBG)
1066                 xen_printf("xdf@%s: xdf_ring_drain: end, err=%d\n",
1067                     vdp->xdf_addr, rv);
1068         return (rv);
1069 }
1070 
1071 static int
1072 xdf_ring_drain(xdf_t *vdp)
1073 {
1074         int rv;
1075         mutex_enter(&vdp->xdf_dev_lk);
1076         rv = xdf_ring_drain_locked(vdp);
1077         mutex_exit(&vdp->xdf_dev_lk);
1078         return (rv);
1079 }
1080 
1081 /*
1082  * Destroy all v_req_t, grant table entries, and our ring buffer.
1083  */
1084 static void
1085 xdf_ring_destroy(xdf_t *vdp)
1086 {
1087         v_req_t         *vreq;
1088         buf_t           *bp;
1089         ge_slot_t       *gs;
1090 
1091         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1092         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1093 
1094         if ((vdp->xdf_state != XD_INIT) &&
1095             (vdp->xdf_state != XD_CONNECTED) &&
1096             (vdp->xdf_state != XD_READY)) {
1097                 ASSERT(vdp->xdf_xb_ring == NULL);
1098                 ASSERT(vdp->xdf_xb_ring_hdl == NULL);
1099                 ASSERT(vdp->xdf_peer == INVALID_DOMID);
1100                 ASSERT(vdp->xdf_evtchn == INVALID_EVTCHN);
1101                 ASSERT(list_is_empty(&vdp->xdf_vreq_act));
1102                 return;
1103         }
1104 
1105         /*
1106          * We don't want to receive async notifications from the backend
1107          * when it finishes processing ring entries.
1108          */
1109 #ifdef XPV_HVM_DRIVER
1110         ec_unbind_evtchn(vdp->xdf_evtchn);
1111 #else /* !XPV_HVM_DRIVER */
1112         (void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1113 #endif /* !XPV_HVM_DRIVER */
1114 
1115         /*
1116          * Drain any requests in the ring.  We need to do this before we
1117          * can free grant table entries, because if active ring entries
1118          * point to grants, then the backend could be trying to access
1119          * those grants.
1120          */
1121         (void) xdf_ring_drain_locked(vdp);
1122 
1123         /* We're done talking to the backend so free up our event channel */
1124         xvdi_free_evtchn(vdp->xdf_dip);
1125         vdp->xdf_evtchn = INVALID_EVTCHN;
1126 
1127         while ((vreq = list_head(&vdp->xdf_vreq_act)) != NULL) {
1128                 bp = vreq->v_buf;
1129                 ASSERT(BP_VREQ(bp) == vreq);
1130 
1131                 /* Free up any grant table entries associaed with this IO */
1132                 while ((gs = list_head(&vreq->v_gs)) != NULL)
1133                         gs_free(gs);
1134 
1135                 /* If this IO was on the runq, move it back to the waitq. */
1136                 if (vreq->v_runq)
1137                         xdf_kstat_runq_to_waitq(vdp, bp);
1138 
1139                 /*
1140                  * Reset any buf IO state since we're going to re-issue the
1141                  * IO when we reconnect.
1142                  */
1143                 vreq_free(vdp, vreq);
1144                 BP_VREQ_SET(bp, NULL);
1145                 bioerror(bp, 0);
1146         }
1147 
1148         /* reset the active queue index pointer */
1149         vdp->xdf_i_act = vdp->xdf_f_act;
1150 
1151         /* Destroy the ring */
1152         xvdi_free_ring(vdp->xdf_xb_ring);
1153         vdp->xdf_xb_ring = NULL;
1154         vdp->xdf_xb_ring_hdl = NULL;
1155         vdp->xdf_peer = INVALID_DOMID;
1156 }
1157 
1158 void
1159 xdfmin(struct buf *bp)
1160 {
1161         if (bp->b_bcount > xdf_maxphys)
1162                 bp->b_bcount = xdf_maxphys;
1163 }
1164 
1165 /*
1166  * Check if we have a pending "eject" media request.
1167  */
1168 static int
1169 xdf_eject_pending(xdf_t *vdp)
1170 {
1171         dev_info_t      *dip = vdp->xdf_dip;
1172         char            *xsname, *str;
1173 
1174         if (!vdp->xdf_media_req_supported)
1175                 return (B_FALSE);
1176 
1177         if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1178             (xenbus_read_str(xsname, XBP_MEDIA_REQ, &str) != 0))
1179                 return (B_FALSE);
1180 
1181         if (strcmp(str, XBV_MEDIA_REQ_EJECT) != 0) {
1182                 strfree(str);
1183                 return (B_FALSE);
1184         }
1185         strfree(str);
1186         return (B_TRUE);
1187 }
1188 
1189 /*
1190  * Generate a media request.
1191  */
1192 static int
1193 xdf_media_req(xdf_t *vdp, char *req, boolean_t media_required)
1194 {
1195         dev_info_t      *dip = vdp->xdf_dip;
1196         char            *xsname;
1197 
1198         /*
1199          * we can't be holding xdf_dev_lk because xenbus_printf() can
1200          * block while waiting for a PIL 1 interrupt message.  this
1201          * would cause a deadlock with xdf_intr() which needs to grab
1202          * xdf_dev_lk as well and runs at PIL 5.
1203          */
1204         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1205         ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1206 
1207         if ((xsname = xvdi_get_xsname(dip)) == NULL)
1208                 return (ENXIO);
1209 
1210         /* Check if we support media requests */
1211         if (!XD_IS_CD(vdp) || !vdp->xdf_media_req_supported)
1212                 return (ENOTTY);
1213 
1214         /* If an eject is pending then don't allow any new requests */
1215         if (xdf_eject_pending(vdp))
1216                 return (ENXIO);
1217 
1218         /* Make sure that there is media present */
1219         if (media_required && (vdp->xdf_xdev_nblocks == 0))
1220                 return (ENXIO);
1221 
1222         /* We only allow operations when the device is ready and connected */
1223         if (vdp->xdf_state != XD_READY)
1224                 return (EIO);
1225 
1226         if (xenbus_printf(XBT_NULL, xsname, XBP_MEDIA_REQ, "%s", req) != 0)
1227                 return (EIO);
1228 
1229         return (0);
1230 }
1231 
1232 /*
1233  * populate a single blkif_request_t w/ a buf
1234  */
1235 static void
1236 xdf_process_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1237 {
1238         grant_ref_t     gr;
1239         uint8_t         fsect, lsect;
1240         size_t          bcnt;
1241         paddr_t         dma_addr;
1242         off_t           blk_off;
1243         dev_info_t      *dip = vdp->xdf_dip;
1244         blkif_vdev_t    vdev = xvdi_get_vdevnum(dip);
1245         v_req_t         *vreq = BP_VREQ(bp);
1246         uint64_t        blkno = vreq->v_blkno;
1247         uint_t          ndmacs = vreq->v_ndmacs;
1248         ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1249         int             seg = 0;
1250         int             isread = IS_READ(bp);
1251         ge_slot_t       *gs = list_head(&vreq->v_gs);
1252 
1253         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1254         ASSERT(vreq->v_status == VREQ_GS_ALLOCED);
1255 
1256         if (isread)
1257                 ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1258         else {
1259                 switch (vreq->v_flush_diskcache) {
1260                 case FLUSH_DISKCACHE:
1261                         ddi_put8(acchdl, &rreq->operation,
1262                             BLKIF_OP_FLUSH_DISKCACHE);
1263                         ddi_put16(acchdl, &rreq->handle, vdev);
1264                         ddi_put64(acchdl, &rreq->id,
1265                             (uint64_t)(uintptr_t)(gs));
1266                         ddi_put8(acchdl, &rreq->nr_segments, 0);
1267                         vreq->v_status = VREQ_DMAWIN_DONE;
1268                         return;
1269                 case WRITE_BARRIER:
1270                         ddi_put8(acchdl, &rreq->operation,
1271                             BLKIF_OP_WRITE_BARRIER);
1272                         break;
1273                 default:
1274                         if (!vdp->xdf_wce)
1275                                 ddi_put8(acchdl, &rreq->operation,
1276                                     BLKIF_OP_WRITE_BARRIER);
1277                         else
1278                                 ddi_put8(acchdl, &rreq->operation,
1279                                     BLKIF_OP_WRITE);
1280                         break;
1281                 }
1282         }
1283 
1284         ddi_put16(acchdl, &rreq->handle, vdev);
1285         ddi_put64(acchdl, &rreq->sector_number, blkno);
1286         ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(gs));
1287 
1288         /*
1289          * loop until all segments are populated or no more dma cookie in buf
1290          */
1291         for (;;) {
1292                 /*
1293                  * Each segment of a blkif request can transfer up to
1294                  * one 4K page of data.
1295                  */
1296                 bcnt = vreq->v_dmac.dmac_size;
1297                 dma_addr = vreq->v_dmac.dmac_laddress;
1298                 blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1299                 fsect = blk_off >> XB_BSHIFT;
1300                 lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1301 
1302                 ASSERT(bcnt <= PAGESIZE);
1303                 ASSERT((bcnt % XB_BSIZE) == 0);
1304                 ASSERT((blk_off & XB_BMASK) == 0);
1305                 ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1306                     lsect < XB_MAX_SEGLEN / XB_BSIZE);
1307 
1308                 gr = gs_grant(gs, PATOMA(dma_addr) >> PAGESHIFT);
1309                 ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1310                 ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1311                 ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1312 
1313                 DPRINTF(IO_DBG, (
1314                     "xdf@%s: seg%d: dmacS %lu blk_off %ld\n",
1315                     vdp->xdf_addr, seg, vreq->v_dmac.dmac_size, blk_off));
1316                 DPRINTF(IO_DBG, (
1317                     "xdf@%s: seg%d: fs %d ls %d gr %d dma 0x%"PRIx64"\n",
1318                     vdp->xdf_addr, seg, fsect, lsect, gr, dma_addr));
1319 
1320                 blkno += (bcnt >> XB_BSHIFT);
1321                 seg++;
1322                 ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1323                 if (--ndmacs) {
1324                         ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1325                         continue;
1326                 }
1327 
1328                 vreq->v_status = VREQ_DMAWIN_DONE;
1329                 vreq->v_blkno = blkno;
1330                 break;
1331         }
1332         ddi_put8(acchdl,  &rreq->nr_segments, seg);
1333         DPRINTF(IO_DBG, (
1334             "xdf@%s: xdf_process_rreq: request id=%"PRIx64" ready\n",
1335             vdp->xdf_addr, rreq->id));
1336 }
1337 
1338 static void
1339 xdf_io_start(xdf_t *vdp)
1340 {
1341         struct buf      *bp;
1342         v_req_t         *vreq;
1343         blkif_request_t *rreq;
1344         boolean_t       rreqready = B_FALSE;
1345 
1346         mutex_enter(&vdp->xdf_dev_lk);
1347 
1348         /*
1349          * Populate the ring request(s).  Loop until there is no buf to
1350          * transfer or no free slot available in I/O ring.
1351          */
1352         for (;;) {
1353                 /* don't start any new IO if we're suspending */
1354                 if (vdp->xdf_suspending)
1355                         break;
1356                 if ((bp = xdf_bp_next(vdp)) == NULL)
1357                         break;
1358 
1359                 /* if the buf doesn't already have a vreq, allocate one */
1360                 if (((vreq = BP_VREQ(bp)) == NULL) &&
1361                     ((vreq = vreq_get(vdp, bp)) == NULL))
1362                         break;
1363 
1364                 /* alloc DMA/GTE resources */
1365                 if (vreq_setup(vdp, vreq) != DDI_SUCCESS)
1366                         break;
1367 
1368                 /* get next blkif_request in the ring */
1369                 if ((rreq = xvdi_ring_get_request(vdp->xdf_xb_ring)) == NULL)
1370                         break;
1371                 bzero(rreq, sizeof (blkif_request_t));
1372                 rreqready = B_TRUE;
1373 
1374                 /* populate blkif_request with this buf */
1375                 xdf_process_rreq(vdp, bp, rreq);
1376 
1377                 /*
1378                  * This buffer/vreq pair is has been allocated a ring buffer
1379                  * resources, so if it isn't already in our runq, add it.
1380                  */
1381                 if (!vreq->v_runq)
1382                         xdf_kstat_waitq_to_runq(vdp, bp);
1383         }
1384 
1385         /* Send the request(s) to the backend */
1386         if (rreqready)
1387                 xdf_ring_push(vdp);
1388 
1389         mutex_exit(&vdp->xdf_dev_lk);
1390 }
1391 
1392 
1393 /* check if partition is open, -1 - check all partitions on the disk */
1394 static boolean_t
1395 xdf_isopen(xdf_t *vdp, int partition)
1396 {
1397         int i;
1398         ulong_t parbit;
1399         boolean_t rval = B_FALSE;
1400 
1401         ASSERT((partition == -1) ||
1402             ((partition >= 0) || (partition < XDF_PEXT)));
1403 
1404         if (partition == -1)
1405                 parbit = (ulong_t)-1;
1406         else
1407                 parbit = 1 << partition;
1408 
1409         for (i = 0; i < OTYPCNT; i++) {
1410                 if (vdp->xdf_vd_open[i] & parbit)
1411                         rval = B_TRUE;
1412         }
1413 
1414         return (rval);
1415 }
1416 
1417 /*
1418  * The connection should never be closed as long as someone is holding
1419  * us open, there is pending IO, or someone is waiting waiting for a
1420  * connection.
1421  */
1422 static boolean_t
1423 xdf_busy(xdf_t *vdp)
1424 {
1425         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1426 
1427         if ((vdp->xdf_xb_ring != NULL) &&
1428             xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
1429                 ASSERT(vdp->xdf_state != XD_CLOSED);
1430                 return (B_TRUE);
1431         }
1432 
1433         if (!list_is_empty(&vdp->xdf_vreq_act) || (vdp->xdf_f_act != NULL)) {
1434                 ASSERT(vdp->xdf_state != XD_CLOSED);
1435                 return (B_TRUE);
1436         }
1437 
1438         if (xdf_isopen(vdp, -1)) {
1439                 ASSERT(vdp->xdf_state != XD_CLOSED);
1440                 return (B_TRUE);
1441         }
1442 
1443         if (vdp->xdf_connect_req > 0) {
1444                 ASSERT(vdp->xdf_state != XD_CLOSED);
1445                 return (B_TRUE);
1446         }
1447 
1448         return (B_FALSE);
1449 }
1450 
1451 static void
1452 xdf_set_state(xdf_t *vdp, xdf_state_t new_state)
1453 {
1454         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1455         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1456         DPRINTF(DDI_DBG, ("xdf@%s: state change %d -> %d\n",
1457             vdp->xdf_addr, vdp->xdf_state, new_state));
1458         vdp->xdf_state = new_state;
1459         cv_broadcast(&vdp->xdf_dev_cv);
1460 }
1461 
1462 static void
1463 xdf_disconnect(xdf_t *vdp, xdf_state_t new_state, boolean_t quiet)
1464 {
1465         dev_info_t      *dip = vdp->xdf_dip;
1466         boolean_t       busy;
1467 
1468         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1469         ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1470         ASSERT((new_state == XD_UNKNOWN) || (new_state == XD_CLOSED));
1471 
1472         /* Check if we're already there. */
1473         if (vdp->xdf_state == new_state)
1474                 return;
1475 
1476         mutex_enter(&vdp->xdf_dev_lk);
1477         busy = xdf_busy(vdp);
1478 
1479         /* If we're already closed then there's nothing todo. */
1480         if (vdp->xdf_state == XD_CLOSED) {
1481                 ASSERT(!busy);
1482                 xdf_set_state(vdp, new_state);
1483                 mutex_exit(&vdp->xdf_dev_lk);
1484                 return;
1485         }
1486 
1487 #ifdef DEBUG
1488         /* UhOh.  Warn the user that something bad has happened. */
1489         if (!quiet && busy && (vdp->xdf_state == XD_READY) &&
1490             (vdp->xdf_xdev_nblocks != 0)) {
1491                 cmn_err(CE_WARN, "xdf@%s: disconnected while in use",
1492                     vdp->xdf_addr);
1493         }
1494 #endif /* DEBUG */
1495 
1496         xdf_ring_destroy(vdp);
1497 
1498         /* If we're busy then we can only go into the unknown state */
1499         xdf_set_state(vdp, (busy) ? XD_UNKNOWN : new_state);
1500         mutex_exit(&vdp->xdf_dev_lk);
1501 
1502         /* if we're closed now, let the other end know */
1503         if (vdp->xdf_state == XD_CLOSED)
1504                 (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1505 }
1506 
1507 
1508 /*
1509  * Kick-off connect process
1510  * Status should be XD_UNKNOWN or XD_CLOSED
1511  * On success, status will be changed to XD_INIT
1512  * On error, it will be changed to XD_UNKNOWN
1513  */
1514 static int
1515 xdf_setstate_init(xdf_t *vdp)
1516 {
1517         dev_info_t              *dip = vdp->xdf_dip;
1518         xenbus_transaction_t    xbt;
1519         grant_ref_t             gref;
1520         char                    *xsname, *str;
1521         int                     rv;
1522 
1523         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1524         ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1525         ASSERT((vdp->xdf_state == XD_UNKNOWN) ||
1526             (vdp->xdf_state == XD_CLOSED));
1527 
1528         DPRINTF(DDI_DBG,
1529             ("xdf@%s: starting connection process\n", vdp->xdf_addr));
1530 
1531         /*
1532          * If an eject is pending then don't allow a new connection.
1533          * (Only the backend can clear media request eject request.)
1534          */
1535         if (xdf_eject_pending(vdp))
1536                 return (DDI_FAILURE);
1537 
1538         if ((xsname = xvdi_get_xsname(dip)) == NULL)
1539                 goto errout;
1540 
1541         if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == INVALID_DOMID)
1542                 goto errout;
1543 
1544         (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialising);
1545 
1546         /*
1547          * Sanity check for the existance of the xenbus device-type property.
1548          * This property might not exist if our xenbus device nodes were
1549          * force destroyed while we were still connected to the backend.
1550          */
1551         if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0)
1552                 goto errout;
1553         strfree(str);
1554 
1555         if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS)
1556                 goto errout;
1557 
1558         vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1559 #ifdef XPV_HVM_DRIVER
1560         ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1561 #else /* !XPV_HVM_DRIVER */
1562         if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1563             DDI_SUCCESS) {
1564                 cmn_err(CE_WARN, "xdf@%s: xdf_setstate_init: "
1565                     "failed to add intr handler", vdp->xdf_addr);
1566                 goto errout1;
1567         }
1568 #endif /* !XPV_HVM_DRIVER */
1569 
1570         if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1571             sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1572             DDI_SUCCESS) {
1573                 cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1574                     vdp->xdf_addr);
1575                 goto errout2;
1576         }
1577         vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1578 
1579         /*
1580          * Write into xenstore the info needed by backend
1581          */
1582 trans_retry:
1583         if (xenbus_transaction_start(&xbt)) {
1584                 cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1585                     vdp->xdf_addr);
1586                 xvdi_fatal_error(dip, EIO, "connect transaction init");
1587                 goto fail_trans;
1588         }
1589 
1590         /*
1591          * XBP_PROTOCOL is written by the domain builder in the case of PV
1592          * domains. However, it is not written for HVM domains, so let's
1593          * write it here.
1594          */
1595         if (((rv = xenbus_printf(xbt, xsname,
1596             XBP_MEDIA_REQ, "%s", XBV_MEDIA_REQ_NONE)) != 0) ||
1597             ((rv = xenbus_printf(xbt, xsname,
1598             XBP_RING_REF, "%u", gref)) != 0) ||
1599             ((rv = xenbus_printf(xbt, xsname,
1600             XBP_EVENT_CHAN, "%u", vdp->xdf_evtchn)) != 0) ||
1601             ((rv = xenbus_printf(xbt, xsname,
1602             XBP_PROTOCOL, "%s", XEN_IO_PROTO_ABI_NATIVE)) != 0) ||
1603             ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0)) {
1604                 (void) xenbus_transaction_end(xbt, 1);
1605                 xvdi_fatal_error(dip, rv, "connect transaction setup");
1606                 goto fail_trans;
1607         }
1608 
1609         /* kick-off connect process */
1610         if (rv = xenbus_transaction_end(xbt, 0)) {
1611                 if (rv == EAGAIN)
1612                         goto trans_retry;
1613                 xvdi_fatal_error(dip, rv, "connect transaction commit");
1614                 goto fail_trans;
1615         }
1616 
1617         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1618         mutex_enter(&vdp->xdf_dev_lk);
1619         xdf_set_state(vdp, XD_INIT);
1620         mutex_exit(&vdp->xdf_dev_lk);
1621 
1622         return (DDI_SUCCESS);
1623 
1624 fail_trans:
1625         xvdi_free_ring(vdp->xdf_xb_ring);
1626 errout2:
1627 #ifdef XPV_HVM_DRIVER
1628         ec_unbind_evtchn(vdp->xdf_evtchn);
1629 #else /* !XPV_HVM_DRIVER */
1630         (void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1631 #endif /* !XPV_HVM_DRIVER */
1632 errout1:
1633         xvdi_free_evtchn(dip);
1634         vdp->xdf_evtchn = INVALID_EVTCHN;
1635 errout:
1636         xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1637         cmn_err(CE_WARN, "xdf@%s: failed to start connection to backend",
1638             vdp->xdf_addr);
1639         return (DDI_FAILURE);
1640 }
1641 
1642 int
1643 xdf_get_flush_block(xdf_t *vdp)
1644 {
1645         /*
1646          * Get a DEV_BSIZE aligned bufer
1647          */
1648         vdp->xdf_flush_mem = kmem_alloc(vdp->xdf_xdev_secsize * 2, KM_SLEEP);
1649         vdp->xdf_cache_flush_block =
1650             (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem),
1651             (int)vdp->xdf_xdev_secsize);
1652 
1653         if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1654             xdf_flush_block, vdp->xdf_xdev_secsize, NULL) != 0)
1655                 return (DDI_FAILURE);
1656         return (DDI_SUCCESS);
1657 }
1658 
1659 static void
1660 xdf_setstate_ready(void *arg)
1661 {
1662         xdf_t           *vdp = (xdf_t *)arg;
1663         dev_info_t      *dip = vdp->xdf_dip;
1664 
1665         vdp->xdf_ready_tq_thread = curthread;
1666 
1667         /* Create minor nodes now when we are almost connected */
1668         mutex_enter(&vdp->xdf_dev_lk);
1669         if (vdp->xdf_cmlb_reattach) {
1670                 vdp->xdf_cmlb_reattach = B_FALSE;
1671                 mutex_exit(&vdp->xdf_dev_lk);
1672                 if (xdf_cmlb_attach(vdp) != 0) {
1673                         cmn_err(CE_WARN,
1674                             "xdf@%s: cmlb attach failed",
1675                             ddi_get_name_addr(dip));
1676                         xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1677                         return;
1678                 }
1679                 mutex_enter(&vdp->xdf_dev_lk);
1680         }
1681 
1682         /* If we're not still trying to get to the ready state, then bail. */
1683         if (vdp->xdf_state != XD_CONNECTED) {
1684                 mutex_exit(&vdp->xdf_dev_lk);
1685                 return;
1686         }
1687         mutex_exit(&vdp->xdf_dev_lk);
1688 
1689         /*
1690          * If backend has feature-barrier, see if it supports disk
1691          * cache flush op.
1692          */
1693         vdp->xdf_flush_supported = B_FALSE;
1694         if (vdp->xdf_feature_barrier) {
1695                 /*
1696                  * Pretend we already know flush is supported so probe
1697                  * will attempt the correct op.
1698                  */
1699                 vdp->xdf_flush_supported = B_TRUE;
1700                 if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1701                         vdp->xdf_flush_supported = B_TRUE;
1702                 } else {
1703                         vdp->xdf_flush_supported = B_FALSE;
1704                         /*
1705                          * If the other end does not support the cache flush op
1706                          * then we must use a barrier-write to force disk
1707                          * cache flushing.  Barrier writes require that a data
1708                          * block actually be written.
1709                          * Cache a block to barrier-write when we are
1710                          * asked to perform a flush.
1711                          * XXX - would it be better to just copy 1 block
1712                          * (512 bytes) from whatever write we did last
1713                          * and rewrite that block?
1714                          */
1715                         if (xdf_get_flush_block(vdp) != DDI_SUCCESS) {
1716                                 xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1717                                 return;
1718                         }
1719                 }
1720         }
1721 
1722         mutex_enter(&vdp->xdf_cb_lk);
1723         mutex_enter(&vdp->xdf_dev_lk);
1724         if (vdp->xdf_state == XD_CONNECTED)
1725                 xdf_set_state(vdp, XD_READY);
1726         mutex_exit(&vdp->xdf_dev_lk);
1727 
1728         /* Restart any currently queued up io */
1729         xdf_io_start(vdp);
1730 
1731         mutex_exit(&vdp->xdf_cb_lk);
1732 }
1733 
1734 /*
1735  * synthetic geometry
1736  */
1737 #define XDF_NSECTS      256
1738 #define XDF_NHEADS      16
1739 
1740 static void
1741 xdf_synthetic_pgeom(dev_info_t *dip, cmlb_geom_t *geomp)
1742 {
1743         xdf_t *vdp;
1744         uint_t ncyl;
1745 
1746         vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
1747 
1748         ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1749 
1750         bzero(geomp, sizeof (*geomp));
1751         geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1752         geomp->g_acyl = 0;
1753         geomp->g_nhead = XDF_NHEADS;
1754         geomp->g_nsect = XDF_NSECTS;
1755         geomp->g_secsize = vdp->xdf_xdev_secsize;
1756         geomp->g_capacity = vdp->xdf_xdev_nblocks;
1757         geomp->g_intrlv = 0;
1758         geomp->g_rpm = 7200;
1759 }
1760 
1761 /*
1762  * Finish other initialization after we've connected to backend
1763  * Status should be XD_INIT before calling this routine
1764  * On success, status should be changed to XD_CONNECTED.
1765  * On error, status should stay XD_INIT
1766  */
1767 static int
1768 xdf_setstate_connected(xdf_t *vdp)
1769 {
1770         dev_info_t      *dip = vdp->xdf_dip;
1771         cmlb_geom_t     pgeom;
1772         diskaddr_t      nblocks = 0;
1773         uint_t          secsize = 0;
1774         char            *oename, *xsname, *str;
1775         uint_t          dinfo;
1776 
1777         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1778         ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1779         ASSERT(vdp->xdf_state == XD_INIT);
1780 
1781         if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1782             ((oename = xvdi_get_oename(dip)) == NULL))
1783                 return (DDI_FAILURE);
1784 
1785         /* Make sure the other end is XenbusStateConnected */
1786         if (xenbus_read_driver_state(oename) != XenbusStateConnected)
1787                 return (DDI_FAILURE);
1788 
1789         /* Determine if feature barrier is supported by backend */
1790         if (!(vdp->xdf_feature_barrier = xenbus_exists(oename, XBP_FB)))
1791                 cmn_err(CE_NOTE, "!xdf@%s: feature-barrier not supported",
1792                     vdp->xdf_addr);
1793 
1794         /*
1795          * Probe backend.  Read the device size into xdf_xdev_nblocks
1796          * and set the VDISK_READONLY, VDISK_CDROM, and VDISK_REMOVABLE
1797          * flags in xdf_dinfo.  If the emulated device type is "cdrom",
1798          * we always set VDISK_CDROM, regardless of if it's present in
1799          * the xenbus info parameter.
1800          */
1801         if (xenbus_gather(XBT_NULL, oename,
1802             XBP_SECTORS, "%"SCNu64, &nblocks,
1803             XBP_SECTOR_SIZE, "%u", &secsize,
1804             XBP_INFO, "%u", &dinfo,
1805             NULL) != 0) {
1806                 cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1807                     "cannot read backend info", vdp->xdf_addr);
1808                 return (DDI_FAILURE);
1809         }
1810         if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
1811                 cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
1812                     vdp->xdf_addr);
1813                 return (DDI_FAILURE);
1814         }
1815         if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
1816                 dinfo |= VDISK_CDROM;
1817         strfree(str);
1818 
1819         if (secsize == 0 || !(ISP2(secsize / DEV_BSIZE)))
1820                 secsize = DEV_BSIZE;
1821         vdp->xdf_xdev_nblocks = nblocks;
1822         vdp->xdf_xdev_secsize = secsize;
1823 #ifdef _ILP32
1824         if (vdp->xdf_xdev_nblocks > DK_MAX_BLOCKS) {
1825                 cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1826                     "backend disk device too large with %llu blocks for"
1827                     " 32-bit kernel", vdp->xdf_addr, vdp->xdf_xdev_nblocks);
1828                 xvdi_fatal_error(dip, EFBIG, "reading backend info");
1829                 return (DDI_FAILURE);
1830         }
1831 #endif
1832 
1833         /*
1834          * If the physical geometry for a fixed disk has been explicity
1835          * set then make sure that the specified physical geometry isn't
1836          * larger than the device we connected to.
1837          */
1838         if (vdp->xdf_pgeom_fixed &&
1839             (vdp->xdf_pgeom.g_capacity > vdp->xdf_xdev_nblocks)) {
1840                 cmn_err(CE_WARN,
1841                     "xdf@%s: connect failed, fixed geometry too large",
1842                     vdp->xdf_addr);
1843                 return (DDI_FAILURE);
1844         }
1845 
1846         vdp->xdf_media_req_supported = xenbus_exists(oename, XBP_MEDIA_REQ_SUP);
1847 
1848         /* mark vbd is ready for I/O */
1849         mutex_enter(&vdp->xdf_dev_lk);
1850         xdf_set_state(vdp, XD_CONNECTED);
1851 
1852         /* check if the cmlb label should be updated */
1853         xdf_synthetic_pgeom(dip, &pgeom);
1854         if ((vdp->xdf_dinfo != dinfo) ||
1855             (!vdp->xdf_pgeom_fixed &&
1856             (memcmp(&vdp->xdf_pgeom, &pgeom, sizeof (pgeom)) != 0))) {
1857                 vdp->xdf_cmlb_reattach = B_TRUE;
1858 
1859                 vdp->xdf_dinfo = dinfo;
1860                 if (!vdp->xdf_pgeom_fixed)
1861                         vdp->xdf_pgeom = pgeom;
1862         }
1863 
1864         if (XD_IS_CD(vdp) || XD_IS_RM(vdp)) {
1865                 if (vdp->xdf_xdev_nblocks == 0) {
1866                         vdp->xdf_mstate = DKIO_EJECTED;
1867                         cv_broadcast(&vdp->xdf_mstate_cv);
1868                 } else {
1869                         vdp->xdf_mstate = DKIO_INSERTED;
1870                         cv_broadcast(&vdp->xdf_mstate_cv);
1871                 }
1872         } else {
1873                 if (vdp->xdf_mstate != DKIO_NONE) {
1874                         vdp->xdf_mstate = DKIO_NONE;
1875                         cv_broadcast(&vdp->xdf_mstate_cv);
1876                 }
1877         }
1878 
1879         mutex_exit(&vdp->xdf_dev_lk);
1880 
1881         cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", vdp->xdf_addr,
1882             (uint64_t)vdp->xdf_xdev_nblocks);
1883 
1884         /* Restart any currently queued up io */
1885         xdf_io_start(vdp);
1886 
1887         /*
1888          * To get to the ready state we have to do IO to the backend device,
1889          * but we can't initiate IO from the other end change callback thread
1890          * (which is the current context we're executing in.)  This is because
1891          * if the other end disconnects while we're doing IO from the callback
1892          * thread, then we can't receive that disconnect event and we hang
1893          * waiting for an IO that can never complete.
1894          */
1895         (void) ddi_taskq_dispatch(vdp->xdf_ready_tq, xdf_setstate_ready, vdp,
1896             DDI_SLEEP);
1897 
1898         (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1899         return (DDI_SUCCESS);
1900 }
1901 
1902 /*ARGSUSED*/
1903 static void
1904 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1905 {
1906         XenbusState new_state = *(XenbusState *)impl_data;
1907         xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1908 
1909         DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1910             vdp->xdf_addr, new_state));
1911 
1912         mutex_enter(&vdp->xdf_cb_lk);
1913 
1914         /* We assume that this callback is single threaded */
1915         ASSERT(vdp->xdf_oe_change_thread == NULL);
1916         DEBUG_EVAL(vdp->xdf_oe_change_thread = curthread);
1917 
1918         /* ignore any backend state changes if we're suspending/suspended */
1919         if (vdp->xdf_suspending || (vdp->xdf_state == XD_SUSPEND)) {
1920                 DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1921                 mutex_exit(&vdp->xdf_cb_lk);
1922                 return;
1923         }
1924 
1925         switch (new_state) {
1926         case XenbusStateUnknown:
1927         case XenbusStateInitialising:
1928         case XenbusStateInitWait:
1929         case XenbusStateInitialised:
1930                 if (vdp->xdf_state == XD_INIT)
1931                         break;
1932 
1933                 xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1934                 if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1935                         break;
1936                 ASSERT(vdp->xdf_state == XD_INIT);
1937                 break;
1938 
1939         case XenbusStateConnected:
1940                 if ((vdp->xdf_state == XD_CONNECTED) ||
1941                     (vdp->xdf_state == XD_READY))
1942                         break;
1943 
1944                 if (vdp->xdf_state != XD_INIT) {
1945                         xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1946                         if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1947                                 break;
1948                         ASSERT(vdp->xdf_state == XD_INIT);
1949                 }
1950 
1951                 if (xdf_setstate_connected(vdp) != DDI_SUCCESS) {
1952                         xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1953                         break;
1954                 }
1955                 ASSERT(vdp->xdf_state == XD_CONNECTED);
1956                 break;
1957 
1958         case XenbusStateClosing:
1959                 if (xdf_isopen(vdp, -1)) {
1960                         cmn_err(CE_NOTE,
1961                             "xdf@%s: hot-unplug failed, still in use",
1962                             vdp->xdf_addr);
1963                         break;
1964                 }
1965                 /*FALLTHROUGH*/
1966         case XenbusStateClosed:
1967                 xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
1968                 break;
1969         }
1970 
1971         /* notify anybody waiting for oe state change */
1972         cv_broadcast(&vdp->xdf_dev_cv);
1973         DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1974         mutex_exit(&vdp->xdf_cb_lk);
1975 }
1976 
1977 static int
1978 xdf_connect_locked(xdf_t *vdp, boolean_t wait)
1979 {
1980         int     rv, timeouts = 0, reset = 20;
1981 
1982         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1983         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1984 
1985         /* we can't connect once we're in the closed state */
1986         if (vdp->xdf_state == XD_CLOSED)
1987                 return (XD_CLOSED);
1988 
1989         vdp->xdf_connect_req++;
1990         while (vdp->xdf_state != XD_READY) {
1991                 mutex_exit(&vdp->xdf_dev_lk);
1992 
1993                 /* only one thread at a time can be the connection thread */
1994                 if (vdp->xdf_connect_thread == NULL)
1995                         vdp->xdf_connect_thread = curthread;
1996 
1997                 if (vdp->xdf_connect_thread == curthread) {
1998                         if ((timeouts > 0) && ((timeouts % reset) == 0)) {
1999                                 /*
2000                                  * If we haven't establised a connection
2001                                  * within the reset time, then disconnect
2002                                  * so we can try again, and double the reset
2003                                  * time.  The reset time starts at 2 sec.
2004                                  */
2005                                 (void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2006                                 reset *= 2;
2007                         }
2008                         if (vdp->xdf_state == XD_UNKNOWN)
2009                                 (void) xdf_setstate_init(vdp);
2010                         if (vdp->xdf_state == XD_INIT)
2011                                 (void) xdf_setstate_connected(vdp);
2012                 }
2013 
2014                 mutex_enter(&vdp->xdf_dev_lk);
2015                 if (!wait || (vdp->xdf_state == XD_READY))
2016                         goto out;
2017 
2018                 mutex_exit((&vdp->xdf_cb_lk));
2019                 if (vdp->xdf_connect_thread != curthread) {
2020                         rv = cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk);
2021                 } else {
2022                         /* delay for 0.1 sec */
2023                         rv = cv_reltimedwait_sig(&vdp->xdf_dev_cv,
2024                             &vdp->xdf_dev_lk, drv_usectohz(100*1000),
2025                             TR_CLOCK_TICK);
2026                         if (rv == -1)
2027                                 timeouts++;
2028                 }
2029                 mutex_exit((&vdp->xdf_dev_lk));
2030                 mutex_enter((&vdp->xdf_cb_lk));
2031                 mutex_enter((&vdp->xdf_dev_lk));
2032                 if (rv == 0)
2033                         goto out;
2034         }
2035 
2036 out:
2037         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2038         ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
2039 
2040         if (vdp->xdf_connect_thread == curthread) {
2041                 /*
2042                  * wake up someone else so they can become the connection
2043                  * thread.
2044                  */
2045                 cv_signal(&vdp->xdf_dev_cv);
2046                 vdp->xdf_connect_thread = NULL;
2047         }
2048 
2049         /* Try to lock the media */
2050         mutex_exit((&vdp->xdf_dev_lk));
2051         (void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2052         mutex_enter((&vdp->xdf_dev_lk));
2053 
2054         vdp->xdf_connect_req--;
2055         return (vdp->xdf_state);
2056 }
2057 
2058 static uint_t
2059 xdf_iorestart(caddr_t arg)
2060 {
2061         xdf_t *vdp = (xdf_t *)arg;
2062 
2063         ASSERT(vdp != NULL);
2064 
2065         mutex_enter(&vdp->xdf_dev_lk);
2066         ASSERT(ISDMACBON(vdp));
2067         SETDMACBOFF(vdp);
2068         mutex_exit(&vdp->xdf_dev_lk);
2069 
2070         xdf_io_start(vdp);
2071 
2072         return (DDI_INTR_CLAIMED);
2073 }
2074 
2075 #ifdef XPV_HVM_DRIVER
2076 
2077 typedef struct xdf_hvm_entry {
2078         list_node_t     xdf_he_list;
2079         char            *xdf_he_path;
2080         dev_info_t      *xdf_he_dip;
2081 } xdf_hvm_entry_t;
2082 
2083 static list_t xdf_hvm_list;
2084 static kmutex_t xdf_hvm_list_lock;
2085 
2086 static xdf_hvm_entry_t *
2087 i_xdf_hvm_find(const char *path, dev_info_t *dip)
2088 {
2089         xdf_hvm_entry_t *i;
2090 
2091         ASSERT((path != NULL) || (dip != NULL));
2092         ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2093 
2094         i = list_head(&xdf_hvm_list);
2095         while (i != NULL) {
2096                 if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2097                         i = list_next(&xdf_hvm_list, i);
2098                         continue;
2099                 }
2100                 if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2101                         i = list_next(&xdf_hvm_list, i);
2102                         continue;
2103                 }
2104                 break;
2105         }
2106         return (i);
2107 }
2108 
2109 dev_info_t *
2110 xdf_hvm_hold(const char *path)
2111 {
2112         xdf_hvm_entry_t *i;
2113         dev_info_t      *dip;
2114 
2115         mutex_enter(&xdf_hvm_list_lock);
2116         i = i_xdf_hvm_find(path, NULL);
2117         if (i == NULL) {
2118                 mutex_exit(&xdf_hvm_list_lock);
2119                 return (B_FALSE);
2120         }
2121         ndi_hold_devi(dip = i->xdf_he_dip);
2122         mutex_exit(&xdf_hvm_list_lock);
2123         return (dip);
2124 }
2125 
2126 static void
2127 xdf_hvm_add(dev_info_t *dip)
2128 {
2129         xdf_hvm_entry_t *i;
2130         char            *path;
2131 
2132         /* figure out the path for the dip */
2133         path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2134         (void) ddi_pathname(dip, path);
2135 
2136         i = kmem_alloc(sizeof (*i), KM_SLEEP);
2137         i->xdf_he_dip = dip;
2138         i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2139 
2140         mutex_enter(&xdf_hvm_list_lock);
2141         ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2142         ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2143         list_insert_head(&xdf_hvm_list, i);
2144         mutex_exit(&xdf_hvm_list_lock);
2145 
2146         kmem_free(path, MAXPATHLEN);
2147 }
2148 
2149 static void
2150 xdf_hvm_rm(dev_info_t *dip)
2151 {
2152         xdf_hvm_entry_t *i;
2153 
2154         mutex_enter(&xdf_hvm_list_lock);
2155         VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2156         list_remove(&xdf_hvm_list, i);
2157         mutex_exit(&xdf_hvm_list_lock);
2158 
2159         kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2160         kmem_free(i, sizeof (*i));
2161 }
2162 
2163 static void
2164 xdf_hvm_init(void)
2165 {
2166         list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2167             offsetof(xdf_hvm_entry_t, xdf_he_list));
2168         mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2169 }
2170 
2171 static void
2172 xdf_hvm_fini(void)
2173 {
2174         ASSERT(list_head(&xdf_hvm_list) == NULL);
2175         list_destroy(&xdf_hvm_list);
2176         mutex_destroy(&xdf_hvm_list_lock);
2177 }
2178 
2179 boolean_t
2180 xdf_hvm_connect(dev_info_t *dip)
2181 {
2182         xdf_t   *vdp = (xdf_t *)ddi_get_driver_private(dip);
2183         char    *oename, *str;
2184         int     rv;
2185 
2186         mutex_enter(&vdp->xdf_cb_lk);
2187 
2188         /*
2189          * Before try to establish a connection we need to wait for the
2190          * backend hotplug scripts to have run.  Once they are run the
2191          * "<oename>/hotplug-status" property will be set to "connected".
2192          */
2193         for (;;) {
2194                 ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2195 
2196                 /*
2197                  * Get the xenbus path to the backend device.  Note that
2198                  * we can't cache this path (and we look it up on each pass
2199                  * through this loop) because it could change during
2200                  * suspend, resume, and migration operations.
2201                  */
2202                 if ((oename = xvdi_get_oename(dip)) == NULL) {
2203                         mutex_exit(&vdp->xdf_cb_lk);
2204                         return (B_FALSE);
2205                 }
2206 
2207                 str = NULL;
2208                 if ((xenbus_read_str(oename, XBP_HP_STATUS, &str) == 0) &&
2209                     (strcmp(str, XBV_HP_STATUS_CONN) == 0))
2210                         break;
2211 
2212                 if (str != NULL)
2213                         strfree(str);
2214 
2215                 /* wait for an update to "<oename>/hotplug-status" */
2216                 if (cv_wait_sig(&vdp->xdf_hp_status_cv, &vdp->xdf_cb_lk) == 0) {
2217                         /* we got interrupted by a signal */
2218                         mutex_exit(&vdp->xdf_cb_lk);
2219                         return (B_FALSE);
2220                 }
2221         }
2222 
2223         /* Good news.  The backend hotplug scripts have been run. */
2224         ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2225         ASSERT(strcmp(str, XBV_HP_STATUS_CONN) == 0);
2226         strfree(str);
2227 
2228         /*
2229          * If we're emulating a cd device and if the backend doesn't support
2230          * media request opreations, then we're not going to bother trying
2231          * to establish a connection for a couple reasons.  First off, media
2232          * requests support is required to support operations like eject and
2233          * media locking.  Second, other backend platforms like Linux don't
2234          * support hvm pv cdrom access.  They don't even have a backend pv
2235          * driver for cdrom device nodes, so we don't want to block forever
2236          * waiting for a connection to a backend driver that doesn't exist.
2237          */
2238         if (XD_IS_CD(vdp) && !xenbus_exists(oename, XBP_MEDIA_REQ_SUP)) {
2239                 mutex_exit(&vdp->xdf_cb_lk);
2240                 return (B_FALSE);
2241         }
2242 
2243         mutex_enter(&vdp->xdf_dev_lk);
2244         rv = xdf_connect_locked(vdp, B_TRUE);
2245         mutex_exit(&vdp->xdf_dev_lk);
2246         mutex_exit(&vdp->xdf_cb_lk);
2247 
2248         return ((rv == XD_READY) ? B_TRUE : B_FALSE);
2249 }
2250 
2251 int
2252 xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2253 {
2254         xdf_t   *vdp = (xdf_t *)ddi_get_driver_private(dip);
2255 
2256         /* sanity check the requested physical geometry */
2257         mutex_enter(&vdp->xdf_dev_lk);
2258         if ((geomp->g_secsize != XB_BSIZE) ||
2259             (geomp->g_capacity == 0)) {
2260                 mutex_exit(&vdp->xdf_dev_lk);
2261                 return (EINVAL);
2262         }
2263 
2264         /*
2265          * If we've already connected to the backend device then make sure
2266          * we're not defining a physical geometry larger than our backend
2267          * device.
2268          */
2269         if ((vdp->xdf_xdev_nblocks != 0) &&
2270             (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2271                 mutex_exit(&vdp->xdf_dev_lk);
2272                 return (EINVAL);
2273         }
2274 
2275         bzero(&vdp->xdf_pgeom, sizeof (vdp->xdf_pgeom));
2276         vdp->xdf_pgeom.g_ncyl = geomp->g_ncyl;
2277         vdp->xdf_pgeom.g_acyl = geomp->g_acyl;
2278         vdp->xdf_pgeom.g_nhead = geomp->g_nhead;
2279         vdp->xdf_pgeom.g_nsect = geomp->g_nsect;
2280         vdp->xdf_pgeom.g_secsize = geomp->g_secsize;
2281         vdp->xdf_pgeom.g_capacity = geomp->g_capacity;
2282         vdp->xdf_pgeom.g_intrlv = geomp->g_intrlv;
2283         vdp->xdf_pgeom.g_rpm = geomp->g_rpm;
2284 
2285         vdp->xdf_pgeom_fixed = B_TRUE;
2286         mutex_exit(&vdp->xdf_dev_lk);
2287 
2288         /* force a re-validation */
2289         cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2290 
2291         return (0);
2292 }
2293 
2294 boolean_t
2295 xdf_is_cd(dev_info_t *dip)
2296 {
2297         xdf_t           *vdp = (xdf_t *)ddi_get_driver_private(dip);
2298         boolean_t       rv;
2299 
2300         mutex_enter(&vdp->xdf_cb_lk);
2301         rv = XD_IS_CD(vdp);
2302         mutex_exit(&vdp->xdf_cb_lk);
2303         return (rv);
2304 }
2305 
2306 boolean_t
2307 xdf_is_rm(dev_info_t *dip)
2308 {
2309         xdf_t           *vdp = (xdf_t *)ddi_get_driver_private(dip);
2310         boolean_t       rv;
2311 
2312         mutex_enter(&vdp->xdf_cb_lk);
2313         rv = XD_IS_RM(vdp);
2314         mutex_exit(&vdp->xdf_cb_lk);
2315         return (rv);
2316 }
2317 
2318 boolean_t
2319 xdf_media_req_supported(dev_info_t *dip)
2320 {
2321         xdf_t           *vdp = (xdf_t *)ddi_get_driver_private(dip);
2322         boolean_t       rv;
2323 
2324         mutex_enter(&vdp->xdf_cb_lk);
2325         rv = vdp->xdf_media_req_supported;
2326         mutex_exit(&vdp->xdf_cb_lk);
2327         return (rv);
2328 }
2329 
2330 #endif /* XPV_HVM_DRIVER */
2331 
2332 static int
2333 xdf_lb_getcap(dev_info_t *dip, diskaddr_t *capp)
2334 {
2335         xdf_t *vdp;
2336         vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2337 
2338         if (vdp == NULL)
2339                 return (ENXIO);
2340 
2341         mutex_enter(&vdp->xdf_dev_lk);
2342         *capp = vdp->xdf_pgeom.g_capacity;
2343         DPRINTF(LBL_DBG, ("xdf@%s:capacity %llu\n", vdp->xdf_addr, *capp));
2344         mutex_exit(&vdp->xdf_dev_lk);
2345         return (0);
2346 }
2347 
2348 static int
2349 xdf_lb_getpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2350 {
2351         xdf_t *vdp;
2352 
2353         if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
2354                 return (ENXIO);
2355         *geomp = vdp->xdf_pgeom;
2356         return (0);
2357 }
2358 
2359 /*
2360  * No real HBA, no geometry available from it
2361  */
2362 /*ARGSUSED*/
2363 static int
2364 xdf_lb_getvgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2365 {
2366         return (EINVAL);
2367 }
2368 
2369 static int
2370 xdf_lb_getattribute(dev_info_t *dip, tg_attribute_t *tgattributep)
2371 {
2372         xdf_t *vdp;
2373 
2374         if (!(vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))))
2375                 return (ENXIO);
2376 
2377         if (XD_IS_RO(vdp))
2378                 tgattributep->media_is_writable = 0;
2379         else
2380                 tgattributep->media_is_writable = 1;
2381         tgattributep->media_is_rotational = 0;
2382         return (0);
2383 }
2384 
2385 /* ARGSUSED3 */
2386 int
2387 xdf_lb_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
2388 {
2389         int instance;
2390         xdf_t   *vdp;
2391 
2392         instance = ddi_get_instance(dip);
2393 
2394         if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
2395                 return (ENXIO);
2396 
2397         switch (cmd) {
2398         case TG_GETPHYGEOM:
2399                 return (xdf_lb_getpgeom(dip, (cmlb_geom_t *)arg));
2400         case TG_GETVIRTGEOM:
2401                 return (xdf_lb_getvgeom(dip, (cmlb_geom_t *)arg));
2402         case TG_GETCAPACITY:
2403                 return (xdf_lb_getcap(dip, (diskaddr_t *)arg));
2404         case TG_GETBLOCKSIZE:
2405                 mutex_enter(&vdp->xdf_cb_lk);
2406                 *(uint32_t *)arg = vdp->xdf_xdev_secsize;
2407                 mutex_exit(&vdp->xdf_cb_lk);
2408                 return (0);
2409         case TG_GETATTR:
2410                 return (xdf_lb_getattribute(dip, (tg_attribute_t *)arg));
2411         default:
2412                 return (ENOTTY);
2413         }
2414 }
2415 
2416 /* ARGSUSED5 */
2417 int
2418 xdf_lb_rdwr(dev_info_t *dip, uchar_t cmd, void *bufp,
2419     diskaddr_t start, size_t reqlen, void *tg_cookie)
2420 {
2421         xdf_t *vdp;
2422         struct buf *bp;
2423         int err = 0;
2424 
2425         vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2426 
2427         /* We don't allow IO from the oe_change callback thread */
2428         ASSERT(curthread != vdp->xdf_oe_change_thread);
2429 
2430         /*
2431          * Having secsize of 0 means that device isn't connected yet.
2432          * FIXME This happens for CD devices, and there's nothing we
2433          * can do about it at the moment.
2434          */
2435         if (vdp->xdf_xdev_secsize == 0)
2436                 return (EIO);
2437 
2438         if ((start + ((reqlen / (vdp->xdf_xdev_secsize / DEV_BSIZE))
2439             >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
2440                 return (EINVAL);
2441 
2442         bp = getrbuf(KM_SLEEP);
2443         if (cmd == TG_READ)
2444                 bp->b_flags = B_BUSY | B_READ;
2445         else
2446                 bp->b_flags = B_BUSY | B_WRITE;
2447 
2448         bp->b_un.b_addr = bufp;
2449         bp->b_bcount = reqlen;
2450         bp->b_blkno = start * (vdp->xdf_xdev_secsize / DEV_BSIZE);
2451         bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
2452 
2453         mutex_enter(&vdp->xdf_dev_lk);
2454         xdf_bp_push(vdp, bp);
2455         mutex_exit(&vdp->xdf_dev_lk);
2456         xdf_io_start(vdp);
2457         if (curthread == vdp->xdf_ready_tq_thread)
2458                 (void) xdf_ring_drain(vdp);
2459         err = biowait(bp);
2460         ASSERT(bp->b_flags & B_DONE);
2461         freerbuf(bp);
2462         return (err);
2463 }
2464 
2465 /*
2466  * Lock the current media.  Set the media state to "lock".
2467  * (Media locks are only respected by the backend driver.)
2468  */
2469 static int
2470 xdf_ioctl_mlock(xdf_t *vdp)
2471 {
2472         int rv;
2473         mutex_enter(&vdp->xdf_cb_lk);
2474         rv = xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2475         mutex_exit(&vdp->xdf_cb_lk);
2476         return (rv);
2477 }
2478 
2479 /*
2480  * Release a media lock.  Set the media state to "none".
2481  */
2482 static int
2483 xdf_ioctl_munlock(xdf_t *vdp)
2484 {
2485         int rv;
2486         mutex_enter(&vdp->xdf_cb_lk);
2487         rv = xdf_media_req(vdp, XBV_MEDIA_REQ_NONE, B_TRUE);
2488         mutex_exit(&vdp->xdf_cb_lk);
2489         return (rv);
2490 }
2491 
2492 /*
2493  * Eject the current media.  Ignores any media locks.  (Media locks
2494  * are only for benifit of the the backend.)
2495  */
2496 static int
2497 xdf_ioctl_eject(xdf_t *vdp)
2498 {
2499         int rv;
2500 
2501         mutex_enter(&vdp->xdf_cb_lk);
2502         if ((rv = xdf_media_req(vdp, XBV_MEDIA_REQ_EJECT, B_FALSE)) != 0) {
2503                 mutex_exit(&vdp->xdf_cb_lk);
2504                 return (rv);
2505         }
2506 
2507         /*
2508          * We've set the media requests xenbus parameter to eject, so now
2509          * disconnect from the backend, wait for the backend to clear
2510          * the media requets xenbus paramter, and then we can reconnect
2511          * to the backend.
2512          */
2513         (void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2514         mutex_enter(&vdp->xdf_dev_lk);
2515         if (xdf_connect_locked(vdp, B_TRUE) != XD_READY) {
2516                 mutex_exit(&vdp->xdf_dev_lk);
2517                 mutex_exit(&vdp->xdf_cb_lk);
2518                 return (EIO);
2519         }
2520         mutex_exit(&vdp->xdf_dev_lk);
2521         mutex_exit(&vdp->xdf_cb_lk);
2522         return (0);
2523 }
2524 
2525 /*
2526  * Watch for media state changes.  This can be an insertion of a device
2527  * (triggered by a 'xm block-configure' request in another domain) or
2528  * the ejection of a device (triggered by a local "eject" operation).
2529  * For a full description of the DKIOCSTATE ioctl behavior see dkio(7I).
2530  */
2531 static int
2532 xdf_dkstate(xdf_t *vdp, enum dkio_state mstate)
2533 {
2534         enum dkio_state         prev_state;
2535 
2536         mutex_enter(&vdp->xdf_cb_lk);
2537         prev_state = vdp->xdf_mstate;
2538 
2539         if (vdp->xdf_mstate == mstate) {
2540                 while (vdp->xdf_mstate == prev_state) {
2541                         if (cv_wait_sig(&vdp->xdf_mstate_cv,
2542                             &vdp->xdf_cb_lk) == 0) {
2543                                 mutex_exit(&vdp->xdf_cb_lk);
2544                                 return (EINTR);
2545                         }
2546                 }
2547         }
2548 
2549         if ((prev_state != DKIO_INSERTED) &&
2550             (vdp->xdf_mstate == DKIO_INSERTED)) {
2551                 (void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2552                 mutex_exit(&vdp->xdf_cb_lk);
2553                 return (0);
2554         }
2555 
2556         mutex_exit(&vdp->xdf_cb_lk);
2557         return (0);
2558 }
2559 
2560 /*ARGSUSED*/
2561 static int
2562 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2563     int *rvalp)
2564 {
2565         minor_t         minor = getminor(dev);
2566         int             part = XDF_PART(minor);
2567         xdf_t           *vdp;
2568         int             rv;
2569 
2570         if (((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) ||
2571             (!xdf_isopen(vdp, part)))
2572                 return (ENXIO);
2573 
2574         DPRINTF(IOCTL_DBG, ("xdf@%s:ioctl: cmd %d (0x%x)\n",
2575             vdp->xdf_addr, cmd, cmd));
2576 
2577         switch (cmd) {
2578         default:
2579                 return (ENOTTY);
2580         case DKIOCG_PHYGEOM:
2581         case DKIOCG_VIRTGEOM:
2582         case DKIOCGGEOM:
2583         case DKIOCSGEOM:
2584         case DKIOCGAPART:
2585         case DKIOCSAPART:
2586         case DKIOCGVTOC:
2587         case DKIOCSVTOC:
2588         case DKIOCPARTINFO:
2589         case DKIOCGEXTVTOC:
2590         case DKIOCSEXTVTOC:
2591         case DKIOCEXTPARTINFO:
2592         case DKIOCGMBOOT:
2593         case DKIOCSMBOOT:
2594         case DKIOCGETEFI:
2595         case DKIOCSETEFI:
2596         case DKIOCSETEXTPART:
2597         case DKIOCPARTITION:
2598                 rv = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
2599                     rvalp, NULL);
2600                 if (rv != 0)
2601                         return (rv);
2602                 /*
2603                  * If we're labelling the disk, we have to update the geometry
2604                  * in the cmlb data structures, and we also have to write a new
2605                  * devid to the disk.  Note that writing an EFI label currently
2606                  * requires 4 ioctls, and devid setup will fail on all but the
2607                  * last.
2608                  */
2609                 if (cmd == DKIOCSEXTVTOC || cmd == DKIOCSVTOC ||
2610                     cmd == DKIOCSETEFI) {
2611                         rv = cmlb_validate(vdp->xdf_vd_lbl, 0, 0);
2612                         if (rv == 0) {
2613                                 xdf_devid_setup(vdp);
2614                         } else {
2615                                 cmn_err(CE_WARN,
2616                                     "xdf@%s, labeling failed on validate",
2617                                     vdp->xdf_addr);
2618                         }
2619                 }
2620                 return (rv);
2621         case FDEJECT:
2622         case DKIOCEJECT:
2623         case CDROMEJECT:
2624                 return (xdf_ioctl_eject(vdp));
2625         case DKIOCLOCK:
2626                 return (xdf_ioctl_mlock(vdp));
2627         case DKIOCUNLOCK:
2628                 return (xdf_ioctl_munlock(vdp));
2629         case CDROMREADOFFSET: {
2630                 int offset = 0;
2631                 if (!XD_IS_CD(vdp))
2632                         return (ENOTTY);
2633                 if (ddi_copyout(&offset, (void *)arg, sizeof (int), mode))
2634                         return (EFAULT);
2635                 return (0);
2636         }
2637         case DKIOCGMEDIAINFO: {
2638                 struct dk_minfo media_info;
2639 
2640                 media_info.dki_lbsize = vdp->xdf_xdev_secsize;
2641                 media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
2642                 if (XD_IS_CD(vdp))
2643                         media_info.dki_media_type = DK_CDROM;
2644                 else
2645                         media_info.dki_media_type = DK_FIXED_DISK;
2646 
2647                 if (ddi_copyout(&media_info, (void *)arg,
2648                     sizeof (struct dk_minfo), mode))
2649                         return (EFAULT);
2650                 return (0);
2651         }
2652         case DKIOCINFO: {
2653                 struct dk_cinfo info;
2654 
2655                 /* controller information */
2656                 if (XD_IS_CD(vdp))
2657                         info.dki_ctype = DKC_CDROM;
2658                 else
2659                         info.dki_ctype = DKC_VBD;
2660 
2661                 info.dki_cnum = 0;
2662                 (void) strncpy((char *)(&info.dki_cname), "xdf", 8);
2663 
2664                 /* unit information */
2665                 info.dki_unit = ddi_get_instance(vdp->xdf_dip);
2666                 (void) strncpy((char *)(&info.dki_dname), "xdf", 8);
2667                 info.dki_flags = DKI_FMTVOL;
2668                 info.dki_partition = part;
2669                 info.dki_maxtransfer = maxphys / DEV_BSIZE;
2670                 info.dki_addr = 0;
2671                 info.dki_space = 0;
2672                 info.dki_prio = 0;
2673                 info.dki_vec = 0;
2674 
2675                 if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
2676                         return (EFAULT);
2677                 return (0);
2678         }
2679         case DKIOCSTATE: {
2680                 enum dkio_state mstate;
2681 
2682                 if (ddi_copyin((void *)arg, &mstate,
2683                     sizeof (mstate), mode) != 0)
2684                         return (EFAULT);
2685                 if ((rv = xdf_dkstate(vdp, mstate)) != 0)
2686                         return (rv);
2687                 mstate = vdp->xdf_mstate;
2688                 if (ddi_copyout(&mstate, (void *)arg,
2689                     sizeof (mstate), mode) != 0)
2690                         return (EFAULT);
2691                 return (0);
2692         }
2693         case DKIOCREMOVABLE: {
2694                 int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2695                 if (ddi_copyout(&i, (caddr_t)arg, sizeof (i), mode))
2696                         return (EFAULT);
2697                 return (0);
2698         }
2699         case DKIOCGETWCE: {
2700                 int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2701                 if (ddi_copyout(&i, (void *)arg, sizeof (i), mode))
2702                         return (EFAULT);
2703                 return (0);
2704         }
2705         case DKIOCSETWCE: {
2706                 int i;
2707                 if (ddi_copyin((void *)arg, &i, sizeof (i), mode))
2708                         return (EFAULT);
2709                 vdp->xdf_wce = VOID2BOOLEAN(i);
2710                 return (0);
2711         }
2712         case DKIOCFLUSHWRITECACHE: {
2713                 struct dk_callback *dkc = (struct dk_callback *)arg;
2714 
2715                 if (vdp->xdf_flush_supported) {
2716                         rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2717                             NULL, 0, 0, (void *)dev);
2718                 } else if (vdp->xdf_feature_barrier &&
2719                     !xdf_barrier_flush_disable) {
2720                         rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2721                             vdp->xdf_cache_flush_block, xdf_flush_block,
2722                             vdp->xdf_xdev_secsize, (void *)dev);
2723                 } else {
2724                         return (ENOTTY);
2725                 }
2726                 if ((mode & FKIOCTL) && (dkc != NULL) &&
2727                     (dkc->dkc_callback != NULL)) {
2728                         (*dkc->dkc_callback)(dkc->dkc_cookie, rv);
2729                         /* need to return 0 after calling callback */
2730                         rv = 0;
2731                 }
2732                 return (rv);
2733         }
2734         }
2735         /*NOTREACHED*/
2736 }
2737 
2738 static int
2739 xdf_strategy(struct buf *bp)
2740 {
2741         xdf_t   *vdp;
2742         minor_t minor;
2743         diskaddr_t p_blkct, p_blkst;
2744         daddr_t blkno;
2745         ulong_t nblks;
2746         int part;
2747 
2748         minor = getminor(bp->b_edev);
2749         part = XDF_PART(minor);
2750         vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor));
2751 
2752         mutex_enter(&vdp->xdf_dev_lk);
2753         if (!xdf_isopen(vdp, part)) {
2754                 mutex_exit(&vdp->xdf_dev_lk);
2755                 xdf_io_err(bp, ENXIO, 0);
2756                 return (0);
2757         }
2758 
2759         /* We don't allow IO from the oe_change callback thread */
2760         ASSERT(curthread != vdp->xdf_oe_change_thread);
2761 
2762         /* Check for writes to a read only device */
2763         if (!IS_READ(bp) && XD_IS_RO(vdp)) {
2764                 mutex_exit(&vdp->xdf_dev_lk);
2765                 xdf_io_err(bp, EROFS, 0);
2766                 return (0);
2767         }
2768 
2769         /* Check if this I/O is accessing a partition or the entire disk */
2770         if ((long)bp->b_private == XB_SLICE_NONE) {
2771                 /* This I/O is using an absolute offset */
2772                 p_blkct = vdp->xdf_xdev_nblocks;
2773                 p_blkst = 0;
2774         } else {
2775                 /* This I/O is using a partition relative offset */
2776                 mutex_exit(&vdp->xdf_dev_lk);
2777                 if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
2778                     &p_blkst, NULL, NULL, NULL)) {
2779                         xdf_io_err(bp, ENXIO, 0);
2780                         return (0);
2781                 }
2782                 mutex_enter(&vdp->xdf_dev_lk);
2783         }
2784 
2785         /*
2786          * Adjust the real blkno and bcount according to the underline
2787          * physical sector size.
2788          */
2789         blkno = bp->b_blkno / (vdp->xdf_xdev_secsize / XB_BSIZE);
2790 
2791         /* check for a starting block beyond the disk or partition limit */
2792         if (blkno > p_blkct) {
2793                 DPRINTF(IO_DBG, ("xdf@%s: block %lld exceeds VBD size %"PRIu64,
2794                     vdp->xdf_addr, (longlong_t)blkno, (uint64_t)p_blkct));
2795                 mutex_exit(&vdp->xdf_dev_lk);
2796                 xdf_io_err(bp, EINVAL, 0);
2797                 return (0);
2798         }
2799 
2800         /* Legacy: don't set error flag at this case */
2801         if (blkno == p_blkct) {
2802                 mutex_exit(&vdp->xdf_dev_lk);
2803                 bp->b_resid = bp->b_bcount;
2804                 biodone(bp);
2805                 return (0);
2806         }
2807 
2808         /* sanitize the input buf */
2809         bioerror(bp, 0);
2810         bp->b_resid = 0;
2811         bp->av_back = bp->av_forw = NULL;
2812 
2813         /* Adjust for partial transfer, this will result in an error later */
2814         if (vdp->xdf_xdev_secsize != 0 &&
2815             vdp->xdf_xdev_secsize != XB_BSIZE) {
2816                 nblks = bp->b_bcount / vdp->xdf_xdev_secsize;
2817         } else {
2818                 nblks = bp->b_bcount >> XB_BSHIFT;
2819         }
2820 
2821         if ((blkno + nblks) > p_blkct) {
2822                 if (vdp->xdf_xdev_secsize != 0 &&
2823                     vdp->xdf_xdev_secsize != XB_BSIZE) {
2824                         bp->b_resid =
2825                             ((blkno + nblks) - p_blkct) *
2826                             vdp->xdf_xdev_secsize;
2827                 } else {
2828                         bp->b_resid =
2829                             ((blkno + nblks) - p_blkct) <<
2830                             XB_BSHIFT;
2831                 }
2832                 bp->b_bcount -= bp->b_resid;
2833         }
2834 
2835         DPRINTF(IO_DBG, ("xdf@%s: strategy blk %lld len %lu\n",
2836             vdp->xdf_addr, (longlong_t)blkno, (ulong_t)bp->b_bcount));
2837 
2838         /* Fix up the buf struct */
2839         bp->b_flags |= B_BUSY;
2840         bp->b_private = (void *)(uintptr_t)p_blkst;
2841 
2842         xdf_bp_push(vdp, bp);
2843         mutex_exit(&vdp->xdf_dev_lk);
2844         xdf_io_start(vdp);
2845         if (do_polled_io)
2846                 (void) xdf_ring_drain(vdp);
2847         return (0);
2848 }
2849 
2850 /*ARGSUSED*/
2851 static int
2852 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
2853 {
2854         xdf_t   *vdp;
2855         minor_t minor;
2856         diskaddr_t p_blkcnt;
2857         int part;
2858 
2859         minor = getminor(dev);
2860         if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2861                 return (ENXIO);
2862 
2863         DPRINTF(IO_DBG, ("xdf@%s: read offset 0x%"PRIx64"\n",
2864             vdp->xdf_addr, (int64_t)uiop->uio_offset));
2865 
2866         part = XDF_PART(minor);
2867         if (!xdf_isopen(vdp, part))
2868                 return (ENXIO);
2869 
2870         if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2871             NULL, NULL, NULL, NULL))
2872                 return (ENXIO);
2873 
2874         if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2875                 return (ENOSPC);
2876 
2877         if (U_INVAL(uiop))
2878                 return (EINVAL);
2879 
2880         return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
2881 }
2882 
2883 /*ARGSUSED*/
2884 static int
2885 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
2886 {
2887         xdf_t *vdp;
2888         minor_t minor;
2889         diskaddr_t p_blkcnt;
2890         int part;
2891 
2892         minor = getminor(dev);
2893         if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2894                 return (ENXIO);
2895 
2896         DPRINTF(IO_DBG, ("xdf@%s: write offset 0x%"PRIx64"\n",
2897             vdp->xdf_addr, (int64_t)uiop->uio_offset));
2898 
2899         part = XDF_PART(minor);
2900         if (!xdf_isopen(vdp, part))
2901                 return (ENXIO);
2902 
2903         if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2904             NULL, NULL, NULL, NULL))
2905                 return (ENXIO);
2906 
2907         if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2908                 return (ENOSPC);
2909 
2910         if (U_INVAL(uiop))
2911                 return (EINVAL);
2912 
2913         return (physio(xdf_strategy, NULL, dev, B_WRITE, xdfmin, uiop));
2914 }
2915 
2916 /*ARGSUSED*/
2917 static int
2918 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
2919 {
2920         xdf_t   *vdp;
2921         minor_t minor;
2922         struct uio *uiop = aiop->aio_uio;
2923         diskaddr_t p_blkcnt;
2924         int part;
2925 
2926         minor = getminor(dev);
2927         if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2928                 return (ENXIO);
2929 
2930         part = XDF_PART(minor);
2931         if (!xdf_isopen(vdp, part))
2932                 return (ENXIO);
2933 
2934         if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2935             NULL, NULL, NULL, NULL))
2936                 return (ENXIO);
2937 
2938         if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2939                 return (ENOSPC);
2940 
2941         if (U_INVAL(uiop))
2942                 return (EINVAL);
2943 
2944         return (aphysio(xdf_strategy, anocancel, dev, B_READ, xdfmin, aiop));
2945 }
2946 
2947 /*ARGSUSED*/
2948 static int
2949 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
2950 {
2951         xdf_t *vdp;
2952         minor_t minor;
2953         struct uio *uiop = aiop->aio_uio;
2954         diskaddr_t p_blkcnt;
2955         int part;
2956 
2957         minor = getminor(dev);
2958         if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2959                 return (ENXIO);
2960 
2961         part = XDF_PART(minor);
2962         if (!xdf_isopen(vdp, part))
2963                 return (ENXIO);
2964 
2965         if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2966             NULL, NULL, NULL, NULL))
2967                 return (ENXIO);
2968 
2969         if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2970                 return (ENOSPC);
2971 
2972         if (U_INVAL(uiop))
2973                 return (EINVAL);
2974 
2975         return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, xdfmin, aiop));
2976 }
2977 
2978 static int
2979 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
2980 {
2981         struct buf dumpbuf, *dbp = &dumpbuf;
2982         xdf_t   *vdp;
2983         minor_t minor;
2984         int err = 0;
2985         int part;
2986         diskaddr_t p_blkcnt, p_blkst;
2987 
2988         minor = getminor(dev);
2989         if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2990                 return (ENXIO);
2991 
2992         DPRINTF(IO_DBG, ("xdf@%s: dump addr (0x%p) blk (%ld) nblks (%d)\n",
2993             vdp->xdf_addr, (void *)addr, blkno, nblk));
2994 
2995         /* We don't allow IO from the oe_change callback thread */
2996         ASSERT(curthread != vdp->xdf_oe_change_thread);
2997 
2998         part = XDF_PART(minor);
2999         if (!xdf_isopen(vdp, part))
3000                 return (ENXIO);
3001 
3002         if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
3003             NULL, NULL, NULL))
3004                 return (ENXIO);
3005 
3006         if ((blkno + nblk) >
3007             (p_blkcnt * (vdp->xdf_xdev_secsize / XB_BSIZE))) {
3008                 cmn_err(CE_WARN, "xdf@%s: block %ld exceeds VBD size %"PRIu64,
3009                     vdp->xdf_addr, (daddr_t)((blkno + nblk) /
3010                     (vdp->xdf_xdev_secsize / XB_BSIZE)), (uint64_t)p_blkcnt);
3011                 return (EINVAL);
3012         }
3013 
3014         bioinit(dbp);
3015         dbp->b_flags = B_BUSY;
3016         dbp->b_un.b_addr = addr;
3017         dbp->b_bcount = nblk << DEV_BSHIFT;
3018         dbp->b_blkno = blkno;
3019         dbp->b_edev = dev;
3020         dbp->b_private = (void *)(uintptr_t)p_blkst;
3021 
3022         mutex_enter(&vdp->xdf_dev_lk);
3023         xdf_bp_push(vdp, dbp);
3024         mutex_exit(&vdp->xdf_dev_lk);
3025         xdf_io_start(vdp);
3026         err = xdf_ring_drain(vdp);
3027         biofini(dbp);
3028         return (err);
3029 }
3030 
3031 /*ARGSUSED*/
3032 static int
3033 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
3034 {
3035         minor_t minor;
3036         xdf_t   *vdp;
3037         int part;
3038         ulong_t parbit;
3039 
3040         minor = getminor(dev);
3041         if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3042                 return (ENXIO);
3043 
3044         mutex_enter(&vdp->xdf_dev_lk);
3045         part = XDF_PART(minor);
3046         if (!xdf_isopen(vdp, part)) {
3047                 mutex_exit(&vdp->xdf_dev_lk);
3048                 return (ENXIO);
3049         }
3050         parbit = 1 << part;
3051 
3052         ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
3053         if (otyp == OTYP_LYR) {
3054                 ASSERT(vdp->xdf_vd_lyropen[part] > 0);
3055                 if (--vdp->xdf_vd_lyropen[part] == 0)
3056                         vdp->xdf_vd_open[otyp] &= ~parbit;
3057         } else {
3058                 vdp->xdf_vd_open[otyp] &= ~parbit;
3059         }
3060         vdp->xdf_vd_exclopen &= ~parbit;
3061 
3062         mutex_exit(&vdp->xdf_dev_lk);
3063         return (0);
3064 }
3065 
3066 static int
3067 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
3068 {
3069         minor_t minor;
3070         xdf_t   *vdp;
3071         int part;
3072         ulong_t parbit;
3073         diskaddr_t p_blkct = 0;
3074         boolean_t firstopen;
3075         boolean_t nodelay;
3076 
3077         minor = getminor(*devp);
3078         if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3079                 return (ENXIO);
3080 
3081         nodelay = (flag & (FNDELAY | FNONBLOCK));
3082 
3083         DPRINTF(DDI_DBG, ("xdf@%s: opening\n", vdp->xdf_addr));
3084 
3085         /* do cv_wait until connected or failed */
3086         mutex_enter(&vdp->xdf_cb_lk);
3087         mutex_enter(&vdp->xdf_dev_lk);
3088         if (!nodelay && (xdf_connect_locked(vdp, B_TRUE) != XD_READY)) {
3089                 mutex_exit(&vdp->xdf_dev_lk);
3090                 mutex_exit(&vdp->xdf_cb_lk);
3091                 return (ENXIO);
3092         }
3093         mutex_exit(&vdp->xdf_cb_lk);
3094 
3095         if ((flag & FWRITE) && XD_IS_RO(vdp)) {
3096                 mutex_exit(&vdp->xdf_dev_lk);
3097                 return (EROFS);
3098         }
3099 
3100         part = XDF_PART(minor);
3101         parbit = 1 << part;
3102         if ((vdp->xdf_vd_exclopen & parbit) ||
3103             ((flag & FEXCL) && xdf_isopen(vdp, part))) {
3104                 mutex_exit(&vdp->xdf_dev_lk);
3105                 return (EBUSY);
3106         }
3107 
3108         /* are we the first one to open this node? */
3109         firstopen = !xdf_isopen(vdp, -1);
3110 
3111         if (otyp == OTYP_LYR)
3112                 vdp->xdf_vd_lyropen[part]++;
3113 
3114         vdp->xdf_vd_open[otyp] |= parbit;
3115 
3116         if (flag & FEXCL)
3117                 vdp->xdf_vd_exclopen |= parbit;
3118 
3119         mutex_exit(&vdp->xdf_dev_lk);
3120 
3121         /* force a re-validation */
3122         if (firstopen)
3123                 cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
3124 
3125         /* If this is a non-blocking open then we're done */
3126         if (nodelay)
3127                 return (0);
3128 
3129         /*
3130          * This is a blocking open, so we require:
3131          * - that the disk have a valid label on it
3132          * - that the size of the partition that we're opening is non-zero
3133          */
3134         if ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
3135             NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0)) {
3136                 (void) xdf_close(*devp, flag, otyp, credp);
3137                 return (ENXIO);
3138         }
3139 
3140         return (0);
3141 }
3142 
3143 /*ARGSUSED*/
3144 static void
3145 xdf_watch_hp_status_cb(dev_info_t *dip, const char *path, void *arg)
3146 {
3147         xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
3148         cv_broadcast(&vdp->xdf_hp_status_cv);
3149 }
3150 
3151 static int
3152 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
3153     char *name, caddr_t valuep, int *lengthp)
3154 {
3155         xdf_t   *vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
3156 
3157         /*
3158          * Sanity check that if a dev_t or dip were specified that they
3159          * correspond to this device driver.  On debug kernels we'll
3160          * panic and on non-debug kernels we'll return failure.
3161          */
3162         ASSERT(ddi_driver_major(dip) == xdf_major);
3163         ASSERT((dev == DDI_DEV_T_ANY) || (getmajor(dev) == xdf_major));
3164         if ((ddi_driver_major(dip) != xdf_major) ||
3165             ((dev != DDI_DEV_T_ANY) && (getmajor(dev) != xdf_major)))
3166                 return (DDI_PROP_NOT_FOUND);
3167 
3168         if (vdp == NULL)
3169                 return (ddi_prop_op(dev, dip, prop_op, flags,
3170                     name, valuep, lengthp));
3171 
3172         return (cmlb_prop_op(vdp->xdf_vd_lbl,
3173             dev, dip, prop_op, flags, name, valuep, lengthp,
3174             XDF_PART(getminor(dev)), NULL));
3175 }
3176 
3177 /*ARGSUSED*/
3178 static int
3179 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
3180 {
3181         int     instance = XDF_INST(getminor((dev_t)arg));
3182         xdf_t   *vbdp;
3183 
3184         switch (cmd) {
3185         case DDI_INFO_DEVT2DEVINFO:
3186                 if ((vbdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL) {
3187                         *rp = NULL;
3188                         return (DDI_FAILURE);
3189                 }
3190                 *rp = vbdp->xdf_dip;
3191                 return (DDI_SUCCESS);
3192 
3193         case DDI_INFO_DEVT2INSTANCE:
3194                 *rp = (void *)(uintptr_t)instance;
3195                 return (DDI_SUCCESS);
3196 
3197         default:
3198                 return (DDI_FAILURE);
3199         }
3200 }
3201 
3202 /*ARGSUSED*/
3203 static int
3204 xdf_resume(dev_info_t *dip)
3205 {
3206         xdf_t   *vdp;
3207         char    *oename;
3208 
3209         if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
3210                 goto err;
3211 
3212         if (xdf_debug & SUSRES_DBG)
3213                 xen_printf("xdf@%s: xdf_resume\n", vdp->xdf_addr);
3214 
3215         mutex_enter(&vdp->xdf_cb_lk);
3216 
3217         if (xvdi_resume(dip) != DDI_SUCCESS) {
3218                 mutex_exit(&vdp->xdf_cb_lk);
3219                 goto err;
3220         }
3221 
3222         if (((oename = xvdi_get_oename(dip)) == NULL) ||
3223             (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3224             xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)) {
3225                 mutex_exit(&vdp->xdf_cb_lk);
3226                 goto err;
3227         }
3228 
3229         mutex_enter(&vdp->xdf_dev_lk);
3230         ASSERT(vdp->xdf_state != XD_READY);
3231         xdf_set_state(vdp, XD_UNKNOWN);
3232         mutex_exit(&vdp->xdf_dev_lk);
3233 
3234         if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3235                 mutex_exit(&vdp->xdf_cb_lk);
3236                 goto err;
3237         }
3238 
3239         mutex_exit(&vdp->xdf_cb_lk);
3240 
3241         if (xdf_debug & SUSRES_DBG)
3242                 xen_printf("xdf@%s: xdf_resume: done\n", vdp->xdf_addr);
3243         return (DDI_SUCCESS);
3244 err:
3245         if (xdf_debug & SUSRES_DBG)
3246                 xen_printf("xdf@%s: xdf_resume: fail\n", vdp->xdf_addr);
3247         return (DDI_FAILURE);
3248 }
3249 
3250 /*
3251  * Uses the in-memory devid if one exists.
3252  *
3253  * Create a devid and write it on the first block of the last track of
3254  * the last cylinder.
3255  * Return DDI_SUCCESS or DDI_FAILURE.
3256  */
3257 static int
3258 xdf_devid_fabricate(xdf_t *vdp)
3259 {
3260         ddi_devid_t     devid = vdp->xdf_tgt_devid; /* null if no devid */
3261         struct dk_devid *dkdevidp = NULL; /* devid struct stored on disk */
3262         diskaddr_t      blk;
3263         uint_t          *ip, chksum;
3264         int             i, devid_size;
3265 
3266         if (cmlb_get_devid_block(vdp->xdf_vd_lbl, &blk, NULL) != 0)
3267                 goto err;
3268 
3269         if (devid == NULL && ddi_devid_init(vdp->xdf_dip, DEVID_FAB, 0,
3270             NULL, &devid) != DDI_SUCCESS)
3271                 goto err;
3272 
3273         /* allocate a buffer */
3274         dkdevidp = (struct dk_devid *)kmem_zalloc(NBPSCTR, KM_SLEEP);
3275 
3276         /* Fill in the revision */
3277         dkdevidp->dkd_rev_hi = DK_DEVID_REV_MSB;
3278         dkdevidp->dkd_rev_lo = DK_DEVID_REV_LSB;
3279 
3280         /* Copy in the device id */
3281         devid_size = ddi_devid_sizeof(devid);
3282         if (devid_size > DK_DEVID_SIZE)
3283                 goto err;
3284         bcopy(devid, dkdevidp->dkd_devid, devid_size);
3285 
3286         /* Calculate the chksum */
3287         chksum = 0;
3288         ip = (uint_t *)dkdevidp;
3289         for (i = 0; i < (NBPSCTR / sizeof (int)) - 1; i++)
3290                 chksum ^= ip[i];
3291 
3292         /* Fill in the checksum */
3293         DKD_FORMCHKSUM(chksum, dkdevidp);
3294 
3295         if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, dkdevidp, blk,
3296             NBPSCTR, NULL) != 0)
3297                 goto err;
3298 
3299         kmem_free(dkdevidp, NBPSCTR);
3300 
3301         vdp->xdf_tgt_devid = devid;
3302         return (DDI_SUCCESS);
3303 
3304 err:
3305         if (dkdevidp != NULL)
3306                 kmem_free(dkdevidp, NBPSCTR);
3307         if (devid != NULL && vdp->xdf_tgt_devid == NULL)
3308                 ddi_devid_free(devid);
3309         return (DDI_FAILURE);
3310 }
3311 
3312 /*
3313  * xdf_devid_read() is a local copy of xdfs_devid_read(), modified to use xdf
3314  * functions.
3315  *
3316  * Read a devid from on the first block of the last track of
3317  * the last cylinder.  Make sure what we read is a valid devid.
3318  * Return DDI_SUCCESS or DDI_FAILURE.
3319  */
3320 static int
3321 xdf_devid_read(xdf_t *vdp)
3322 {
3323         diskaddr_t      blk;
3324         struct dk_devid *dkdevidp;
3325         uint_t          *ip, chksum;
3326         int             i;
3327 
3328         if (cmlb_get_devid_block(vdp->xdf_vd_lbl, &blk, NULL) != 0)
3329                 return (DDI_FAILURE);
3330 
3331         dkdevidp = kmem_zalloc(NBPSCTR, KM_SLEEP);
3332         if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, dkdevidp, blk,
3333             NBPSCTR, NULL) != 0)
3334                 goto err;
3335 
3336         /* Validate the revision */
3337         if ((dkdevidp->dkd_rev_hi != DK_DEVID_REV_MSB) ||
3338             (dkdevidp->dkd_rev_lo != DK_DEVID_REV_LSB))
3339                 goto err;
3340 
3341         /* Calculate the checksum */
3342         chksum = 0;
3343         ip = (uint_t *)dkdevidp;
3344         for (i = 0; i < (NBPSCTR / sizeof (int)) - 1; i++)
3345                 chksum ^= ip[i];
3346         if (DKD_GETCHKSUM(dkdevidp) != chksum)
3347                 goto err;
3348 
3349         /* Validate the device id */
3350         if (ddi_devid_valid((ddi_devid_t)dkdevidp->dkd_devid) != DDI_SUCCESS)
3351                 goto err;
3352 
3353         /* keep a copy of the device id */
3354         i = ddi_devid_sizeof((ddi_devid_t)dkdevidp->dkd_devid);
3355         vdp->xdf_tgt_devid = kmem_alloc(i, KM_SLEEP);
3356         bcopy(dkdevidp->dkd_devid, vdp->xdf_tgt_devid, i);
3357         kmem_free(dkdevidp, NBPSCTR);
3358         return (DDI_SUCCESS);
3359 
3360 err:
3361         kmem_free(dkdevidp, NBPSCTR);
3362         return (DDI_FAILURE);
3363 }
3364 
3365 /*
3366  * xdf_devid_setup() is a modified copy of cmdk_devid_setup().
3367  *
3368  * This function creates a devid if we don't already have one, and
3369  * registers it.  If we already have one, we make sure that it can be
3370  * read from the disk, otherwise we write it to the disk ourselves.  If
3371  * we didn't already have a devid, and we create one, we also need to
3372  * register it.
3373  */
3374 void
3375 xdf_devid_setup(xdf_t *vdp)
3376 {
3377         int rc;
3378         boolean_t existed = vdp->xdf_tgt_devid != NULL;
3379 
3380         /* Read devid from the disk, if present */
3381         rc = xdf_devid_read(vdp);
3382 
3383         /* Otherwise write a devid (which we create if necessary) on the disk */
3384         if (rc != DDI_SUCCESS)
3385                 rc = xdf_devid_fabricate(vdp);
3386 
3387         /* If we created a devid or found it on the disk, register it */
3388         if (rc == DDI_SUCCESS && !existed)
3389                 (void) ddi_devid_register(vdp->xdf_dip, vdp->xdf_tgt_devid);
3390 }
3391 
3392 static int
3393 xdf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3394 {
3395         int                     n, instance = ddi_get_instance(dip);
3396         ddi_iblock_cookie_t     ibc, softibc;
3397         boolean_t               dev_iscd = B_FALSE;
3398         xdf_t                   *vdp;
3399         char                    *oename, *xsname, *str;
3400         clock_t                 timeout;
3401         int                     err = 0;
3402 
3403         if ((n = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_NOTPROM,
3404             "xdf_debug", 0)) != 0)
3405                 xdf_debug = n;
3406 
3407         switch (cmd) {
3408         case DDI_RESUME:
3409                 return (xdf_resume(dip));
3410         case DDI_ATTACH:
3411                 break;
3412         default:
3413                 return (DDI_FAILURE);
3414         }
3415         /* DDI_ATTACH */
3416 
3417         if ((xsname = xvdi_get_xsname(dip)) == NULL ||
3418             (oename = xvdi_get_oename(dip)) == NULL)
3419                 return (DDI_FAILURE);
3420 
3421         /*
3422          * Disable auto-detach.  This is necessary so that we don't get
3423          * detached while we're disconnected from the back end.
3424          */
3425         if ((ddi_prop_update_int(DDI_DEV_T_NONE, dip,
3426             DDI_NO_AUTODETACH, 1) != DDI_PROP_SUCCESS))
3427                 return (DDI_FAILURE);
3428 
3429         /* driver handles kernel-issued IOCTLs */
3430         if (ddi_prop_create(DDI_DEV_T_NONE, dip,
3431             DDI_PROP_CANSLEEP, DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS)
3432                 return (DDI_FAILURE);
3433 
3434         if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
3435                 return (DDI_FAILURE);
3436 
3437         if (ddi_get_soft_iblock_cookie(dip,
3438             DDI_SOFTINT_LOW, &softibc) != DDI_SUCCESS)
3439                 return (DDI_FAILURE);
3440 
3441         if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
3442                 cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
3443                     ddi_get_name_addr(dip));
3444                 return (DDI_FAILURE);
3445         }
3446         if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
3447                 dev_iscd = B_TRUE;
3448         strfree(str);
3449 
3450         if (ddi_soft_state_zalloc(xdf_ssp, instance) != DDI_SUCCESS)
3451                 return (DDI_FAILURE);
3452 
3453         DPRINTF(DDI_DBG, ("xdf@%s: attaching\n", ddi_get_name_addr(dip)));
3454         vdp = ddi_get_soft_state(xdf_ssp, instance);
3455         ddi_set_driver_private(dip, vdp);
3456         vdp->xdf_dip = dip;
3457         vdp->xdf_addr = ddi_get_name_addr(dip);
3458         vdp->xdf_suspending = B_FALSE;
3459         vdp->xdf_media_req_supported = B_FALSE;
3460         vdp->xdf_peer = INVALID_DOMID;
3461         vdp->xdf_evtchn = INVALID_EVTCHN;
3462         list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
3463             offsetof(v_req_t, v_link));
3464         cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
3465         cv_init(&vdp->xdf_hp_status_cv, NULL, CV_DEFAULT, NULL);
3466         cv_init(&vdp->xdf_mstate_cv, NULL, CV_DEFAULT, NULL);
3467         mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3468         mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3469         mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3470         vdp->xdf_cmlb_reattach = B_TRUE;
3471         if (dev_iscd) {
3472                 vdp->xdf_dinfo |= VDISK_CDROM;
3473                 vdp->xdf_mstate = DKIO_EJECTED;
3474         } else {
3475                 vdp->xdf_mstate = DKIO_NONE;
3476         }
3477 
3478         if ((vdp->xdf_ready_tq = ddi_taskq_create(dip, "xdf_ready_tq",
3479             1, TASKQ_DEFAULTPRI, 0)) == NULL)
3480                 goto errout0;
3481 
3482         if (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3483             xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)
3484                 goto errout0;
3485 
3486         if (ddi_add_softintr(dip, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
3487             &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
3488                 cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
3489                     ddi_get_name_addr(dip));
3490                 goto errout0;
3491         }
3492 
3493         /*
3494          * Initialize the physical geometry stucture.  Note that currently
3495          * we don't know the size of the backend device so the number
3496          * of blocks on the device will be initialized to zero.  Once
3497          * we connect to the backend device we'll update the physical
3498          * geometry to reflect the real size of the device.
3499          */
3500         xdf_synthetic_pgeom(dip, &vdp->xdf_pgeom);
3501         vdp->xdf_pgeom_fixed = B_FALSE;
3502 
3503         /*
3504          * Allocate the cmlb handle, minor nodes will be created once
3505          * the device is connected with backend.
3506          */
3507         cmlb_alloc_handle(&vdp->xdf_vd_lbl);
3508 
3509         /* We ship with cache-enabled disks */
3510         vdp->xdf_wce = B_TRUE;
3511 
3512         mutex_enter(&vdp->xdf_cb_lk);
3513         /* Watch backend XenbusState change */
3514         if (xvdi_add_event_handler(dip,
3515             XS_OE_STATE, xdf_oe_change, NULL) != DDI_SUCCESS) {
3516                 mutex_exit(&vdp->xdf_cb_lk);
3517                 goto errout0;
3518         }
3519 
3520         if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3521                 cmn_err(CE_WARN, "xdf@%s: start connection failed",
3522                     ddi_get_name_addr(dip));
3523                 mutex_exit(&vdp->xdf_cb_lk);
3524                 goto errout1;
3525         }
3526 
3527         /* Nothing else to do for CD devices */
3528         if (dev_iscd) {
3529                 mutex_exit(&vdp->xdf_cb_lk);
3530                 goto done;
3531         }
3532 
3533         /*
3534          * In order to do cmlb_validate, we have to wait for the disk to
3535          * acknowledge the attach, so we can query the backend for the disk
3536          * geometry (see xdf_setstate_connected).
3537          *
3538          * We only wait 30 seconds; if this is the root disk, the boot
3539          * will fail, but it would fail anyway if the device never
3540          * connected.  If this is a non-boot disk, that disk will fail
3541          * to connect, but again, it would fail anyway.
3542          */
3543         timeout = ddi_get_lbolt() + drv_usectohz(XDF_STATE_TIMEOUT);
3544         while (vdp->xdf_state != XD_CONNECTED && vdp->xdf_state != XD_READY) {
3545                 if (cv_timedwait(&vdp->xdf_dev_cv, &vdp->xdf_cb_lk,
3546                     timeout) < 0) {
3547                         cmn_err(CE_WARN, "xdf@%s: disk failed to connect",
3548                             ddi_get_name_addr(dip));
3549                         mutex_exit(&vdp->xdf_cb_lk);
3550                         goto errout1;
3551                 }
3552         }
3553         mutex_exit(&vdp->xdf_cb_lk);
3554 
3555         /*
3556          * We call cmlb_validate so that the geometry information in
3557          * vdp->xdf_vd_lbl is correct; this fills out the number of
3558          * alternate cylinders so that we have a place to write the
3559          * devid.
3560          */
3561         if ((err = cmlb_validate(vdp->xdf_vd_lbl, 0, NULL)) != 0) {
3562                 cmn_err(CE_NOTE,
3563                     "xdf@%s: cmlb_validate failed: %d",
3564                     ddi_get_name_addr(dip), err);
3565                 /*
3566                  * We can carry on even if cmlb_validate() returns EINVAL here,
3567                  * as we'll rewrite the disk label anyway.
3568                  */
3569                 if (err != EINVAL)
3570                         goto errout1;
3571         }
3572 
3573         /*
3574          * xdf_devid_setup will only write a devid if one isn't
3575          * already present.  If it fails to find or create one, we
3576          * create one in-memory so that when we label the disk later,
3577          * it will have a devid to use.  This is helpful to deal with
3578          * cases where people use the devids of their disks before
3579          * labelling them; note that this does cause problems if
3580          * people rely on the devids of unlabelled disks to persist
3581          * across reboot.
3582          */
3583         xdf_devid_setup(vdp);
3584         if (vdp->xdf_tgt_devid == NULL) {
3585                 if (ddi_devid_init(vdp->xdf_dip, DEVID_FAB, 0, NULL,
3586                     &vdp->xdf_tgt_devid) != DDI_SUCCESS) {
3587                         cmn_err(CE_WARN,
3588                             "xdf@%s_ attach failed, devid_init failed",
3589                             ddi_get_name_addr(dip));
3590                         goto errout1;
3591                 } else {
3592                         (void) ddi_devid_register(vdp->xdf_dip,
3593                             vdp->xdf_tgt_devid);
3594                 }
3595         }
3596 
3597 done:
3598 #ifdef XPV_HVM_DRIVER
3599         xdf_hvm_add(dip);
3600 
3601         /* Report our version to dom0 */
3602         (void) xenbus_printf(XBT_NULL, "guest/xdf", "version", "%d",
3603             HVMPV_XDF_VERS);
3604 #endif /* XPV_HVM_DRIVER */
3605 
3606         /* Create kstat for iostat(1M) */
3607         if (xdf_kstat_create(dip) != 0) {
3608                 cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
3609                     ddi_get_name_addr(dip));
3610                 goto errout1;
3611         }
3612 
3613         /*
3614          * Don't bother with getting real device identification
3615          * strings (is it even possible?), they are unlikely to
3616          * change often (if at all).
3617          */
3618         (void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_VENDOR_ID,
3619             "Xen");
3620         (void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_PRODUCT_ID,
3621             dev_iscd ? "Virtual CD" : "Virtual disk");
3622         (void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_REVISION_ID,
3623             "1.0");
3624 
3625         ddi_report_dev(dip);
3626         DPRINTF(DDI_DBG, ("xdf@%s: attached\n", vdp->xdf_addr));
3627         return (DDI_SUCCESS);
3628 
3629 errout1:
3630         (void) xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed);
3631         xvdi_remove_event_handler(dip, XS_OE_STATE);
3632 errout0:
3633         if (vdp->xdf_vd_lbl != NULL) {
3634                 cmlb_free_handle(&vdp->xdf_vd_lbl);
3635                 vdp->xdf_vd_lbl = NULL;
3636         }
3637         if (vdp->xdf_softintr_id != NULL)
3638                 ddi_remove_softintr(vdp->xdf_softintr_id);
3639         xvdi_remove_xb_watch_handlers(dip);
3640         if (vdp->xdf_ready_tq != NULL)
3641                 ddi_taskq_destroy(vdp->xdf_ready_tq);
3642         mutex_destroy(&vdp->xdf_cb_lk);
3643         mutex_destroy(&vdp->xdf_dev_lk);
3644         cv_destroy(&vdp->xdf_dev_cv);
3645         cv_destroy(&vdp->xdf_hp_status_cv);
3646         ddi_soft_state_free(xdf_ssp, instance);
3647         ddi_set_driver_private(dip, NULL);
3648         ddi_prop_remove_all(dip);
3649         cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(dip));
3650         return (DDI_FAILURE);
3651 }
3652 
3653 static int
3654 xdf_suspend(dev_info_t *dip)
3655 {
3656         int             instance = ddi_get_instance(dip);
3657         xdf_t           *vdp;
3658 
3659         if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
3660                 return (DDI_FAILURE);
3661 
3662         if (xdf_debug & SUSRES_DBG)
3663                 xen_printf("xdf@%s: xdf_suspend\n", vdp->xdf_addr);
3664 
3665         xvdi_suspend(dip);
3666 
3667         mutex_enter(&vdp->xdf_cb_lk);
3668         mutex_enter(&vdp->xdf_dev_lk);
3669 
3670         vdp->xdf_suspending = B_TRUE;
3671         xdf_ring_destroy(vdp);
3672         xdf_set_state(vdp, XD_SUSPEND);
3673         vdp->xdf_suspending = B_FALSE;
3674 
3675         mutex_exit(&vdp->xdf_dev_lk);
3676         mutex_exit(&vdp->xdf_cb_lk);
3677 
3678         if (xdf_debug & SUSRES_DBG)
3679                 xen_printf("xdf@%s: xdf_suspend: done\n", vdp->xdf_addr);
3680 
3681         return (DDI_SUCCESS);
3682 }
3683 
3684 static int
3685 xdf_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3686 {
3687         xdf_t *vdp;
3688         int instance;
3689 
3690         switch (cmd) {
3691 
3692         case DDI_PM_SUSPEND:
3693                 break;
3694 
3695         case DDI_SUSPEND:
3696                 return (xdf_suspend(dip));
3697 
3698         case DDI_DETACH:
3699                 break;
3700 
3701         default:
3702                 return (DDI_FAILURE);
3703         }
3704 
3705         instance = ddi_get_instance(dip);
3706         DPRINTF(DDI_DBG, ("xdf@%s: detaching\n", ddi_get_name_addr(dip)));
3707         vdp = ddi_get_soft_state(xdf_ssp, instance);
3708 
3709         if (vdp == NULL)
3710                 return (DDI_FAILURE);
3711 
3712         mutex_enter(&vdp->xdf_cb_lk);
3713         xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
3714         if (vdp->xdf_state != XD_CLOSED) {
3715                 mutex_exit(&vdp->xdf_cb_lk);
3716                 return (DDI_FAILURE);
3717         }
3718         mutex_exit(&vdp->xdf_cb_lk);
3719 
3720         ASSERT(!ISDMACBON(vdp));
3721 
3722 #ifdef XPV_HVM_DRIVER
3723         xdf_hvm_rm(dip);
3724 #endif /* XPV_HVM_DRIVER */
3725 
3726         if (vdp->xdf_timeout_id != 0)
3727                 (void) untimeout(vdp->xdf_timeout_id);
3728 
3729         xvdi_remove_event_handler(dip, XS_OE_STATE);
3730         ddi_taskq_destroy(vdp->xdf_ready_tq);
3731 
3732         cmlb_detach(vdp->xdf_vd_lbl, NULL);
3733         cmlb_free_handle(&vdp->xdf_vd_lbl);
3734 
3735         /* we'll support backend running in domU later */
3736 #ifdef  DOMU_BACKEND
3737         (void) xvdi_post_event(dip, XEN_HP_REMOVE);
3738 #endif
3739 
3740         list_destroy(&vdp->xdf_vreq_act);
3741         ddi_prop_remove_all(dip);
3742         xdf_kstat_delete(dip);
3743         ddi_remove_softintr(vdp->xdf_softintr_id);
3744         xvdi_remove_xb_watch_handlers(dip);
3745         ddi_set_driver_private(dip, NULL);
3746         cv_destroy(&vdp->xdf_dev_cv);
3747         mutex_destroy(&vdp->xdf_cb_lk);
3748         mutex_destroy(&vdp->xdf_dev_lk);
3749         if (vdp->xdf_cache_flush_block != NULL)
3750                 kmem_free(vdp->xdf_flush_mem, 2 * vdp->xdf_xdev_secsize);
3751         ddi_soft_state_free(xdf_ssp, instance);
3752         return (DDI_SUCCESS);
3753 }
3754 
3755 /*
3756  * Driver linkage structures.
3757  */
3758 static struct cb_ops xdf_cbops = {
3759         xdf_open,
3760         xdf_close,
3761         xdf_strategy,
3762         nodev,
3763         xdf_dump,
3764         xdf_read,
3765         xdf_write,
3766         xdf_ioctl,
3767         nodev,
3768         nodev,
3769         nodev,
3770         nochpoll,
3771         xdf_prop_op,
3772         NULL,
3773         D_MP | D_NEW | D_64BIT,
3774         CB_REV,
3775         xdf_aread,
3776         xdf_awrite
3777 };
3778 
3779 struct dev_ops xdf_devops = {
3780         DEVO_REV,               /* devo_rev */
3781         0,                      /* devo_refcnt */
3782         xdf_getinfo,            /* devo_getinfo */
3783         nulldev,                /* devo_identify */
3784         nulldev,                /* devo_probe */
3785         xdf_attach,             /* devo_attach */
3786         xdf_detach,             /* devo_detach */
3787         nodev,                  /* devo_reset */
3788         &xdf_cbops,         /* devo_cb_ops */
3789         NULL,                   /* devo_bus_ops */
3790         NULL,                   /* devo_power */
3791         ddi_quiesce_not_supported, /* devo_quiesce */
3792 };
3793 
3794 /*
3795  * Module linkage structures.
3796  */
3797 static struct modldrv modldrv = {
3798         &mod_driverops,             /* Type of module.  This one is a driver */
3799         "virtual block driver", /* short description */
3800         &xdf_devops         /* driver specific ops */
3801 };
3802 
3803 static struct modlinkage xdf_modlinkage = {
3804         MODREV_1, (void *)&modldrv, NULL
3805 };
3806 
3807 /*
3808  * standard module entry points
3809  */
3810 int
3811 _init(void)
3812 {
3813         int rc;
3814 
3815         xdf_major = ddi_name_to_major("xdf");
3816         if (xdf_major == (major_t)-1)
3817                 return (EINVAL);
3818 
3819         if ((rc = ddi_soft_state_init(&xdf_ssp, sizeof (xdf_t), 0)) != 0)
3820                 return (rc);
3821 
3822         xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
3823             sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3824         xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
3825             sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3826 
3827 #ifdef XPV_HVM_DRIVER
3828         xdf_hvm_init();
3829 #endif /* XPV_HVM_DRIVER */
3830 
3831         if ((rc = mod_install(&xdf_modlinkage)) != 0) {
3832 #ifdef XPV_HVM_DRIVER
3833                 xdf_hvm_fini();
3834 #endif /* XPV_HVM_DRIVER */
3835                 kmem_cache_destroy(xdf_vreq_cache);
3836                 kmem_cache_destroy(xdf_gs_cache);
3837                 ddi_soft_state_fini(&xdf_ssp);
3838                 return (rc);
3839         }
3840 
3841         return (rc);
3842 }
3843 
3844 int
3845 _fini(void)
3846 {
3847         int err;
3848         if ((err = mod_remove(&xdf_modlinkage)) != 0)
3849                 return (err);
3850 
3851 #ifdef XPV_HVM_DRIVER
3852         xdf_hvm_fini();
3853 #endif /* XPV_HVM_DRIVER */
3854 
3855         kmem_cache_destroy(xdf_vreq_cache);
3856         kmem_cache_destroy(xdf_gs_cache);
3857         ddi_soft_state_fini(&xdf_ssp);
3858 
3859         return (0);
3860 }
3861 
3862 int
3863 _info(struct modinfo *modinfop)
3864 {
3865         return (mod_info(&xdf_modlinkage, modinfop));
3866 }