1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
  25  * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/ksynch.h>
  30 #include <sys/kmem.h>
  31 #include <sys/file.h>
  32 #include <sys/errno.h>
  33 #include <sys/open.h>
  34 #include <sys/buf.h>
  35 #include <sys/uio.h>
  36 #include <sys/aio_req.h>
  37 #include <sys/cred.h>
  38 #include <sys/modctl.h>
  39 #include <sys/cmlb.h>
  40 #include <sys/conf.h>
  41 #include <sys/devops.h>
  42 #include <sys/list.h>
  43 #include <sys/sysmacros.h>
  44 #include <sys/dkio.h>
  45 #include <sys/vtoc.h>
  46 #include <sys/scsi/scsi.h>        /* for DTYPE_DIRECT */
  47 #include <sys/kstat.h>
  48 #include <sys/fs/dv_node.h>
  49 #include <sys/ddi.h>
  50 #include <sys/sunddi.h>
  51 #include <sys/note.h>
  52 #include <sys/blkdev.h>
  53 
  54 #define BD_MAXPART      64
  55 #define BDINST(dev)     (getminor(dev) / BD_MAXPART)
  56 #define BDPART(dev)     (getminor(dev) % BD_MAXPART)
  57 
  58 typedef struct bd bd_t;
  59 typedef struct bd_xfer_impl bd_xfer_impl_t;
  60 
  61 struct bd {
  62         void            *d_private;
  63         dev_info_t      *d_dip;
  64         kmutex_t        d_ocmutex;
  65         kmutex_t        d_iomutex;
  66         kmutex_t        d_statemutex;
  67         kcondvar_t      d_statecv;
  68         enum dkio_state d_state;
  69         cmlb_handle_t   d_cmlbh;
  70         unsigned        d_open_lyr[BD_MAXPART]; /* open count */
  71         uint64_t        d_open_excl;    /* bit mask indexed by partition */
  72         uint64_t        d_open_reg[OTYPCNT];            /* bit mask */
  73 
  74         uint32_t        d_qsize;
  75         uint32_t        d_qactive;
  76         uint32_t        d_maxxfer;
  77         uint32_t        d_blkshift;
  78         uint64_t        d_numblks;
  79         ddi_devid_t     d_devid;
  80 
  81         kmem_cache_t    *d_cache;
  82         list_t          d_runq;
  83         list_t          d_waitq;
  84         kstat_t         *d_ksp;
  85         kstat_io_t      *d_kiop;
  86 
  87         boolean_t       d_rdonly;
  88         boolean_t       d_removable;
  89         boolean_t       d_hotpluggable;
  90         boolean_t       d_use_dma;
  91 
  92         ddi_dma_attr_t  d_dma;
  93         bd_ops_t        d_ops;
  94         bd_handle_t     d_handle;
  95 };
  96 
  97 struct bd_handle {
  98         bd_ops_t        h_ops;
  99         ddi_dma_attr_t  *h_dma;
 100         dev_info_t      *h_parent;
 101         dev_info_t      *h_child;
 102         void            *h_private;
 103         bd_t            *h_bd;
 104         char            *h_name;
 105         char            h_addr[20];     /* enough for %X,%X */
 106 };
 107 
 108 struct bd_xfer_impl {
 109         bd_xfer_t       i_public;
 110         list_node_t     i_linkage;
 111         bd_t            *i_bd;
 112         buf_t           *i_bp;
 113         uint_t          i_num_win;
 114         uint_t          i_cur_win;
 115         off_t           i_offset;
 116         int             (*i_func)(void *, bd_xfer_t *);
 117         uint32_t        i_blkshift;
 118         size_t          i_len;
 119         size_t          i_resid;
 120 };
 121 
 122 #define i_dmah          i_public.x_dmah
 123 #define i_dmac          i_public.x_dmac
 124 #define i_ndmac         i_public.x_ndmac
 125 #define i_kaddr         i_public.x_kaddr
 126 #define i_nblks         i_public.x_nblks
 127 #define i_blkno         i_public.x_blkno
 128 #define i_flags         i_public.x_flags
 129 
 130 
 131 /*
 132  * Private prototypes.
 133  */
 134 
 135 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 136 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
 137 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
 138 
 139 static int bd_open(dev_t *, int, int, cred_t *);
 140 static int bd_close(dev_t, int, int, cred_t *);
 141 static int bd_strategy(struct buf *);
 142 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
 143 static int bd_dump(dev_t, caddr_t, daddr_t, int);
 144 static int bd_read(dev_t, struct uio *, cred_t *);
 145 static int bd_write(dev_t, struct uio *, cred_t *);
 146 static int bd_aread(dev_t, struct aio_req *, cred_t *);
 147 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
 148 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
 149     caddr_t, int *);
 150 
 151 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
 152     void *);
 153 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
 154 static int bd_xfer_ctor(void *, void *, int);
 155 static void bd_xfer_dtor(void *, void *);
 156 static void bd_sched(bd_t *);
 157 static void bd_submit(bd_t *, bd_xfer_impl_t *);
 158 static void bd_runq_exit(bd_xfer_impl_t *, int);
 159 static void bd_update_state(bd_t *);
 160 static int bd_check_state(bd_t *, enum dkio_state *);
 161 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
 162 
 163 struct cmlb_tg_ops bd_tg_ops = {
 164         TG_DK_OPS_VERSION_1,
 165         bd_tg_rdwr,
 166         bd_tg_getinfo,
 167 };
 168 
 169 static struct cb_ops bd_cb_ops = {
 170         bd_open,                /* open */
 171         bd_close,               /* close */
 172         bd_strategy,            /* strategy */
 173         nodev,                  /* print */
 174         bd_dump,                /* dump */
 175         bd_read,                /* read */
 176         bd_write,               /* write */
 177         bd_ioctl,               /* ioctl */
 178         nodev,                  /* devmap */
 179         nodev,                  /* mmap */
 180         nodev,                  /* segmap */
 181         nochpoll,               /* poll */
 182         bd_prop_op,             /* cb_prop_op */
 183         0,                      /* streamtab  */
 184         D_64BIT | D_MP,         /* Driver comaptibility flag */
 185         CB_REV,                 /* cb_rev */
 186         bd_aread,               /* async read */
 187         bd_awrite               /* async write */
 188 };
 189 
 190 struct dev_ops bd_dev_ops = {
 191         DEVO_REV,               /* devo_rev, */
 192         0,                      /* refcnt  */
 193         bd_getinfo,             /* getinfo */
 194         nulldev,                /* identify */
 195         nulldev,                /* probe */
 196         bd_attach,              /* attach */
 197         bd_detach,              /* detach */
 198         nodev,                  /* reset */
 199         &bd_cb_ops,                 /* driver operations */
 200         NULL,                   /* bus operations */
 201         NULL,                   /* power */
 202         ddi_quiesce_not_needed, /* quiesce */
 203 };
 204 
 205 static struct modldrv modldrv = {
 206         &mod_driverops,
 207         "Generic Block Device",
 208         &bd_dev_ops,
 209 };
 210 
 211 static struct modlinkage modlinkage = {
 212         MODREV_1, { &modldrv, NULL }
 213 };
 214 
 215 static void *bd_state;
 216 static krwlock_t bd_lock;
 217 
 218 int
 219 _init(void)
 220 {
 221         int     rv;
 222 
 223         rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
 224         if (rv != DDI_SUCCESS) {
 225                 return (rv);
 226         }
 227         rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
 228         rv = mod_install(&modlinkage);
 229         if (rv != DDI_SUCCESS) {
 230                 rw_destroy(&bd_lock);
 231                 ddi_soft_state_fini(&bd_state);
 232         }
 233         return (rv);
 234 }
 235 
 236 int
 237 _fini(void)
 238 {
 239         int     rv;
 240 
 241         rv = mod_remove(&modlinkage);
 242         if (rv == DDI_SUCCESS) {
 243                 rw_destroy(&bd_lock);
 244                 ddi_soft_state_fini(&bd_state);
 245         }
 246         return (rv);
 247 }
 248 
 249 int
 250 _info(struct modinfo *modinfop)
 251 {
 252         return (mod_info(&modlinkage, modinfop));
 253 }
 254 
 255 static int
 256 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
 257 {
 258         bd_t    *bd;
 259         minor_t inst;
 260 
 261         _NOTE(ARGUNUSED(dip));
 262 
 263         inst = BDINST((dev_t)arg);
 264 
 265         switch (cmd) {
 266         case DDI_INFO_DEVT2DEVINFO:
 267                 bd = ddi_get_soft_state(bd_state, inst);
 268                 if (bd == NULL) {
 269                         return (DDI_FAILURE);
 270                 }
 271                 *resultp = (void *)bd->d_dip;
 272                 break;
 273 
 274         case DDI_INFO_DEVT2INSTANCE:
 275                 *resultp = (void *)(intptr_t)inst;
 276                 break;
 277 
 278         default:
 279                 return (DDI_FAILURE);
 280         }
 281         return (DDI_SUCCESS);
 282 }
 283 
 284 static int
 285 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 286 {
 287         int             inst;
 288         bd_handle_t     hdl;
 289         bd_t            *bd;
 290         bd_drive_t      drive;
 291         int             rv;
 292         char            name[16];
 293         char            kcache[32];
 294 
 295         switch (cmd) {
 296         case DDI_ATTACH:
 297                 break;
 298         case DDI_RESUME:
 299                 /* We don't do anything native for suspend/resume */
 300                 return (DDI_SUCCESS);
 301         default:
 302                 return (DDI_FAILURE);
 303         }
 304 
 305         inst = ddi_get_instance(dip);
 306         hdl = ddi_get_parent_data(dip);
 307 
 308         (void) snprintf(name, sizeof (name), "%s%d",
 309             ddi_driver_name(dip), ddi_get_instance(dip));
 310         (void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
 311 
 312         if (hdl == NULL) {
 313                 cmn_err(CE_WARN, "%s: missing parent data!", name);
 314                 return (DDI_FAILURE);
 315         }
 316 
 317         if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
 318                 cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
 319                 return (DDI_FAILURE);
 320         }
 321         bd = ddi_get_soft_state(bd_state, inst);
 322 
 323         if (hdl->h_dma) {
 324                 bd->d_dma = *(hdl->h_dma);
 325                 bd->d_dma.dma_attr_granular =
 326                     max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
 327                 bd->d_use_dma = B_TRUE;
 328 
 329                 if (bd->d_maxxfer &&
 330                     (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
 331                         cmn_err(CE_WARN,
 332                             "%s: inconsistent maximum transfer size!",
 333                             name);
 334                         /* We force it */
 335                         bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
 336                 } else {
 337                         bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
 338                 }
 339         } else {
 340                 bd->d_use_dma = B_FALSE;
 341                 if (bd->d_maxxfer == 0) {
 342                         bd->d_maxxfer = 1024 * 1024;
 343                 }
 344         }
 345         bd->d_ops = hdl->h_ops;
 346         bd->d_private = hdl->h_private;
 347         bd->d_blkshift = 9;  /* 512 bytes, to start */
 348 
 349         if (bd->d_maxxfer % DEV_BSIZE) {
 350                 cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
 351                 bd->d_maxxfer &= ~(DEV_BSIZE - 1);
 352         }
 353         if (bd->d_maxxfer < DEV_BSIZE) {
 354                 cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
 355                 ddi_soft_state_free(bd_state, inst);
 356                 return (DDI_FAILURE);
 357         }
 358 
 359         bd->d_dip = dip;
 360         bd->d_handle = hdl;
 361         hdl->h_bd = bd;
 362         ddi_set_driver_private(dip, bd);
 363 
 364         mutex_init(&bd->d_iomutex, NULL, MUTEX_DRIVER, NULL);
 365         mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
 366         mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
 367         cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
 368 
 369         list_create(&bd->d_waitq, sizeof (bd_xfer_impl_t),
 370             offsetof(struct bd_xfer_impl, i_linkage));
 371         list_create(&bd->d_runq, sizeof (bd_xfer_impl_t),
 372             offsetof(struct bd_xfer_impl, i_linkage));
 373 
 374         bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
 375             bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
 376 
 377         bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
 378             KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
 379         if (bd->d_ksp != NULL) {
 380                 bd->d_ksp->ks_lock = &bd->d_iomutex;
 381                 kstat_install(bd->d_ksp);
 382                 bd->d_kiop = bd->d_ksp->ks_data;
 383         } else {
 384                 /*
 385                  * Even if we cannot create the kstat, we create a
 386                  * scratch kstat.  The reason for this is to ensure
 387                  * that we can update the kstat all of the time,
 388                  * without adding an extra branch instruction.
 389                  */
 390                 bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
 391         }
 392 
 393         cmlb_alloc_handle(&bd->d_cmlbh);
 394 
 395         bd->d_state = DKIO_NONE;
 396 
 397         bzero(&drive, sizeof (drive));
 398         bd->d_ops.o_drive_info(bd->d_private, &drive);
 399         bd->d_qsize = drive.d_qsize;
 400         bd->d_removable = drive.d_removable;
 401         bd->d_hotpluggable = drive.d_hotpluggable;
 402 
 403         if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
 404                 bd->d_maxxfer = drive.d_maxxfer;
 405 
 406 
 407         rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
 408             bd->d_removable, bd->d_hotpluggable,
 409             drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
 410             CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
 411         if (rv != 0) {
 412                 cmlb_free_handle(&bd->d_cmlbh);
 413                 kmem_cache_destroy(bd->d_cache);
 414                 mutex_destroy(&bd->d_iomutex);
 415                 mutex_destroy(&bd->d_ocmutex);
 416                 mutex_destroy(&bd->d_statemutex);
 417                 cv_destroy(&bd->d_statecv);
 418                 list_destroy(&bd->d_waitq);
 419                 list_destroy(&bd->d_runq);
 420                 if (bd->d_ksp != NULL) {
 421                         kstat_delete(bd->d_ksp);
 422                         bd->d_ksp = NULL;
 423                 } else {
 424                         kmem_free(bd->d_kiop, sizeof (kstat_io_t));
 425                 }
 426                 ddi_soft_state_free(bd_state, inst);
 427                 return (DDI_FAILURE);
 428         }
 429 
 430         if (bd->d_ops.o_devid_init != NULL) {
 431                 rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
 432                 if (rv == DDI_SUCCESS) {
 433                         if (ddi_devid_register(dip, bd->d_devid) !=
 434                             DDI_SUCCESS) {
 435                                 cmn_err(CE_WARN,
 436                                     "%s: unable to register devid", name);
 437                         }
 438                 }
 439         }
 440 
 441         /*
 442          * Add a zero-length attribute to tell the world we support
 443          * kernel ioctls (for layered drivers).  Also set up properties
 444          * used by HAL to identify removable media.
 445          */
 446         (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
 447             DDI_KERNEL_IOCTL, NULL, 0);
 448         if (bd->d_removable) {
 449                 (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
 450                     "removable-media", NULL, 0);
 451         }
 452         if (bd->d_hotpluggable) {
 453                 (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
 454                     "hotpluggable", NULL, 0);
 455         }
 456 
 457         ddi_report_dev(dip);
 458 
 459         return (DDI_SUCCESS);
 460 }
 461 
 462 static int
 463 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 464 {
 465         bd_t    *bd;
 466 
 467         bd = ddi_get_driver_private(dip);
 468 
 469         switch (cmd) {
 470         case DDI_DETACH:
 471                 break;
 472         case DDI_SUSPEND:
 473                 /* We don't suspend, but our parent does */
 474                 return (DDI_SUCCESS);
 475         default:
 476                 return (DDI_FAILURE);
 477         }
 478         if (bd->d_ksp != NULL) {
 479                 kstat_delete(bd->d_ksp);
 480                 bd->d_ksp = NULL;
 481         } else {
 482                 kmem_free(bd->d_kiop, sizeof (kstat_io_t));
 483         }
 484         cmlb_detach(bd->d_cmlbh, 0);
 485         cmlb_free_handle(&bd->d_cmlbh);
 486         if (bd->d_devid)
 487                 ddi_devid_free(bd->d_devid);
 488         kmem_cache_destroy(bd->d_cache);
 489         mutex_destroy(&bd->d_iomutex);
 490         mutex_destroy(&bd->d_ocmutex);
 491         mutex_destroy(&bd->d_statemutex);
 492         cv_destroy(&bd->d_statecv);
 493         list_destroy(&bd->d_waitq);
 494         list_destroy(&bd->d_runq);
 495         ddi_soft_state_free(bd_state, ddi_get_instance(dip));
 496         return (DDI_SUCCESS);
 497 }
 498 
 499 static int
 500 bd_xfer_ctor(void *buf, void *arg, int kmflag)
 501 {
 502         bd_xfer_impl_t  *xi;
 503         bd_t            *bd = arg;
 504         int             (*dcb)(caddr_t);
 505 
 506         if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
 507                 dcb = DDI_DMA_SLEEP;
 508         } else {
 509                 dcb = DDI_DMA_DONTWAIT;
 510         }
 511 
 512         xi = buf;
 513         bzero(xi, sizeof (*xi));
 514         xi->i_bd = bd;
 515 
 516         if (bd->d_use_dma) {
 517                 if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
 518                     &xi->i_dmah) != DDI_SUCCESS) {
 519                         return (-1);
 520                 }
 521         }
 522 
 523         return (0);
 524 }
 525 
 526 static void
 527 bd_xfer_dtor(void *buf, void *arg)
 528 {
 529         bd_xfer_impl_t  *xi = buf;
 530 
 531         _NOTE(ARGUNUSED(arg));
 532 
 533         if (xi->i_dmah)
 534                 ddi_dma_free_handle(&xi->i_dmah);
 535         xi->i_dmah = NULL;
 536 }
 537 
 538 static bd_xfer_impl_t *
 539 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
 540     int kmflag)
 541 {
 542         bd_xfer_impl_t          *xi;
 543         int                     rv;
 544         int                     status;
 545         unsigned                dir;
 546         int                     (*cb)(caddr_t);
 547         size_t                  len;
 548         uint32_t                shift;
 549 
 550         if (kmflag == KM_SLEEP) {
 551                 cb = DDI_DMA_SLEEP;
 552         } else {
 553                 cb = DDI_DMA_DONTWAIT;
 554         }
 555 
 556         xi = kmem_cache_alloc(bd->d_cache, kmflag);
 557         if (xi == NULL) {
 558                 bioerror(bp, ENOMEM);
 559                 return (NULL);
 560         }
 561 
 562         ASSERT(bp);
 563 
 564         xi->i_bp = bp;
 565         xi->i_func = func;
 566         xi->i_blkno = bp->b_lblkno;
 567 
 568         if (bp->b_bcount == 0) {
 569                 xi->i_len = 0;
 570                 xi->i_nblks = 0;
 571                 xi->i_kaddr = NULL;
 572                 xi->i_resid = 0;
 573                 xi->i_num_win = 0;
 574                 goto done;
 575         }
 576 
 577         if (bp->b_flags & B_READ) {
 578                 dir = DDI_DMA_READ;
 579                 xi->i_func = bd->d_ops.o_read;
 580         } else {
 581                 dir = DDI_DMA_WRITE;
 582                 xi->i_func = bd->d_ops.o_write;
 583         }
 584 
 585         shift = bd->d_blkshift;
 586         xi->i_blkshift = shift;
 587 
 588         if (!bd->d_use_dma) {
 589                 bp_mapin(bp);
 590                 rv = 0;
 591                 xi->i_offset = 0;
 592                 xi->i_num_win =
 593                     (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
 594                 xi->i_cur_win = 0;
 595                 xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
 596                 xi->i_nblks = xi->i_len >> shift;
 597                 xi->i_kaddr = bp->b_un.b_addr;
 598                 xi->i_resid = bp->b_bcount;
 599         } else {
 600 
 601                 /*
 602                  * We have to use consistent DMA if the address is misaligned.
 603                  */
 604                 if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
 605                     ((uintptr_t)bp->b_un.b_addr & 0x7)) {
 606                         dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
 607                 } else {
 608                         dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
 609                 }
 610 
 611                 status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
 612                     NULL, &xi->i_dmac, &xi->i_ndmac);
 613                 switch (status) {
 614                 case DDI_DMA_MAPPED:
 615                         xi->i_num_win = 1;
 616                         xi->i_cur_win = 0;
 617                         xi->i_offset = 0;
 618                         xi->i_len = bp->b_bcount;
 619                         xi->i_nblks = xi->i_len >> shift;
 620                         xi->i_resid = bp->b_bcount;
 621                         rv = 0;
 622                         break;
 623                 case DDI_DMA_PARTIAL_MAP:
 624                         xi->i_cur_win = 0;
 625 
 626                         if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
 627                             DDI_SUCCESS) ||
 628                             (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
 629                             &len, &xi->i_dmac, &xi->i_ndmac) !=
 630                             DDI_SUCCESS) ||
 631                             (P2PHASE(len, shift) != 0)) {
 632                                 (void) ddi_dma_unbind_handle(xi->i_dmah);
 633                                 rv = EFAULT;
 634                                 goto done;
 635                         }
 636                         xi->i_len = len;
 637                         xi->i_nblks = xi->i_len >> shift;
 638                         xi->i_resid = bp->b_bcount;
 639                         rv = 0;
 640                         break;
 641                 case DDI_DMA_NORESOURCES:
 642                         rv = EAGAIN;
 643                         goto done;
 644                 case DDI_DMA_TOOBIG:
 645                         rv = EINVAL;
 646                         goto done;
 647                 case DDI_DMA_NOMAPPING:
 648                 case DDI_DMA_INUSE:
 649                 default:
 650                         rv = EFAULT;
 651                         goto done;
 652                 }
 653         }
 654 
 655 done:
 656         if (rv != 0) {
 657                 kmem_cache_free(bd->d_cache, xi);
 658                 bioerror(bp, rv);
 659                 return (NULL);
 660         }
 661 
 662         return (xi);
 663 }
 664 
 665 static void
 666 bd_xfer_free(bd_xfer_impl_t *xi)
 667 {
 668         if (xi->i_dmah) {
 669                 (void) ddi_dma_unbind_handle(xi->i_dmah);
 670         }
 671         kmem_cache_free(xi->i_bd->d_cache, xi);
 672 }
 673 
 674 static int
 675 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
 676 {
 677         dev_t           dev = *devp;
 678         bd_t            *bd;
 679         minor_t         part;
 680         minor_t         inst;
 681         uint64_t        mask;
 682         boolean_t       ndelay;
 683         int             rv;
 684         diskaddr_t      nblks;
 685         diskaddr_t      lba;
 686 
 687         _NOTE(ARGUNUSED(credp));
 688 
 689         part = BDPART(dev);
 690         inst = BDINST(dev);
 691 
 692         if (otyp >= OTYPCNT)
 693                 return (EINVAL);
 694 
 695         ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
 696 
 697         /*
 698          * Block any DR events from changing the set of registered
 699          * devices while we function.
 700          */
 701         rw_enter(&bd_lock, RW_READER);
 702         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
 703                 rw_exit(&bd_lock);
 704                 return (ENXIO);
 705         }
 706 
 707         mutex_enter(&bd->d_ocmutex);
 708 
 709         ASSERT(part < 64);
 710         mask = (1U << part);
 711 
 712         bd_update_state(bd);
 713 
 714         if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
 715 
 716                 /* non-blocking opens are allowed to succeed */
 717                 if (!ndelay) {
 718                         rv = ENXIO;
 719                         goto done;
 720                 }
 721         } else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
 722             NULL, NULL, 0) == 0) {
 723 
 724                 /*
 725                  * We read the partinfo, verify valid ranges.  If the
 726                  * partition is invalid, and we aren't blocking or
 727                  * doing a raw access, then fail. (Non-blocking and
 728                  * raw accesses can still succeed to allow a disk with
 729                  * bad partition data to opened by format and fdisk.)
 730                  */
 731                 if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
 732                         rv = ENXIO;
 733                         goto done;
 734                 }
 735         } else if (!ndelay) {
 736                 /*
 737                  * cmlb_partinfo failed -- invalid partition or no
 738                  * disk label.
 739                  */
 740                 rv = ENXIO;
 741                 goto done;
 742         }
 743 
 744         if ((flag & FWRITE) && bd->d_rdonly) {
 745                 rv = EROFS;
 746                 goto done;
 747         }
 748 
 749         if ((bd->d_open_excl) & (mask)) {
 750                 rv = EBUSY;
 751                 goto done;
 752         }
 753         if (flag & FEXCL) {
 754                 if (bd->d_open_lyr[part]) {
 755                         rv = EBUSY;
 756                         goto done;
 757                 }
 758                 for (int i = 0; i < OTYP_LYR; i++) {
 759                         if (bd->d_open_reg[i] & mask) {
 760                                 rv = EBUSY;
 761                                 goto done;
 762                         }
 763                 }
 764         }
 765 
 766         if (otyp == OTYP_LYR) {
 767                 bd->d_open_lyr[part]++;
 768         } else {
 769                 bd->d_open_reg[otyp] |= mask;
 770         }
 771         if (flag & FEXCL) {
 772                 bd->d_open_excl |= mask;
 773         }
 774 
 775         rv = 0;
 776 done:
 777         mutex_exit(&bd->d_ocmutex);
 778         rw_exit(&bd_lock);
 779 
 780         return (rv);
 781 }
 782 
 783 static int
 784 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
 785 {
 786         bd_t            *bd;
 787         minor_t         inst;
 788         minor_t         part;
 789         uint64_t        mask;
 790         boolean_t       last = B_TRUE;
 791 
 792         _NOTE(ARGUNUSED(flag));
 793         _NOTE(ARGUNUSED(credp));
 794 
 795         part = BDPART(dev);
 796         inst = BDINST(dev);
 797 
 798         ASSERT(part < 64);
 799         mask = (1U << part);
 800 
 801         rw_enter(&bd_lock, RW_READER);
 802 
 803         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
 804                 rw_exit(&bd_lock);
 805                 return (ENXIO);
 806         }
 807 
 808         mutex_enter(&bd->d_ocmutex);
 809         if (bd->d_open_excl & mask) {
 810                 bd->d_open_excl &= ~mask;
 811         }
 812         if (otyp == OTYP_LYR) {
 813                 bd->d_open_lyr[part]--;
 814         } else {
 815                 bd->d_open_reg[otyp] &= ~mask;
 816         }
 817         for (int i = 0; i < 64; i++) {
 818                 if (bd->d_open_lyr[part]) {
 819                         last = B_FALSE;
 820                 }
 821         }
 822         for (int i = 0; last && (i < OTYP_LYR); i++) {
 823                 if (bd->d_open_reg[i]) {
 824                         last = B_FALSE;
 825                 }
 826         }
 827         mutex_exit(&bd->d_ocmutex);
 828 
 829         if (last) {
 830                 cmlb_invalidate(bd->d_cmlbh, 0);
 831         }
 832         rw_exit(&bd_lock);
 833 
 834         return (0);
 835 }
 836 
 837 static int
 838 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
 839 {
 840         minor_t         inst;
 841         minor_t         part;
 842         diskaddr_t      pstart;
 843         diskaddr_t      psize;
 844         bd_t            *bd;
 845         bd_xfer_impl_t  *xi;
 846         buf_t           *bp;
 847         int             rv;
 848 
 849         rw_enter(&bd_lock, RW_READER);
 850 
 851         part = BDPART(dev);
 852         inst = BDINST(dev);
 853 
 854         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
 855                 rw_exit(&bd_lock);
 856                 return (ENXIO);
 857         }
 858         /*
 859          * do cmlb, but do it synchronously unless we already have the
 860          * partition (which we probably should.)
 861          */
 862         if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
 863             (void *)1)) {
 864                 rw_exit(&bd_lock);
 865                 return (ENXIO);
 866         }
 867 
 868         if ((blkno + nblk) > psize) {
 869                 rw_exit(&bd_lock);
 870                 return (EINVAL);
 871         }
 872         bp = getrbuf(KM_NOSLEEP);
 873         if (bp == NULL) {
 874                 rw_exit(&bd_lock);
 875                 return (ENOMEM);
 876         }
 877 
 878         bp->b_bcount = nblk << bd->d_blkshift;
 879         bp->b_resid = bp->b_bcount;
 880         bp->b_lblkno = blkno;
 881         bp->b_un.b_addr = caddr;
 882 
 883         xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
 884         if (xi == NULL) {
 885                 rw_exit(&bd_lock);
 886                 freerbuf(bp);
 887                 return (ENOMEM);
 888         }
 889         xi->i_blkno = blkno + pstart;
 890         xi->i_flags = BD_XFER_POLL;
 891         bd_submit(bd, xi);
 892         rw_exit(&bd_lock);
 893 
 894         /*
 895          * Generally, we should have run this entirely synchronously
 896          * at this point and the biowait call should be a no-op.  If
 897          * it didn't happen this way, it's a bug in the underlying
 898          * driver not honoring BD_XFER_POLL.
 899          */
 900         (void) biowait(bp);
 901         rv = geterror(bp);
 902         freerbuf(bp);
 903         return (rv);
 904 }
 905 
 906 void
 907 bd_minphys(struct buf *bp)
 908 {
 909         minor_t inst;
 910         bd_t    *bd;
 911         inst = BDINST(bp->b_edev);
 912 
 913         bd = ddi_get_soft_state(bd_state, inst);
 914 
 915         /*
 916          * In a non-debug kernel, bd_strategy will catch !bd as
 917          * well, and will fail nicely.
 918          */
 919         ASSERT(bd);
 920 
 921         if (bp->b_bcount > bd->d_maxxfer)
 922                 bp->b_bcount = bd->d_maxxfer;
 923 }
 924 
 925 static int
 926 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
 927 {
 928         _NOTE(ARGUNUSED(credp));
 929         return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
 930 }
 931 
 932 static int
 933 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
 934 {
 935         _NOTE(ARGUNUSED(credp));
 936         return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
 937 }
 938 
 939 static int
 940 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
 941 {
 942         _NOTE(ARGUNUSED(credp));
 943         return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
 944 }
 945 
 946 static int
 947 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
 948 {
 949         _NOTE(ARGUNUSED(credp));
 950         return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
 951 }
 952 
 953 static int
 954 bd_strategy(struct buf *bp)
 955 {
 956         minor_t         inst;
 957         minor_t         part;
 958         bd_t            *bd;
 959         diskaddr_t      p_lba;
 960         diskaddr_t      p_nblks;
 961         diskaddr_t      b_nblks;
 962         bd_xfer_impl_t  *xi;
 963         uint32_t        shift;
 964         int             (*func)(void *, bd_xfer_t *);
 965 
 966         part = BDPART(bp->b_edev);
 967         inst = BDINST(bp->b_edev);
 968 
 969         ASSERT(bp);
 970 
 971         bp->b_resid = bp->b_bcount;
 972 
 973         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
 974                 bioerror(bp, ENXIO);
 975                 biodone(bp);
 976                 return (0);
 977         }
 978 
 979         if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
 980             NULL, NULL, 0)) {
 981                 bioerror(bp, ENXIO);
 982                 biodone(bp);
 983                 return (0);
 984         }
 985 
 986         shift = bd->d_blkshift;
 987 
 988         if ((P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
 989             (bp->b_lblkno > p_nblks)) {
 990                 bioerror(bp, ENXIO);
 991                 biodone(bp);
 992                 return (0);
 993         }
 994         b_nblks = bp->b_bcount >> shift;
 995         if ((bp->b_lblkno == p_nblks) || (bp->b_bcount == 0)) {
 996                 biodone(bp);
 997                 return (0);
 998         }
 999 
1000         if ((b_nblks + bp->b_lblkno) > p_nblks) {
1001                 bp->b_resid = ((bp->b_lblkno + b_nblks - p_nblks) << shift);
1002                 bp->b_bcount -= bp->b_resid;
1003         } else {
1004                 bp->b_resid = 0;
1005         }
1006         func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1007 
1008         xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1009         if (xi == NULL) {
1010                 xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1011         }
1012         if (xi == NULL) {
1013                 /* bd_request_alloc will have done bioerror */
1014                 biodone(bp);
1015                 return (0);
1016         }
1017         xi->i_blkno = bp->b_lblkno + p_lba;
1018 
1019         bd_submit(bd, xi);
1020 
1021         return (0);
1022 }
1023 
1024 static int
1025 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1026 {
1027         minor_t         inst;
1028         uint16_t        part;
1029         bd_t            *bd;
1030         void            *ptr = (void *)arg;
1031         int             rv;
1032 
1033         part = BDPART(dev);
1034         inst = BDINST(dev);
1035 
1036         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1037                 return (ENXIO);
1038         }
1039 
1040         rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1041         if (rv != ENOTTY)
1042                 return (rv);
1043 
1044         switch (cmd) {
1045         case DKIOCGMEDIAINFO: {
1046                 struct dk_minfo minfo;
1047 
1048                 /* make sure our state information is current */
1049                 bd_update_state(bd);
1050                 bzero(&minfo, sizeof (minfo));
1051                 minfo.dki_media_type = DK_FIXED_DISK;
1052                 minfo.dki_lbsize = (1U << bd->d_blkshift);
1053                 minfo.dki_capacity = bd->d_numblks;
1054                 if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag))  {
1055                         return (EFAULT);
1056                 }
1057                 return (0);
1058         }
1059         case DKIOCINFO: {
1060                 struct dk_cinfo cinfo;
1061                 bzero(&cinfo, sizeof (cinfo));
1062                 cinfo.dki_ctype = DKC_BLKDEV;
1063                 cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1064                 (void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1065                     "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1066                 (void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1067                     "%s", ddi_driver_name(bd->d_dip));
1068                 cinfo.dki_unit = inst;
1069                 cinfo.dki_flags = DKI_FMTVOL;
1070                 cinfo.dki_partition = part;
1071                 cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1072                 cinfo.dki_addr = 0;
1073                 cinfo.dki_slave = 0;
1074                 cinfo.dki_space = 0;
1075                 cinfo.dki_prio = 0;
1076                 cinfo.dki_vec = 0;
1077                 if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag))  {
1078                         return (EFAULT);
1079                 }
1080                 return (0);
1081         }
1082         case DKIOCREMOVABLE: {
1083                 int i;
1084                 i = bd->d_removable ? 1 : 0;
1085                 if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1086                         return (EFAULT);
1087                 }
1088                 return (0);
1089         }
1090         case DKIOCHOTPLUGGABLE: {
1091                 int i;
1092                 i = bd->d_hotpluggable ? 1 : 0;
1093                 if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1094                         return (EFAULT);
1095                 }
1096                 return (0);
1097         }
1098         case DKIOCREADONLY: {
1099                 int i;
1100                 i = bd->d_rdonly ? 1 : 0;
1101                 if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1102                         return (EFAULT);
1103                 }
1104                 return (0);
1105         }
1106         case DKIOCSTATE: {
1107                 enum dkio_state state;
1108                 if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1109                         return (EFAULT);
1110                 }
1111                 if ((rv = bd_check_state(bd, &state)) != 0) {
1112                         return (rv);
1113                 }
1114                 if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1115                         return (EFAULT);
1116                 }
1117                 return (0);
1118         }
1119         case DKIOCFLUSHWRITECACHE: {
1120                 struct dk_callback *dkc = NULL;
1121 
1122                 if (flag & FKIOCTL)
1123                         dkc = (void *)arg;
1124 
1125                 rv = bd_flush_write_cache(bd, dkc);
1126                 return (rv);
1127         }
1128 
1129         default:
1130                 break;
1131 
1132         }
1133         return (ENOTTY);
1134 }
1135 
1136 static int
1137 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1138     char *name, caddr_t valuep, int *lengthp)
1139 {
1140         bd_t    *bd;
1141 
1142         bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1143         if (bd == NULL)
1144                 return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1145                     name, valuep, lengthp));
1146 
1147         return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1148             valuep, lengthp, BDPART(dev), 0));
1149 }
1150 
1151 
1152 static int
1153 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1154     size_t length, void *tg_cookie)
1155 {
1156         bd_t            *bd;
1157         buf_t           *bp;
1158         bd_xfer_impl_t  *xi;
1159         int             rv;
1160         int             (*func)(void *, bd_xfer_t *);
1161         int             kmflag;
1162 
1163         /*
1164          * If we are running in polled mode (such as during dump(9e)
1165          * execution), then we cannot sleep for kernel allocations.
1166          */
1167         kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1168 
1169         bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1170 
1171         if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1172                 /* We can only transfer whole blocks at a time! */
1173                 return (EINVAL);
1174         }
1175 
1176         if ((bp = getrbuf(kmflag)) == NULL) {
1177                 return (ENOMEM);
1178         }
1179 
1180         switch (cmd) {
1181         case TG_READ:
1182                 bp->b_flags = B_READ;
1183                 func = bd->d_ops.o_read;
1184                 break;
1185         case TG_WRITE:
1186                 bp->b_flags = B_WRITE;
1187                 func = bd->d_ops.o_write;
1188                 break;
1189         default:
1190                 freerbuf(bp);
1191                 return (EINVAL);
1192         }
1193 
1194         bp->b_un.b_addr = bufaddr;
1195         bp->b_bcount = length;
1196         xi = bd_xfer_alloc(bd, bp, func, kmflag);
1197         if (xi == NULL) {
1198                 rv = geterror(bp);
1199                 freerbuf(bp);
1200                 return (rv);
1201         }
1202         xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1203         xi->i_blkno = start;
1204         bd_submit(bd, xi);
1205         (void) biowait(bp);
1206         rv = geterror(bp);
1207         freerbuf(bp);
1208 
1209         return (rv);
1210 }
1211 
1212 static int
1213 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1214 {
1215         bd_t            *bd;
1216 
1217         _NOTE(ARGUNUSED(tg_cookie));
1218         bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1219 
1220         switch (cmd) {
1221         case TG_GETPHYGEOM:
1222         case TG_GETVIRTGEOM:
1223                 /*
1224                  * We don't have any "geometry" as such, let cmlb
1225                  * fabricate something.
1226                  */
1227                 return (ENOTTY);
1228 
1229         case TG_GETCAPACITY:
1230                 bd_update_state(bd);
1231                 *(diskaddr_t *)arg = bd->d_numblks;
1232                 return (0);
1233 
1234         case TG_GETBLOCKSIZE:
1235                 *(uint32_t *)arg = (1U << bd->d_blkshift);
1236                 return (0);
1237 
1238         case TG_GETATTR:
1239                 /*
1240                  * It turns out that cmlb really doesn't do much for
1241                  * non-writable media, but lets make the information
1242                  * available for it in case it does more in the
1243                  * future.  (The value is currently used for
1244                  * triggering special behavior for CD-ROMs.)
1245                  */
1246                 bd_update_state(bd);
1247                 ((tg_attribute_t *)arg)->media_is_writable =
1248                     bd->d_rdonly ? B_FALSE : B_TRUE;
1249                 return (0);
1250 
1251         default:
1252                 return (EINVAL);
1253         }
1254 }
1255 
1256 
1257 static void
1258 bd_sched(bd_t *bd)
1259 {
1260         bd_xfer_impl_t  *xi;
1261         struct buf      *bp;
1262         int             rv;
1263 
1264         mutex_enter(&bd->d_iomutex);
1265 
1266         while ((bd->d_qactive < bd->d_qsize) &&
1267             ((xi = list_remove_head(&bd->d_waitq)) != NULL)) {
1268                 bd->d_qactive++;
1269                 kstat_waitq_to_runq(bd->d_kiop);
1270                 list_insert_tail(&bd->d_runq, xi);
1271 
1272                 /*
1273                  * Submit the job to the driver.  We drop the I/O mutex
1274                  * so that we can deal with the case where the driver
1275                  * completion routine calls back into us synchronously.
1276                  */
1277 
1278                 mutex_exit(&bd->d_iomutex);
1279 
1280                 rv = xi->i_func(bd->d_private, &xi->i_public);
1281                 if (rv != 0) {
1282                         bp = xi->i_bp;
1283                         bd_xfer_free(xi);
1284                         bioerror(bp, rv);
1285                         biodone(bp);
1286 
1287                         mutex_enter(&bd->d_iomutex);
1288                         bd->d_qactive--;
1289                         kstat_runq_exit(bd->d_kiop);
1290                         list_remove(&bd->d_runq, xi);
1291                 } else {
1292                         mutex_enter(&bd->d_iomutex);
1293                 }
1294         }
1295 
1296         mutex_exit(&bd->d_iomutex);
1297 }
1298 
1299 static void
1300 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1301 {
1302         mutex_enter(&bd->d_iomutex);
1303         list_insert_tail(&bd->d_waitq, xi);
1304         kstat_waitq_enter(bd->d_kiop);
1305         mutex_exit(&bd->d_iomutex);
1306 
1307         bd_sched(bd);
1308 }
1309 
1310 static void
1311 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1312 {
1313         bd_t    *bd = xi->i_bd;
1314         buf_t   *bp = xi->i_bp;
1315 
1316         mutex_enter(&bd->d_iomutex);
1317         bd->d_qactive--;
1318         kstat_runq_exit(bd->d_kiop);
1319         list_remove(&bd->d_runq, xi);
1320         mutex_exit(&bd->d_iomutex);
1321 
1322         if (err == 0) {
1323                 if (bp->b_flags & B_READ) {
1324                         bd->d_kiop->reads++;
1325                         bd->d_kiop->nread += (bp->b_bcount - xi->i_resid);
1326                 } else {
1327                         bd->d_kiop->writes++;
1328                         bd->d_kiop->nwritten += (bp->b_bcount - xi->i_resid);
1329                 }
1330         }
1331         bd_sched(bd);
1332 }
1333 
1334 static void
1335 bd_update_state(bd_t *bd)
1336 {
1337         enum    dkio_state      state;
1338         bd_media_t              media;
1339         boolean_t               docmlb = B_FALSE;
1340 
1341         bzero(&media, sizeof (media));
1342 
1343         mutex_enter(&bd->d_statemutex);
1344         if (bd->d_ops.o_media_info(bd->d_private, &media) == 0) {
1345                 if ((1U << bd->d_blkshift) != media.m_blksize) {
1346                         if ((media.m_blksize < 512) ||
1347                             (!ISP2(media.m_blksize)) ||
1348                             (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1349                                 cmn_err(CE_WARN,
1350                                     "%s%d: Invalid media block size (%d)",
1351                                     ddi_driver_name(bd->d_dip),
1352                                     ddi_get_instance(bd->d_dip),
1353                                     media.m_blksize);
1354                                 /*
1355                                  * We can't use the media, treat it as
1356                                  * not present.
1357                                  */
1358                                 state = DKIO_EJECTED;
1359                                 bd->d_numblks = 0;
1360                         } else {
1361                                 bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1362                                 bd->d_numblks = media.m_nblks;
1363                                 bd->d_rdonly = media.m_readonly;
1364                                 state = DKIO_INSERTED;
1365                         }
1366 
1367                         /* Device size changed */
1368                         docmlb = B_TRUE;
1369 
1370                 } else {
1371                         if (bd->d_numblks != media.m_nblks) {
1372                                 /* Device size changed */
1373                                 docmlb = B_TRUE;
1374                         }
1375                         bd->d_numblks = media.m_nblks;
1376                         bd->d_rdonly = media.m_readonly;
1377                         state = DKIO_INSERTED;
1378                 }
1379 
1380         } else {
1381                 bd->d_numblks = 0;
1382                 state = DKIO_EJECTED;
1383         }
1384         if (state != bd->d_state) {
1385                 bd->d_state = state;
1386                 cv_broadcast(&bd->d_statecv);
1387                 docmlb = B_TRUE;
1388         }
1389         mutex_exit(&bd->d_statemutex);
1390 
1391         if (docmlb) {
1392                 if (state == DKIO_INSERTED) {
1393                         (void) cmlb_validate(bd->d_cmlbh, 0, 0);
1394                 } else {
1395                         cmlb_invalidate(bd->d_cmlbh, 0);
1396                 }
1397         }
1398 }
1399 
1400 static int
1401 bd_check_state(bd_t *bd, enum dkio_state *state)
1402 {
1403         clock_t         when;
1404 
1405         for (;;) {
1406 
1407                 bd_update_state(bd);
1408 
1409                 mutex_enter(&bd->d_statemutex);
1410 
1411                 if (bd->d_state != *state) {
1412                         *state = bd->d_state;
1413                         mutex_exit(&bd->d_statemutex);
1414                         break;
1415                 }
1416 
1417                 when = drv_usectohz(1000000);
1418                 if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1419                     when, TR_CLOCK_TICK) == 0) {
1420                         mutex_exit(&bd->d_statemutex);
1421                         return (EINTR);
1422                 }
1423 
1424                 mutex_exit(&bd->d_statemutex);
1425         }
1426 
1427         return (0);
1428 }
1429 
1430 static int
1431 bd_flush_write_cache_done(struct buf *bp)
1432 {
1433         struct dk_callback *dc = (void *)bp->b_private;
1434 
1435         (*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1436         kmem_free(dc, sizeof (*dc));
1437         freerbuf(bp);
1438         return (0);
1439 }
1440 
1441 static int
1442 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
1443 {
1444         buf_t                   *bp;
1445         struct dk_callback      *dc;
1446         bd_xfer_impl_t          *xi;
1447         int                     rv;
1448 
1449         if (bd->d_ops.o_sync_cache == NULL) {
1450                 return (ENOTSUP);
1451         }
1452         if ((bp = getrbuf(KM_SLEEP)) == NULL) {
1453                 return (ENOMEM);
1454         }
1455         bp->b_resid = 0;
1456         bp->b_bcount = 0;
1457 
1458         xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
1459         if (xi == NULL) {
1460                 rv = geterror(bp);
1461                 freerbuf(bp);
1462                 return (rv);
1463         }
1464 
1465         /* Make an asynchronous flush, but only if there is a callback */
1466         if (dkc != NULL && dkc->dkc_callback != NULL) {
1467                 /* Make a private copy of the callback structure */
1468                 dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
1469                 *dc = *dkc;
1470                 bp->b_private = dc;
1471                 bp->b_iodone = bd_flush_write_cache_done;
1472 
1473                 bd_submit(bd, xi);
1474                 return (0);
1475         }
1476 
1477         /* In case there is no callback, perform a synchronous flush */
1478         bd_submit(bd, xi);
1479         (void) biowait(bp);
1480         rv = geterror(bp);
1481         freerbuf(bp);
1482 
1483         return (rv);
1484 }
1485 
1486 /*
1487  * Nexus support.
1488  */
1489 int
1490 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
1491     void *arg, void *result)
1492 {
1493         bd_handle_t     hdl;
1494 
1495         switch (ctlop) {
1496         case DDI_CTLOPS_REPORTDEV:
1497                 cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
1498                     ddi_node_name(rdip), ddi_get_name_addr(rdip),
1499                     ddi_driver_name(rdip), ddi_get_instance(rdip));
1500                 return (DDI_SUCCESS);
1501 
1502         case DDI_CTLOPS_INITCHILD:
1503                 hdl = ddi_get_parent_data((dev_info_t *)arg);
1504                 if (hdl == NULL) {
1505                         return (DDI_NOT_WELL_FORMED);
1506                 }
1507                 ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
1508                 return (DDI_SUCCESS);
1509 
1510         case DDI_CTLOPS_UNINITCHILD:
1511                 ddi_set_name_addr((dev_info_t *)arg, NULL);
1512                 ndi_prop_remove_all((dev_info_t *)arg);
1513                 return (DDI_SUCCESS);
1514 
1515         default:
1516                 return (ddi_ctlops(dip, rdip, ctlop, arg, result));
1517         }
1518 }
1519 
1520 /*
1521  * Functions for device drivers.
1522  */
1523 bd_handle_t
1524 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
1525 {
1526         bd_handle_t     hdl;
1527 
1528         hdl = kmem_zalloc(sizeof (*hdl), kmflag);
1529         if (hdl != NULL) {
1530                 hdl->h_ops = *ops;
1531                 hdl->h_dma = dma;
1532                 hdl->h_private = private;
1533         }
1534 
1535         return (hdl);
1536 }
1537 
1538 void
1539 bd_free_handle(bd_handle_t hdl)
1540 {
1541         kmem_free(hdl, sizeof (*hdl));
1542 }
1543 
1544 int
1545 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
1546 {
1547         dev_info_t      *child;
1548         bd_drive_t      drive;
1549 
1550         /* if drivers don't override this, make it assume none */
1551         drive.d_lun = -1;
1552         hdl->h_ops.o_drive_info(hdl->h_private, &drive);
1553 
1554         hdl->h_parent = dip;
1555         hdl->h_name = "blkdev";
1556 
1557         if (drive.d_lun >= 0) {
1558                 (void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X,%X",
1559                     drive.d_target, drive.d_lun);
1560         } else {
1561                 (void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X",
1562                     drive.d_target);
1563         }
1564         if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
1565             &child) != NDI_SUCCESS) {
1566                 cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
1567                     ddi_driver_name(dip), ddi_get_instance(dip),
1568                     "blkdev", hdl->h_addr);
1569                 return (DDI_FAILURE);
1570         }
1571 
1572         ddi_set_parent_data(child, hdl);
1573         hdl->h_child = child;
1574 
1575         if (ndi_devi_online(child, 0) == NDI_FAILURE) {
1576                 cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
1577                     ddi_driver_name(dip), ddi_get_instance(dip),
1578                     hdl->h_name, hdl->h_addr);
1579                 (void) ndi_devi_free(child);
1580                 return (DDI_FAILURE);
1581         }
1582 
1583         return (DDI_SUCCESS);
1584 }
1585 
1586 int
1587 bd_detach_handle(bd_handle_t hdl)
1588 {
1589         int     circ;
1590         int     rv;
1591         char    *devnm;
1592 
1593         if (hdl->h_child == NULL) {
1594                 return (DDI_SUCCESS);
1595         }
1596         ndi_devi_enter(hdl->h_parent, &circ);
1597         if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
1598                 rv = ddi_remove_child(hdl->h_child, 0);
1599         } else {
1600                 devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
1601                 (void) ddi_deviname(hdl->h_child, devnm);
1602                 (void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
1603                 rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
1604                     NDI_DEVI_REMOVE | NDI_UNCONFIG);
1605                 kmem_free(devnm, MAXNAMELEN + 1);
1606         }
1607         if (rv == 0) {
1608                 hdl->h_child = NULL;
1609         }
1610 
1611         ndi_devi_exit(hdl->h_parent, circ);
1612         return (rv = NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
1613 }
1614 
1615 void
1616 bd_xfer_done(bd_xfer_t *xfer, int err)
1617 {
1618         bd_xfer_impl_t  *xi = (void *)xfer;
1619         buf_t           *bp = xi->i_bp;
1620         int             rv = DDI_SUCCESS;
1621         bd_t            *bd = xi->i_bd;
1622         size_t          len;
1623 
1624         if (err != 0) {
1625                 bd_runq_exit(xi, err);
1626 
1627                 bp->b_resid += xi->i_resid;
1628                 bd_xfer_free(xi);
1629                 bioerror(bp, err);
1630                 biodone(bp);
1631                 return;
1632         }
1633 
1634         xi->i_cur_win++;
1635         xi->i_resid -= xi->i_len;
1636 
1637         if (xi->i_resid == 0) {
1638                 /* Job completed succcessfully! */
1639                 bd_runq_exit(xi, 0);
1640 
1641                 bd_xfer_free(xi);
1642                 biodone(bp);
1643                 return;
1644         }
1645 
1646         xi->i_blkno += xi->i_nblks;
1647 
1648         if (bd->d_use_dma) {
1649                 /* More transfer still pending... advance to next DMA window. */
1650                 rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
1651                     &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
1652         } else {
1653                 /* Advance memory window. */
1654                 xi->i_kaddr += xi->i_len;
1655                 xi->i_offset += xi->i_len;
1656                 len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
1657         }
1658 
1659 
1660         if ((rv != DDI_SUCCESS) ||
1661             (P2PHASE(len, (1U << xi->i_blkshift) != 0))) {
1662                 bd_runq_exit(xi, EFAULT);
1663 
1664                 bp->b_resid += xi->i_resid;
1665                 bd_xfer_free(xi);
1666                 bioerror(bp, EFAULT);
1667                 biodone(bp);
1668                 return;
1669         }
1670         xi->i_len = len;
1671         xi->i_nblks = len >> xi->i_blkshift;
1672 
1673         /* Submit next window to hardware. */
1674         rv = xi->i_func(bd->d_private, &xi->i_public);
1675         if (rv != 0) {
1676                 bd_runq_exit(xi, rv);
1677 
1678                 bp->b_resid += xi->i_resid;
1679                 bd_xfer_free(xi);
1680                 bioerror(bp, rv);
1681                 biodone(bp);
1682         }
1683 }
1684 
1685 void
1686 bd_state_change(bd_handle_t hdl)
1687 {
1688         bd_t            *bd;
1689 
1690         if ((bd = hdl->h_bd) != NULL) {
1691                 bd_update_state(bd);
1692         }
1693 }
1694 
1695 void
1696 bd_mod_init(struct dev_ops *devops)
1697 {
1698         static struct bus_ops bd_bus_ops = {
1699                 BUSO_REV,               /* busops_rev */
1700                 nullbusmap,             /* bus_map */
1701                 NULL,                   /* bus_get_intrspec (OBSOLETE) */
1702                 NULL,                   /* bus_add_intrspec (OBSOLETE) */
1703                 NULL,                   /* bus_remove_intrspec (OBSOLETE) */
1704                 i_ddi_map_fault,        /* bus_map_fault */
1705                 NULL,                   /* bus_dma_map (OBSOLETE) */
1706                 ddi_dma_allochdl,       /* bus_dma_allochdl */
1707                 ddi_dma_freehdl,        /* bus_dma_freehdl */
1708                 ddi_dma_bindhdl,        /* bus_dma_bindhdl */
1709                 ddi_dma_unbindhdl,      /* bus_dma_unbindhdl */
1710                 ddi_dma_flush,          /* bus_dma_flush */
1711                 ddi_dma_win,            /* bus_dma_win */
1712                 ddi_dma_mctl,           /* bus_dma_ctl */
1713                 bd_bus_ctl,             /* bus_ctl */
1714                 ddi_bus_prop_op,        /* bus_prop_op */
1715                 NULL,                   /* bus_get_eventcookie */
1716                 NULL,                   /* bus_add_eventcall */
1717                 NULL,                   /* bus_remove_eventcall */
1718                 NULL,                   /* bus_post_event */
1719                 NULL,                   /* bus_intr_ctl (OBSOLETE) */
1720                 NULL,                   /* bus_config */
1721                 NULL,                   /* bus_unconfig */
1722                 NULL,                   /* bus_fm_init */
1723                 NULL,                   /* bus_fm_fini */
1724                 NULL,                   /* bus_fm_access_enter */
1725                 NULL,                   /* bus_fm_access_exit */
1726                 NULL,                   /* bus_power */
1727                 NULL,                   /* bus_intr_op */
1728         };
1729 
1730         devops->devo_bus_ops = &bd_bus_ops;
1731 
1732         /*
1733          * NB: The device driver is free to supply its own
1734          * character entry device support.
1735          */
1736 }
1737 
1738 void
1739 bd_mod_fini(struct dev_ops *devops)
1740 {
1741         devops->devo_bus_ops = NULL;
1742 }