1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
  25  * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/ksynch.h>
  30 #include <sys/kmem.h>
  31 #include <sys/file.h>
  32 #include <sys/errno.h>
  33 #include <sys/open.h>
  34 #include <sys/buf.h>
  35 #include <sys/uio.h>
  36 #include <sys/aio_req.h>
  37 #include <sys/cred.h>
  38 #include <sys/modctl.h>
  39 #include <sys/cmlb.h>
  40 #include <sys/conf.h>
  41 #include <sys/devops.h>
  42 #include <sys/list.h>
  43 #include <sys/sysmacros.h>
  44 #include <sys/dkio.h>
  45 #include <sys/vtoc.h>
  46 #include <sys/scsi/scsi.h>        /* for DTYPE_DIRECT */
  47 #include <sys/kstat.h>
  48 #include <sys/fs/dv_node.h>
  49 #include <sys/ddi.h>
  50 #include <sys/sunddi.h>
  51 #include <sys/note.h>
  52 #include <sys/blkdev.h>
  53 
  54 #define BD_MAXPART      64
  55 #define BDINST(dev)     (getminor(dev) / BD_MAXPART)
  56 #define BDPART(dev)     (getminor(dev) % BD_MAXPART)
  57 
  58 typedef struct bd bd_t;
  59 typedef struct bd_xfer_impl bd_xfer_impl_t;
  60 
  61 struct bd {
  62         void            *d_private;
  63         dev_info_t      *d_dip;
  64         kmutex_t        d_ocmutex;
  65         kmutex_t        d_iomutex;
  66         kmutex_t        d_statemutex;
  67         kcondvar_t      d_statecv;
  68         enum dkio_state d_state;
  69         cmlb_handle_t   d_cmlbh;
  70         unsigned        d_open_lyr[BD_MAXPART]; /* open count */
  71         uint64_t        d_open_excl;    /* bit mask indexed by partition */
  72         uint64_t        d_open_reg[OTYPCNT];            /* bit mask */
  73 
  74         uint32_t        d_qsize;
  75         uint32_t        d_qactive;
  76         uint32_t        d_maxxfer;
  77         uint32_t        d_blkshift;
  78         uint64_t        d_numblks;
  79         ddi_devid_t     d_devid;
  80 
  81         kmem_cache_t    *d_cache;
  82         list_t          d_runq;
  83         list_t          d_waitq;
  84         kstat_t         *d_ksp;
  85         kstat_io_t      *d_kiop;
  86 
  87         boolean_t       d_rdonly;
  88         boolean_t       d_removable;
  89         boolean_t       d_hotpluggable;
  90         boolean_t       d_use_dma;
  91 
  92         ddi_dma_attr_t  d_dma;
  93         bd_ops_t        d_ops;
  94         bd_handle_t     d_handle;
  95 };
  96 
  97 struct bd_handle {
  98         bd_ops_t        h_ops;
  99         ddi_dma_attr_t  *h_dma;
 100         dev_info_t      *h_parent;
 101         dev_info_t      *h_child;
 102         void            *h_private;
 103         bd_t            *h_bd;
 104         char            *h_name;
 105         char            h_addr[20];     /* enough for %X,%X */
 106 };
 107 
 108 struct bd_xfer_impl {
 109         bd_xfer_t       i_public;
 110         list_node_t     i_linkage;
 111         bd_t            *i_bd;
 112         buf_t           *i_bp;
 113         uint_t          i_num_win;
 114         uint_t          i_cur_win;
 115         off_t           i_offset;
 116         int             (*i_func)(void *, bd_xfer_t *);
 117         uint32_t        i_blkshift;
 118         size_t          i_len;
 119         size_t          i_resid;
 120 };
 121 
 122 #define i_dmah          i_public.x_dmah
 123 #define i_dmac          i_public.x_dmac
 124 #define i_ndmac         i_public.x_ndmac
 125 #define i_kaddr         i_public.x_kaddr
 126 #define i_nblks         i_public.x_nblks
 127 #define i_blkno         i_public.x_blkno
 128 #define i_flags         i_public.x_flags
 129 
 130 
 131 /*
 132  * Private prototypes.
 133  */
 134 
 135 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 136 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
 137 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
 138 
 139 static int bd_open(dev_t *, int, int, cred_t *);
 140 static int bd_close(dev_t, int, int, cred_t *);
 141 static int bd_strategy(struct buf *);
 142 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
 143 static int bd_dump(dev_t, caddr_t, daddr_t, int);
 144 static int bd_read(dev_t, struct uio *, cred_t *);
 145 static int bd_write(dev_t, struct uio *, cred_t *);
 146 static int bd_aread(dev_t, struct aio_req *, cred_t *);
 147 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
 148 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
 149     caddr_t, int *);
 150 
 151 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
 152     void *);
 153 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
 154 static int bd_xfer_ctor(void *, void *, int);
 155 static void bd_xfer_dtor(void *, void *);
 156 static void bd_sched(bd_t *);
 157 static void bd_submit(bd_t *, bd_xfer_impl_t *);
 158 static void bd_runq_exit(bd_xfer_impl_t *, int);
 159 static void bd_update_state(bd_t *);
 160 static int bd_check_state(bd_t *, enum dkio_state *);
 161 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
 162 
 163 struct cmlb_tg_ops bd_tg_ops = {
 164         TG_DK_OPS_VERSION_1,
 165         bd_tg_rdwr,
 166         bd_tg_getinfo,
 167 };
 168 
 169 static struct cb_ops bd_cb_ops = {
 170         bd_open,                /* open */
 171         bd_close,               /* close */
 172         bd_strategy,            /* strategy */
 173         nodev,                  /* print */
 174         bd_dump,                /* dump */
 175         bd_read,                /* read */
 176         bd_write,               /* write */
 177         bd_ioctl,               /* ioctl */
 178         nodev,                  /* devmap */
 179         nodev,                  /* mmap */
 180         nodev,                  /* segmap */
 181         nochpoll,               /* poll */
 182         bd_prop_op,             /* cb_prop_op */
 183         0,                      /* streamtab  */
 184         D_64BIT | D_MP,         /* Driver comaptibility flag */
 185         CB_REV,                 /* cb_rev */
 186         bd_aread,               /* async read */
 187         bd_awrite               /* async write */
 188 };
 189 
 190 struct dev_ops bd_dev_ops = {
 191         DEVO_REV,               /* devo_rev, */
 192         0,                      /* refcnt  */
 193         bd_getinfo,             /* getinfo */
 194         nulldev,                /* identify */
 195         nulldev,                /* probe */
 196         bd_attach,              /* attach */
 197         bd_detach,              /* detach */
 198         nodev,                  /* reset */
 199         &bd_cb_ops,                 /* driver operations */
 200         NULL,                   /* bus operations */
 201         NULL,                   /* power */
 202         ddi_quiesce_not_needed, /* quiesce */
 203 };
 204 
 205 static struct modldrv modldrv = {
 206         &mod_driverops,
 207         "Generic Block Device",
 208         &bd_dev_ops,
 209 };
 210 
 211 static struct modlinkage modlinkage = {
 212         MODREV_1, { &modldrv, NULL }
 213 };
 214 
 215 static void *bd_state;
 216 static krwlock_t bd_lock;
 217 
 218 int
 219 _init(void)
 220 {
 221         int     rv;
 222 
 223         rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
 224         if (rv != DDI_SUCCESS) {
 225                 return (rv);
 226         }
 227         rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
 228         rv = mod_install(&modlinkage);
 229         if (rv != DDI_SUCCESS) {
 230                 rw_destroy(&bd_lock);
 231                 ddi_soft_state_fini(&bd_state);
 232         }
 233         return (rv);
 234 }
 235 
 236 int
 237 _fini(void)
 238 {
 239         int     rv;
 240 
 241         rv = mod_remove(&modlinkage);
 242         if (rv == DDI_SUCCESS) {
 243                 rw_destroy(&bd_lock);
 244                 ddi_soft_state_fini(&bd_state);
 245         }
 246         return (rv);
 247 }
 248 
 249 int
 250 _info(struct modinfo *modinfop)
 251 {
 252         return (mod_info(&modlinkage, modinfop));
 253 }
 254 
 255 static int
 256 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
 257 {
 258         bd_t    *bd;
 259         minor_t inst;
 260 
 261         _NOTE(ARGUNUSED(dip));
 262 
 263         inst = BDINST((dev_t)arg);
 264 
 265         switch (cmd) {
 266         case DDI_INFO_DEVT2DEVINFO:
 267                 bd = ddi_get_soft_state(bd_state, inst);
 268                 if (bd == NULL) {
 269                         return (DDI_FAILURE);
 270                 }
 271                 *resultp = (void *)bd->d_dip;
 272                 break;
 273 
 274         case DDI_INFO_DEVT2INSTANCE:
 275                 *resultp = (void *)(intptr_t)inst;
 276                 break;
 277 
 278         default:
 279                 return (DDI_FAILURE);
 280         }
 281         return (DDI_SUCCESS);
 282 }
 283 
 284 static int
 285 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 286 {
 287         int             inst;
 288         bd_handle_t     hdl;
 289         bd_t            *bd;
 290         bd_drive_t      drive;
 291         int             rv;
 292         char            name[16];
 293         char            kcache[32];
 294 
 295         switch (cmd) {
 296         case DDI_ATTACH:
 297                 break;
 298         case DDI_RESUME:
 299                 /* We don't do anything native for suspend/resume */
 300                 return (DDI_SUCCESS);
 301         default:
 302                 return (DDI_FAILURE);
 303         }
 304 
 305         inst = ddi_get_instance(dip);
 306         hdl = ddi_get_parent_data(dip);
 307 
 308         (void) snprintf(name, sizeof (name), "%s%d",
 309             ddi_driver_name(dip), ddi_get_instance(dip));
 310         (void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
 311 
 312         if (hdl == NULL) {
 313                 cmn_err(CE_WARN, "%s: missing parent data!", name);
 314                 return (DDI_FAILURE);
 315         }
 316 
 317         if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
 318                 cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
 319                 return (DDI_FAILURE);
 320         }
 321         bd = ddi_get_soft_state(bd_state, inst);
 322 
 323         if (hdl->h_dma) {
 324                 bd->d_dma = *(hdl->h_dma);
 325                 bd->d_dma.dma_attr_granular =
 326                     max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
 327                 bd->d_use_dma = B_TRUE;
 328 
 329                 if (bd->d_maxxfer &&
 330                     (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
 331                         cmn_err(CE_WARN,
 332                             "%s: inconsistent maximum transfer size!",
 333                             name);
 334                         /* We force it */
 335                         bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
 336                 } else {
 337                         bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
 338                 }
 339         } else {
 340                 bd->d_use_dma = B_FALSE;
 341                 if (bd->d_maxxfer == 0) {
 342                         bd->d_maxxfer = 1024 * 1024;
 343                 }
 344         }
 345         bd->d_ops = hdl->h_ops;
 346         bd->d_private = hdl->h_private;
 347         bd->d_blkshift = 9;  /* 512 bytes, to start */
 348 
 349         if (bd->d_maxxfer % DEV_BSIZE) {
 350                 cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
 351                 bd->d_maxxfer &= ~(DEV_BSIZE - 1);
 352         }
 353         if (bd->d_maxxfer < DEV_BSIZE) {
 354                 cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
 355                 ddi_soft_state_free(bd_state, inst);
 356                 return (DDI_FAILURE);
 357         }
 358 
 359         bd->d_dip = dip;
 360         bd->d_handle = hdl;
 361         hdl->h_bd = bd;
 362         ddi_set_driver_private(dip, bd);
 363 
 364         mutex_init(&bd->d_iomutex, NULL, MUTEX_DRIVER, NULL);
 365         mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
 366         mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
 367         cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
 368 
 369         list_create(&bd->d_waitq, sizeof (bd_xfer_impl_t),
 370             offsetof(struct bd_xfer_impl, i_linkage));
 371         list_create(&bd->d_runq, sizeof (bd_xfer_impl_t),
 372             offsetof(struct bd_xfer_impl, i_linkage));
 373 
 374         bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
 375             bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
 376 
 377         bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
 378             KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
 379         if (bd->d_ksp != NULL) {
 380                 bd->d_ksp->ks_lock = &bd->d_iomutex;
 381                 kstat_install(bd->d_ksp);
 382                 bd->d_kiop = bd->d_ksp->ks_data;
 383         } else {
 384                 /*
 385                  * Even if we cannot create the kstat, we create a
 386                  * scratch kstat.  The reason for this is to ensure
 387                  * that we can update the kstat all of the time,
 388                  * without adding an extra branch instruction.
 389                  */
 390                 bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
 391         }
 392 
 393         cmlb_alloc_handle(&bd->d_cmlbh);
 394 
 395         bd->d_state = DKIO_NONE;
 396 
 397         bzero(&drive, sizeof (drive));
 398         bd->d_ops.o_drive_info(bd->d_private, &drive);
 399         bd->d_qsize = drive.d_qsize;
 400         bd->d_removable = drive.d_removable;
 401         bd->d_hotpluggable = drive.d_hotpluggable;
 402 
 403         if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
 404                 bd->d_maxxfer = drive.d_maxxfer;
 405 
 406 
 407         rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
 408             bd->d_removable, bd->d_hotpluggable,
 409             drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
 410             CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
 411         if (rv != 0) {
 412                 cmlb_free_handle(&bd->d_cmlbh);
 413                 kmem_cache_destroy(bd->d_cache);
 414                 mutex_destroy(&bd->d_iomutex);
 415                 mutex_destroy(&bd->d_ocmutex);
 416                 mutex_destroy(&bd->d_statemutex);
 417                 cv_destroy(&bd->d_statecv);
 418                 list_destroy(&bd->d_waitq);
 419                 list_destroy(&bd->d_runq);
 420                 if (bd->d_ksp != NULL) {
 421                         kstat_delete(bd->d_ksp);
 422                         bd->d_ksp = NULL;
 423                 } else {
 424                         kmem_free(bd->d_kiop, sizeof (kstat_io_t));
 425                 }
 426                 ddi_soft_state_free(bd_state, inst);
 427                 return (DDI_FAILURE);
 428         }
 429 
 430         if (bd->d_ops.o_devid_init != NULL) {
 431                 rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
 432                 if (rv == DDI_SUCCESS) {
 433                         if (ddi_devid_register(dip, bd->d_devid) !=
 434                             DDI_SUCCESS) {
 435                                 cmn_err(CE_WARN,
 436                                     "%s: unable to register devid", name);
 437                         }
 438                 }
 439         }
 440 
 441         /*
 442          * Add a zero-length attribute to tell the world we support
 443          * kernel ioctls (for layered drivers).  Also set up properties
 444          * used by HAL to identify removable media.
 445          */
 446         (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
 447             DDI_KERNEL_IOCTL, NULL, 0);
 448         if (bd->d_removable) {
 449                 (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
 450                     "removable-media", NULL, 0);
 451         }
 452         if (bd->d_hotpluggable) {
 453                 (void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
 454                     "hotpluggable", NULL, 0);
 455         }
 456 
 457         ddi_report_dev(dip);
 458 
 459         return (DDI_SUCCESS);
 460 }
 461 
 462 static int
 463 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 464 {
 465         bd_t    *bd;
 466 
 467         bd = ddi_get_driver_private(dip);
 468 
 469         switch (cmd) {
 470         case DDI_DETACH:
 471                 break;
 472         case DDI_SUSPEND:
 473                 /* We don't suspend, but our parent does */
 474                 return (DDI_SUCCESS);
 475         default:
 476                 return (DDI_FAILURE);
 477         }
 478         if (bd->d_ksp != NULL) {
 479                 kstat_delete(bd->d_ksp);
 480                 bd->d_ksp = NULL;
 481         } else {
 482                 kmem_free(bd->d_kiop, sizeof (kstat_io_t));
 483         }
 484         cmlb_detach(bd->d_cmlbh, 0);
 485         cmlb_free_handle(&bd->d_cmlbh);
 486         if (bd->d_devid)
 487                 ddi_devid_free(bd->d_devid);
 488         kmem_cache_destroy(bd->d_cache);
 489         mutex_destroy(&bd->d_iomutex);
 490         mutex_destroy(&bd->d_ocmutex);
 491         mutex_destroy(&bd->d_statemutex);
 492         cv_destroy(&bd->d_statecv);
 493         list_destroy(&bd->d_waitq);
 494         list_destroy(&bd->d_runq);
 495         ddi_soft_state_free(bd_state, ddi_get_instance(dip));
 496         return (DDI_SUCCESS);
 497 }
 498 
 499 static int
 500 bd_xfer_ctor(void *buf, void *arg, int kmflag)
 501 {
 502         bd_xfer_impl_t  *xi;
 503         bd_t            *bd = arg;
 504         int             (*dcb)(caddr_t);
 505 
 506         if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
 507                 dcb = DDI_DMA_SLEEP;
 508         } else {
 509                 dcb = DDI_DMA_DONTWAIT;
 510         }
 511 
 512         xi = buf;
 513         bzero(xi, sizeof (*xi));
 514         xi->i_bd = bd;
 515 
 516         if (bd->d_use_dma) {
 517                 if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
 518                     &xi->i_dmah) != DDI_SUCCESS) {
 519                         return (-1);
 520                 }
 521         }
 522 
 523         return (0);
 524 }
 525 
 526 static void
 527 bd_xfer_dtor(void *buf, void *arg)
 528 {
 529         bd_xfer_impl_t  *xi = buf;
 530 
 531         _NOTE(ARGUNUSED(arg));
 532 
 533         if (xi->i_dmah)
 534                 ddi_dma_free_handle(&xi->i_dmah);
 535         xi->i_dmah = NULL;
 536 }
 537 
 538 static bd_xfer_impl_t *
 539 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
 540     int kmflag)
 541 {
 542         bd_xfer_impl_t          *xi;
 543         int                     rv;
 544         int                     status;
 545         unsigned                dir;
 546         int                     (*cb)(caddr_t);
 547         size_t                  len;
 548         uint32_t                shift;
 549 
 550         if (kmflag == KM_SLEEP) {
 551                 cb = DDI_DMA_SLEEP;
 552         } else {
 553                 cb = DDI_DMA_DONTWAIT;
 554         }
 555 
 556         xi = kmem_cache_alloc(bd->d_cache, kmflag);
 557         if (xi == NULL) {
 558                 bioerror(bp, ENOMEM);
 559                 return (NULL);
 560         }
 561 
 562         ASSERT(bp);
 563 
 564         xi->i_bp = bp;
 565         xi->i_func = func;
 566         xi->i_blkno = bp->b_lblkno;
 567 
 568         if (bp->b_bcount == 0) {
 569                 xi->i_len = 0;
 570                 xi->i_nblks = 0;
 571                 xi->i_kaddr = NULL;
 572                 xi->i_resid = 0;
 573                 xi->i_num_win = 0;
 574                 goto done;
 575         }
 576 
 577         if (bp->b_flags & B_READ) {
 578                 dir = DDI_DMA_READ;
 579                 xi->i_func = bd->d_ops.o_read;
 580         } else {
 581                 dir = DDI_DMA_WRITE;
 582                 xi->i_func = bd->d_ops.o_write;
 583         }
 584 
 585         shift = bd->d_blkshift;
 586         xi->i_blkshift = shift;
 587 
 588         if (!bd->d_use_dma) {
 589                 bp_mapin(bp);
 590                 rv = 0;
 591                 xi->i_offset = 0;
 592                 xi->i_num_win =
 593                     (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
 594                 xi->i_cur_win = 0;
 595                 xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
 596                 xi->i_nblks = xi->i_len >> shift;
 597                 xi->i_kaddr = bp->b_un.b_addr;
 598                 xi->i_resid = bp->b_bcount;
 599         } else {
 600 
 601                 /*
 602                  * We have to use consistent DMA if the address is misaligned.
 603                  */
 604                 if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
 605                     ((uintptr_t)bp->b_un.b_addr & 0x7)) {
 606                         dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
 607                 } else {
 608                         dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
 609                 }
 610 
 611                 status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
 612                     NULL, &xi->i_dmac, &xi->i_ndmac);
 613                 switch (status) {
 614                 case DDI_DMA_MAPPED:
 615                         xi->i_num_win = 1;
 616                         xi->i_cur_win = 0;
 617                         xi->i_offset = 0;
 618                         xi->i_len = bp->b_bcount;
 619                         xi->i_nblks = xi->i_len >> shift;
 620                         xi->i_resid = bp->b_bcount;
 621                         rv = 0;
 622                         break;
 623                 case DDI_DMA_PARTIAL_MAP:
 624                         xi->i_cur_win = 0;
 625 
 626                         if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
 627                             DDI_SUCCESS) ||
 628                             (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
 629                             &len, &xi->i_dmac, &xi->i_ndmac) !=
 630                             DDI_SUCCESS) ||
 631                             (P2PHASE(len, shift) != 0)) {
 632                                 (void) ddi_dma_unbind_handle(xi->i_dmah);
 633                                 rv = EFAULT;
 634                                 goto done;
 635                         }
 636                         xi->i_len = len;
 637                         xi->i_nblks = xi->i_len >> shift;
 638                         xi->i_resid = bp->b_bcount;
 639                         rv = 0;
 640                         break;
 641                 case DDI_DMA_NORESOURCES:
 642                         rv = EAGAIN;
 643                         goto done;
 644                 case DDI_DMA_TOOBIG:
 645                         rv = EINVAL;
 646                         goto done;
 647                 case DDI_DMA_NOMAPPING:
 648                 case DDI_DMA_INUSE:
 649                 default:
 650                         rv = EFAULT;
 651                         goto done;
 652                 }
 653         }
 654 
 655 done:
 656         if (rv != 0) {
 657                 kmem_cache_free(bd->d_cache, xi);
 658                 bioerror(bp, rv);
 659                 return (NULL);
 660         }
 661 
 662         return (xi);
 663 }
 664 
 665 static void
 666 bd_xfer_free(bd_xfer_impl_t *xi)
 667 {
 668         if (xi->i_dmah) {
 669                 (void) ddi_dma_unbind_handle(xi->i_dmah);
 670         }
 671         kmem_cache_free(xi->i_bd->d_cache, xi);
 672 }
 673 
 674 static int
 675 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
 676 {
 677         dev_t           dev = *devp;
 678         bd_t            *bd;
 679         minor_t         part;
 680         minor_t         inst;
 681         uint64_t        mask;
 682         boolean_t       ndelay;
 683         int             rv;
 684         diskaddr_t      nblks;
 685         diskaddr_t      lba;
 686 
 687         _NOTE(ARGUNUSED(credp));
 688 
 689         part = BDPART(dev);
 690         inst = BDINST(dev);
 691 
 692         if (otyp >= OTYPCNT)
 693                 return (EINVAL);
 694 
 695         ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
 696 
 697         /*
 698          * Block any DR events from changing the set of registered
 699          * devices while we function.
 700          */
 701         rw_enter(&bd_lock, RW_READER);
 702         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
 703                 rw_exit(&bd_lock);
 704                 return (ENXIO);
 705         }
 706 
 707         mutex_enter(&bd->d_ocmutex);
 708 
 709         ASSERT(part < 64);
 710         mask = (1U << part);
 711 
 712         bd_update_state(bd);
 713 
 714         if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
 715 
 716                 /* non-blocking opens are allowed to succeed */
 717                 if (!ndelay) {
 718                         rv = ENXIO;
 719                         goto done;
 720                 }
 721         } else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
 722             NULL, NULL, 0) == 0) {
 723 
 724                 /*
 725                  * We read the partinfo, verify valid ranges.  If the
 726                  * partition is invalid, and we aren't blocking or
 727                  * doing a raw access, then fail. (Non-blocking and
 728                  * raw accesses can still succeed to allow a disk with
 729                  * bad partition data to opened by format and fdisk.)
 730                  */
 731                 if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
 732                         rv = ENXIO;
 733                         goto done;
 734                 }
 735         } else if (!ndelay) {
 736                 /*
 737                  * cmlb_partinfo failed -- invalid partition or no
 738                  * disk label.
 739                  */
 740                 rv = ENXIO;
 741                 goto done;
 742         }
 743 
 744         if ((flag & FWRITE) && bd->d_rdonly) {
 745                 rv = EROFS;
 746                 goto done;
 747         }
 748 
 749         if ((bd->d_open_excl) & (mask)) {
 750                 rv = EBUSY;
 751                 goto done;
 752         }
 753         if (flag & FEXCL) {
 754                 if (bd->d_open_lyr[part]) {
 755                         rv = EBUSY;
 756                         goto done;
 757                 }
 758                 for (int i = 0; i < OTYP_LYR; i++) {
 759                         if (bd->d_open_reg[i] & mask) {
 760                                 rv = EBUSY;
 761                                 goto done;
 762                         }
 763                 }
 764         }
 765 
 766         if (otyp == OTYP_LYR) {
 767                 bd->d_open_lyr[part]++;
 768         } else {
 769                 bd->d_open_reg[otyp] |= mask;
 770         }
 771         if (flag & FEXCL) {
 772                 bd->d_open_excl |= mask;
 773         }
 774 
 775         rv = 0;
 776 done:
 777         mutex_exit(&bd->d_ocmutex);
 778         rw_exit(&bd_lock);
 779 
 780         return (rv);
 781 }
 782 
 783 static int
 784 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
 785 {
 786         bd_t            *bd;
 787         minor_t         inst;
 788         minor_t         part;
 789         uint64_t        mask;
 790         boolean_t       last = B_TRUE;
 791 
 792         _NOTE(ARGUNUSED(flag));
 793         _NOTE(ARGUNUSED(credp));
 794 
 795         part = BDPART(dev);
 796         inst = BDINST(dev);
 797 
 798         ASSERT(part < 64);
 799         mask = (1U << part);
 800 
 801         rw_enter(&bd_lock, RW_READER);
 802 
 803         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
 804                 rw_exit(&bd_lock);
 805                 return (ENXIO);
 806         }
 807 
 808         mutex_enter(&bd->d_ocmutex);
 809         if (bd->d_open_excl & mask) {
 810                 bd->d_open_excl &= ~mask;
 811         }
 812         if (otyp == OTYP_LYR) {
 813                 bd->d_open_lyr[part]--;
 814         } else {
 815                 bd->d_open_reg[otyp] &= ~mask;
 816         }
 817         for (int i = 0; i < 64; i++) {
 818                 if (bd->d_open_lyr[part]) {
 819                         last = B_FALSE;
 820                 }
 821         }
 822         for (int i = 0; last && (i < OTYP_LYR); i++) {
 823                 if (bd->d_open_reg[i]) {
 824                         last = B_FALSE;
 825                 }
 826         }
 827         mutex_exit(&bd->d_ocmutex);
 828 
 829         if (last) {
 830                 cmlb_invalidate(bd->d_cmlbh, 0);
 831         }
 832         rw_exit(&bd_lock);
 833 
 834         return (0);
 835 }
 836 
 837 static int
 838 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
 839 {
 840         minor_t         inst;
 841         minor_t         part;
 842         diskaddr_t      pstart;
 843         diskaddr_t      psize;
 844         bd_t            *bd;
 845         bd_xfer_impl_t  *xi;
 846         buf_t           *bp;
 847         int             rv;
 848 
 849         rw_enter(&bd_lock, RW_READER);
 850 
 851         part = BDPART(dev);
 852         inst = BDINST(dev);
 853 
 854         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
 855                 rw_exit(&bd_lock);
 856                 return (ENXIO);
 857         }
 858         /*
 859          * do cmlb, but do it synchronously unless we already have the
 860          * partition (which we probably should.)
 861          */
 862         if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
 863             (void *)1)) {
 864                 rw_exit(&bd_lock);
 865                 return (ENXIO);
 866         }
 867 
 868         if ((blkno + nblk) > psize) {
 869                 rw_exit(&bd_lock);
 870                 return (EINVAL);
 871         }
 872         bp = getrbuf(KM_NOSLEEP);
 873         if (bp == NULL) {
 874                 rw_exit(&bd_lock);
 875                 return (ENOMEM);
 876         }
 877 
 878         bp->b_bcount = nblk << bd->d_blkshift;
 879         bp->b_resid = bp->b_bcount;
 880         bp->b_lblkno = blkno;
 881         bp->b_un.b_addr = caddr;
 882 
 883         xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
 884         if (xi == NULL) {
 885                 rw_exit(&bd_lock);
 886                 freerbuf(bp);
 887                 return (ENOMEM);
 888         }
 889         xi->i_blkno = blkno + pstart;
 890         xi->i_flags = BD_XFER_POLL;
 891         bd_submit(bd, xi);
 892         rw_exit(&bd_lock);
 893 
 894         /*
 895          * Generally, we should have run this entirely synchronously
 896          * at this point and the biowait call should be a no-op.  If
 897          * it didn't happen this way, it's a bug in the underlying
 898          * driver not honoring BD_XFER_POLL.
 899          */
 900         (void) biowait(bp);
 901         rv = geterror(bp);
 902         freerbuf(bp);
 903         return (rv);
 904 }
 905 
 906 void
 907 bd_minphys(struct buf *bp)
 908 {
 909         minor_t inst;
 910         bd_t    *bd;
 911         inst = BDINST(bp->b_edev);
 912 
 913         bd = ddi_get_soft_state(bd_state, inst);
 914 
 915         /*
 916          * In a non-debug kernel, bd_strategy will catch !bd as
 917          * well, and will fail nicely.
 918          */
 919         ASSERT(bd);
 920 
 921         if (bp->b_bcount > bd->d_maxxfer)
 922                 bp->b_bcount = bd->d_maxxfer;
 923 }
 924 
 925 static int
 926 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
 927 {
 928         _NOTE(ARGUNUSED(credp));
 929         return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
 930 }
 931 
 932 static int
 933 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
 934 {
 935         _NOTE(ARGUNUSED(credp));
 936         return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
 937 }
 938 
 939 static int
 940 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
 941 {
 942         _NOTE(ARGUNUSED(credp));
 943         return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
 944 }
 945 
 946 static int
 947 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
 948 {
 949         _NOTE(ARGUNUSED(credp));
 950         return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
 951 }
 952 
 953 static int
 954 bd_strategy(struct buf *bp)
 955 {
 956         minor_t         inst;
 957         minor_t         part;
 958         bd_t            *bd;
 959         diskaddr_t      p_lba;
 960         diskaddr_t      p_nblks;
 961         diskaddr_t      b_nblks;
 962         bd_xfer_impl_t  *xi;
 963         uint32_t        shift;
 964         int             (*func)(void *, bd_xfer_t *);
 965 
 966         part = BDPART(bp->b_edev);
 967         inst = BDINST(bp->b_edev);
 968 
 969         ASSERT(bp);
 970 
 971         bp->b_resid = bp->b_bcount;
 972 
 973         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
 974                 bioerror(bp, ENXIO);
 975                 biodone(bp);
 976                 return (0);
 977         }
 978 
 979         if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
 980             NULL, NULL, 0)) {
 981                 bioerror(bp, ENXIO);
 982                 biodone(bp);
 983                 return (0);
 984         }
 985 
 986         shift = bd->d_blkshift;
 987 
 988         if ((P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
 989             (bp->b_lblkno > p_nblks)) {
 990                 bioerror(bp, ENXIO);
 991                 biodone(bp);
 992                 return (0);
 993         }
 994         b_nblks = bp->b_bcount >> shift;
 995         if ((bp->b_lblkno == p_nblks) || (bp->b_bcount == 0)) {
 996                 biodone(bp);
 997                 return (0);
 998         }
 999 
1000         if ((b_nblks + bp->b_lblkno) > p_nblks) {
1001                 bp->b_resid = ((bp->b_lblkno + b_nblks - p_nblks) << shift);
1002                 bp->b_bcount -= bp->b_resid;
1003         } else {
1004                 bp->b_resid = 0;
1005         }
1006         func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1007 
1008         xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1009         if (xi == NULL) {
1010                 xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1011         }
1012         if (xi == NULL) {
1013                 /* bd_request_alloc will have done bioerror */
1014                 biodone(bp);
1015                 return (0);
1016         }
1017         xi->i_blkno = bp->b_lblkno + p_lba;
1018 
1019         bd_submit(bd, xi);
1020 
1021         return (0);
1022 }
1023 
1024 static int
1025 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1026 {
1027         minor_t         inst;
1028         uint16_t        part;
1029         bd_t            *bd;
1030         void            *ptr = (void *)arg;
1031         int             rv;
1032 
1033         part = BDPART(dev);
1034         inst = BDINST(dev);
1035 
1036         if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1037                 return (ENXIO);
1038         }
1039 
1040         rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1041         if (rv != ENOTTY)
1042                 return (rv);
1043 
1044         switch (cmd) {
1045         case DKIOCGMEDIAINFO: {
1046                 struct dk_minfo minfo;
1047 
1048                 /* make sure our state information is current */
1049                 bd_update_state(bd);
1050                 bzero(&minfo, sizeof (minfo));
1051                 minfo.dki_media_type = DK_FIXED_DISK;
1052                 minfo.dki_lbsize = (1U << bd->d_blkshift);
1053                 minfo.dki_capacity = bd->d_numblks;
1054                 if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag))  {
1055                         return (EFAULT);
1056                 }
1057                 return (0);
1058         }
1059         case DKIOCINFO: {
1060                 struct dk_cinfo cinfo;
1061                 bzero(&cinfo, sizeof (cinfo));
1062                 cinfo.dki_ctype = DKC_BLKDEV;
1063                 cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1064                 (void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1065                     "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1066                 (void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1067                     "%s", ddi_driver_name(bd->d_dip));
1068                 cinfo.dki_unit = inst;
1069                 cinfo.dki_flags = DKI_FMTVOL;
1070                 cinfo.dki_partition = part;
1071                 cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1072                 cinfo.dki_addr = 0;
1073                 cinfo.dki_slave = 0;
1074                 cinfo.dki_space = 0;
1075                 cinfo.dki_prio = 0;
1076                 cinfo.dki_vec = 0;
1077                 if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag))  {
1078                         return (EFAULT);
1079                 }
1080                 return (0);
1081         }
1082         case DKIOCREMOVABLE: {
1083                 int i;
1084                 i = bd->d_removable ? 1 : 0;
1085                 if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1086                         return (EFAULT);
1087                 }
1088                 return (0);
1089         }
1090         case DKIOCHOTPLUGGABLE: {
1091                 int i;
1092                 i = bd->d_hotpluggable ? 1 : 0;
1093                 if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1094                         return (EFAULT);
1095                 }
1096                 return (0);
1097         }
1098         case DKIOCREADONLY: {
1099                 int i;
1100                 i = bd->d_rdonly ? 1 : 0;
1101                 if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1102                         return (EFAULT);
1103                 }
1104                 return (0);
1105         }
1106         case DKIOCSTATE: {
1107                 enum dkio_state state;
1108                 if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1109                         return (EFAULT);
1110                 }
1111                 if ((rv = bd_check_state(bd, &state)) != 0) {
1112                         return (rv);
1113                 }
1114                 if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1115                         return (EFAULT);
1116                 }
1117                 return (0);
1118         }
1119         case DKIOCFLUSHWRITECACHE: {
1120                 struct dk_callback *dkc = NULL;
1121 
1122                 if (flag & FKIOCTL)
1123                         dkc = (void *)arg;
1124 
1125                 rv = bd_flush_write_cache(bd, dkc);
1126                 return (rv);
1127         }
1128 
1129         default:
1130                 if (bd->d_ops.o_ioctl != NULL) {
1131                         rv = bd->d_ops.o_ioctl(dev, cmd, arg, flag, credp,
1132                             rvalp);
1133                 } else {
1134                         /* Unsupported ioctl ==> return ENOTTY. */
1135                         rv = ENOTTY;
1136                 }
1137                 /* FALLTHRU */
1138         }
1139         return (rv);
1140 }
1141 
1142 static int
1143 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1144     char *name, caddr_t valuep, int *lengthp)
1145 {
1146         bd_t    *bd;
1147 
1148         bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1149         if (bd == NULL)
1150                 return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1151                     name, valuep, lengthp));
1152 
1153         return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1154             valuep, lengthp, BDPART(dev), 0));
1155 }
1156 
1157 
1158 static int
1159 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1160     size_t length, void *tg_cookie)
1161 {
1162         bd_t            *bd;
1163         buf_t           *bp;
1164         bd_xfer_impl_t  *xi;
1165         int             rv;
1166         int             (*func)(void *, bd_xfer_t *);
1167         int             kmflag;
1168 
1169         /*
1170          * If we are running in polled mode (such as during dump(9e)
1171          * execution), then we cannot sleep for kernel allocations.
1172          */
1173         kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1174 
1175         bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1176 
1177         if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1178                 /* We can only transfer whole blocks at a time! */
1179                 return (EINVAL);
1180         }
1181 
1182         if ((bp = getrbuf(kmflag)) == NULL) {
1183                 return (ENOMEM);
1184         }
1185 
1186         switch (cmd) {
1187         case TG_READ:
1188                 bp->b_flags = B_READ;
1189                 func = bd->d_ops.o_read;
1190                 break;
1191         case TG_WRITE:
1192                 bp->b_flags = B_WRITE;
1193                 func = bd->d_ops.o_write;
1194                 break;
1195         default:
1196                 freerbuf(bp);
1197                 return (EINVAL);
1198         }
1199 
1200         bp->b_un.b_addr = bufaddr;
1201         bp->b_bcount = length;
1202         xi = bd_xfer_alloc(bd, bp, func, kmflag);
1203         if (xi == NULL) {
1204                 rv = geterror(bp);
1205                 freerbuf(bp);
1206                 return (rv);
1207         }
1208         xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1209         xi->i_blkno = start;
1210         bd_submit(bd, xi);
1211         (void) biowait(bp);
1212         rv = geterror(bp);
1213         freerbuf(bp);
1214 
1215         return (rv);
1216 }
1217 
1218 static int
1219 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1220 {
1221         bd_t            *bd;
1222 
1223         _NOTE(ARGUNUSED(tg_cookie));
1224         bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1225 
1226         switch (cmd) {
1227         case TG_GETPHYGEOM:
1228         case TG_GETVIRTGEOM:
1229                 /*
1230                  * We don't have any "geometry" as such, let cmlb
1231                  * fabricate something.
1232                  */
1233                 return (ENOTTY);
1234 
1235         case TG_GETCAPACITY:
1236                 bd_update_state(bd);
1237                 *(diskaddr_t *)arg = bd->d_numblks;
1238                 return (0);
1239 
1240         case TG_GETBLOCKSIZE:
1241                 *(uint32_t *)arg = (1U << bd->d_blkshift);
1242                 return (0);
1243 
1244         case TG_GETATTR:
1245                 /*
1246                  * It turns out that cmlb really doesn't do much for
1247                  * non-writable media, but lets make the information
1248                  * available for it in case it does more in the
1249                  * future.  (The value is currently used for
1250                  * triggering special behavior for CD-ROMs.)
1251                  */
1252                 bd_update_state(bd);
1253                 ((tg_attribute_t *)arg)->media_is_writable =
1254                     bd->d_rdonly ? B_FALSE : B_TRUE;
1255                 return (0);
1256 
1257         default:
1258                 return (EINVAL);
1259         }
1260 }
1261 
1262 
1263 static void
1264 bd_sched(bd_t *bd)
1265 {
1266         bd_xfer_impl_t  *xi;
1267         struct buf      *bp;
1268         int             rv;
1269 
1270         mutex_enter(&bd->d_iomutex);
1271 
1272         while ((bd->d_qactive < bd->d_qsize) &&
1273             ((xi = list_remove_head(&bd->d_waitq)) != NULL)) {
1274                 bd->d_qactive++;
1275                 kstat_waitq_to_runq(bd->d_kiop);
1276                 list_insert_tail(&bd->d_runq, xi);
1277 
1278                 /*
1279                  * Submit the job to the driver.  We drop the I/O mutex
1280                  * so that we can deal with the case where the driver
1281                  * completion routine calls back into us synchronously.
1282                  */
1283 
1284                 mutex_exit(&bd->d_iomutex);
1285 
1286                 rv = xi->i_func(bd->d_private, &xi->i_public);
1287                 if (rv != 0) {
1288                         bp = xi->i_bp;
1289                         bd_xfer_free(xi);
1290                         bioerror(bp, rv);
1291                         biodone(bp);
1292 
1293                         mutex_enter(&bd->d_iomutex);
1294                         bd->d_qactive--;
1295                         kstat_runq_exit(bd->d_kiop);
1296                         list_remove(&bd->d_runq, xi);
1297                 } else {
1298                         mutex_enter(&bd->d_iomutex);
1299                 }
1300         }
1301 
1302         mutex_exit(&bd->d_iomutex);
1303 }
1304 
1305 static void
1306 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1307 {
1308         mutex_enter(&bd->d_iomutex);
1309         list_insert_tail(&bd->d_waitq, xi);
1310         kstat_waitq_enter(bd->d_kiop);
1311         mutex_exit(&bd->d_iomutex);
1312 
1313         bd_sched(bd);
1314 }
1315 
1316 static void
1317 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1318 {
1319         bd_t    *bd = xi->i_bd;
1320         buf_t   *bp = xi->i_bp;
1321 
1322         mutex_enter(&bd->d_iomutex);
1323         bd->d_qactive--;
1324         kstat_runq_exit(bd->d_kiop);
1325         list_remove(&bd->d_runq, xi);
1326         mutex_exit(&bd->d_iomutex);
1327 
1328         if (err == 0) {
1329                 if (bp->b_flags & B_READ) {
1330                         bd->d_kiop->reads++;
1331                         bd->d_kiop->nread += (bp->b_bcount - xi->i_resid);
1332                 } else {
1333                         bd->d_kiop->writes++;
1334                         bd->d_kiop->nwritten += (bp->b_bcount - xi->i_resid);
1335                 }
1336         }
1337         bd_sched(bd);
1338 }
1339 
1340 static void
1341 bd_update_state(bd_t *bd)
1342 {
1343         enum    dkio_state      state;
1344         bd_media_t              media;
1345         boolean_t               docmlb = B_FALSE;
1346 
1347         bzero(&media, sizeof (media));
1348 
1349         mutex_enter(&bd->d_statemutex);
1350         if (bd->d_ops.o_media_info(bd->d_private, &media) == 0) {
1351                 if ((1U << bd->d_blkshift) != media.m_blksize) {
1352                         if ((media.m_blksize < 512) ||
1353                             (!ISP2(media.m_blksize)) ||
1354                             (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1355                                 cmn_err(CE_WARN,
1356                                     "%s%d: Invalid media block size (%d)",
1357                                     ddi_driver_name(bd->d_dip),
1358                                     ddi_get_instance(bd->d_dip),
1359                                     media.m_blksize);
1360                                 /*
1361                                  * We can't use the media, treat it as
1362                                  * not present.
1363                                  */
1364                                 state = DKIO_EJECTED;
1365                                 bd->d_numblks = 0;
1366                         } else {
1367                                 bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1368                                 bd->d_numblks = media.m_nblks;
1369                                 bd->d_rdonly = media.m_readonly;
1370                                 state = DKIO_INSERTED;
1371                         }
1372 
1373                         /* Device size changed */
1374                         docmlb = B_TRUE;
1375 
1376                 } else {
1377                         if (bd->d_numblks != media.m_nblks) {
1378                                 /* Device size changed */
1379                                 docmlb = B_TRUE;
1380                         }
1381                         bd->d_numblks = media.m_nblks;
1382                         bd->d_rdonly = media.m_readonly;
1383                         state = DKIO_INSERTED;
1384                 }
1385 
1386         } else {
1387                 bd->d_numblks = 0;
1388                 state = DKIO_EJECTED;
1389         }
1390         if (state != bd->d_state) {
1391                 bd->d_state = state;
1392                 cv_broadcast(&bd->d_statecv);
1393                 docmlb = B_TRUE;
1394         }
1395         mutex_exit(&bd->d_statemutex);
1396 
1397         if (docmlb) {
1398                 if (state == DKIO_INSERTED) {
1399                         (void) cmlb_validate(bd->d_cmlbh, 0, 0);
1400                 } else {
1401                         cmlb_invalidate(bd->d_cmlbh, 0);
1402                 }
1403         }
1404 }
1405 
1406 static int
1407 bd_check_state(bd_t *bd, enum dkio_state *state)
1408 {
1409         clock_t         when;
1410 
1411         for (;;) {
1412 
1413                 bd_update_state(bd);
1414 
1415                 mutex_enter(&bd->d_statemutex);
1416 
1417                 if (bd->d_state != *state) {
1418                         *state = bd->d_state;
1419                         mutex_exit(&bd->d_statemutex);
1420                         break;
1421                 }
1422 
1423                 when = drv_usectohz(1000000);
1424                 if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1425                     when, TR_CLOCK_TICK) == 0) {
1426                         mutex_exit(&bd->d_statemutex);
1427                         return (EINTR);
1428                 }
1429 
1430                 mutex_exit(&bd->d_statemutex);
1431         }
1432 
1433         return (0);
1434 }
1435 
1436 static int
1437 bd_flush_write_cache_done(struct buf *bp)
1438 {
1439         struct dk_callback *dc = (void *)bp->b_private;
1440 
1441         (*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1442         kmem_free(dc, sizeof (*dc));
1443         freerbuf(bp);
1444         return (0);
1445 }
1446 
1447 static int
1448 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
1449 {
1450         buf_t                   *bp;
1451         struct dk_callback      *dc;
1452         bd_xfer_impl_t          *xi;
1453         int                     rv;
1454 
1455         if (bd->d_ops.o_sync_cache == NULL) {
1456                 return (ENOTSUP);
1457         }
1458         if ((bp = getrbuf(KM_SLEEP)) == NULL) {
1459                 return (ENOMEM);
1460         }
1461         bp->b_resid = 0;
1462         bp->b_bcount = 0;
1463 
1464         xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
1465         if (xi == NULL) {
1466                 rv = geterror(bp);
1467                 freerbuf(bp);
1468                 return (rv);
1469         }
1470 
1471         /* Make an asynchronous flush, but only if there is a callback */
1472         if (dkc != NULL && dkc->dkc_callback != NULL) {
1473                 /* Make a private copy of the callback structure */
1474                 dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
1475                 *dc = *dkc;
1476                 bp->b_private = dc;
1477                 bp->b_iodone = bd_flush_write_cache_done;
1478 
1479                 bd_submit(bd, xi);
1480                 return (0);
1481         }
1482 
1483         /* In case there is no callback, perform a synchronous flush */
1484         bd_submit(bd, xi);
1485         (void) biowait(bp);
1486         rv = geterror(bp);
1487         freerbuf(bp);
1488 
1489         return (rv);
1490 }
1491 
1492 /*
1493  * Nexus support.
1494  */
1495 int
1496 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
1497     void *arg, void *result)
1498 {
1499         bd_handle_t     hdl;
1500 
1501         switch (ctlop) {
1502         case DDI_CTLOPS_REPORTDEV:
1503                 cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
1504                     ddi_node_name(rdip), ddi_get_name_addr(rdip),
1505                     ddi_driver_name(rdip), ddi_get_instance(rdip));
1506                 return (DDI_SUCCESS);
1507 
1508         case DDI_CTLOPS_INITCHILD:
1509                 hdl = ddi_get_parent_data((dev_info_t *)arg);
1510                 if (hdl == NULL) {
1511                         return (DDI_NOT_WELL_FORMED);
1512                 }
1513                 ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
1514                 return (DDI_SUCCESS);
1515 
1516         case DDI_CTLOPS_UNINITCHILD:
1517                 ddi_set_name_addr((dev_info_t *)arg, NULL);
1518                 ndi_prop_remove_all((dev_info_t *)arg);
1519                 return (DDI_SUCCESS);
1520 
1521         default:
1522                 return (ddi_ctlops(dip, rdip, ctlop, arg, result));
1523         }
1524 }
1525 
1526 /*
1527  * Functions for device drivers.
1528  */
1529 bd_handle_t
1530 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
1531 {
1532         bd_handle_t     hdl;
1533 
1534         hdl = kmem_zalloc(sizeof (*hdl), kmflag);
1535         if (hdl == NULL)
1536                 return (NULL);
1537 
1538         /*
1539          * Cheesy versioning handling.  We've only appended members into
1540          * bd_ops as we grew from v0 to v1.  Since we zalloc hdl, the
1541          * ioctl ops will be NULL anyway.  So for the old version, we
1542          * copy over only the v0 elements.
1543          */
1544         switch (ops->o_version) {
1545         case BD_OPS_VERSION_0:
1546                 /* Don't copy the last pointer in the structure. */
1547                 bcopy(ops, &hdl->h_ops, sizeof (*ops) - sizeof (void *));
1548                 break;
1549         case BD_OPS_VERSION_1:
1550                 hdl->h_ops = *ops;
1551                 break;
1552         default:
1553                 kmem_free(hdl, sizeof (*hdl));
1554                 cmn_err(CE_WARN, "Unsupported blkdev ops version %d.\n",
1555                     ops->o_version);
1556                 return (NULL);
1557                 /* NOTREACHED */
1558         }
1559         hdl->h_dma = dma;
1560         hdl->h_private = private;
1561 
1562         return (hdl);
1563 }
1564 
1565 void
1566 bd_free_handle(bd_handle_t hdl)
1567 {
1568         kmem_free(hdl, sizeof (*hdl));
1569 }
1570 
1571 int
1572 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
1573 {
1574         dev_info_t      *child;
1575         bd_drive_t      drive;
1576 
1577         /* if drivers don't override this, make it assume none */
1578         drive.d_lun = -1;
1579         hdl->h_ops.o_drive_info(hdl->h_private, &drive);
1580 
1581         hdl->h_parent = dip;
1582         hdl->h_name = "blkdev";
1583 
1584         if (drive.d_lun >= 0) {
1585                 (void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X,%X",
1586                     drive.d_target, drive.d_lun);
1587         } else {
1588                 (void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X",
1589                     drive.d_target);
1590         }
1591         if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
1592             &child) != NDI_SUCCESS) {
1593                 cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
1594                     ddi_driver_name(dip), ddi_get_instance(dip),
1595                     "blkdev", hdl->h_addr);
1596                 return (DDI_FAILURE);
1597         }
1598 
1599         ddi_set_parent_data(child, hdl);
1600         hdl->h_child = child;
1601 
1602         if (ndi_devi_online(child, 0) == NDI_FAILURE) {
1603                 cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
1604                     ddi_driver_name(dip), ddi_get_instance(dip),
1605                     hdl->h_name, hdl->h_addr);
1606                 (void) ndi_devi_free(child);
1607                 return (DDI_FAILURE);
1608         }
1609 
1610         return (DDI_SUCCESS);
1611 }
1612 
1613 int
1614 bd_detach_handle(bd_handle_t hdl)
1615 {
1616         int     circ;
1617         int     rv;
1618         char    *devnm;
1619 
1620         if (hdl->h_child == NULL) {
1621                 return (DDI_SUCCESS);
1622         }
1623         ndi_devi_enter(hdl->h_parent, &circ);
1624         if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
1625                 rv = ddi_remove_child(hdl->h_child, 0);
1626         } else {
1627                 devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
1628                 (void) ddi_deviname(hdl->h_child, devnm);
1629                 (void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
1630                 rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
1631                     NDI_DEVI_REMOVE | NDI_UNCONFIG);
1632                 kmem_free(devnm, MAXNAMELEN + 1);
1633         }
1634         if (rv == 0) {
1635                 hdl->h_child = NULL;
1636         }
1637 
1638         ndi_devi_exit(hdl->h_parent, circ);
1639         return (rv = NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
1640 }
1641 
1642 void
1643 bd_xfer_done(bd_xfer_t *xfer, int err)
1644 {
1645         bd_xfer_impl_t  *xi = (void *)xfer;
1646         buf_t           *bp = xi->i_bp;
1647         int             rv = DDI_SUCCESS;
1648         bd_t            *bd = xi->i_bd;
1649         size_t          len;
1650 
1651         if (err != 0) {
1652                 bd_runq_exit(xi, err);
1653 
1654                 bp->b_resid += xi->i_resid;
1655                 bd_xfer_free(xi);
1656                 bioerror(bp, err);
1657                 biodone(bp);
1658                 return;
1659         }
1660 
1661         xi->i_cur_win++;
1662         xi->i_resid -= xi->i_len;
1663 
1664         if (xi->i_resid == 0) {
1665                 /* Job completed succcessfully! */
1666                 bd_runq_exit(xi, 0);
1667 
1668                 bd_xfer_free(xi);
1669                 biodone(bp);
1670                 return;
1671         }
1672 
1673         xi->i_blkno += xi->i_nblks;
1674 
1675         if (bd->d_use_dma) {
1676                 /* More transfer still pending... advance to next DMA window. */
1677                 rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
1678                     &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
1679         } else {
1680                 /* Advance memory window. */
1681                 xi->i_kaddr += xi->i_len;
1682                 xi->i_offset += xi->i_len;
1683                 len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
1684         }
1685 
1686 
1687         if ((rv != DDI_SUCCESS) ||
1688             (P2PHASE(len, (1U << xi->i_blkshift) != 0))) {
1689                 bd_runq_exit(xi, EFAULT);
1690 
1691                 bp->b_resid += xi->i_resid;
1692                 bd_xfer_free(xi);
1693                 bioerror(bp, EFAULT);
1694                 biodone(bp);
1695                 return;
1696         }
1697         xi->i_len = len;
1698         xi->i_nblks = len >> xi->i_blkshift;
1699 
1700         /* Submit next window to hardware. */
1701         rv = xi->i_func(bd->d_private, &xi->i_public);
1702         if (rv != 0) {
1703                 bd_runq_exit(xi, rv);
1704 
1705                 bp->b_resid += xi->i_resid;
1706                 bd_xfer_free(xi);
1707                 bioerror(bp, rv);
1708                 biodone(bp);
1709         }
1710 }
1711 
1712 void
1713 bd_state_change(bd_handle_t hdl)
1714 {
1715         bd_t            *bd;
1716 
1717         if ((bd = hdl->h_bd) != NULL) {
1718                 bd_update_state(bd);
1719         }
1720 }
1721 
1722 void
1723 bd_mod_init(struct dev_ops *devops)
1724 {
1725         static struct bus_ops bd_bus_ops = {
1726                 BUSO_REV,               /* busops_rev */
1727                 nullbusmap,             /* bus_map */
1728                 NULL,                   /* bus_get_intrspec (OBSOLETE) */
1729                 NULL,                   /* bus_add_intrspec (OBSOLETE) */
1730                 NULL,                   /* bus_remove_intrspec (OBSOLETE) */
1731                 i_ddi_map_fault,        /* bus_map_fault */
1732                 NULL,                   /* bus_dma_map (OBSOLETE) */
1733                 ddi_dma_allochdl,       /* bus_dma_allochdl */
1734                 ddi_dma_freehdl,        /* bus_dma_freehdl */
1735                 ddi_dma_bindhdl,        /* bus_dma_bindhdl */
1736                 ddi_dma_unbindhdl,      /* bus_dma_unbindhdl */
1737                 ddi_dma_flush,          /* bus_dma_flush */
1738                 ddi_dma_win,            /* bus_dma_win */
1739                 ddi_dma_mctl,           /* bus_dma_ctl */
1740                 bd_bus_ctl,             /* bus_ctl */
1741                 ddi_bus_prop_op,        /* bus_prop_op */
1742                 NULL,                   /* bus_get_eventcookie */
1743                 NULL,                   /* bus_add_eventcall */
1744                 NULL,                   /* bus_remove_eventcall */
1745                 NULL,                   /* bus_post_event */
1746                 NULL,                   /* bus_intr_ctl (OBSOLETE) */
1747                 NULL,                   /* bus_config */
1748                 NULL,                   /* bus_unconfig */
1749                 NULL,                   /* bus_fm_init */
1750                 NULL,                   /* bus_fm_fini */
1751                 NULL,                   /* bus_fm_access_enter */
1752                 NULL,                   /* bus_fm_access_exit */
1753                 NULL,                   /* bus_power */
1754                 NULL,                   /* bus_intr_op */
1755         };
1756 
1757         devops->devo_bus_ops = &bd_bus_ops;
1758 
1759         /*
1760          * NB: The device driver is free to supply its own
1761          * character entry device support.
1762          */
1763 }
1764 
1765 void
1766 bd_mod_fini(struct dev_ops *devops)
1767 {
1768         devops->devo_bus_ops = NULL;
1769 }