1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2017 Joyent, Inc.
  26  */
  27 
  28 
  29 #include <sys/errno.h>
  30 #include <sys/types.h>
  31 #include <sys/conf.h>
  32 #include <sys/kmem.h>
  33 #include <sys/ddi.h>
  34 #include <sys/stat.h>
  35 #include <sys/sunddi.h>
  36 #include <sys/file.h>
  37 #include <sys/open.h>
  38 #include <sys/modctl.h>
  39 #include <sys/ddi_impldefs.h>
  40 #include <sys/sysmacros.h>
  41 #include <sys/ddidevmap.h>
  42 #include <sys/policy.h>
  43 
  44 #include <sys/vmsystm.h>
  45 #include <vm/hat_i86.h>
  46 #include <vm/hat_pte.h>
  47 #include <vm/seg_kmem.h>
  48 #include <vm/seg_mf.h>
  49 
  50 #include <xen/io/blkif_impl.h>
  51 #include <xen/io/blk_common.h>
  52 #include <xen/io/xpvtap.h>
  53 
  54 
  55 static int xpvtap_open(dev_t *devp, int flag, int otyp, cred_t *cred);
  56 static int xpvtap_close(dev_t devp, int flag, int otyp, cred_t *cred);
  57 static int xpvtap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
  58     cred_t *cred, int *rval);
  59 static int xpvtap_devmap(dev_t dev, devmap_cookie_t dhp, offset_t off,
  60     size_t len, size_t *maplen, uint_t model);
  61 static int xpvtap_segmap(dev_t dev, off_t off, struct as *asp, caddr_t *addrp,
  62     off_t len, unsigned int prot, unsigned int maxprot, unsigned int flags,
  63     cred_t *cred_p);
  64 static int xpvtap_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
  65     struct pollhead **phpp);
  66 
  67 static  struct cb_ops xpvtap_cb_ops = {
  68         xpvtap_open,            /* cb_open */
  69         xpvtap_close,           /* cb_close */
  70         nodev,                  /* cb_strategy */
  71         nodev,                  /* cb_print */
  72         nodev,                  /* cb_dump */
  73         nodev,                  /* cb_read */
  74         nodev,                  /* cb_write */
  75         xpvtap_ioctl,           /* cb_ioctl */
  76         xpvtap_devmap,          /* cb_devmap */
  77         nodev,                  /* cb_mmap */
  78         xpvtap_segmap,          /* cb_segmap */
  79         xpvtap_chpoll,          /* cb_chpoll */
  80         ddi_prop_op,            /* cb_prop_op */
  81         NULL,                   /* cb_stream */
  82         D_NEW | D_MP | D_64BIT | D_DEVMAP,      /* cb_flag */
  83         CB_REV
  84 };
  85 
  86 static int xpvtap_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
  87     void **result);
  88 static int xpvtap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd);
  89 static int xpvtap_detach(dev_info_t *devi, ddi_detach_cmd_t cmd);
  90 
  91 static struct dev_ops xpvtap_dev_ops = {
  92         DEVO_REV,               /* devo_rev */
  93         0,                      /* devo_refcnt */
  94         xpvtap_getinfo,         /* devo_getinfo */
  95         nulldev,                /* devo_identify */
  96         nulldev,                /* devo_probe */
  97         xpvtap_attach,          /* devo_attach */
  98         xpvtap_detach,          /* devo_detach */
  99         nodev,                  /* devo_reset */
 100         &xpvtap_cb_ops,             /* devo_cb_ops */
 101         NULL,                   /* devo_bus_ops */
 102         NULL                    /* power */
 103 };
 104 
 105 
 106 static struct modldrv xpvtap_modldrv = {
 107         &mod_driverops,             /* Type of module.  This one is a driver */
 108         "xpvtap driver",        /* Name of the module. */
 109         &xpvtap_dev_ops,    /* driver ops */
 110 };
 111 
 112 static struct modlinkage xpvtap_modlinkage = {
 113         MODREV_1,
 114         (void *) &xpvtap_modldrv,
 115         NULL
 116 };
 117 
 118 
 119 void *xpvtap_statep;
 120 
 121 
 122 static xpvtap_state_t *xpvtap_drv_init(int instance);
 123 static void xpvtap_drv_fini(xpvtap_state_t *state);
 124 static uint_t xpvtap_intr(caddr_t arg);
 125 
 126 typedef void (*xpvtap_rs_cleanup_t)(xpvtap_state_t *state, uint_t rs);
 127 static void xpvtap_rs_init(uint_t min_val, uint_t max_val,
 128     xpvtap_rs_hdl_t *handle);
 129 static void xpvtap_rs_fini(xpvtap_rs_hdl_t *handle);
 130 static int xpvtap_rs_alloc(xpvtap_rs_hdl_t handle, uint_t *rs);
 131 static void xpvtap_rs_free(xpvtap_rs_hdl_t handle, uint_t rs);
 132 static void xpvtap_rs_flush(xpvtap_rs_hdl_t handle,
 133     xpvtap_rs_cleanup_t callback, void *arg);
 134 
 135 static int xpvtap_segmf_register(xpvtap_state_t *state);
 136 static void xpvtap_segmf_unregister(struct as *as, void *arg, uint_t event);
 137 
 138 static int xpvtap_user_init(xpvtap_state_t *state);
 139 static void xpvtap_user_fini(xpvtap_state_t *state);
 140 static int xpvtap_user_ring_init(xpvtap_state_t *state);
 141 static void xpvtap_user_ring_fini(xpvtap_state_t *state);
 142 static int xpvtap_user_thread_init(xpvtap_state_t *state);
 143 static void xpvtap_user_thread_fini(xpvtap_state_t *state);
 144 static void xpvtap_user_thread_start(caddr_t arg);
 145 static void xpvtap_user_thread_stop(xpvtap_state_t *state);
 146 static void xpvtap_user_thread(void *arg);
 147 
 148 static void xpvtap_user_app_stop(caddr_t arg);
 149 
 150 static int xpvtap_user_request_map(xpvtap_state_t *state, blkif_request_t *req,
 151     uint_t *uid);
 152 static int xpvtap_user_request_push(xpvtap_state_t *state,
 153     blkif_request_t *req, uint_t uid);
 154 static int xpvtap_user_response_get(xpvtap_state_t *state,
 155     blkif_response_t *resp, uint_t *uid);
 156 static void xpvtap_user_request_unmap(xpvtap_state_t *state, uint_t uid);
 157 
 158 
 159 /*
 160  * _init()
 161  */
 162 int
 163 _init(void)
 164 {
 165         int e;
 166 
 167         e = ddi_soft_state_init(&xpvtap_statep, sizeof (xpvtap_state_t), 1);
 168         if (e != 0) {
 169                 return (e);
 170         }
 171 
 172         e = mod_install(&xpvtap_modlinkage);
 173         if (e != 0) {
 174                 ddi_soft_state_fini(&xpvtap_statep);
 175                 return (e);
 176         }
 177 
 178         return (0);
 179 }
 180 
 181 
 182 /*
 183  * _info()
 184  */
 185 int
 186 _info(struct modinfo *modinfop)
 187 {
 188         return (mod_info(&xpvtap_modlinkage, modinfop));
 189 }
 190 
 191 
 192 /*
 193  * _fini()
 194  */
 195 int
 196 _fini(void)
 197 {
 198         int e;
 199 
 200         e = mod_remove(&xpvtap_modlinkage);
 201         if (e != 0) {
 202                 return (e);
 203         }
 204 
 205         ddi_soft_state_fini(&xpvtap_statep);
 206 
 207         return (0);
 208 }
 209 
 210 
 211 /*
 212  * xpvtap_attach()
 213  */
 214 static int
 215 xpvtap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 216 {
 217         blk_ringinit_args_t args;
 218         xpvtap_state_t *state;
 219         int instance;
 220         int e;
 221 
 222 
 223         switch (cmd) {
 224         case DDI_ATTACH:
 225                 break;
 226 
 227         case DDI_RESUME:
 228                 return (DDI_SUCCESS);
 229 
 230         default:
 231                 return (DDI_FAILURE);
 232         }
 233 
 234         /* initialize our state info */
 235         instance = ddi_get_instance(dip);
 236         state = xpvtap_drv_init(instance);
 237         if (state == NULL) {
 238                 return (DDI_FAILURE);
 239         }
 240         state->bt_dip = dip;
 241 
 242         /* Initialize the guest ring */
 243         args.ar_dip = state->bt_dip;
 244         args.ar_intr = xpvtap_intr;
 245         args.ar_intr_arg = (caddr_t)state;
 246         args.ar_ringup = xpvtap_user_thread_start;
 247         args.ar_ringup_arg = (caddr_t)state;
 248         args.ar_ringdown = xpvtap_user_app_stop;
 249         args.ar_ringdown_arg = (caddr_t)state;
 250         e = blk_ring_init(&args, &state->bt_guest_ring);
 251         if (e != DDI_SUCCESS) {
 252                 goto attachfail_ringinit;
 253         }
 254 
 255         /* create the minor node (for ioctl/mmap) */
 256         e = ddi_create_minor_node(dip, "xpvtap", S_IFCHR, instance,
 257             DDI_PSEUDO, 0);
 258         if (e != DDI_SUCCESS) {
 259                 goto attachfail_minor_node;
 260         }
 261 
 262         /* Report that driver was loaded */
 263         ddi_report_dev(dip);
 264 
 265         return (DDI_SUCCESS);
 266 
 267 attachfail_minor_node:
 268         blk_ring_fini(&state->bt_guest_ring);
 269 attachfail_ringinit:
 270         xpvtap_drv_fini(state);
 271         return (DDI_FAILURE);
 272 }
 273 
 274 
 275 /*
 276  * xpvtap_detach()
 277  */
 278 static int
 279 xpvtap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 280 {
 281         xpvtap_state_t *state;
 282         int instance;
 283 
 284 
 285         instance = ddi_get_instance(dip);
 286         state = ddi_get_soft_state(xpvtap_statep, instance);
 287         if (state == NULL) {
 288                 return (DDI_FAILURE);
 289         }
 290 
 291         switch (cmd) {
 292         case DDI_DETACH:
 293                 break;
 294 
 295         case DDI_SUSPEND:
 296         default:
 297                 return (DDI_FAILURE);
 298         }
 299 
 300         xpvtap_user_thread_stop(state);
 301         blk_ring_fini(&state->bt_guest_ring);
 302         xpvtap_drv_fini(state);
 303         ddi_remove_minor_node(dip, NULL);
 304 
 305         return (DDI_SUCCESS);
 306 }
 307 
 308 
 309 /*
 310  * xpvtap_getinfo()
 311  */
 312 /*ARGSUSED*/
 313 static int
 314 xpvtap_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
 315 {
 316         xpvtap_state_t *state;
 317         int instance;
 318         dev_t dev;
 319         int e;
 320 
 321 
 322         dev = (dev_t)arg;
 323         instance = getminor(dev);
 324 
 325         switch (cmd) {
 326         case DDI_INFO_DEVT2DEVINFO:
 327                 state = ddi_get_soft_state(xpvtap_statep, instance);
 328                 if (state == NULL) {
 329                         return (DDI_FAILURE);
 330                 }
 331                 *result = (void *)state->bt_dip;
 332                 e = DDI_SUCCESS;
 333                 break;
 334 
 335         case DDI_INFO_DEVT2INSTANCE:
 336                 *result = (void *)(uintptr_t)instance;
 337                 e = DDI_SUCCESS;
 338                 break;
 339 
 340         default:
 341                 e = DDI_FAILURE;
 342                 break;
 343         }
 344 
 345         return (e);
 346 }
 347 
 348 
 349 /*
 350  * xpvtap_open()
 351  */
 352 /*ARGSUSED*/
 353 static int
 354 xpvtap_open(dev_t *devp, int flag, int otyp, cred_t *cred)
 355 {
 356         xpvtap_state_t *state;
 357         int instance;
 358 
 359 
 360         if (secpolicy_xvm_control(cred)) {
 361                 return (EPERM);
 362         }
 363 
 364         instance = getminor(*devp);
 365         state = ddi_get_soft_state(xpvtap_statep, instance);
 366         if (state == NULL) {
 367                 return (ENXIO);
 368         }
 369 
 370         /* we should only be opened once */
 371         mutex_enter(&state->bt_open.bo_mutex);
 372         if (state->bt_open.bo_opened) {
 373                 mutex_exit(&state->bt_open.bo_mutex);
 374                 return (EBUSY);
 375         }
 376         state->bt_open.bo_opened = B_TRUE;
 377         mutex_exit(&state->bt_open.bo_mutex);
 378 
 379         /*
 380          * save the apps address space. need it for mapping/unmapping grefs
 381          * since will be doing it in a separate kernel thread.
 382          */
 383         state->bt_map.um_as = curproc->p_as;
 384 
 385         return (0);
 386 }
 387 
 388 
 389 /*
 390  * xpvtap_close()
 391  */
 392 /*ARGSUSED*/
 393 static int
 394 xpvtap_close(dev_t devp, int flag, int otyp, cred_t *cred)
 395 {
 396         xpvtap_state_t *state;
 397         int instance;
 398 
 399 
 400         instance = getminor(devp);
 401         state = ddi_get_soft_state(xpvtap_statep, instance);
 402         if (state == NULL) {
 403                 return (ENXIO);
 404         }
 405 
 406         /*
 407          * wake thread so it can cleanup and wait for it to exit so we can
 408          * be sure it's not in the middle of processing a request/response.
 409          */
 410         mutex_enter(&state->bt_thread.ut_mutex);
 411         state->bt_thread.ut_wake = B_TRUE;
 412         state->bt_thread.ut_exit = B_TRUE;
 413         cv_signal(&state->bt_thread.ut_wake_cv);
 414         if (!state->bt_thread.ut_exit_done) {
 415                 cv_wait(&state->bt_thread.ut_exit_done_cv,
 416                     &state->bt_thread.ut_mutex);
 417         }
 418         ASSERT(state->bt_thread.ut_exit_done);
 419         mutex_exit(&state->bt_thread.ut_mutex);
 420 
 421         state->bt_map.um_as = NULL;
 422         state->bt_map.um_guest_pages = NULL;
 423 
 424         /*
 425          * when the ring is brought down, a userland hotplug script is run
 426          * which tries to bring the userland app down. We'll wait for a bit
 427          * for the user app to exit. Notify the thread waiting that the app
 428          * has closed the driver.
 429          */
 430         mutex_enter(&state->bt_open.bo_mutex);
 431         ASSERT(state->bt_open.bo_opened);
 432         state->bt_open.bo_opened = B_FALSE;
 433         cv_signal(&state->bt_open.bo_exit_cv);
 434         mutex_exit(&state->bt_open.bo_mutex);
 435 
 436         return (0);
 437 }
 438 
 439 
 440 /*
 441  * xpvtap_ioctl()
 442  */
 443 /*ARGSUSED*/
 444 static int
 445 xpvtap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred,
 446     int *rval)
 447 {
 448         xpvtap_state_t *state;
 449         int instance;
 450 
 451 
 452         if (secpolicy_xvm_control(cred)) {
 453                 return (EPERM);
 454         }
 455 
 456         instance = getminor(dev);
 457         if (instance == -1) {
 458                 return (EBADF);
 459         }
 460 
 461         state = ddi_get_soft_state(xpvtap_statep, instance);
 462         if (state == NULL) {
 463                 return (EBADF);
 464         }
 465 
 466         switch (cmd) {
 467         case XPVTAP_IOCTL_RESP_PUSH:
 468                 /*
 469                  * wake thread, thread handles guest requests and user app
 470                  * responses.
 471                  */
 472                 mutex_enter(&state->bt_thread.ut_mutex);
 473                 state->bt_thread.ut_wake = B_TRUE;
 474                 cv_signal(&state->bt_thread.ut_wake_cv);
 475                 mutex_exit(&state->bt_thread.ut_mutex);
 476                 break;
 477 
 478         default:
 479                 cmn_err(CE_WARN, "ioctl(%d) not supported\n", cmd);
 480                 return (ENXIO);
 481         }
 482 
 483         return (0);
 484 }
 485 
 486 
 487 /*
 488  * xpvtap_segmap()
 489  */
 490 /*ARGSUSED*/
 491 static int
 492 xpvtap_segmap(dev_t dev, off_t off, struct as *asp, caddr_t *addrp,
 493     off_t len, unsigned int prot, unsigned int maxprot, unsigned int flags,
 494     cred_t *cred_p)
 495 {
 496         struct segmf_crargs a;
 497         xpvtap_state_t *state;
 498         int instance;
 499         int e;
 500 
 501 
 502         if (secpolicy_xvm_control(cred_p)) {
 503                 return (EPERM);
 504         }
 505 
 506         instance = getminor(dev);
 507         state = ddi_get_soft_state(xpvtap_statep, instance);
 508         if (state == NULL) {
 509                 return (EBADF);
 510         }
 511 
 512         /* the user app should be doing a MAP_SHARED mapping */
 513         if ((flags & MAP_TYPE) != MAP_SHARED) {
 514                 return (EINVAL);
 515         }
 516 
 517         /*
 518          * if this is the user ring (offset = 0), devmap it (which ends up in
 519          * xpvtap_devmap). devmap will alloc and map the ring into the
 520          * app's VA space.
 521          */
 522         if (off == 0) {
 523                 e = devmap_setup(dev, (offset_t)off, asp, addrp, (size_t)len,
 524                     prot, maxprot, flags, cred_p);
 525                 return (e);
 526         }
 527 
 528         /* this should be the mmap for the gref pages (offset = PAGESIZE) */
 529         if (off != PAGESIZE) {
 530                 return (EINVAL);
 531         }
 532 
 533         /* make sure we get the size we're expecting */
 534         if (len != XPVTAP_GREF_BUFSIZE) {
 535                 return (EINVAL);
 536         }
 537 
 538         /*
 539          * reserve user app VA space for the gref pages and use segmf to
 540          * manage the backing store for the physical memory. segmf will
 541          * map in/out the grefs and fault them in/out.
 542          */
 543         ASSERT(asp == state->bt_map.um_as);
 544         as_rangelock(asp);
 545         if ((flags & MAP_FIXED) == 0) {
 546                 map_addr(addrp, len, 0, 0, flags);
 547                 if (*addrp == NULL) {
 548                         as_rangeunlock(asp);
 549                         return (ENOMEM);
 550                 }
 551         } else {
 552                 /* User specified address */
 553                 (void) as_unmap(asp, *addrp, len);
 554         }
 555         a.dev = dev;
 556         a.prot = (uchar_t)prot;
 557         a.maxprot = (uchar_t)maxprot;
 558         e = as_map(asp, *addrp, len, segmf_create, &a);
 559         if (e != 0) {
 560                 as_rangeunlock(asp);
 561                 return (e);
 562         }
 563         as_rangeunlock(asp);
 564 
 565         /*
 566          * Stash user base address, and compute address where the request
 567          * array will end up.
 568          */
 569         state->bt_map.um_guest_pages = (caddr_t)*addrp;
 570         state->bt_map.um_guest_size = (size_t)len;
 571 
 572         /* register an as callback so we can cleanup when the app goes away */
 573         e = as_add_callback(asp, xpvtap_segmf_unregister, state,
 574             AS_UNMAP_EVENT, *addrp, len, KM_SLEEP);
 575         if (e != 0) {
 576                 (void) as_unmap(asp, *addrp, len);
 577                 return (EINVAL);
 578         }
 579 
 580         /* wake thread to see if there are requests already queued up */
 581         mutex_enter(&state->bt_thread.ut_mutex);
 582         state->bt_thread.ut_wake = B_TRUE;
 583         cv_signal(&state->bt_thread.ut_wake_cv);
 584         mutex_exit(&state->bt_thread.ut_mutex);
 585 
 586         return (0);
 587 }
 588 
 589 
 590 /*
 591  * xpvtap_devmap()
 592  */
 593 /*ARGSUSED*/
 594 static int
 595 xpvtap_devmap(dev_t dev, devmap_cookie_t dhp, offset_t off, size_t len,
 596     size_t *maplen, uint_t model)
 597 {
 598         xpvtap_user_ring_t *usring;
 599         xpvtap_state_t *state;
 600         int instance;
 601         int e;
 602 
 603 
 604         instance = getminor(dev);
 605         state = ddi_get_soft_state(xpvtap_statep, instance);
 606         if (state == NULL) {
 607                 return (EBADF);
 608         }
 609 
 610         /* we should only get here if the offset was == 0 */
 611         if (off != 0) {
 612                 return (EINVAL);
 613         }
 614 
 615         /* we should only be mapping in one page */
 616         if (len != PAGESIZE) {
 617                 return (EINVAL);
 618         }
 619 
 620         /*
 621          * we already allocated the user ring during driver attach, all we
 622          * need to do is map it into the user app's VA.
 623          */
 624         usring = &state->bt_user_ring;
 625         e = devmap_umem_setup(dhp, state->bt_dip, NULL, usring->ur_cookie, 0,
 626             PAGESIZE, PROT_ALL, DEVMAP_DEFAULTS, NULL);
 627         if (e < 0) {
 628                 return (e);
 629         }
 630 
 631         /* return the size to compete the devmap */
 632         *maplen = PAGESIZE;
 633 
 634         return (0);
 635 }
 636 
 637 
 638 /*
 639  * xpvtap_chpoll()
 640  */
 641 static int
 642 xpvtap_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
 643     struct pollhead **phpp)
 644 {
 645         xpvtap_user_ring_t *usring;
 646         xpvtap_state_t *state;
 647         int instance;
 648 
 649 
 650         instance = getminor(dev);
 651         if (instance == -1) {
 652                 return (EBADF);
 653         }
 654         state = ddi_get_soft_state(xpvtap_statep, instance);
 655         if (state == NULL) {
 656                 return (EBADF);
 657         }
 658 
 659         if (((events & (POLLIN | POLLRDNORM)) == 0) && !anyyet) {
 660                 return (EINVAL);
 661         }
 662 
 663         /*
 664          * if we pushed requests on the user ring since the last poll, wakeup
 665          * the user app
 666          */
 667         *reventsp = 0;
 668         usring = &state->bt_user_ring;
 669         if (usring->ur_prod_polled != usring->ur_ring.req_prod_pvt) {
 670 
 671                 /*
 672                  * XXX - is this faster here or xpvtap_user_request_push??
 673                  * prelim data says here.  Because less membars or because
 674                  * user thread will spin in poll requests before getting to
 675                  * responses?
 676                  */
 677                 RING_PUSH_REQUESTS(&usring->ur_ring);
 678 
 679                 usring->ur_prod_polled = usring->ur_ring.sring->req_prod;
 680                 *reventsp =  POLLIN | POLLRDNORM;
 681         }
 682 
 683         if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
 684                 *phpp = &state->bt_pollhead;
 685         }
 686 
 687         return (0);
 688 }
 689 
 690 
 691 /*
 692  * xpvtap_drv_init()
 693  */
 694 static xpvtap_state_t *
 695 xpvtap_drv_init(int instance)
 696 {
 697         xpvtap_state_t *state;
 698         int e;
 699 
 700 
 701         e = ddi_soft_state_zalloc(xpvtap_statep, instance);
 702         if (e != DDI_SUCCESS) {
 703                 return (NULL);
 704         }
 705         state = ddi_get_soft_state(xpvtap_statep, instance);
 706         if (state == NULL) {
 707                 goto drvinitfail_get_soft_state;
 708         }
 709 
 710         state->bt_instance = instance;
 711         mutex_init(&state->bt_open.bo_mutex, NULL, MUTEX_DRIVER, NULL);
 712         cv_init(&state->bt_open.bo_exit_cv, NULL, CV_DRIVER, NULL);
 713         state->bt_open.bo_opened = B_FALSE;
 714         state->bt_map.um_registered = B_FALSE;
 715 
 716         /* initialize user ring, thread, mapping state */
 717         e = xpvtap_user_init(state);
 718         if (e != DDI_SUCCESS) {
 719                 goto drvinitfail_userinit;
 720         }
 721 
 722         return (state);
 723 
 724 drvinitfail_userinit:
 725         cv_destroy(&state->bt_open.bo_exit_cv);
 726         mutex_destroy(&state->bt_open.bo_mutex);
 727 drvinitfail_get_soft_state:
 728         (void) ddi_soft_state_free(xpvtap_statep, instance);
 729         return (NULL);
 730 }
 731 
 732 
 733 /*
 734  * xpvtap_drv_fini()
 735  */
 736 static void
 737 xpvtap_drv_fini(xpvtap_state_t *state)
 738 {
 739         xpvtap_user_fini(state);
 740         cv_destroy(&state->bt_open.bo_exit_cv);
 741         mutex_destroy(&state->bt_open.bo_mutex);
 742         (void) ddi_soft_state_free(xpvtap_statep, state->bt_instance);
 743 }
 744 
 745 
 746 /*
 747  * xpvtap_intr()
 748  *    this routine will be called when we have a request on the guest ring.
 749  */
 750 static uint_t
 751 xpvtap_intr(caddr_t arg)
 752 {
 753         xpvtap_state_t *state;
 754 
 755 
 756         state = (xpvtap_state_t *)arg;
 757 
 758         /* wake thread, thread handles guest requests and user app responses */
 759         mutex_enter(&state->bt_thread.ut_mutex);
 760         state->bt_thread.ut_wake = B_TRUE;
 761         cv_signal(&state->bt_thread.ut_wake_cv);
 762         mutex_exit(&state->bt_thread.ut_mutex);
 763 
 764         return (DDI_INTR_CLAIMED);
 765 }
 766 
 767 
 768 /*
 769  * xpvtap_segmf_register()
 770  */
 771 static int
 772 xpvtap_segmf_register(xpvtap_state_t *state)
 773 {
 774         struct seg *seg;
 775         uint64_t pte_ma;
 776         struct as *as;
 777         caddr_t uaddr;
 778         uint_t pgcnt;
 779         int i;
 780 
 781 
 782         as = state->bt_map.um_as;
 783         pgcnt = btopr(state->bt_map.um_guest_size);
 784         uaddr = state->bt_map.um_guest_pages;
 785 
 786         if (pgcnt == 0) {
 787                 return (DDI_FAILURE);
 788         }
 789 
 790         AS_LOCK_ENTER(as, RW_READER);
 791 
 792         seg = as_findseg(as, state->bt_map.um_guest_pages, 0);
 793         if ((seg == NULL) || ((uaddr + state->bt_map.um_guest_size) >
 794             (seg->s_base + seg->s_size))) {
 795                 AS_LOCK_EXIT(as);
 796                 return (DDI_FAILURE);
 797         }
 798 
 799         /*
 800          * lock down the htables so the HAT can't steal them. Register the
 801          * PTE MA's for each gref page with seg_mf so we can do user space
 802          * gref mappings.
 803          */
 804         for (i = 0; i < pgcnt; i++) {
 805                 hat_prepare_mapping(as->a_hat, uaddr, &pte_ma);
 806                 hat_devload(as->a_hat, uaddr, PAGESIZE, (pfn_t)0,
 807                     PROT_READ | PROT_WRITE | PROT_USER | HAT_UNORDERED_OK,
 808                     HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
 809                 hat_release_mapping(as->a_hat, uaddr);
 810                 segmf_add_gref_pte(seg, uaddr, pte_ma);
 811                 uaddr += PAGESIZE;
 812         }
 813 
 814         state->bt_map.um_registered = B_TRUE;
 815 
 816         AS_LOCK_EXIT(as);
 817 
 818         return (DDI_SUCCESS);
 819 }
 820 
 821 
 822 /*
 823  * xpvtap_segmf_unregister()
 824  *    as_callback routine
 825  */
 826 /*ARGSUSED*/
 827 static void
 828 xpvtap_segmf_unregister(struct as *as, void *arg, uint_t event)
 829 {
 830         xpvtap_state_t *state;
 831         caddr_t uaddr;
 832         uint_t pgcnt;
 833         int i;
 834 
 835 
 836         state = (xpvtap_state_t *)arg;
 837         if (!state->bt_map.um_registered) {
 838                 /* remove the callback (which is this routine) */
 839                 (void) as_delete_callback(as, arg);
 840                 return;
 841         }
 842 
 843         pgcnt = btopr(state->bt_map.um_guest_size);
 844         uaddr = state->bt_map.um_guest_pages;
 845 
 846         /* unmap any outstanding req's grefs */
 847         xpvtap_rs_flush(state->bt_map.um_rs, xpvtap_user_request_unmap, state);
 848 
 849         /* Unlock the gref pages */
 850         for (i = 0; i < pgcnt; i++) {
 851                 AS_LOCK_ENTER(as, RW_WRITER);
 852                 hat_prepare_mapping(as->a_hat, uaddr, NULL);
 853                 hat_unload(as->a_hat, uaddr, PAGESIZE, HAT_UNLOAD_UNLOCK);
 854                 hat_release_mapping(as->a_hat, uaddr);
 855                 AS_LOCK_EXIT(as);
 856                 uaddr += PAGESIZE;
 857         }
 858 
 859         /* remove the callback (which is this routine) */
 860         (void) as_delete_callback(as, arg);
 861 
 862         state->bt_map.um_registered = B_FALSE;
 863 }
 864 
 865 
 866 /*
 867  * xpvtap_user_init()
 868  */
 869 static int
 870 xpvtap_user_init(xpvtap_state_t *state)
 871 {
 872         xpvtap_user_map_t *map;
 873         int e;
 874 
 875 
 876         map = &state->bt_map;
 877 
 878         /* Setup the ring between the driver and user app */
 879         e = xpvtap_user_ring_init(state);
 880         if (e != DDI_SUCCESS) {
 881                 return (DDI_FAILURE);
 882         }
 883 
 884         /*
 885          * the user ring can handle BLKIF_RING_SIZE outstanding requests. This
 886          * is the same number of requests as the guest ring. Initialize the
 887          * state we use to track request IDs to the user app. These IDs will
 888          * also identify which group of gref pages correspond with the
 889          * request.
 890          */
 891         xpvtap_rs_init(0, (BLKIF_RING_SIZE - 1), &map->um_rs);
 892 
 893         /*
 894          * allocate the space to store a copy of each outstanding requests. We
 895          * will need to reference the ID and the number of segments when we
 896          * get the response from the user app.
 897          */
 898         map->um_outstanding_reqs = kmem_zalloc(
 899             sizeof (*map->um_outstanding_reqs) * BLKIF_RING_SIZE,
 900             KM_SLEEP);
 901 
 902         /*
 903          * initialize the thread we use to process guest requests and user
 904          * responses.
 905          */
 906         e = xpvtap_user_thread_init(state);
 907         if (e != DDI_SUCCESS) {
 908                 goto userinitfail_user_thread_init;
 909         }
 910 
 911         return (DDI_SUCCESS);
 912 
 913 userinitfail_user_thread_init:
 914         xpvtap_rs_fini(&map->um_rs);
 915         kmem_free(map->um_outstanding_reqs,
 916             sizeof (*map->um_outstanding_reqs) * BLKIF_RING_SIZE);
 917         xpvtap_user_ring_fini(state);
 918         return (DDI_FAILURE);
 919 }
 920 
 921 
 922 /*
 923  * xpvtap_user_ring_init()
 924  */
 925 static int
 926 xpvtap_user_ring_init(xpvtap_state_t *state)
 927 {
 928         xpvtap_user_ring_t *usring;
 929 
 930 
 931         usring = &state->bt_user_ring;
 932 
 933         /* alocate and initialize the page for the shared user ring */
 934         usring->ur_sring = (blkif_sring_t *)ddi_umem_alloc(PAGESIZE,
 935             DDI_UMEM_SLEEP, &usring->ur_cookie);
 936         SHARED_RING_INIT(usring->ur_sring);
 937         FRONT_RING_INIT(&usring->ur_ring, usring->ur_sring, PAGESIZE);
 938         usring->ur_prod_polled = 0;
 939 
 940         return (DDI_SUCCESS);
 941 }
 942 
 943 
 944 /*
 945  * xpvtap_user_thread_init()
 946  */
 947 static int
 948 xpvtap_user_thread_init(xpvtap_state_t *state)
 949 {
 950         xpvtap_user_thread_t *thread;
 951         char taskqname[32];
 952 
 953 
 954         thread = &state->bt_thread;
 955 
 956         mutex_init(&thread->ut_mutex, NULL, MUTEX_DRIVER, NULL);
 957         cv_init(&thread->ut_wake_cv, NULL, CV_DRIVER, NULL);
 958         cv_init(&thread->ut_exit_done_cv, NULL, CV_DRIVER, NULL);
 959         thread->ut_wake = B_FALSE;
 960         thread->ut_exit = B_FALSE;
 961         thread->ut_exit_done = B_TRUE;
 962 
 963         /* create but don't start the user thread */
 964         (void) sprintf(taskqname, "xvptap_%d", state->bt_instance);
 965         thread->ut_taskq = ddi_taskq_create(state->bt_dip, taskqname, 1,
 966             TASKQ_DEFAULTPRI, 0);
 967         if (thread->ut_taskq == NULL) {
 968                 goto userinitthrfail_taskq_create;
 969         }
 970 
 971         return (DDI_SUCCESS);
 972 
 973 userinitthrfail_taskq_dispatch:
 974         ddi_taskq_destroy(thread->ut_taskq);
 975 userinitthrfail_taskq_create:
 976         cv_destroy(&thread->ut_exit_done_cv);
 977         cv_destroy(&thread->ut_wake_cv);
 978         mutex_destroy(&thread->ut_mutex);
 979 
 980         return (DDI_FAILURE);
 981 }
 982 
 983 
 984 /*
 985  * xpvtap_user_thread_start()
 986  */
 987 static void
 988 xpvtap_user_thread_start(caddr_t arg)
 989 {
 990         xpvtap_user_thread_t *thread;
 991         xpvtap_state_t *state;
 992         int e;
 993 
 994 
 995         state = (xpvtap_state_t *)arg;
 996         thread = &state->bt_thread;
 997 
 998         /* start the user thread */
 999         thread->ut_exit_done = B_FALSE;
1000         e = ddi_taskq_dispatch(thread->ut_taskq, xpvtap_user_thread, state,
1001             DDI_SLEEP);
1002         if (e != DDI_SUCCESS) {
1003                 thread->ut_exit_done = B_TRUE;
1004                 cmn_err(CE_WARN, "Unable to start user thread\n");
1005         }
1006 }
1007 
1008 
1009 /*
1010  * xpvtap_user_thread_stop()
1011  */
1012 static void
1013 xpvtap_user_thread_stop(xpvtap_state_t *state)
1014 {
1015         /* wake thread so it can exit */
1016         mutex_enter(&state->bt_thread.ut_mutex);
1017         state->bt_thread.ut_wake = B_TRUE;
1018         state->bt_thread.ut_exit = B_TRUE;
1019         cv_signal(&state->bt_thread.ut_wake_cv);
1020         if (!state->bt_thread.ut_exit_done) {
1021                 cv_wait(&state->bt_thread.ut_exit_done_cv,
1022                     &state->bt_thread.ut_mutex);
1023         }
1024         mutex_exit(&state->bt_thread.ut_mutex);
1025         ASSERT(state->bt_thread.ut_exit_done);
1026 }
1027 
1028 
1029 /*
1030  * xpvtap_user_fini()
1031  */
1032 static void
1033 xpvtap_user_fini(xpvtap_state_t *state)
1034 {
1035         xpvtap_user_map_t *map;
1036 
1037 
1038         map = &state->bt_map;
1039 
1040         xpvtap_user_thread_fini(state);
1041         xpvtap_rs_fini(&map->um_rs);
1042         kmem_free(map->um_outstanding_reqs,
1043             sizeof (*map->um_outstanding_reqs) * BLKIF_RING_SIZE);
1044         xpvtap_user_ring_fini(state);
1045 }
1046 
1047 
1048 /*
1049  * xpvtap_user_ring_fini()
1050  */
1051 static void
1052 xpvtap_user_ring_fini(xpvtap_state_t *state)
1053 {
1054         ddi_umem_free(state->bt_user_ring.ur_cookie);
1055 }
1056 
1057 
1058 /*
1059  * xpvtap_user_thread_fini()
1060  */
1061 static void
1062 xpvtap_user_thread_fini(xpvtap_state_t *state)
1063 {
1064         ddi_taskq_destroy(state->bt_thread.ut_taskq);
1065         cv_destroy(&state->bt_thread.ut_exit_done_cv);
1066         cv_destroy(&state->bt_thread.ut_wake_cv);
1067         mutex_destroy(&state->bt_thread.ut_mutex);
1068 }
1069 
1070 
1071 /*
1072  * xpvtap_user_thread()
1073  */
1074 static void
1075 xpvtap_user_thread(void *arg)
1076 {
1077         xpvtap_user_thread_t *thread;
1078         blkif_response_t resp;
1079         xpvtap_state_t *state;
1080         blkif_request_t req;
1081         boolean_t b;
1082         uint_t uid;
1083         int e;
1084 
1085 
1086         state = (xpvtap_state_t *)arg;
1087         thread = &state->bt_thread;
1088 
1089 xpvtap_thread_start:
1090         /* See if we are supposed to exit */
1091         mutex_enter(&thread->ut_mutex);
1092         if (thread->ut_exit) {
1093                 thread->ut_exit_done = B_TRUE;
1094                 cv_signal(&state->bt_thread.ut_exit_done_cv);
1095                 mutex_exit(&thread->ut_mutex);
1096                 return;
1097         }
1098 
1099         /*
1100          * if we aren't supposed to be awake, wait until someone wakes us.
1101          * when we wake up, check for a kill or someone telling us to exit.
1102          */
1103         if (!thread->ut_wake) {
1104                 e = cv_wait_sig(&thread->ut_wake_cv, &thread->ut_mutex);
1105                 if ((e == 0) || (thread->ut_exit)) {
1106                         thread->ut_exit = B_TRUE;
1107                         mutex_exit(&thread->ut_mutex);
1108                         goto xpvtap_thread_start;
1109                 }
1110         }
1111 
1112         /* if someone didn't wake us, go back to the start of the thread */
1113         if (!thread->ut_wake) {
1114                 mutex_exit(&thread->ut_mutex);
1115                 goto xpvtap_thread_start;
1116         }
1117 
1118         /* we are awake */
1119         thread->ut_wake = B_FALSE;
1120         mutex_exit(&thread->ut_mutex);
1121 
1122         /* process requests from the guest */
1123         do {
1124                 /*
1125                  * check for requests from the guest. if we don't have any,
1126                  * break out of the loop.
1127                  */
1128                 e = blk_ring_request_get(state->bt_guest_ring, &req);
1129                 if (e == B_FALSE) {
1130                         break;
1131                 }
1132 
1133                 /* we got a request, map the grefs into the user app's VA */
1134                 e = xpvtap_user_request_map(state, &req, &uid);
1135                 if (e != DDI_SUCCESS) {
1136                         /*
1137                          * If we couldn't map the request (e.g. user app hasn't
1138                          * opened the device yet), requeue it and try again
1139                          * later
1140                          */
1141                         blk_ring_request_requeue(state->bt_guest_ring);
1142                         break;
1143                 }
1144 
1145                 /* push the request to the user app */
1146                 e = xpvtap_user_request_push(state, &req, uid);
1147                 if (e != DDI_SUCCESS) {
1148                         resp.id = req.id;
1149                         resp.operation = req.operation;
1150                         resp.status = BLKIF_RSP_ERROR;
1151                         blk_ring_response_put(state->bt_guest_ring, &resp);
1152                 }
1153         } while (!thread->ut_exit);
1154 
1155         /* process reponses from the user app */
1156         do {
1157                 /*
1158                  * check for responses from the user app. if we don't have any,
1159                  * break out of the loop.
1160                  */
1161                 b = xpvtap_user_response_get(state, &resp, &uid);
1162                 if (b != B_TRUE) {
1163                         break;
1164                 }
1165 
1166                 /*
1167                  * if we got a response, unmap the grefs from the matching
1168                  * request.
1169                  */
1170                 xpvtap_user_request_unmap(state, uid);
1171 
1172                 /* push the response to the guest */
1173                 blk_ring_response_put(state->bt_guest_ring, &resp);
1174         } while (!thread->ut_exit);
1175 
1176         goto xpvtap_thread_start;
1177 }
1178 
1179 
1180 /*
1181  * xpvtap_user_request_map()
1182  */
1183 static int
1184 xpvtap_user_request_map(xpvtap_state_t *state, blkif_request_t *req,
1185     uint_t *uid)
1186 {
1187         grant_ref_t gref[BLKIF_MAX_SEGMENTS_PER_REQUEST];
1188         struct seg *seg;
1189         struct as *as;
1190         domid_t domid;
1191         caddr_t uaddr;
1192         uint_t flags;
1193         int i;
1194         int e;
1195 
1196 
1197         domid = xvdi_get_oeid(state->bt_dip);
1198 
1199         as = state->bt_map.um_as;
1200         if ((as == NULL) || (state->bt_map.um_guest_pages == NULL)) {
1201                 return (DDI_FAILURE);
1202         }
1203 
1204         /* has to happen after segmap returns */
1205         if (!state->bt_map.um_registered) {
1206                 /* register the pte's with segmf */
1207                 e = xpvtap_segmf_register(state);
1208                 if (e != DDI_SUCCESS) {
1209                         return (DDI_FAILURE);
1210                 }
1211         }
1212 
1213         /* alloc an ID for the user ring */
1214         e = xpvtap_rs_alloc(state->bt_map.um_rs, uid);
1215         if (e != DDI_SUCCESS) {
1216                 return (DDI_FAILURE);
1217         }
1218 
1219         /* if we don't have any segments to map, we're done */
1220         if ((req->operation == BLKIF_OP_WRITE_BARRIER) ||
1221             (req->operation == BLKIF_OP_FLUSH_DISKCACHE) ||
1222             (req->nr_segments == 0)) {
1223                 return (DDI_SUCCESS);
1224         }
1225 
1226         /* get the apps gref address */
1227         uaddr = XPVTAP_GREF_REQADDR(state->bt_map.um_guest_pages, *uid);
1228 
1229         AS_LOCK_ENTER(as, RW_READER);
1230         seg = as_findseg(as, state->bt_map.um_guest_pages, 0);
1231         if ((seg == NULL) || ((uaddr + mmu_ptob(req->nr_segments)) >
1232             (seg->s_base + seg->s_size))) {
1233                 AS_LOCK_EXIT(as);
1234                 return (DDI_FAILURE);
1235         }
1236 
1237         /* if we are reading from disk, we are writing into memory */
1238         flags = 0;
1239         if (req->operation == BLKIF_OP_READ) {
1240                 flags |= SEGMF_GREF_WR;
1241         }
1242 
1243         /* Load the grefs into seg_mf */
1244         for (i = 0; i < req->nr_segments; i++) {
1245                 gref[i] = req->seg[i].gref;
1246         }
1247         (void) segmf_add_grefs(seg, uaddr, flags, gref, req->nr_segments,
1248             domid);
1249 
1250         AS_LOCK_EXIT(as);
1251 
1252         return (DDI_SUCCESS);
1253 }
1254 
1255 
1256 /*
1257  * xpvtap_user_request_push()
1258  */
1259 static int
1260 xpvtap_user_request_push(xpvtap_state_t *state, blkif_request_t *req,
1261     uint_t uid)
1262 {
1263         blkif_request_t *outstanding_req;
1264         blkif_front_ring_t *uring;
1265         blkif_request_t *target;
1266         xpvtap_user_map_t *map;
1267 
1268 
1269         uring = &state->bt_user_ring.ur_ring;
1270         map = &state->bt_map;
1271 
1272         target = RING_GET_REQUEST(uring, uring->req_prod_pvt);
1273 
1274         /*
1275          * Save request from the frontend. used for ID mapping and unmap
1276          * on response/cleanup
1277          */
1278         outstanding_req = &map->um_outstanding_reqs[uid];
1279         bcopy(req, outstanding_req, sizeof (*outstanding_req));
1280 
1281         /* put the request on the user ring */
1282         bcopy(req, target, sizeof (*req));
1283         target->id = (uint64_t)uid;
1284         uring->req_prod_pvt++;
1285 
1286         pollwakeup(&state->bt_pollhead, POLLIN | POLLRDNORM);
1287 
1288         return (DDI_SUCCESS);
1289 }
1290 
1291 
1292 static void
1293 xpvtap_user_request_unmap(xpvtap_state_t *state, uint_t uid)
1294 {
1295         blkif_request_t *req;
1296         struct seg *seg;
1297         struct as *as;
1298         caddr_t uaddr;
1299         int e;
1300 
1301 
1302         as = state->bt_map.um_as;
1303         if (as == NULL) {
1304                 return;
1305         }
1306 
1307         /* get a copy of the original request */
1308         req = &state->bt_map.um_outstanding_reqs[uid];
1309 
1310         /* unmap the grefs for this request */
1311         if ((req->operation != BLKIF_OP_WRITE_BARRIER) &&
1312             (req->operation != BLKIF_OP_FLUSH_DISKCACHE) &&
1313             (req->nr_segments != 0)) {
1314                 uaddr = XPVTAP_GREF_REQADDR(state->bt_map.um_guest_pages, uid);
1315                 AS_LOCK_ENTER(as, RW_READER);
1316                 seg = as_findseg(as, state->bt_map.um_guest_pages, 0);
1317                 if ((seg == NULL) || ((uaddr + mmu_ptob(req->nr_segments)) >
1318                     (seg->s_base + seg->s_size))) {
1319                         AS_LOCK_EXIT(as);
1320                         xpvtap_rs_free(state->bt_map.um_rs, uid);
1321                         return;
1322                 }
1323 
1324                 e = segmf_release_grefs(seg, uaddr, req->nr_segments);
1325                 if (e != 0) {
1326                         cmn_err(CE_WARN, "unable to release grefs");
1327                 }
1328 
1329                 AS_LOCK_EXIT(as);
1330         }
1331 
1332         /* free up the user ring id */
1333         xpvtap_rs_free(state->bt_map.um_rs, uid);
1334 }
1335 
1336 
1337 static int
1338 xpvtap_user_response_get(xpvtap_state_t *state, blkif_response_t *resp,
1339     uint_t *uid)
1340 {
1341         blkif_front_ring_t *uring;
1342         blkif_response_t *target;
1343 
1344 
1345         uring = &state->bt_user_ring.ur_ring;
1346 
1347         if (!RING_HAS_UNCONSUMED_RESPONSES(uring)) {
1348                 return (B_FALSE);
1349         }
1350 
1351         target = NULL;
1352         target = RING_GET_RESPONSE(uring, uring->rsp_cons);
1353         if (target == NULL) {
1354                 return (B_FALSE);
1355         }
1356 
1357         /* copy out the user app response */
1358         bcopy(target, resp, sizeof (*resp));
1359         uring->rsp_cons++;
1360 
1361         /* restore the quests id from the original request */
1362         *uid = (uint_t)resp->id;
1363         resp->id = state->bt_map.um_outstanding_reqs[*uid].id;
1364 
1365         return (B_TRUE);
1366 }
1367 
1368 
1369 /*
1370  * xpvtap_user_app_stop()
1371  */
1372 static void xpvtap_user_app_stop(caddr_t arg)
1373 {
1374         xpvtap_state_t *state;
1375         clock_t rc;
1376 
1377         state = (xpvtap_state_t *)arg;
1378 
1379         /*
1380          * Give the app 10 secs to exit. If it doesn't exit, it's not a serious
1381          * problem, we just won't auto-detach the driver.
1382          */
1383         mutex_enter(&state->bt_open.bo_mutex);
1384         if (state->bt_open.bo_opened) {
1385                 rc = cv_reltimedwait(&state->bt_open.bo_exit_cv,
1386                     &state->bt_open.bo_mutex, drv_usectohz(10000000),
1387                     TR_CLOCK_TICK);
1388                 if (rc <= 0) {
1389                         cmn_err(CE_NOTE, "!user process still has driver open, "
1390                             "deferring detach\n");
1391                 }
1392         }
1393         mutex_exit(&state->bt_open.bo_mutex);
1394 }
1395 
1396 
1397 /*
1398  * xpvtap_rs_init()
1399  *    Initialize the resource structure. init() returns a handle to be used
1400  *    for the rest of the resource functions. This code is written assuming
1401  *    that min_val will be close to 0. Therefore, we will allocate the free
1402  *    buffer only taking max_val into account.
1403  */
1404 static void
1405 xpvtap_rs_init(uint_t min_val, uint_t max_val, xpvtap_rs_hdl_t *handle)
1406 {
1407         xpvtap_rs_t *rstruct;
1408         uint_t array_size;
1409         uint_t index;
1410 
1411 
1412         ASSERT(handle != NULL);
1413         ASSERT(min_val < max_val);
1414 
1415         /* alloc space for resource structure */
1416         rstruct = kmem_alloc(sizeof (xpvtap_rs_t), KM_SLEEP);
1417 
1418         /*
1419          * Test to see if the max value is 64-bit aligned. If so, we don't need
1420          * to allocate an extra 64-bit word. alloc space for free buffer
1421          * (8 bytes per uint64_t).
1422          */
1423         if ((max_val & 0x3F) == 0) {
1424                 rstruct->rs_free_size = (max_val >> 6) * 8;
1425         } else {
1426                 rstruct->rs_free_size = ((max_val >> 6) + 1) * 8;
1427         }
1428         rstruct->rs_free = kmem_alloc(rstruct->rs_free_size, KM_SLEEP);
1429 
1430         /* Initialize resource structure */
1431         rstruct->rs_min = min_val;
1432         rstruct->rs_last = min_val;
1433         rstruct->rs_max = max_val;
1434         mutex_init(&rstruct->rs_mutex, NULL, MUTEX_DRIVER, NULL);
1435         rstruct->rs_flushing = B_FALSE;
1436 
1437         /* Mark all resources as free */
1438         array_size = rstruct->rs_free_size >> 3;
1439         for (index = 0; index < array_size; index++) {
1440                 rstruct->rs_free[index] = (uint64_t)0xFFFFFFFFFFFFFFFF;
1441         }
1442 
1443         /* setup handle which is returned from this function */
1444         *handle = rstruct;
1445 }
1446 
1447 
1448 /*
1449  * xpvtap_rs_fini()
1450  *    Frees up the space allocated in init().  Notice that a pointer to the
1451  *    handle is used for the parameter.  fini() will set the handle to NULL
1452  *    before returning.
1453  */
1454 static void
1455 xpvtap_rs_fini(xpvtap_rs_hdl_t *handle)
1456 {
1457         xpvtap_rs_t *rstruct;
1458 
1459 
1460         ASSERT(handle != NULL);
1461 
1462         rstruct = (xpvtap_rs_t *)*handle;
1463 
1464         mutex_destroy(&rstruct->rs_mutex);
1465         kmem_free(rstruct->rs_free, rstruct->rs_free_size);
1466         kmem_free(rstruct, sizeof (xpvtap_rs_t));
1467 
1468         /* set handle to null.  This helps catch bugs. */
1469         *handle = NULL;
1470 }
1471 
1472 
1473 /*
1474  * xpvtap_rs_alloc()
1475  *    alloc a resource. If alloc fails, we are out of resources.
1476  */
1477 static int
1478 xpvtap_rs_alloc(xpvtap_rs_hdl_t handle, uint_t *resource)
1479 {
1480         xpvtap_rs_t *rstruct;
1481         uint_t array_idx;
1482         uint64_t free;
1483         uint_t index;
1484         uint_t last;
1485         uint_t min;
1486         uint_t max;
1487 
1488 
1489         ASSERT(handle != NULL);
1490         ASSERT(resource != NULL);
1491 
1492         rstruct = (xpvtap_rs_t *)handle;
1493 
1494         mutex_enter(&rstruct->rs_mutex);
1495         min = rstruct->rs_min;
1496         max = rstruct->rs_max;
1497 
1498         /*
1499          * Find a free resource. This will return out of the loop once it finds
1500          * a free resource. There are a total of 'max'-'min'+1 resources.
1501          * Performs a round robin allocation.
1502          */
1503         for (index = min; index <= max; index++) {
1504 
1505                 array_idx = rstruct->rs_last >> 6;
1506                 free = rstruct->rs_free[array_idx];
1507                 last = rstruct->rs_last & 0x3F;
1508 
1509                 /* if the next resource to check is free */
1510                 if ((free & ((uint64_t)1 << last)) != 0) {
1511                         /* we are using this resource */
1512                         *resource = rstruct->rs_last;
1513 
1514                         /* take it out of the free list */
1515                         rstruct->rs_free[array_idx] &= ~((uint64_t)1 << last);
1516 
1517                         /*
1518                          * increment the last count so we start checking the
1519                          * next resource on the next alloc().  Note the rollover
1520                          * at 'max'+1.
1521                          */
1522                         rstruct->rs_last++;
1523                         if (rstruct->rs_last > max) {
1524                                 rstruct->rs_last = rstruct->rs_min;
1525                         }
1526 
1527                         /* unlock the resource structure */
1528                         mutex_exit(&rstruct->rs_mutex);
1529 
1530                         return (DDI_SUCCESS);
1531                 }
1532 
1533                 /*
1534                  * This resource is not free, lets go to the next one. Note the
1535                  * rollover at 'max'.
1536                  */
1537                 rstruct->rs_last++;
1538                 if (rstruct->rs_last > max) {
1539                         rstruct->rs_last = rstruct->rs_min;
1540                 }
1541         }
1542 
1543         mutex_exit(&rstruct->rs_mutex);
1544 
1545         return (DDI_FAILURE);
1546 }
1547 
1548 
1549 /*
1550  * xpvtap_rs_free()
1551  *    Free the previously alloc'd resource.  Once a resource has been free'd,
1552  *    it can be used again when alloc is called.
1553  */
1554 static void
1555 xpvtap_rs_free(xpvtap_rs_hdl_t handle, uint_t resource)
1556 {
1557         xpvtap_rs_t *rstruct;
1558         uint_t array_idx;
1559         uint_t offset;
1560 
1561 
1562         ASSERT(handle != NULL);
1563 
1564         rstruct = (xpvtap_rs_t *)handle;
1565         ASSERT(resource >= rstruct->rs_min);
1566         ASSERT(resource <= rstruct->rs_max);
1567 
1568         if (!rstruct->rs_flushing) {
1569                 mutex_enter(&rstruct->rs_mutex);
1570         }
1571 
1572         /* Put the resource back in the free list */
1573         array_idx = resource >> 6;
1574         offset = resource & 0x3F;
1575         rstruct->rs_free[array_idx] |= ((uint64_t)1 << offset);
1576 
1577         if (!rstruct->rs_flushing) {
1578                 mutex_exit(&rstruct->rs_mutex);
1579         }
1580 }
1581 
1582 
1583 /*
1584  * xpvtap_rs_flush()
1585  */
1586 static void
1587 xpvtap_rs_flush(xpvtap_rs_hdl_t handle, xpvtap_rs_cleanup_t callback,
1588     void *arg)
1589 {
1590         xpvtap_rs_t *rstruct;
1591         uint_t array_idx;
1592         uint64_t free;
1593         uint_t index;
1594         uint_t last;
1595         uint_t min;
1596         uint_t max;
1597 
1598 
1599         ASSERT(handle != NULL);
1600 
1601         rstruct = (xpvtap_rs_t *)handle;
1602 
1603         mutex_enter(&rstruct->rs_mutex);
1604         min = rstruct->rs_min;
1605         max = rstruct->rs_max;
1606 
1607         rstruct->rs_flushing = B_TRUE;
1608 
1609         /*
1610          * for all resources not free, call the callback routine to clean it
1611          * up.
1612          */
1613         for (index = min; index <= max; index++) {
1614 
1615                 array_idx = rstruct->rs_last >> 6;
1616                 free = rstruct->rs_free[array_idx];
1617                 last = rstruct->rs_last & 0x3F;
1618 
1619                 /* if the next resource to check is not free */
1620                 if ((free & ((uint64_t)1 << last)) == 0) {
1621                         /* call the callback to cleanup */
1622                         (*callback)(arg, rstruct->rs_last);
1623 
1624                         /* put it back in the free list */
1625                         rstruct->rs_free[array_idx] |= ((uint64_t)1 << last);
1626                 }
1627 
1628                 /* go to the next one. Note the rollover at 'max' */
1629                 rstruct->rs_last++;
1630                 if (rstruct->rs_last > max) {
1631                         rstruct->rs_last = rstruct->rs_min;
1632                 }
1633         }
1634 
1635         mutex_exit(&rstruct->rs_mutex);
1636 }