1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2016 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * Support for the eventfd facility, a Linux-borne facility for user-generated
  18  * file descriptor-based events.
  19  */
  20 
  21 #include <sys/ddi.h>
  22 #include <sys/sunddi.h>
  23 #include <sys/eventfd.h>
  24 #include <sys/conf.h>
  25 #include <sys/vmem.h>
  26 #include <sys/sysmacros.h>
  27 #include <sys/filio.h>
  28 #include <sys/stat.h>
  29 #include <sys/file.h>
  30 
  31 struct eventfd_state;
  32 typedef struct eventfd_state eventfd_state_t;
  33 
  34 struct eventfd_state {
  35         kmutex_t efd_lock;                      /* lock protecting state */
  36         boolean_t efd_semaphore;                /* boolean: sema. semantics */
  37         kcondvar_t efd_cv;                      /* condvar */
  38         pollhead_t efd_pollhd;                  /* poll head */
  39         uint64_t efd_value;                     /* value */
  40         size_t efd_bwriters;                    /* count of blocked writers */
  41         eventfd_state_t *efd_next;              /* next state on global list */
  42 };
  43 
  44 /*
  45  * Internal global variables.
  46  */
  47 static kmutex_t         eventfd_lock;           /* lock protecting state */
  48 static dev_info_t       *eventfd_devi;          /* device info */
  49 static vmem_t           *eventfd_minor;         /* minor number arena */
  50 static void             *eventfd_softstate;     /* softstate pointer */
  51 static eventfd_state_t  *eventfd_state;         /* global list of state */
  52 
  53 /*ARGSUSED*/
  54 static int
  55 eventfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
  56 {
  57         eventfd_state_t *state;
  58         major_t major = getemajor(*devp);
  59         minor_t minor = getminor(*devp);
  60 
  61         if (minor != EVENTFDMNRN_EVENTFD)
  62                 return (ENXIO);
  63 
  64         mutex_enter(&eventfd_lock);
  65 
  66         minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1,
  67             VM_BESTFIT | VM_SLEEP);
  68 
  69         if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) {
  70                 vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
  71                 mutex_exit(&eventfd_lock);
  72                 return (NULL);
  73         }
  74 
  75         state = ddi_get_soft_state(eventfd_softstate, minor);
  76         *devp = makedevice(major, minor);
  77 
  78         state->efd_next = eventfd_state;
  79         eventfd_state = state;
  80 
  81         mutex_exit(&eventfd_lock);
  82 
  83         return (0);
  84 }
  85 
  86 /*ARGSUSED*/
  87 static int
  88 eventfd_read(dev_t dev, uio_t *uio, cred_t *cr)
  89 {
  90         eventfd_state_t *state;
  91         minor_t minor = getminor(dev);
  92         uint64_t val, oval;
  93         int err;
  94 
  95         if (uio->uio_resid < sizeof (val))
  96                 return (EINVAL);
  97 
  98         state = ddi_get_soft_state(eventfd_softstate, minor);
  99 
 100         mutex_enter(&state->efd_lock);
 101 
 102         while (state->efd_value == 0) {
 103                 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
 104                         mutex_exit(&state->efd_lock);
 105                         return (EAGAIN);
 106                 }
 107 
 108                 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
 109                         mutex_exit(&state->efd_lock);
 110                         return (EINTR);
 111                 }
 112         }
 113 
 114         /*
 115          * We have a non-zero value and we own the lock; our behavior now
 116          * depends on whether or not EFD_SEMAPHORE was set when the eventfd
 117          * was created.
 118          */
 119         val = oval = state->efd_value;
 120 
 121         if (state->efd_semaphore) {
 122                 state->efd_value--;
 123                 val = 1;
 124         } else {
 125                 state->efd_value = 0;
 126         }
 127 
 128         err = uiomove(&val, sizeof (val), UIO_READ, uio);
 129 
 130         /*
 131          * Wake any writers blocked on this eventfd as this read operation may
 132          * have created adequate capacity for their values.
 133          */
 134         if (state->efd_bwriters != 0) {
 135                 cv_broadcast(&state->efd_cv);
 136         }
 137         mutex_exit(&state->efd_lock);
 138 
 139         /*
 140          * It is necessary to emit POLLOUT events only when the eventfd
 141          * transitions from EVENTFD_VALMAX to a lower value.  At all other
 142          * times, it is already considered writable by poll.
 143          */
 144         if (oval == EVENTFD_VALMAX) {
 145                 pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT);
 146         }
 147 
 148         return (err);
 149 }
 150 
 151 /*ARGSUSED*/
 152 static int
 153 eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
 154 {
 155         eventfd_state_t *state;
 156         minor_t minor = getminor(dev);
 157         uint64_t val, oval;
 158         int err;
 159 
 160         if (uio->uio_resid < sizeof (val))
 161                 return (EINVAL);
 162 
 163         if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
 164                 return (err);
 165 
 166         if (val > EVENTFD_VALMAX)
 167                 return (EINVAL);
 168 
 169         state = ddi_get_soft_state(eventfd_softstate, minor);
 170 
 171         mutex_enter(&state->efd_lock);
 172 
 173         while (val > EVENTFD_VALMAX - state->efd_value) {
 174                 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
 175                         mutex_exit(&state->efd_lock);
 176                         return (EAGAIN);
 177                 }
 178 
 179                 state->efd_bwriters++;
 180                 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
 181                         state->efd_bwriters--;
 182                         mutex_exit(&state->efd_lock);
 183                         return (EINTR);
 184                 }
 185                 state->efd_bwriters--;
 186         }
 187 
 188         /*
 189          * We now know that we can add the value without overflowing.
 190          */
 191         state->efd_value = (oval = state->efd_value) + val;
 192 
 193         /*
 194          * If the value was previously "empty", notify blocked readers that
 195          * data is available.
 196          */
 197         if (oval == 0) {
 198                 cv_broadcast(&state->efd_cv);
 199         }
 200         mutex_exit(&state->efd_lock);
 201 
 202         /*
 203          * Notify pollers as well if the eventfd is now readable.
 204          */
 205         if (oval == 0) {
 206                 pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN);
 207         }
 208 
 209         return (0);
 210 }
 211 
 212 /*ARGSUSED*/
 213 static int
 214 eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
 215     struct pollhead **phpp)
 216 {
 217         eventfd_state_t *state;
 218         minor_t minor = getminor(dev);
 219         short revents = 0;
 220 
 221         state = ddi_get_soft_state(eventfd_softstate, minor);
 222 
 223         mutex_enter(&state->efd_lock);
 224 
 225         if (state->efd_value > 0)
 226                 revents |= POLLRDNORM | POLLIN;
 227 
 228         if (state->efd_value < EVENTFD_VALMAX)
 229                 revents |= POLLWRNORM | POLLOUT;
 230 
 231         if (!(*reventsp = revents & events) && !anyyet)
 232                 *phpp = &state->efd_pollhd;
 233 
 234         mutex_exit(&state->efd_lock);
 235 
 236         return (0);
 237 }
 238 
 239 /*ARGSUSED*/
 240 static int
 241 eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
 242 {
 243         eventfd_state_t *state;
 244         minor_t minor = getminor(dev);
 245 
 246         state = ddi_get_soft_state(eventfd_softstate, minor);
 247 
 248         switch (cmd) {
 249         case EVENTFDIOC_SEMAPHORE: {
 250                 mutex_enter(&state->efd_lock);
 251                 state->efd_semaphore ^= 1;
 252                 mutex_exit(&state->efd_lock);
 253 
 254                 return (0);
 255         }
 256 
 257         default:
 258                 break;
 259         }
 260 
 261         return (ENOTTY);
 262 }
 263 
 264 /*ARGSUSED*/
 265 static int
 266 eventfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
 267 {
 268         eventfd_state_t *state, **sp;
 269         minor_t minor = getminor(dev);
 270 
 271         state = ddi_get_soft_state(eventfd_softstate, minor);
 272 
 273         if (state->efd_pollhd.ph_list != NULL) {
 274                 pollwakeup(&state->efd_pollhd, POLLERR);
 275                 pollhead_clean(&state->efd_pollhd);
 276         }
 277 
 278         mutex_enter(&eventfd_lock);
 279 
 280         /*
 281          * Remove our state from our global list.
 282          */
 283         for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next))
 284                 VERIFY(*sp != NULL);
 285 
 286         *sp = (*sp)->efd_next;
 287 
 288         ddi_soft_state_free(eventfd_softstate, minor);
 289         vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
 290 
 291         mutex_exit(&eventfd_lock);
 292 
 293         return (0);
 294 }
 295 
 296 static int
 297 eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 298 {
 299         switch (cmd) {
 300         case DDI_ATTACH:
 301                 break;
 302 
 303         case DDI_RESUME:
 304                 return (DDI_SUCCESS);
 305 
 306         default:
 307                 return (DDI_FAILURE);
 308         }
 309 
 310         mutex_enter(&eventfd_lock);
 311 
 312         if (ddi_soft_state_init(&eventfd_softstate,
 313             sizeof (eventfd_state_t), 0) != 0) {
 314                 cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state");
 315                 mutex_exit(&eventfd_lock);
 316                 return (DDI_FAILURE);
 317         }
 318 
 319         if (ddi_create_minor_node(devi, "eventfd", S_IFCHR,
 320             EVENTFDMNRN_EVENTFD, DDI_PSEUDO, NULL) == DDI_FAILURE) {
 321                 cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node");
 322                 ddi_soft_state_fini(&eventfd_softstate);
 323                 mutex_exit(&eventfd_lock);
 324                 return (DDI_FAILURE);
 325         }
 326 
 327         ddi_report_dev(devi);
 328         eventfd_devi = devi;
 329 
 330         eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE,
 331             UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0,
 332             VM_SLEEP | VMC_IDENTIFIER);
 333 
 334         mutex_exit(&eventfd_lock);
 335 
 336         return (DDI_SUCCESS);
 337 }
 338 
 339 /*ARGSUSED*/
 340 static int
 341 eventfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 342 {
 343         switch (cmd) {
 344         case DDI_DETACH:
 345                 break;
 346 
 347         case DDI_SUSPEND:
 348                 return (DDI_SUCCESS);
 349 
 350         default:
 351                 return (DDI_FAILURE);
 352         }
 353 
 354         mutex_enter(&eventfd_lock);
 355         vmem_destroy(eventfd_minor);
 356 
 357         ddi_remove_minor_node(eventfd_devi, NULL);
 358         eventfd_devi = NULL;
 359 
 360         ddi_soft_state_fini(&eventfd_softstate);
 361         mutex_exit(&eventfd_lock);
 362 
 363         return (DDI_SUCCESS);
 364 }
 365 
 366 /*ARGSUSED*/
 367 static int
 368 eventfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 369 {
 370         int error;
 371 
 372         switch (infocmd) {
 373         case DDI_INFO_DEVT2DEVINFO:
 374                 *result = (void *)eventfd_devi;
 375                 error = DDI_SUCCESS;
 376                 break;
 377         case DDI_INFO_DEVT2INSTANCE:
 378                 *result = (void *)0;
 379                 error = DDI_SUCCESS;
 380                 break;
 381         default:
 382                 error = DDI_FAILURE;
 383         }
 384         return (error);
 385 }
 386 
 387 static struct cb_ops eventfd_cb_ops = {
 388         eventfd_open,           /* open */
 389         eventfd_close,          /* close */
 390         nulldev,                /* strategy */
 391         nulldev,                /* print */
 392         nodev,                  /* dump */
 393         eventfd_read,           /* read */
 394         eventfd_write,          /* write */
 395         eventfd_ioctl,          /* ioctl */
 396         nodev,                  /* devmap */
 397         nodev,                  /* mmap */
 398         nodev,                  /* segmap */
 399         eventfd_poll,           /* poll */
 400         ddi_prop_op,            /* cb_prop_op */
 401         0,                      /* streamtab  */
 402         D_NEW | D_MP            /* Driver compatibility flag */
 403 };
 404 
 405 static struct dev_ops eventfd_ops = {
 406         DEVO_REV,               /* devo_rev */
 407         0,                      /* refcnt */
 408         eventfd_info,           /* get_dev_info */
 409         nulldev,                /* identify */
 410         nulldev,                /* probe */
 411         eventfd_attach,         /* attach */
 412         eventfd_detach,         /* detach */
 413         nodev,                  /* reset */
 414         &eventfd_cb_ops,    /* driver operations */
 415         NULL,                   /* bus operations */
 416         nodev,                  /* dev power */
 417         ddi_quiesce_not_needed, /* quiesce */
 418 };
 419 
 420 static struct modldrv modldrv = {
 421         &mod_driverops,             /* module type (this is a pseudo driver) */
 422         "eventfd support",      /* name of module */
 423         &eventfd_ops,               /* driver ops */
 424 };
 425 
 426 static struct modlinkage modlinkage = {
 427         MODREV_1,
 428         (void *)&modldrv,
 429         NULL
 430 };
 431 
 432 int
 433 _init(void)
 434 {
 435         return (mod_install(&modlinkage));
 436 }
 437 
 438 int
 439 _info(struct modinfo *modinfop)
 440 {
 441         return (mod_info(&modlinkage, modinfop));
 442 }
 443 
 444 int
 445 _fini(void)
 446 {
 447         return (mod_remove(&modlinkage));
 448 }