1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2017 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * Support for the eventfd facility, a Linux-borne facility for user-generated
  18  * file descriptor-based events.
  19  */
  20 
  21 #include <sys/ddi.h>
  22 #include <sys/sunddi.h>
  23 #include <sys/eventfd.h>
  24 #include <sys/conf.h>
  25 #include <sys/vmem.h>
  26 #include <sys/sysmacros.h>
  27 #include <sys/filio.h>
  28 #include <sys/stat.h>
  29 #include <sys/file.h>
  30 
  31 struct eventfd_state;
  32 typedef struct eventfd_state eventfd_state_t;
  33 
  34 struct eventfd_state {
  35         kmutex_t efd_lock;                      /* lock protecting state */
  36         boolean_t efd_semaphore;                /* boolean: sema. semantics */
  37         kcondvar_t efd_cv;                      /* condvar */
  38         pollhead_t efd_pollhd;                  /* poll head */
  39         uint64_t efd_value;                     /* value */
  40         size_t efd_bwriters;                    /* count of blocked writers */
  41         eventfd_state_t *efd_next;              /* next state on global list */
  42 };
  43 
  44 /*
  45  * Internal global variables.
  46  */
  47 static kmutex_t         eventfd_lock;           /* lock protecting state */
  48 static dev_info_t       *eventfd_devi;          /* device info */
  49 static vmem_t           *eventfd_minor;         /* minor number arena */
  50 static void             *eventfd_softstate;     /* softstate pointer */
  51 static eventfd_state_t  *eventfd_state;         /* global list of state */
  52 
  53 /*ARGSUSED*/
  54 static int
  55 eventfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
  56 {
  57         eventfd_state_t *state;
  58         major_t major = getemajor(*devp);
  59         minor_t minor = getminor(*devp);
  60 
  61         if (minor != EVENTFDMNRN_EVENTFD)
  62                 return (ENXIO);
  63 
  64         mutex_enter(&eventfd_lock);
  65 
  66         minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1,
  67             VM_BESTFIT | VM_SLEEP);
  68 
  69         if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) {
  70                 vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
  71                 mutex_exit(&eventfd_lock);
  72                 return (NULL);
  73         }
  74 
  75         state = ddi_get_soft_state(eventfd_softstate, minor);
  76         *devp = makedevice(major, minor);
  77 
  78         state->efd_next = eventfd_state;
  79         eventfd_state = state;
  80 
  81         mutex_exit(&eventfd_lock);
  82 
  83         return (0);
  84 }
  85 
  86 /*ARGSUSED*/
  87 static int
  88 eventfd_read(dev_t dev, uio_t *uio, cred_t *cr)
  89 {
  90         eventfd_state_t *state;
  91         minor_t minor = getminor(dev);
  92         uint64_t val, oval;
  93         int err;
  94 
  95         if (uio->uio_resid < sizeof (val))
  96                 return (EINVAL);
  97 
  98         state = ddi_get_soft_state(eventfd_softstate, minor);
  99 
 100         mutex_enter(&state->efd_lock);
 101 
 102         while (state->efd_value == 0) {
 103                 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
 104                         mutex_exit(&state->efd_lock);
 105                         return (EAGAIN);
 106                 }
 107 
 108                 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
 109                         mutex_exit(&state->efd_lock);
 110                         return (EINTR);
 111                 }
 112         }
 113 
 114         /*
 115          * We have a non-zero value and we own the lock; our behavior now
 116          * depends on whether or not EFD_SEMAPHORE was set when the eventfd
 117          * was created.
 118          */
 119         val = oval = state->efd_value;
 120 
 121         if (state->efd_semaphore) {
 122                 state->efd_value--;
 123                 val = 1;
 124         } else {
 125                 state->efd_value = 0;
 126         }
 127 
 128         err = uiomove(&val, sizeof (val), UIO_READ, uio);
 129 
 130         /*
 131          * Wake any writers blocked on this eventfd as this read operation may
 132          * have created adequate capacity for their values.
 133          */
 134         if (state->efd_bwriters != 0) {
 135                 cv_broadcast(&state->efd_cv);
 136         }
 137         mutex_exit(&state->efd_lock);
 138 
 139         /*
 140          * It is necessary to emit POLLOUT events only when the eventfd
 141          * transitions from EVENTFD_VALMAX to a lower value.  At all other
 142          * times, it is already considered writable by poll.
 143          */
 144         if (oval == EVENTFD_VALMAX) {
 145                 pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT);
 146         }
 147 
 148         return (err);
 149 }
 150 
 151 /*ARGSUSED*/
 152 static int
 153 eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
 154 {
 155         eventfd_state_t *state;
 156         minor_t minor = getminor(dev);
 157         uint64_t val, oval;
 158         int err;
 159 
 160         if (uio->uio_resid < sizeof (val))
 161                 return (EINVAL);
 162 
 163         if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
 164                 return (err);
 165 
 166         if (val > EVENTFD_VALMAX)
 167                 return (EINVAL);
 168 
 169         state = ddi_get_soft_state(eventfd_softstate, minor);
 170 
 171         mutex_enter(&state->efd_lock);
 172 
 173         while (val > EVENTFD_VALMAX - state->efd_value) {
 174                 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
 175                         mutex_exit(&state->efd_lock);
 176                         return (EAGAIN);
 177                 }
 178 
 179                 state->efd_bwriters++;
 180                 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
 181                         state->efd_bwriters--;
 182                         mutex_exit(&state->efd_lock);
 183                         return (EINTR);
 184                 }
 185                 state->efd_bwriters--;
 186         }
 187 
 188         /*
 189          * We now know that we can add the value without overflowing.
 190          */
 191         state->efd_value = (oval = state->efd_value) + val;
 192 
 193         /*
 194          * If the value was previously "empty", notify blocked readers that
 195          * data is available.
 196          */
 197         if (oval == 0) {
 198                 cv_broadcast(&state->efd_cv);
 199         }
 200         mutex_exit(&state->efd_lock);
 201 
 202         /*
 203          * Notify pollers as well if the eventfd is now readable.
 204          */
 205         if (oval == 0) {
 206                 pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN);
 207         }
 208 
 209         return (0);
 210 }
 211 
 212 /*ARGSUSED*/
 213 static int
 214 eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
 215     struct pollhead **phpp)
 216 {
 217         eventfd_state_t *state;
 218         minor_t minor = getminor(dev);
 219         short revents = 0;
 220 
 221         state = ddi_get_soft_state(eventfd_softstate, minor);
 222 
 223         mutex_enter(&state->efd_lock);
 224 
 225         if (state->efd_value > 0)
 226                 revents |= POLLRDNORM | POLLIN;
 227 
 228         if (state->efd_value < EVENTFD_VALMAX)
 229                 revents |= POLLWRNORM | POLLOUT;
 230 
 231         *reventsp = revents & events;
 232         if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
 233                 *phpp = &state->efd_pollhd;
 234         }
 235 
 236         mutex_exit(&state->efd_lock);
 237 
 238         return (0);
 239 }
 240 
 241 /*ARGSUSED*/
 242 static int
 243 eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
 244 {
 245         eventfd_state_t *state;
 246         minor_t minor = getminor(dev);
 247 
 248         state = ddi_get_soft_state(eventfd_softstate, minor);
 249 
 250         switch (cmd) {
 251         case EVENTFDIOC_SEMAPHORE: {
 252                 mutex_enter(&state->efd_lock);
 253                 state->efd_semaphore ^= 1;
 254                 mutex_exit(&state->efd_lock);
 255 
 256                 return (0);
 257         }
 258 
 259         default:
 260                 break;
 261         }
 262 
 263         return (ENOTTY);
 264 }
 265 
 266 /*ARGSUSED*/
 267 static int
 268 eventfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
 269 {
 270         eventfd_state_t *state, **sp;
 271         minor_t minor = getminor(dev);
 272 
 273         state = ddi_get_soft_state(eventfd_softstate, minor);
 274 
 275         if (state->efd_pollhd.ph_list != NULL) {
 276                 pollwakeup(&state->efd_pollhd, POLLERR);
 277                 pollhead_clean(&state->efd_pollhd);
 278         }
 279 
 280         mutex_enter(&eventfd_lock);
 281 
 282         /*
 283          * Remove our state from our global list.
 284          */
 285         for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next))
 286                 VERIFY(*sp != NULL);
 287 
 288         *sp = (*sp)->efd_next;
 289 
 290         ddi_soft_state_free(eventfd_softstate, minor);
 291         vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
 292 
 293         mutex_exit(&eventfd_lock);
 294 
 295         return (0);
 296 }
 297 
 298 static int
 299 eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 300 {
 301         switch (cmd) {
 302         case DDI_ATTACH:
 303                 break;
 304 
 305         case DDI_RESUME:
 306                 return (DDI_SUCCESS);
 307 
 308         default:
 309                 return (DDI_FAILURE);
 310         }
 311 
 312         mutex_enter(&eventfd_lock);
 313 
 314         if (ddi_soft_state_init(&eventfd_softstate,
 315             sizeof (eventfd_state_t), 0) != 0) {
 316                 cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state");
 317                 mutex_exit(&eventfd_lock);
 318                 return (DDI_FAILURE);
 319         }
 320 
 321         if (ddi_create_minor_node(devi, "eventfd", S_IFCHR,
 322             EVENTFDMNRN_EVENTFD, DDI_PSEUDO, NULL) == DDI_FAILURE) {
 323                 cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node");
 324                 ddi_soft_state_fini(&eventfd_softstate);
 325                 mutex_exit(&eventfd_lock);
 326                 return (DDI_FAILURE);
 327         }
 328 
 329         ddi_report_dev(devi);
 330         eventfd_devi = devi;
 331 
 332         eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE,
 333             UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0,
 334             VM_SLEEP | VMC_IDENTIFIER);
 335 
 336         mutex_exit(&eventfd_lock);
 337 
 338         return (DDI_SUCCESS);
 339 }
 340 
 341 /*ARGSUSED*/
 342 static int
 343 eventfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 344 {
 345         switch (cmd) {
 346         case DDI_DETACH:
 347                 break;
 348 
 349         case DDI_SUSPEND:
 350                 return (DDI_SUCCESS);
 351 
 352         default:
 353                 return (DDI_FAILURE);
 354         }
 355 
 356         mutex_enter(&eventfd_lock);
 357         vmem_destroy(eventfd_minor);
 358 
 359         ddi_remove_minor_node(eventfd_devi, NULL);
 360         eventfd_devi = NULL;
 361 
 362         ddi_soft_state_fini(&eventfd_softstate);
 363         mutex_exit(&eventfd_lock);
 364 
 365         return (DDI_SUCCESS);
 366 }
 367 
 368 /*ARGSUSED*/
 369 static int
 370 eventfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 371 {
 372         int error;
 373 
 374         switch (infocmd) {
 375         case DDI_INFO_DEVT2DEVINFO:
 376                 *result = (void *)eventfd_devi;
 377                 error = DDI_SUCCESS;
 378                 break;
 379         case DDI_INFO_DEVT2INSTANCE:
 380                 *result = (void *)0;
 381                 error = DDI_SUCCESS;
 382                 break;
 383         default:
 384                 error = DDI_FAILURE;
 385         }
 386         return (error);
 387 }
 388 
 389 static struct cb_ops eventfd_cb_ops = {
 390         eventfd_open,           /* open */
 391         eventfd_close,          /* close */
 392         nulldev,                /* strategy */
 393         nulldev,                /* print */
 394         nodev,                  /* dump */
 395         eventfd_read,           /* read */
 396         eventfd_write,          /* write */
 397         eventfd_ioctl,          /* ioctl */
 398         nodev,                  /* devmap */
 399         nodev,                  /* mmap */
 400         nodev,                  /* segmap */
 401         eventfd_poll,           /* poll */
 402         ddi_prop_op,            /* cb_prop_op */
 403         0,                      /* streamtab  */
 404         D_NEW | D_MP            /* Driver compatibility flag */
 405 };
 406 
 407 static struct dev_ops eventfd_ops = {
 408         DEVO_REV,               /* devo_rev */
 409         0,                      /* refcnt */
 410         eventfd_info,           /* get_dev_info */
 411         nulldev,                /* identify */
 412         nulldev,                /* probe */
 413         eventfd_attach,         /* attach */
 414         eventfd_detach,         /* detach */
 415         nodev,                  /* reset */
 416         &eventfd_cb_ops,    /* driver operations */
 417         NULL,                   /* bus operations */
 418         nodev,                  /* dev power */
 419         ddi_quiesce_not_needed, /* quiesce */
 420 };
 421 
 422 static struct modldrv modldrv = {
 423         &mod_driverops,             /* module type (this is a pseudo driver) */
 424         "eventfd support",      /* name of module */
 425         &eventfd_ops,               /* driver ops */
 426 };
 427 
 428 static struct modlinkage modlinkage = {
 429         MODREV_1,
 430         (void *)&modldrv,
 431         NULL
 432 };
 433 
 434 int
 435 _init(void)
 436 {
 437         return (mod_install(&modlinkage));
 438 }
 439 
 440 int
 441 _info(struct modinfo *modinfop)
 442 {
 443         return (mod_info(&modlinkage, modinfop));
 444 }
 445 
 446 int
 447 _fini(void)
 448 {
 449         return (mod_remove(&modlinkage));
 450 }