Print this page
    
OS-5538 eventfd wrongly blocks writers in semaphore mode
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/io/eventfd.c
          +++ new/usr/src/uts/common/io/eventfd.c
   1    1  /*
   2    2   * This file and its contents are supplied under the terms of the
  
    | 
      ↓ open down ↓ | 
    2 lines elided | 
    
      ↑ open up ↑ | 
  
   3    3   * Common Development and Distribution License ("CDDL"), version 1.0.
   4    4   * You may only use this file in accordance with the terms of version
   5    5   * 1.0 of the CDDL.
   6    6   *
   7    7   * A full copy of the text of the CDDL should have accompanied this
   8    8   * source.  A copy of the CDDL is also available via the Internet at
   9    9   * http://www.illumos.org/license/CDDL.
  10   10   */
  11   11  
  12   12  /*
  13      - * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
       13 + * Copyright 2016 Joyent, Inc.
  14   14   */
  15   15  
  16   16  /*
  17   17   * Support for the eventfd facility, a Linux-borne facility for user-generated
  18   18   * file descriptor-based events.
  19   19   */
  20   20  
  21   21  #include <sys/ddi.h>
  22   22  #include <sys/sunddi.h>
  23   23  #include <sys/eventfd.h>
  24   24  #include <sys/conf.h>
  25   25  #include <sys/vmem.h>
  26   26  #include <sys/sysmacros.h>
  27   27  #include <sys/filio.h>
  28   28  #include <sys/stat.h>
  29   29  #include <sys/file.h>
  
    | 
      ↓ open down ↓ | 
    6 lines elided | 
    
      ↑ open up ↑ | 
  
  30   30  
  31   31  struct eventfd_state;
  32   32  typedef struct eventfd_state eventfd_state_t;
  33   33  
  34   34  struct eventfd_state {
  35   35          kmutex_t efd_lock;                      /* lock protecting state */
  36   36          boolean_t efd_semaphore;                /* boolean: sema. semantics */
  37   37          kcondvar_t efd_cv;                      /* condvar */
  38   38          pollhead_t efd_pollhd;                  /* poll head */
  39   39          uint64_t efd_value;                     /* value */
       40 +        size_t efd_bwriters;                    /* count of blocked writers */
  40   41          eventfd_state_t *efd_next;              /* next state on global list */
  41   42  };
  42   43  
  43   44  /*
  44   45   * Internal global variables.
  45   46   */
  46   47  static kmutex_t         eventfd_lock;           /* lock protecting state */
  47   48  static dev_info_t       *eventfd_devi;          /* device info */
  48   49  static vmem_t           *eventfd_minor;         /* minor number arena */
  49   50  static void             *eventfd_softstate;     /* softstate pointer */
  50   51  static eventfd_state_t  *eventfd_state;         /* global list of state */
  51   52  
  52   53  /*ARGSUSED*/
  53   54  static int
  54   55  eventfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
  55   56  {
  56   57          eventfd_state_t *state;
  57   58          major_t major = getemajor(*devp);
  58   59          minor_t minor = getminor(*devp);
  59   60  
  60   61          if (minor != EVENTFDMNRN_EVENTFD)
  61   62                  return (ENXIO);
  62   63  
  63   64          mutex_enter(&eventfd_lock);
  64   65  
  65   66          minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1,
  66   67              VM_BESTFIT | VM_SLEEP);
  67   68  
  68   69          if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) {
  69   70                  vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
  70   71                  mutex_exit(&eventfd_lock);
  71   72                  return (NULL);
  72   73          }
  73   74  
  74   75          state = ddi_get_soft_state(eventfd_softstate, minor);
  75   76          *devp = makedevice(major, minor);
  76   77  
  77   78          state->efd_next = eventfd_state;
  78   79          eventfd_state = state;
  79   80  
  80   81          mutex_exit(&eventfd_lock);
  81   82  
  82   83          return (0);
  83   84  }
  84   85  
  85   86  /*ARGSUSED*/
  86   87  static int
  87   88  eventfd_read(dev_t dev, uio_t *uio, cred_t *cr)
  88   89  {
  89   90          eventfd_state_t *state;
  90   91          minor_t minor = getminor(dev);
  91   92          uint64_t val, oval;
  92   93          int err;
  93   94  
  94   95          if (uio->uio_resid < sizeof (val))
  95   96                  return (EINVAL);
  96   97  
  97   98          state = ddi_get_soft_state(eventfd_softstate, minor);
  98   99  
  99  100          mutex_enter(&state->efd_lock);
 100  101  
 101  102          while (state->efd_value == 0) {
 102  103                  if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
 103  104                          mutex_exit(&state->efd_lock);
 104  105                          return (EAGAIN);
 105  106                  }
 106  107  
 107  108                  if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
 108  109                          mutex_exit(&state->efd_lock);
 109  110                          return (EINTR);
 110  111                  }
 111  112          }
 112  113  
 113  114          /*
 114  115           * We have a non-zero value and we own the lock; our behavior now
 115  116           * depends on whether or not EFD_SEMAPHORE was set when the eventfd
 116  117           * was created.
 117  118           */
 118  119          val = oval = state->efd_value;
  
    | 
      ↓ open down ↓ | 
    69 lines elided | 
    
      ↑ open up ↑ | 
  
 119  120  
 120  121          if (state->efd_semaphore) {
 121  122                  state->efd_value--;
 122  123                  val = 1;
 123  124          } else {
 124  125                  state->efd_value = 0;
 125  126          }
 126  127  
 127  128          err = uiomove(&val, sizeof (val), UIO_READ, uio);
 128  129  
      130 +        /*
      131 +         * Wake any writers blocked on this eventfd as this read operation may
      132 +         * have created adequate capacity for their values.
      133 +         */
      134 +        if (state->efd_bwriters != 0) {
      135 +                cv_broadcast(&state->efd_cv);
      136 +        }
 129  137          mutex_exit(&state->efd_lock);
 130  138  
      139 +        /*
      140 +         * It is necessary to emit POLLOUT events only when the eventfd
      141 +         * transitions from EVENTFD_VALMAX to a lower value.  At all other
      142 +         * times, it is already considered writable by poll.
      143 +         */
 131  144          if (oval == EVENTFD_VALMAX) {
 132      -                cv_broadcast(&state->efd_cv);
 133  145                  pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT);
 134  146          }
 135  147  
 136  148          return (err);
 137  149  }
 138  150  
 139  151  /*ARGSUSED*/
 140  152  static int
 141  153  eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
 142  154  {
 143  155          eventfd_state_t *state;
 144  156          minor_t minor = getminor(dev);
 145  157          uint64_t val, oval;
 146  158          int err;
 147  159  
 148  160          if (uio->uio_resid < sizeof (val))
 149  161                  return (EINVAL);
 150  162  
 151  163          if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
 152  164                  return (err);
 153  165  
 154  166          if (val > EVENTFD_VALMAX)
 155  167                  return (EINVAL);
 156  168  
  
    | 
      ↓ open down ↓ | 
    14 lines elided | 
    
      ↑ open up ↑ | 
  
 157  169          state = ddi_get_soft_state(eventfd_softstate, minor);
 158  170  
 159  171          mutex_enter(&state->efd_lock);
 160  172  
 161  173          while (val > EVENTFD_VALMAX - state->efd_value) {
 162  174                  if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
 163  175                          mutex_exit(&state->efd_lock);
 164  176                          return (EAGAIN);
 165  177                  }
 166  178  
      179 +                state->efd_bwriters++;
 167  180                  if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
      181 +                        state->efd_bwriters--;
 168  182                          mutex_exit(&state->efd_lock);
 169  183                          return (EINTR);
 170  184                  }
      185 +                state->efd_bwriters--;
 171  186          }
 172  187  
 173  188          /*
 174  189           * We now know that we can add the value without overflowing.
 175  190           */
 176  191          state->efd_value = (oval = state->efd_value) + val;
 177  192  
      193 +        /*
      194 +         * If the value was previously "empty", notify blocked readers that
      195 +         * data is available.
      196 +         */
      197 +        if (oval == 0) {
      198 +                cv_broadcast(&state->efd_cv);
      199 +        }
 178  200          mutex_exit(&state->efd_lock);
 179  201  
      202 +        /*
      203 +         * Notify pollers as well if the eventfd is now readable.
      204 +         */
 180  205          if (oval == 0) {
 181      -                cv_broadcast(&state->efd_cv);
 182  206                  pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN);
 183  207          }
 184  208  
 185  209          return (0);
 186  210  }
 187  211  
 188  212  /*ARGSUSED*/
 189  213  static int
 190  214  eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
 191  215      struct pollhead **phpp)
 192  216  {
 193  217          eventfd_state_t *state;
 194  218          minor_t minor = getminor(dev);
 195  219          short revents = 0;
 196  220  
 197  221          state = ddi_get_soft_state(eventfd_softstate, minor);
 198  222  
 199  223          mutex_enter(&state->efd_lock);
 200  224  
 201  225          if (state->efd_value > 0)
 202  226                  revents |= POLLRDNORM | POLLIN;
 203  227  
 204  228          if (state->efd_value < EVENTFD_VALMAX)
 205  229                  revents |= POLLWRNORM | POLLOUT;
 206  230  
 207  231          if (!(*reventsp = revents & events) && !anyyet)
 208  232                  *phpp = &state->efd_pollhd;
 209  233  
 210  234          mutex_exit(&state->efd_lock);
 211  235  
 212  236          return (0);
 213  237  }
 214  238  
 215  239  /*ARGSUSED*/
 216  240  static int
 217  241  eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
 218  242  {
 219  243          eventfd_state_t *state;
 220  244          minor_t minor = getminor(dev);
 221  245  
 222  246          state = ddi_get_soft_state(eventfd_softstate, minor);
 223  247  
 224  248          switch (cmd) {
 225  249          case EVENTFDIOC_SEMAPHORE: {
 226  250                  mutex_enter(&state->efd_lock);
 227  251                  state->efd_semaphore ^= 1;
 228  252                  mutex_exit(&state->efd_lock);
 229  253  
 230  254                  return (0);
 231  255          }
 232  256  
 233  257          default:
 234  258                  break;
 235  259          }
 236  260  
 237  261          return (ENOTTY);
 238  262  }
 239  263  
 240  264  /*ARGSUSED*/
 241  265  static int
 242  266  eventfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
 243  267  {
 244  268          eventfd_state_t *state, **sp;
 245  269          minor_t minor = getminor(dev);
 246  270  
 247  271          state = ddi_get_soft_state(eventfd_softstate, minor);
 248  272  
 249  273          if (state->efd_pollhd.ph_list != NULL) {
 250  274                  pollwakeup(&state->efd_pollhd, POLLERR);
 251  275                  pollhead_clean(&state->efd_pollhd);
 252  276          }
 253  277  
 254  278          mutex_enter(&eventfd_lock);
 255  279  
 256  280          /*
 257  281           * Remove our state from our global list.
 258  282           */
 259  283          for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next))
 260  284                  VERIFY(*sp != NULL);
 261  285  
 262  286          *sp = (*sp)->efd_next;
 263  287  
 264  288          ddi_soft_state_free(eventfd_softstate, minor);
 265  289          vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
 266  290  
 267  291          mutex_exit(&eventfd_lock);
 268  292  
 269  293          return (0);
 270  294  }
 271  295  
 272  296  static int
 273  297  eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 274  298  {
 275  299          switch (cmd) {
 276  300          case DDI_ATTACH:
 277  301                  break;
 278  302  
 279  303          case DDI_RESUME:
 280  304                  return (DDI_SUCCESS);
 281  305  
 282  306          default:
 283  307                  return (DDI_FAILURE);
 284  308          }
 285  309  
 286  310          mutex_enter(&eventfd_lock);
 287  311  
 288  312          if (ddi_soft_state_init(&eventfd_softstate,
 289  313              sizeof (eventfd_state_t), 0) != 0) {
 290  314                  cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state");
 291  315                  mutex_exit(&eventfd_lock);
 292  316                  return (DDI_FAILURE);
 293  317          }
 294  318  
 295  319          if (ddi_create_minor_node(devi, "eventfd", S_IFCHR,
 296  320              EVENTFDMNRN_EVENTFD, DDI_PSEUDO, NULL) == DDI_FAILURE) {
 297  321                  cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node");
 298  322                  ddi_soft_state_fini(&eventfd_softstate);
 299  323                  mutex_exit(&eventfd_lock);
 300  324                  return (DDI_FAILURE);
 301  325          }
 302  326  
 303  327          ddi_report_dev(devi);
 304  328          eventfd_devi = devi;
 305  329  
 306  330          eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE,
 307  331              UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0,
 308  332              VM_SLEEP | VMC_IDENTIFIER);
 309  333  
 310  334          mutex_exit(&eventfd_lock);
 311  335  
 312  336          return (DDI_SUCCESS);
 313  337  }
 314  338  
 315  339  /*ARGSUSED*/
 316  340  static int
 317  341  eventfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 318  342  {
 319  343          switch (cmd) {
 320  344          case DDI_DETACH:
 321  345                  break;
 322  346  
 323  347          case DDI_SUSPEND:
 324  348                  return (DDI_SUCCESS);
 325  349  
 326  350          default:
 327  351                  return (DDI_FAILURE);
 328  352          }
 329  353  
 330  354          mutex_enter(&eventfd_lock);
 331  355          vmem_destroy(eventfd_minor);
 332  356  
 333  357          ddi_remove_minor_node(eventfd_devi, NULL);
 334  358          eventfd_devi = NULL;
 335  359  
 336  360          ddi_soft_state_fini(&eventfd_softstate);
 337  361          mutex_exit(&eventfd_lock);
 338  362  
 339  363          return (DDI_SUCCESS);
 340  364  }
 341  365  
 342  366  /*ARGSUSED*/
 343  367  static int
 344  368  eventfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 345  369  {
 346  370          int error;
 347  371  
 348  372          switch (infocmd) {
 349  373          case DDI_INFO_DEVT2DEVINFO:
 350  374                  *result = (void *)eventfd_devi;
 351  375                  error = DDI_SUCCESS;
 352  376                  break;
 353  377          case DDI_INFO_DEVT2INSTANCE:
 354  378                  *result = (void *)0;
 355  379                  error = DDI_SUCCESS;
 356  380                  break;
 357  381          default:
 358  382                  error = DDI_FAILURE;
 359  383          }
 360  384          return (error);
 361  385  }
 362  386  
 363  387  static struct cb_ops eventfd_cb_ops = {
 364  388          eventfd_open,           /* open */
 365  389          eventfd_close,          /* close */
 366  390          nulldev,                /* strategy */
 367  391          nulldev,                /* print */
 368  392          nodev,                  /* dump */
 369  393          eventfd_read,           /* read */
 370  394          eventfd_write,          /* write */
 371  395          eventfd_ioctl,          /* ioctl */
 372  396          nodev,                  /* devmap */
 373  397          nodev,                  /* mmap */
 374  398          nodev,                  /* segmap */
 375  399          eventfd_poll,           /* poll */
 376  400          ddi_prop_op,            /* cb_prop_op */
 377  401          0,                      /* streamtab  */
 378  402          D_NEW | D_MP            /* Driver compatibility flag */
 379  403  };
 380  404  
 381  405  static struct dev_ops eventfd_ops = {
 382  406          DEVO_REV,               /* devo_rev */
 383  407          0,                      /* refcnt */
 384  408          eventfd_info,           /* get_dev_info */
 385  409          nulldev,                /* identify */
 386  410          nulldev,                /* probe */
 387  411          eventfd_attach,         /* attach */
 388  412          eventfd_detach,         /* detach */
 389  413          nodev,                  /* reset */
 390  414          &eventfd_cb_ops,        /* driver operations */
 391  415          NULL,                   /* bus operations */
 392  416          nodev,                  /* dev power */
 393  417          ddi_quiesce_not_needed, /* quiesce */
 394  418  };
 395  419  
 396  420  static struct modldrv modldrv = {
 397  421          &mod_driverops,         /* module type (this is a pseudo driver) */
 398  422          "eventfd support",      /* name of module */
 399  423          &eventfd_ops,           /* driver ops */
 400  424  };
 401  425  
 402  426  static struct modlinkage modlinkage = {
 403  427          MODREV_1,
 404  428          (void *)&modldrv,
 405  429          NULL
 406  430  };
 407  431  
 408  432  int
 409  433  _init(void)
 410  434  {
 411  435          return (mod_install(&modlinkage));
 412  436  }
 413  437  
 414  438  int
 415  439  _info(struct modinfo *modinfop)
 416  440  {
 417  441          return (mod_info(&modlinkage, modinfop));
 418  442  }
 419  443  
 420  444  int
 421  445  _fini(void)
 422  446  {
 423  447          return (mod_remove(&modlinkage));
 424  448  }
  
    | 
      ↓ open down ↓ | 
    233 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX