Print this page
OS-5538 eventfd wrongly blocks writers in semaphore mode
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
        
*** 8,18 ****
   * source.  A copy of the CDDL is also available via the Internet at
   * http://www.illumos.org/license/CDDL.
   */
  
  /*
!  * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
   */
  
  /*
   * Support for the eventfd facility, a Linux-borne facility for user-generated
   * file descriptor-based events.
--- 8,18 ----
   * source.  A copy of the CDDL is also available via the Internet at
   * http://www.illumos.org/license/CDDL.
   */
  
  /*
!  * Copyright 2016 Joyent, Inc.
   */
  
  /*
   * Support for the eventfd facility, a Linux-borne facility for user-generated
   * file descriptor-based events.
*** 35,44 ****
--- 35,45 ----
          kmutex_t efd_lock;                      /* lock protecting state */
          boolean_t efd_semaphore;                /* boolean: sema. semantics */
          kcondvar_t efd_cv;                      /* condvar */
          pollhead_t efd_pollhd;                  /* poll head */
          uint64_t efd_value;                     /* value */
+         size_t efd_bwriters;                    /* count of blocked writers */
          eventfd_state_t *efd_next;              /* next state on global list */
  };
  
  /*
   * Internal global variables.
*** 124,137 ****
                  state->efd_value = 0;
          }
  
          err = uiomove(&val, sizeof (val), UIO_READ, uio);
  
          mutex_exit(&state->efd_lock);
  
          if (oval == EVENTFD_VALMAX) {
-                 cv_broadcast(&state->efd_cv);
                  pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT);
          }
  
          return (err);
  }
--- 125,149 ----
                  state->efd_value = 0;
          }
  
          err = uiomove(&val, sizeof (val), UIO_READ, uio);
  
+         /*
+          * Wake any writers blocked on this eventfd as this read operation may
+          * have created adequate capacity for their values.
+          */
+         if (state->efd_bwriters != 0) {
+                 cv_broadcast(&state->efd_cv);
+         }
          mutex_exit(&state->efd_lock);
  
+         /*
+          * It is necessary to emit POLLOUT events only when the eventfd
+          * transitions from EVENTFD_VALMAX to a lower value.  At all other
+          * times, it is already considered writable by poll.
+          */
          if (oval == EVENTFD_VALMAX) {
                  pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT);
          }
  
          return (err);
  }
*** 162,186 ****
                  if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
                          mutex_exit(&state->efd_lock);
                          return (EAGAIN);
                  }
  
                  if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
                          mutex_exit(&state->efd_lock);
                          return (EINTR);
                  }
          }
  
          /*
           * We now know that we can add the value without overflowing.
           */
          state->efd_value = (oval = state->efd_value) + val;
  
          mutex_exit(&state->efd_lock);
  
          if (oval == 0) {
-                 cv_broadcast(&state->efd_cv);
                  pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN);
          }
  
          return (0);
  }
--- 174,210 ----
                  if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
                          mutex_exit(&state->efd_lock);
                          return (EAGAIN);
                  }
  
+                 state->efd_bwriters++;
                  if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
+                         state->efd_bwriters--;
                          mutex_exit(&state->efd_lock);
                          return (EINTR);
                  }
+                 state->efd_bwriters--;
          }
  
          /*
           * We now know that we can add the value without overflowing.
           */
          state->efd_value = (oval = state->efd_value) + val;
  
+         /*
+          * If the value was previously "empty", notify blocked readers that
+          * data is available.
+          */
+         if (oval == 0) {
+                 cv_broadcast(&state->efd_cv);
+         }
          mutex_exit(&state->efd_lock);
  
+         /*
+          * Notify pollers as well if the eventfd is now readable.
+          */
          if (oval == 0) {
                  pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN);
          }
  
          return (0);
  }