Print this page
8634 epoll fails to wake on certain edge-triggered conditions
8635 epoll should not emit POLLNVAL
8636 recursive epoll should emit EPOLLRDNORM
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: Igor Kozhukhov <igor@dilos.org>

@@ -23,11 +23,11 @@
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2012 by Delphix. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/types.h>
 #include <sys/devops.h>
 #include <sys/conf.h>

@@ -364,26 +364,41 @@
                                 }
                         }
 
                         if (fp != pdp->pd_fp) {
                                 /*
-                                 * user is polling on a cached fd which was
-                                 * closed and then reused. Unfortunately
-                                 * there is no good way to inform user.
-                                 * If the file struct is also reused, we
-                                 * may not be able to detect the fd reuse
-                                 * at all.  As long as this does not
-                                 * cause system failure and/or memory leak,
-                                 * we will play along. Man page states if
-                                 * user does not clean up closed fds, polling
+                                 * The user is polling on a cached fd which was
+                                 * closed and then reused.  Unfortunately there
+                                 * is no good way to communicate this fact to
+                                 * the consumer.
+                                 *
+                                 * If the file struct is also reused, we may
+                                 * not be able to detect the fd reuse at all.
+                                 * As long as this does not cause system
+                                 * failure and/or memory leaks, we will play
+                                 * along.  The man page states that if the user
+                                 * does not clean up closed fds, polling
                                  * results will be indeterministic.
                                  *
-                                 * XXX - perhaps log the detection of fd
-                                 *       reuse?
+                                 * XXX: perhaps log the detection of fd reuse?
                                  */
                                 pdp->pd_fp = fp;
+
+                                /*
+                                 * When this situation has been detected, it's
+                                 * likely that any existing pollhead is
+                                 * ill-suited to perform proper wake-ups.
+                                 *
+                                 * Clean up the old entry under the expectation
+                                 * that a valid one will be provided as part of
+                                 * the later VOP_POLL.
+                                 */
+                                if (pdp->pd_php != NULL) {
+                                        pollhead_delete(pdp->pd_php, pdp);
+                                        pdp->pd_php = NULL;
                         }
+                        }
                         /*
                          * XXX - pollrelock() logic needs to know which
                          * which pollcache lock to grab. It'd be a
                          * cleaner solution if we could pass pcp as
                          * an arguement in VOP_POLL interface instead

@@ -394,10 +409,31 @@
                          * the tradeoff later.
                          */
                         curthread->t_pollcache = pcp;
                         error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0,
                             &revent, &php, NULL);
+
+                        /*
+                         * Recheck edge-triggered descriptors which lack a
+                         * pollhead.  While this check is performed when an fd
+                         * is added to the pollcache in dpwrite(), subsequent
+                         * descriptor manipulation could cause a different
+                         * resource to be present now.
+                         */
+                        if ((pdp->pd_events & POLLET) && error == 0 &&
+                            pdp->pd_php == NULL && php == NULL && revent != 0) {
+                                short levent = 0;
+
+                                /*
+                                 * The same POLLET-only VOP_POLL is used in an
+                                 * attempt to coax a pollhead from older
+                                 * driver logic.
+                                 */
+                                error = VOP_POLL(fp->f_vnode, POLLET,
+                                    0, &levent, &php, NULL);
+                        }
+
                         curthread->t_pollcache = NULL;
                         releasef(fd);
                         if (error != 0) {
                                 break;
                         }

@@ -429,10 +465,20 @@
 
                                         ASSERT(epoll != NULL);
                                         ep->data.u64 = pdp->pd_epolldata;
 
                                         /*
+                                         * Since POLLNVAL is a legal event for
+                                         * VOP_POLL handlers to emit, it must
+                                         * be translated epoll-legal.
+                                         */
+                                        if (revent & POLLNVAL) {
+                                                revent &= ~POLLNVAL;
+                                                revent |= POLLERR;
+                                        }
+
+                                        /*
                                          * If any of the event bits are set for
                                          * which poll and epoll representations
                                          * differ, swizzle in the native epoll
                                          * values.
                                          */

@@ -486,23 +532,16 @@
                                                 fdcnt++;
                                                 break;
                                         }
                                 }
 
+                                /* Handle special polling modes. */
+                                if (pdp->pd_events & POLLONESHOT) {
                                 /*
-                                 * If POLLET is set, clear the bit in the
-                                 * bitmap -- which effectively latches the
-                                 * edge on a pollwakeup() from the driver.
+                                         * If POLLONESHOT is set, perform the
+                                         * implicit POLLREMOVE.
                                  */
-                                if (pdp->pd_events & POLLET)
-                                        BT_CLEAR(pcp->pc_bitmap, fd);
-
-                                /*
-                                 * If POLLONESHOT is set, perform the implicit
-                                 * POLLREMOVE.
-                                 */
-                                if (pdp->pd_events & POLLONESHOT) {
                                         pdp->pd_fp = NULL;
                                         pdp->pd_events = 0;
 
                                         if (pdp->pd_php != NULL) {
                                                 pollhead_delete(pdp->pd_php,

@@ -509,12 +548,34 @@
                                                     pdp);
                                                 pdp->pd_php = NULL;
                                         }
 
                                         BT_CLEAR(pcp->pc_bitmap, fd);
+                                } else if (pdp->pd_events & POLLET) {
+                                        /*
+                                         * Wire up the pollhead which should
+                                         * have been provided.  Edge-triggered
+                                         * polling cannot function properly
+                                         * with drivers which do not emit one.
+                                         */
+                                        if (php != NULL &&
+                                            pdp->pd_php == NULL) {
+                                                pollhead_insert(php, pdp);
+                                                pdp->pd_php = php;
                                 }
 
+                                        /*
+                                         * If the driver has emitted a pollhead,
+                                         * clear the bit in the bitmap which
+                                         * effectively latches the edge on a
+                                         * pollwakeup() from the driver.
+                                         */
+                                        if (pdp->pd_php != NULL) {
+                                                BT_CLEAR(pcp->pc_bitmap, fd);
+                                        }
+                                }
+
                                 fdcnt++;
                         } else if (php != NULL) {
                                 /*
                                  * We clear a bit or cache a poll fd if
                                  * the driver returns a poll head ptr,

@@ -915,10 +976,36 @@
                          * the tradeoff later.
                          */
                         curthread->t_pollcache = pcp;
                         error = VOP_POLL(fp->f_vnode, pfdp->events, 0,
                             &pfdp->revents, &php, NULL);
+
+                        /*
+                         * Edge-triggered polling requires a pollhead in order
+                         * to initiate wake-ups properly.  Drivers which are
+                         * savvy to POLLET presence, which should include
+                         * everything in-gate, will always emit one, regardless
+                         * of revent status.  Older drivers which only emit a
+                         * pollhead if 'revents == 0' are given a second chance
+                         * here via a second VOP_POLL, with only POLLET set in
+                         * the events of interest.  These circumstances should
+                         * induce any cacheable drivers to emit a pollhead for
+                         * wake-ups.
+                         *
+                         * Drivers which never emit a pollhead will simply
+                         * disobey the exectation of edge-triggered behavior.
+                         * This includes recursive epoll which, even on Linux,
+                         * yields its events in a level-triggered fashion only.
+                         */
+                        if ((pdp->pd_events & POLLET) && error == 0 &&
+                            php == NULL) {
+                                short levent = 0;
+
+                                error = VOP_POLL(fp->f_vnode, POLLET, 0,
+                                    &levent, &php, NULL);
+                        }
+
                         curthread->t_pollcache = NULL;
                         /*
                          * We always set the bit when this fd is cached;
                          * this forces the first DP_POLL to poll this fd.
                          * Real performance gain comes from subsequent

@@ -1456,14 +1543,26 @@
         if (res == PSE_SUCCESS) {
                 nfds_t          nfds = 1;
                 int             fdcnt = 0;
                 pollstate_t     *ps = curthread->t_pollstate;
 
+                /*
+                 * Recursive polling will only emit certain events.  Skip a
+                 * scan of the pollcache if those events are not of interest.
+                 */
+                if (events & (POLLIN|POLLRDNORM)) {
                 rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt);
-                if (rc == 0) {
-                        *reventsp = (fdcnt > 0) ? POLLIN : 0;
+                } else {
+                        rc = 0;
+                        fdcnt = 0;
                 }
+
+                if (rc == 0 && fdcnt > 0) {
+                        *reventsp = POLLIN|POLLRDNORM;
+                } else {
+                        *reventsp = 0;
+                }
                 pcachelink_assoc(pcp, ps->ps_pc_stack[0]);
                 pollstate_exit(pcp);
         } else {
                 switch (res) {
                 case PSE_FAIL_DEPTH: