Print this page
8634 epoll fails to wake on certain edge-triggered conditions
8635 epoll should not emit POLLNVAL
8636 recursive epoll should emit EPOLLRDNORM
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: Igor Kozhukhov <igor@dilos.org>
@@ -23,11 +23,11 @@
* Use is subject to license terms.
*/
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2017 Joyent, Inc.
*/
#include <sys/types.h>
#include <sys/devops.h>
#include <sys/conf.h>
@@ -364,26 +364,41 @@
}
}
if (fp != pdp->pd_fp) {
/*
- * user is polling on a cached fd which was
- * closed and then reused. Unfortunately
- * there is no good way to inform user.
- * If the file struct is also reused, we
- * may not be able to detect the fd reuse
- * at all. As long as this does not
- * cause system failure and/or memory leak,
- * we will play along. Man page states if
- * user does not clean up closed fds, polling
+ * The user is polling on a cached fd which was
+ * closed and then reused. Unfortunately there
+ * is no good way to communicate this fact to
+ * the consumer.
+ *
+ * If the file struct is also reused, we may
+ * not be able to detect the fd reuse at all.
+ * As long as this does not cause system
+ * failure and/or memory leaks, we will play
+ * along. The man page states that if the user
+ * does not clean up closed fds, polling
* results will be indeterministic.
*
- * XXX - perhaps log the detection of fd
- * reuse?
+ * XXX: perhaps log the detection of fd reuse?
*/
pdp->pd_fp = fp;
+
+ /*
+ * When this situation has been detected, it's
+ * likely that any existing pollhead is
+ * ill-suited to perform proper wake-ups.
+ *
+ * Clean up the old entry under the expectation
+ * that a valid one will be provided as part of
+ * the later VOP_POLL.
+ */
+ if (pdp->pd_php != NULL) {
+ pollhead_delete(pdp->pd_php, pdp);
+ pdp->pd_php = NULL;
}
+ }
/*
* XXX - pollrelock() logic needs to know which
* which pollcache lock to grab. It'd be a
* cleaner solution if we could pass pcp as
* an arguement in VOP_POLL interface instead
@@ -394,10 +409,31 @@
* the tradeoff later.
*/
curthread->t_pollcache = pcp;
error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0,
&revent, &php, NULL);
+
+ /*
+ * Recheck edge-triggered descriptors which lack a
+ * pollhead. While this check is performed when an fd
+ * is added to the pollcache in dpwrite(), subsequent
+ * descriptor manipulation could cause a different
+ * resource to be present now.
+ */
+ if ((pdp->pd_events & POLLET) && error == 0 &&
+ pdp->pd_php == NULL && php == NULL && revent != 0) {
+ short levent = 0;
+
+ /*
+ * The same POLLET-only VOP_POLL is used in an
+ * attempt to coax a pollhead from older
+ * driver logic.
+ */
+ error = VOP_POLL(fp->f_vnode, POLLET,
+ 0, &levent, &php, NULL);
+ }
+
curthread->t_pollcache = NULL;
releasef(fd);
if (error != 0) {
break;
}
@@ -429,10 +465,20 @@
ASSERT(epoll != NULL);
ep->data.u64 = pdp->pd_epolldata;
/*
+ * Since POLLNVAL is a legal event for
+ * VOP_POLL handlers to emit, it must
+ * be translated epoll-legal.
+ */
+ if (revent & POLLNVAL) {
+ revent &= ~POLLNVAL;
+ revent |= POLLERR;
+ }
+
+ /*
* If any of the event bits are set for
* which poll and epoll representations
* differ, swizzle in the native epoll
* values.
*/
@@ -486,23 +532,16 @@
fdcnt++;
break;
}
}
+ /* Handle special polling modes. */
+ if (pdp->pd_events & POLLONESHOT) {
/*
- * If POLLET is set, clear the bit in the
- * bitmap -- which effectively latches the
- * edge on a pollwakeup() from the driver.
+ * If POLLONESHOT is set, perform the
+ * implicit POLLREMOVE.
*/
- if (pdp->pd_events & POLLET)
- BT_CLEAR(pcp->pc_bitmap, fd);
-
- /*
- * If POLLONESHOT is set, perform the implicit
- * POLLREMOVE.
- */
- if (pdp->pd_events & POLLONESHOT) {
pdp->pd_fp = NULL;
pdp->pd_events = 0;
if (pdp->pd_php != NULL) {
pollhead_delete(pdp->pd_php,
@@ -509,12 +548,34 @@
pdp);
pdp->pd_php = NULL;
}
BT_CLEAR(pcp->pc_bitmap, fd);
+ } else if (pdp->pd_events & POLLET) {
+ /*
+ * Wire up the pollhead which should
+ * have been provided. Edge-triggered
+ * polling cannot function properly
+ * with drivers which do not emit one.
+ */
+ if (php != NULL &&
+ pdp->pd_php == NULL) {
+ pollhead_insert(php, pdp);
+ pdp->pd_php = php;
}
+ /*
+ * If the driver has emitted a pollhead,
+ * clear the bit in the bitmap which
+ * effectively latches the edge on a
+ * pollwakeup() from the driver.
+ */
+ if (pdp->pd_php != NULL) {
+ BT_CLEAR(pcp->pc_bitmap, fd);
+ }
+ }
+
fdcnt++;
} else if (php != NULL) {
/*
* We clear a bit or cache a poll fd if
* the driver returns a poll head ptr,
@@ -915,10 +976,36 @@
* the tradeoff later.
*/
curthread->t_pollcache = pcp;
error = VOP_POLL(fp->f_vnode, pfdp->events, 0,
&pfdp->revents, &php, NULL);
+
+ /*
+ * Edge-triggered polling requires a pollhead in order
+ * to initiate wake-ups properly. Drivers which are
+ * savvy to POLLET presence, which should include
+ * everything in-gate, will always emit one, regardless
+ * of revent status. Older drivers which only emit a
+ * pollhead if 'revents == 0' are given a second chance
+ * here via a second VOP_POLL, with only POLLET set in
+ * the events of interest. These circumstances should
+ * induce any cacheable drivers to emit a pollhead for
+ * wake-ups.
+ *
+ * Drivers which never emit a pollhead will simply
+ * disobey the exectation of edge-triggered behavior.
+ * This includes recursive epoll which, even on Linux,
+ * yields its events in a level-triggered fashion only.
+ */
+ if ((pdp->pd_events & POLLET) && error == 0 &&
+ php == NULL) {
+ short levent = 0;
+
+ error = VOP_POLL(fp->f_vnode, POLLET, 0,
+ &levent, &php, NULL);
+ }
+
curthread->t_pollcache = NULL;
/*
* We always set the bit when this fd is cached;
* this forces the first DP_POLL to poll this fd.
* Real performance gain comes from subsequent
@@ -1456,14 +1543,26 @@
if (res == PSE_SUCCESS) {
nfds_t nfds = 1;
int fdcnt = 0;
pollstate_t *ps = curthread->t_pollstate;
+ /*
+ * Recursive polling will only emit certain events. Skip a
+ * scan of the pollcache if those events are not of interest.
+ */
+ if (events & (POLLIN|POLLRDNORM)) {
rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt);
- if (rc == 0) {
- *reventsp = (fdcnt > 0) ? POLLIN : 0;
+ } else {
+ rc = 0;
+ fdcnt = 0;
}
+
+ if (rc == 0 && fdcnt > 0) {
+ *reventsp = POLLIN|POLLRDNORM;
+ } else {
+ *reventsp = 0;
+ }
pcachelink_assoc(pcp, ps->ps_pc_stack[0]);
pollstate_exit(pcp);
} else {
switch (res) {
case PSE_FAIL_DEPTH: