Print this page
8634 epoll fails to wake on certain edge-triggered conditions
8635 epoll should not emit POLLNVAL
8636 recursive epoll should emit EPOLLRDNORM
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: Igor Kozhukhov <igor@dilos.org>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/io/devpoll.c
          +++ new/usr/src/uts/common/io/devpoll.c
↓ open down ↓ 17 lines elided ↑ open up ↑
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Copyright (c) 2012 by Delphix. All rights reserved.
  28      - * Copyright 2016 Joyent, Inc.
       28 + * Copyright 2017 Joyent, Inc.
  29   29   */
  30   30  
  31   31  #include <sys/types.h>
  32   32  #include <sys/devops.h>
  33   33  #include <sys/conf.h>
  34   34  #include <sys/modctl.h>
  35   35  #include <sys/sunddi.h>
  36   36  #include <sys/stat.h>
  37   37  #include <sys/poll_impl.h>
  38   38  #include <sys/errno.h>
↓ open down ↓ 320 lines elided ↑ open up ↑
 359  359                                                  pdp->pd_php = NULL;
 360  360                                          }
 361  361  
 362  362                                          BT_CLEAR(pcp->pc_bitmap, fd);
 363  363                                          continue;
 364  364                                  }
 365  365                          }
 366  366  
 367  367                          if (fp != pdp->pd_fp) {
 368  368                                  /*
 369      -                                 * user is polling on a cached fd which was
 370      -                                 * closed and then reused. Unfortunately
 371      -                                 * there is no good way to inform user.
 372      -                                 * If the file struct is also reused, we
 373      -                                 * may not be able to detect the fd reuse
 374      -                                 * at all.  As long as this does not
 375      -                                 * cause system failure and/or memory leak,
 376      -                                 * we will play along. Man page states if
 377      -                                 * user does not clean up closed fds, polling
      369 +                                 * The user is polling on a cached fd which was
      370 +                                 * closed and then reused.  Unfortunately there
      371 +                                 * is no good way to communicate this fact to
      372 +                                 * the consumer.
      373 +                                 *
      374 +                                 * If the file struct is also reused, we may
      375 +                                 * not be able to detect the fd reuse at all.
      376 +                                 * As long as this does not cause system
      377 +                                 * failure and/or memory leaks, we will play
      378 +                                 * along.  The man page states that if the user
      379 +                                 * does not clean up closed fds, polling
 378  380                                   * results will be indeterministic.
 379  381                                   *
 380      -                                 * XXX - perhaps log the detection of fd
 381      -                                 *       reuse?
      382 +                                 * XXX: perhaps log the detection of fd reuse?
 382  383                                   */
 383  384                                  pdp->pd_fp = fp;
      385 +
      386 +                                /*
      387 +                                 * When this situation has been detected, it's
      388 +                                 * likely that any existing pollhead is
      389 +                                 * ill-suited to perform proper wake-ups.
      390 +                                 *
      391 +                                 * Clean up the old entry under the expectation
      392 +                                 * that a valid one will be provided as part of
      393 +                                 * the later VOP_POLL.
      394 +                                 */
      395 +                                if (pdp->pd_php != NULL) {
      396 +                                        pollhead_delete(pdp->pd_php, pdp);
      397 +                                        pdp->pd_php = NULL;
      398 +                                }
 384  399                          }
 385  400                          /*
 386  401                           * XXX - pollrelock() logic needs to know which
 387  402                           * which pollcache lock to grab. It'd be a
 388  403                           * cleaner solution if we could pass pcp as
 389  404                           * an arguement in VOP_POLL interface instead
 390  405                           * of implicitly passing it using thread_t
 391  406                           * struct. On the other hand, changing VOP_POLL
 392  407                           * interface will require all driver/file system
 393  408                           * poll routine to change. May want to revisit
 394  409                           * the tradeoff later.
 395  410                           */
 396  411                          curthread->t_pollcache = pcp;
 397  412                          error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0,
 398  413                              &revent, &php, NULL);
      414 +
      415 +                        /*
      416 +                         * Recheck edge-triggered descriptors which lack a
      417 +                         * pollhead.  While this check is performed when an fd
      418 +                         * is added to the pollcache in dpwrite(), subsequent
      419 +                         * descriptor manipulation could cause a different
      420 +                         * resource to be present now.
      421 +                         */
      422 +                        if ((pdp->pd_events & POLLET) && error == 0 &&
      423 +                            pdp->pd_php == NULL && php == NULL && revent != 0) {
      424 +                                short levent = 0;
      425 +
      426 +                                /*
      427 +                                 * The same POLLET-only VOP_POLL is used in an
      428 +                                 * attempt to coax a pollhead from older
      429 +                                 * driver logic.
      430 +                                 */
      431 +                                error = VOP_POLL(fp->f_vnode, POLLET,
      432 +                                    0, &levent, &php, NULL);
      433 +                        }
      434 +
 399  435                          curthread->t_pollcache = NULL;
 400  436                          releasef(fd);
 401  437                          if (error != 0) {
 402  438                                  break;
 403  439                          }
 404  440  
 405  441                          /*
 406  442                           * layered devices (e.g. console driver)
 407  443                           * may change the vnode and thus the pollhead
 408  444                           * pointer out from underneath us.
↓ open down ↓ 15 lines elided ↑ open up ↑
 424  460                                          pfdp[fdcnt].fd = fd;
 425  461                                          pfdp[fdcnt].events = pdp->pd_events;
 426  462                                          pfdp[fdcnt].revents = revent;
 427  463                                  } else if (epoll != NULL) {
 428  464                                          epoll_event_t *ep = &epoll[fdcnt];
 429  465  
 430  466                                          ASSERT(epoll != NULL);
 431  467                                          ep->data.u64 = pdp->pd_epolldata;
 432  468  
 433  469                                          /*
      470 +                                         * Since POLLNVAL is a legal event for
      471 +                                         * VOP_POLL handlers to emit, it must
      472 +                                         * be translated epoll-legal.
      473 +                                         */
      474 +                                        if (revent & POLLNVAL) {
      475 +                                                revent &= ~POLLNVAL;
      476 +                                                revent |= POLLERR;
      477 +                                        }
      478 +
      479 +                                        /*
 434  480                                           * If any of the event bits are set for
 435  481                                           * which poll and epoll representations
 436  482                                           * differ, swizzle in the native epoll
 437  483                                           * values.
 438  484                                           */
 439  485                                          if (revent & mask) {
 440  486                                                  ep->events = (revent & ~mask) |
 441  487                                                      ((revent & POLLRDHUP) ?
 442  488                                                      EPOLLRDHUP : 0) |
 443  489                                                      ((revent & POLLWRBAND) ?
↓ open down ↓ 37 lines elided ↑ open up ↑
 481  527                                              != 0) {
 482  528                                                  ps->ps_flags &=
 483  529                                                      ~POLLSTATE_ULFAIL;
 484  530                                                  continue;
 485  531                                          } else {
 486  532                                                  fdcnt++;
 487  533                                                  break;
 488  534                                          }
 489  535                                  }
 490  536  
 491      -                                /*
 492      -                                 * If POLLET is set, clear the bit in the
 493      -                                 * bitmap -- which effectively latches the
 494      -                                 * edge on a pollwakeup() from the driver.
 495      -                                 */
 496      -                                if (pdp->pd_events & POLLET)
 497      -                                        BT_CLEAR(pcp->pc_bitmap, fd);
 498      -
 499      -                                /*
 500      -                                 * If POLLONESHOT is set, perform the implicit
 501      -                                 * POLLREMOVE.
 502      -                                 */
      537 +                                /* Handle special polling modes. */
 503  538                                  if (pdp->pd_events & POLLONESHOT) {
      539 +                                        /*
      540 +                                         * If POLLONESHOT is set, perform the
      541 +                                         * implicit POLLREMOVE.
      542 +                                         */
 504  543                                          pdp->pd_fp = NULL;
 505  544                                          pdp->pd_events = 0;
 506  545  
 507  546                                          if (pdp->pd_php != NULL) {
 508  547                                                  pollhead_delete(pdp->pd_php,
 509  548                                                      pdp);
 510  549                                                  pdp->pd_php = NULL;
 511  550                                          }
 512  551  
 513  552                                          BT_CLEAR(pcp->pc_bitmap, fd);
      553 +                                } else if (pdp->pd_events & POLLET) {
      554 +                                        /*
      555 +                                         * Wire up the pollhead which should
      556 +                                         * have been provided.  Edge-triggered
      557 +                                         * polling cannot function properly
      558 +                                         * with drivers which do not emit one.
      559 +                                         */
      560 +                                        if (php != NULL &&
      561 +                                            pdp->pd_php == NULL) {
      562 +                                                pollhead_insert(php, pdp);
      563 +                                                pdp->pd_php = php;
      564 +                                        }
      565 +
      566 +                                        /*
      567 +                                         * If the driver has emitted a pollhead,
      568 +                                         * clear the bit in the bitmap which
      569 +                                         * effectively latches the edge on a
      570 +                                         * pollwakeup() from the driver.
      571 +                                         */
      572 +                                        if (pdp->pd_php != NULL) {
      573 +                                                BT_CLEAR(pcp->pc_bitmap, fd);
      574 +                                        }
 514  575                                  }
 515  576  
 516  577                                  fdcnt++;
 517  578                          } else if (php != NULL) {
 518  579                                  /*
 519  580                                   * We clear a bit or cache a poll fd if
 520  581                                   * the driver returns a poll head ptr,
 521  582                                   * which is expected in the case of 0
 522  583                                   * revents. Some buggy driver may return
 523  584                                   * NULL php pointer with 0 revents. In
↓ open down ↓ 386 lines elided ↑ open up ↑
 910  971                           * an arguement in VOP_POLL interface instead
 911  972                           * of implicitly passing it using thread_t
 912  973                           * struct. On the other hand, changing VOP_POLL
 913  974                           * interface will require all driver/file system
 914  975                           * poll routine to change. May want to revisit
 915  976                           * the tradeoff later.
 916  977                           */
 917  978                          curthread->t_pollcache = pcp;
 918  979                          error = VOP_POLL(fp->f_vnode, pfdp->events, 0,
 919  980                              &pfdp->revents, &php, NULL);
      981 +
      982 +                        /*
      983 +                         * Edge-triggered polling requires a pollhead in order
      984 +                         * to initiate wake-ups properly.  Drivers which are
      985 +                         * savvy to POLLET presence, which should include
      986 +                         * everything in-gate, will always emit one, regardless
      987 +                         * of revent status.  Older drivers which only emit a
      988 +                         * pollhead if 'revents == 0' are given a second chance
      989 +                         * here via a second VOP_POLL, with only POLLET set in
      990 +                         * the events of interest.  These circumstances should
      991 +                         * induce any cacheable drivers to emit a pollhead for
      992 +                         * wake-ups.
      993 +                         *
      994 +                         * Drivers which never emit a pollhead will simply
      995 +                         * disobey the exectation of edge-triggered behavior.
      996 +                         * This includes recursive epoll which, even on Linux,
      997 +                         * yields its events in a level-triggered fashion only.
      998 +                         */
      999 +                        if ((pdp->pd_events & POLLET) && error == 0 &&
     1000 +                            php == NULL) {
     1001 +                                short levent = 0;
     1002 +
     1003 +                                error = VOP_POLL(fp->f_vnode, POLLET, 0,
     1004 +                                    &levent, &php, NULL);
     1005 +                        }
     1006 +
 920 1007                          curthread->t_pollcache = NULL;
 921 1008                          /*
 922 1009                           * We always set the bit when this fd is cached;
 923 1010                           * this forces the first DP_POLL to poll this fd.
 924 1011                           * Real performance gain comes from subsequent
 925 1012                           * DP_POLL.  We also attempt a pollhead_insert();
 926 1013                           * if it's not possible, we'll do it in dpioctl().
 927 1014                           */
 928 1015                          BT_SET(pcp->pc_bitmap, fd);
 929 1016                          if (error != 0) {
↓ open down ↓ 521 lines elided ↑ open up ↑
1451 1538                  pcp = dpep->dpe_pcache;
1452 1539                  mutex_exit(&dpep->dpe_lock);
1453 1540          }
1454 1541  
1455 1542          res = pollstate_enter(pcp);
1456 1543          if (res == PSE_SUCCESS) {
1457 1544                  nfds_t          nfds = 1;
1458 1545                  int             fdcnt = 0;
1459 1546                  pollstate_t     *ps = curthread->t_pollstate;
1460 1547  
1461      -                rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt);
1462      -                if (rc == 0) {
1463      -                        *reventsp = (fdcnt > 0) ? POLLIN : 0;
     1548 +                /*
     1549 +                 * Recursive polling will only emit certain events.  Skip a
     1550 +                 * scan of the pollcache if those events are not of interest.
     1551 +                 */
     1552 +                if (events & (POLLIN|POLLRDNORM)) {
     1553 +                        rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt);
     1554 +                } else {
     1555 +                        rc = 0;
     1556 +                        fdcnt = 0;
1464 1557                  }
     1558 +
     1559 +                if (rc == 0 && fdcnt > 0) {
     1560 +                        *reventsp = POLLIN|POLLRDNORM;
     1561 +                } else {
     1562 +                        *reventsp = 0;
     1563 +                }
1465 1564                  pcachelink_assoc(pcp, ps->ps_pc_stack[0]);
1466 1565                  pollstate_exit(pcp);
1467 1566          } else {
1468 1567                  switch (res) {
1469 1568                  case PSE_FAIL_DEPTH:
1470 1569                          rc = EINVAL;
1471 1570                          break;
1472 1571                  case PSE_FAIL_LOOP:
1473 1572                  case PSE_FAIL_DEADLOCK:
1474 1573                          rc = ELOOP;
↓ open down ↓ 254 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX