Print this page
OS-5566 ppoll timeout calculation can overflow
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Alex Wilson <alex.wilson@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
OS-4656 nested epoll does not mimic Linux behavior
Reviewed by: Bryan Cantrill <bryan@joyent.com>
OS-5162 poll/select yield improper EINTR when nfds and timeout are 0
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
OS-4830 lxbrand convert select/poll to IKE
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
        
*** 27,37 ****
  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  /*        All Rights Reserved   */
  
  /*
   * Copyright (c) 2012 by Delphix. All rights reserved.
!  * Copyright 2015, Joyent, Inc.
   */
  
  /*
   * Portions of this source code were derived from Berkeley 4.3 BSD
   * under license from the Regents of the University of California.
--- 27,37 ----
  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  /*        All Rights Reserved   */
  
  /*
   * Copyright (c) 2012 by Delphix. All rights reserved.
!  * Copyright 2016, Joyent, Inc.
   */
  
  /*
   * Portions of this source code were derived from Berkeley 4.3 BSD
   * under license from the Regents of the University of California.
*** 315,381 ****
                  pollrelock(state);
          }
          return (0);
  }
  
! static int
! poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
  {
          kthread_t *t = curthread;
-         klwp_t *lwp = ttolwp(t);
-         proc_t *p = ttoproc(t);
-         int fdcnt = 0;
-         int i;
          hrtime_t deadline; /* hrtime value when we want to return */
          pollfd_t *pollfdp;
-         pollstate_t *ps;
          pollcache_t *pcp;
          int error = 0;
-         nfds_t old_nfds;
          int cacheindex = 0;     /* which cache set is used */
  
          /*
           * Determine the precise future time of the requested timeout, if any.
           */
          if (tsp == NULL) {
                  deadline = -1;
          } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
                  deadline = 0;
          } else {
                  /* They must wait at least a tick. */
!                 deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec;
!                 deadline = MAX(deadline, nsec_per_tick);
!                 deadline += gethrtime();
!         }
  
          /*
!          * Reset our signal mask, if requested.
           */
!         if (ksetp != NULL) {
!                 mutex_enter(&p->p_lock);
!                 schedctl_finish_sigblock(t);
!                 lwp->lwp_sigoldmask = t->t_hold;
!                 t->t_hold = *ksetp;
!                 t->t_flag |= T_TOMASK;
!                 /*
!                  * Call cv_reltimedwait_sig() just to check for signals.
!                  * We will return immediately with either 0 or -1.
!                  */
!                 if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
!                     TR_CLOCK_TICK)) {
!                         mutex_exit(&p->p_lock);
!                         error = EINTR;
!                         goto pollout;
                  }
-                 mutex_exit(&p->p_lock);
          }
  
          /*
!          * Check to see if this guy just wants to use poll() as a timeout.
           * If yes then bypass all the other stuff and make him sleep.
           */
          if (nfds == 0) {
                  /*
                   * Sleep until we have passed the requested future
                   * time or until interrupted by a signal.
                   * Do not check for signals if we do not want to wait.
                   */
--- 315,420 ----
                  pollrelock(state);
          }
          return (0);
  }
  
! int
! poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds)
  {
+         pollfd_t *pollfdp;
+         nfds_t old_nfds;
+ 
+         /*
+          * NOTE: for performance, buffers are saved across poll() calls.
+          * The theory is that if a process polls heavily, it tends to poll
+          * on the same set of descriptors.  Therefore, we only reallocate
+          * buffers when nfds changes.  There is no hysteresis control,
+          * because there is no data to suggest that this is necessary;
+          * the penalty of reallocating is not *that* great in any event.
+          */
+         old_nfds = ps->ps_nfds;
+         if (nfds != old_nfds) {
+                 kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
+                 pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
+                 ps->ps_pollfd = pollfdp;
+                 ps->ps_nfds = nfds;
+         }
+ 
+         pollfdp = ps->ps_pollfd;
+         if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) {
+                 return (EFAULT);
+         }
+ 
+         if (fds == NULL) {
+                 /*
+                  * If the process has page 0 mapped, then the copyin() above
+                  * will succeed even if fds is NULL.  However, our cached
+                  * poll lists are keyed by the address of the passed-in fds
+                  * structure, and we use the value NULL to indicate an unused
+                  * poll cache list entry.  As such, we elect not to support
+                  * NULL as a valid (user) memory address and fail the poll()
+                  * call.
+                  */
+                 return (EFAULT);
+         }
+         return (0);
+ }
+ 
+ int
+ poll_common(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, timespec_t *tsp,
+     int *fdcnt)
+ {
          kthread_t *t = curthread;
          hrtime_t deadline; /* hrtime value when we want to return */
          pollfd_t *pollfdp;
          pollcache_t *pcp;
          int error = 0;
          int cacheindex = 0;     /* which cache set is used */
  
          /*
           * Determine the precise future time of the requested timeout, if any.
           */
          if (tsp == NULL) {
                  deadline = -1;
          } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
                  deadline = 0;
+         } else if (tsp->tv_sec >= HRTIME_MAX/NANOSEC) {
+                 /* Use an indefinite timeout if tv_sec would cause overflow */
+                 deadline = -1;
          } else {
+                 /*
+                  * The above check, when combined with the protections offered
+                  * by itimerspecfix (ensuring that neither field is negative
+                  * and that tv_nsec represents less than a whole second), will
+                  * prevent overflow during the conversion from timespec_t to
+                  * uhrtime_t.
+                  */
+                 uhrtime_t utime = tsp->tv_sec * NANOSEC;
+                 utime += tsp->tv_nsec;
+ 
                  /* They must wait at least a tick. */
!                 utime = MAX(utime, nsec_per_tick);
  
                  /*
!                  * Since utime has an upper bound of HRTIME_MAX, adding the
!                  * gethrtime() result cannot incur an overflow as the unsigned
!                  * type has an adequate bound.
                   */
!                 utime += (uhrtime_t)gethrtime();
!                 if (utime > HRTIME_MAX) {
!                         deadline = -1;
!                 } else {
!                         deadline = (hrtime_t)utime;
                  }
          }
  
          /*
!          * Check to see if the caller just wants to use poll() as a timeout.
           * If yes then bypass all the other stuff and make him sleep.
           */
          if (nfds == 0) {
+                 *fdcnt = 0;
                  /*
                   * Sleep until we have passed the requested future
                   * time or until interrupted by a signal.
                   * Do not check for signals if we do not want to wait.
                   */
*** 383,454 ****
                          mutex_enter(&t->t_delay_lock);
                          while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv,
                              &t->t_delay_lock, deadline)) > 0)
                                  continue;
                          mutex_exit(&t->t_delay_lock);
!                         error = (error == 0) ? EINTR : 0;
                  }
!                 goto pollout;
          }
  
!         if (nfds > p->p_fno_ctl) {
!                 mutex_enter(&p->p_lock);
!                 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
!                     p->p_rctls, p, RCA_SAFE);
!                 mutex_exit(&p->p_lock);
!                 error = EINVAL;
!                 goto pollout;
!         }
! 
!         /*
!          * Need to allocate memory for pollstate before anything because
!          * the mutex and cv are created in this space
!          */
!         ps = pollstate_create();
! 
!         if (ps->ps_pcache == NULL)
!                 ps->ps_pcache = pcache_alloc();
!         pcp = ps->ps_pcache;
! 
!         /*
!          * NOTE: for performance, buffers are saved across poll() calls.
!          * The theory is that if a process polls heavily, it tends to poll
!          * on the same set of descriptors.  Therefore, we only reallocate
!          * buffers when nfds changes.  There is no hysteresis control,
!          * because there is no data to suggest that this is necessary;
!          * the penalty of reallocating is not *that* great in any event.
!          */
!         old_nfds = ps->ps_nfds;
!         if (nfds != old_nfds) {
! 
!                 kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
!                 pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
!                 ps->ps_pollfd = pollfdp;
!                 ps->ps_nfds = nfds;
!         }
! 
          pollfdp = ps->ps_pollfd;
!         if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) {
!                 error = EFAULT;
!                 goto pollout;
!         }
  
-         if (fds == NULL) {
                  /*
-                  * If the process has page 0 mapped, then the copyin() above
-                  * will succeed even if fds is NULL.  However, our cached
-                  * poll lists are keyed by the address of the passed-in fds
-                  * structure, and we use the value NULL to indicate an unused
-                  * poll cache list entry.  As such, we elect not to support
-                  * NULL as a valid (user) memory address and fail the poll()
-                  * call.
-                  */
-                 error = EINVAL;
-                 goto pollout;
-         }
- 
-         /*
           * If this thread polls for the first time, allocate ALL poll
           * cache data structures and cache the poll fd list. This
           * allocation is delayed till now because lwp's polling 0 fd
           * (i.e. using poll as timeout()) don't need this memory.
           */
--- 422,441 ----
                          mutex_enter(&t->t_delay_lock);
                          while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv,
                              &t->t_delay_lock, deadline)) > 0)
                                  continue;
                          mutex_exit(&t->t_delay_lock);
!                         return ((error == 0) ? EINTR : 0);
                  }
!                 return (0);
          }
  
!         VERIFY(ps != NULL);
          pollfdp = ps->ps_pollfd;
!         VERIFY(pollfdp != NULL);
  
          /*
           * If this thread polls for the first time, allocate ALL poll
           * cache data structures and cache the poll fd list. This
           * allocation is delayed till now because lwp's polling 0 fd
           * (i.e. using poll as timeout()) don't need this memory.
           */
*** 458,471 ****
          if (pcp->pc_bitmap == NULL) {
                  pcache_create(pcp, nfds);
                  /*
                   * poll and cache this poll fd list in ps_pcacheset[0].
                   */
!                 error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex);
!                 if (fdcnt || error) {
                          mutex_exit(&ps->ps_lock);
!                         goto pollout;
                  }
          } else {
                  pollcacheset_t  *pcset = ps->ps_pcacheset;
  
                  /*
--- 445,458 ----
          if (pcp->pc_bitmap == NULL) {
                  pcache_create(pcp, nfds);
                  /*
                   * poll and cache this poll fd list in ps_pcacheset[0].
                   */
!                 error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex);
!                 if (error || *fdcnt) {
                          mutex_exit(&ps->ps_lock);
!                         return (error);
                  }
          } else {
                  pollcacheset_t  *pcset = ps->ps_pcacheset;
  
                  /*
*** 486,500 ****
                                   * list and previously cached one.
                                   * If there is an error during resolve(),
                                   * the callee will guarantee the consistency
                                   * of cached poll list and cache content.
                                   */
!                                 error = pcacheset_resolve(ps, nfds, &fdcnt,
                                      cacheindex);
                                  if (error) {
                                          mutex_exit(&ps->ps_lock);
!                                         goto pollout;
                                  }
                                  break;
                          }
  
                          /*
--- 473,487 ----
                                   * list and previously cached one.
                                   * If there is an error during resolve(),
                                   * the callee will guarantee the consistency
                                   * of cached poll list and cache content.
                                   */
!                                 error = pcacheset_resolve(ps, nfds, fdcnt,
                                      cacheindex);
                                  if (error) {
                                          mutex_exit(&ps->ps_lock);
!                                         return (error);
                                  }
                                  break;
                          }
  
                          /*
*** 507,521 ****
                          if (pcset[cacheindex].pcs_usradr == NULL) {
                                  /*
                                   * found an unused entry. Use it to cache
                                   * this poll list.
                                   */
!                                 error = pcacheset_cache_list(ps, fds, &fdcnt,
                                      cacheindex);
!                                 if (fdcnt || error) {
                                          mutex_exit(&ps->ps_lock);
!                                         goto pollout;
                                  }
                                  break;
                          }
                  }
                  if (cacheindex == ps->ps_nsets) {
--- 494,508 ----
                          if (pcset[cacheindex].pcs_usradr == NULL) {
                                  /*
                                   * found an unused entry. Use it to cache
                                   * this poll list.
                                   */
!                                 error = pcacheset_cache_list(ps, fds, fdcnt,
                                      cacheindex);
!                                 if (error || *fdcnt) {
                                          mutex_exit(&ps->ps_lock);
!                                         return (error);
                                  }
                                  break;
                          }
                  }
                  if (cacheindex == ps->ps_nsets) {
*** 525,538 ****
                           */
                          pollstats.polllistmiss.value.ui64++;
                          cacheindex = pcacheset_replace(ps);
                          ASSERT(cacheindex < ps->ps_nsets);
                          pcset[cacheindex].pcs_usradr = (uintptr_t)fds;
!                         error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex);
                          if (error) {
                                  mutex_exit(&ps->ps_lock);
!                                 goto pollout;
                          }
                  }
          }
  
          /*
--- 512,525 ----
                           */
                          pollstats.polllistmiss.value.ui64++;
                          cacheindex = pcacheset_replace(ps);
                          ASSERT(cacheindex < ps->ps_nsets);
                          pcset[cacheindex].pcs_usradr = (uintptr_t)fds;
!                         error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex);
                          if (error) {
                                  mutex_exit(&ps->ps_lock);
!                                 return (error);
                          }
                  }
          }
  
          /*
*** 546,557 ****
           * the pcache before updating poll bitmap.
           */
          mutex_enter(&pcp->pc_lock);
          for (;;) {
                  pcp->pc_flag = 0;
!                 error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex);
!                 if (fdcnt || error) {
                          mutex_exit(&pcp->pc_lock);
                          mutex_exit(&ps->ps_lock);
                          break;
                  }
  
--- 533,544 ----
           * the pcache before updating poll bitmap.
           */
          mutex_enter(&pcp->pc_lock);
          for (;;) {
                  pcp->pc_flag = 0;
!                 error = pcache_poll(pollfdp, ps, nfds, fdcnt, cacheindex);
!                 if (error || *fdcnt) {
                          mutex_exit(&pcp->pc_lock);
                          mutex_exit(&ps->ps_lock);
                          break;
                  }
  
*** 593,609 ****
                   */
                  mutex_enter(&ps->ps_lock);
                  mutex_enter(&pcp->pc_lock);
          }
  
  pollout:
          /*
!          * If we changed the signal mask but we received
!          * no signal then restore the signal mask.
!          * Otherwise psig() will deal with the signal mask.
           */
!         if (ksetp != NULL) {
                  mutex_enter(&p->p_lock);
                  if (lwp->lwp_cursig == 0) {
                          t->t_hold = lwp->lwp_sigoldmask;
                          t->t_flag &= ~T_TOMASK;
                  }
--- 580,699 ----
                   */
                  mutex_enter(&ps->ps_lock);
                  mutex_enter(&pcp->pc_lock);
          }
  
+         return (error);
+ }
+ 
+ /*
+  * This is the system call trap that poll(),
+  * select() and pselect() are built upon.
+  * It is a private interface between libc and the kernel.
+  */
+ int
+ pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp)
+ {
+         kthread_t *t = curthread;
+         klwp_t *lwp = ttolwp(t);
+         proc_t *p = ttoproc(t);
+         timespec_t ts;
+         timespec_t *tsp;
+         k_sigset_t kset;
+         pollstate_t *ps = NULL;
+         pollfd_t *pollfdp = NULL;
+         int error = 0, fdcnt = 0;
+ 
+         /*
+          * Copy in timeout
+          */
+         if (timeoutp == NULL) {
+                 tsp = NULL;
+         } else {
+                 if (get_udatamodel() == DATAMODEL_NATIVE) {
+                         if (copyin(timeoutp, &ts, sizeof (ts)))
+                                 return (set_errno(EFAULT));
+                 } else {
+                         timespec32_t ts32;
+ 
+                         if (copyin(timeoutp, &ts32, sizeof (ts32)))
+                                 return (set_errno(EFAULT));
+                         TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
+                 }
+ 
+                 if (itimerspecfix(&ts))
+                         return (set_errno(EINVAL));
+                 tsp = &ts;
+         }
+ 
+         /*
+          * Copy in and reset signal mask, if requested.
+          */
+         if (setp != NULL) {
+                 sigset_t set;
+ 
+                 if (copyin(setp, &set, sizeof (set)))
+                         return (set_errno(EFAULT));
+                 sigutok(&set, &kset);
+ 
+                 mutex_enter(&p->p_lock);
+                 schedctl_finish_sigblock(t);
+                 lwp->lwp_sigoldmask = t->t_hold;
+                 t->t_hold = kset;
+                 t->t_flag |= T_TOMASK;
+                 /*
+                  * Call cv_reltimedwait_sig() just to check for signals.
+                  * We will return immediately with either 0 or -1.
+                  */
+                 if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
+                     TR_CLOCK_TICK)) {
+                         mutex_exit(&p->p_lock);
+                         error = EINTR;
+                         goto pollout;
+                 }
+                 mutex_exit(&p->p_lock);
+         }
+ 
+         /*
+          * Initialize pollstate and copy in pollfd data if present.
+          * If nfds == 0, we will skip all of the copying and check steps and
+          * proceed directly into poll_common to process the supplied timeout.
+          */
+         if (nfds != 0) {
+                 if (nfds > p->p_fno_ctl) {
+                         mutex_enter(&p->p_lock);
+                         (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
+                             p->p_rctls, p, RCA_SAFE);
+                         mutex_exit(&p->p_lock);
+                         error = EINVAL;
+                         goto pollout;
+                 }
+ 
+                 /*
+                  * Need to allocate memory for pollstate before anything
+                  * because the mutex and cv are created in this space
+                  */
+                 ps = pollstate_create();
+                 if (ps->ps_pcache == NULL)
+                         ps->ps_pcache = pcache_alloc();
+ 
+                 if ((error = poll_copyin(ps, fds, nfds)) != 0)
+                         goto pollout;
+                 pollfdp = ps->ps_pollfd;
+         }
+ 
+         /*
+          * Perform the actual poll.
+          */
+         error = poll_common(ps, fds, nfds, tsp, &fdcnt);
+ 
  pollout:
          /*
!          * If we changed the signal mask but we received no signal then restore
!          * the signal mask.  Otherwise psig() will deal with the signal mask.
           */
!         if (setp != NULL) {
                  mutex_enter(&p->p_lock);
                  if (lwp->lwp_cursig == 0) {
                          t->t_hold = lwp->lwp_sigoldmask;
                          t->t_flag &= ~T_TOMASK;
                  }
*** 610,633 ****
                  mutex_exit(&p->p_lock);
          }
  
          if (error)
                  return (set_errno(error));
- 
          /*
           * Copy out the events and return the fdcnt to the user.
           */
!         if (nfds != 0 &&
!             copyout(pollfdp, fds, nfds * sizeof (pollfd_t)))
                  return (set_errno(EFAULT));
  
  #ifdef DEBUG
          /*
           * Another sanity check:
           */
          if (fdcnt) {
!                 int     reventcnt = 0;
  
                  for (i = 0; i < nfds; i++) {
                          if (pollfdp[i].fd < 0) {
                                  ASSERT(pollfdp[i].revents == 0);
                                  continue;
--- 700,721 ----
                  mutex_exit(&p->p_lock);
          }
  
          if (error)
                  return (set_errno(error));
          /*
           * Copy out the events and return the fdcnt to the user.
           */
!         if (nfds != 0 && copyout(pollfdp, fds, nfds * sizeof (pollfd_t)))
                  return (set_errno(EFAULT));
  
  #ifdef DEBUG
          /*
           * Another sanity check:
           */
          if (fdcnt) {
!                 int i, reventcnt = 0;
  
                  for (i = 0; i < nfds; i++) {
                          if (pollfdp[i].fd < 0) {
                                  ASSERT(pollfdp[i].revents == 0);
                                  continue;
*** 636,700 ****
                                  reventcnt++;
                          }
                  }
                  ASSERT(fdcnt == reventcnt);
          } else {
                  for (i = 0; i < nfds; i++) {
                          ASSERT(pollfdp[i].revents == 0);
                  }
          }
  #endif  /* DEBUG */
  
          return (fdcnt);
  }
  
- /*
-  * This is the system call trap that poll(),
-  * select() and pselect() are built upon.
-  * It is a private interface between libc and the kernel.
-  */
- int
- pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp)
- {
-         timespec_t ts;
-         timespec_t *tsp;
-         sigset_t set;
-         k_sigset_t kset;
-         k_sigset_t *ksetp;
-         model_t datamodel = get_udatamodel();
- 
-         if (timeoutp == NULL)
-                 tsp = NULL;
-         else {
-                 if (datamodel == DATAMODEL_NATIVE) {
-                         if (copyin(timeoutp, &ts, sizeof (ts)))
-                                 return (set_errno(EFAULT));
-                 } else {
-                         timespec32_t ts32;
- 
-                         if (copyin(timeoutp, &ts32, sizeof (ts32)))
-                                 return (set_errno(EFAULT));
-                         TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
-                 }
- 
-                 if (itimerspecfix(&ts))
-                         return (set_errno(EINVAL));
-                 tsp = &ts;
-         }
- 
-         if (setp == NULL)
-                 ksetp = NULL;
-         else {
-                 if (copyin(setp, &set, sizeof (set)))
-                         return (set_errno(EFAULT));
-                 sigutok(&set, &kset);
-                 ksetp = &kset;
-         }
- 
-         return (poll_common(fds, nfds, tsp, ksetp));
- }
- 
  /*
   * Clean up any state left around by poll(2). Called when a thread exits.
   */
  void
  pollcleanup()
--- 724,744 ----
                                  reventcnt++;
                          }
                  }
                  ASSERT(fdcnt == reventcnt);
          } else {
+                 int i;
+ 
                  for (i = 0; i < nfds; i++) {
                          ASSERT(pollfdp[i].revents == 0);
                  }
          }
  #endif  /* DEBUG */
  
          return (fdcnt);
  }
  
  /*
   * Clean up any state left around by poll(2). Called when a thread exits.
   */
  void
  pollcleanup()