1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2017 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * Support for the signalfd facility, a Linux-borne facility for
  18  * file descriptor-based synchronous signal consumption.
  19  *
  20  * As described on the signalfd(3C) man page, the general idea behind these
  21  * file descriptors is that they can be used to synchronously consume signals
  22  * via the read(2) syscall.  While that capability already exists with the
  23  * sigwaitinfo(3C) function, signalfd holds an advantage since it is file
  24  * descriptor based: It is able use the event facilities (poll(2), /dev/poll,
  25  * event ports) to notify interested parties when consumable signals arrive.
  26  *
  27  * The signalfd lifecycle begins When a process opens /dev/signalfd.  A minor
  28  * will be allocated for them along with an associated signalfd_state_t struct.
  29  * It is there where the mask of desired signals resides.
  30  *
  31  * Reading from the signalfd is straightforward and mimics the kernel behavior
  32  * for sigtimedwait().  Signals continue to live on either the proc's p_sig, or
  33  * thread's t_sig, member.  During a read operation, those which match the mask
  34  * are consumed so they are no longer pending.
  35  *
  36  * The poll side is more complex.  Every time a signal is delivered, all of the
  37  * signalfds on the process need to be examined in order to pollwake threads
  38  * waiting for signal arrival.
  39  *
  40  * When a thread polling on a signalfd requires a pollhead, several steps must
  41  * be taken to safely ensure the proper result.  A sigfd_proc_state_t is
  42  * created for the calling process if it does not yet exist.  It is there where
  43  * a list of sigfd_poll_waiter_t structures reside which associate pollheads to
  44  * signalfd_state_t entries.  The sigfd_proc_state_t list is walked to find a
  45  * sigfd_poll_waiter_t matching the signalfd_state_t which corresponds to the
  46  * polled resource.  If one is found, it is reused.  Otherwise a new one is
  47  * created, incrementing the refcount on the signalfd_state_t, and it is added
  48  * to the sigfd_poll_waiter_t list.
  49  *
  50  * The complications imposed by fork(2) are why the pollhead is stored in the
  51  * associated sigfd_poll_waiter_t instead of directly in the signalfd_state_t.
  52  * More than one process can hold a reference to the signalfd at a time but
  53  * arriving signals should wake only process-local pollers.  Additionally,
  54  * signalfd_close is called only when the last referencing fd is closed, hiding
  55  * occurrences of preceeding threads which released their references.  This
  56  * necessitates reference counting on the signalfd_state_t so it is able to
  57  * persist after close until all poll references have been cleansed.  Doing so
  58  * ensures that blocked pollers which hold references to the signalfd_state_t
  59  * will be able to do clean-up after the descriptor itself has been closed.
  60  *
  61  * When a signal arrives in a process polling on signalfd, signalfd_pollwake_cb
  62  * is called via the pointer in sigfd_proc_state_t.  It will walk over the
  63  * sigfd_poll_waiter_t entries present in the list, searching for any
  64  * associated with a signalfd_state_t with a matching signal mask.  The
  65  * approach of keeping the poller list in p_sigfd was chosen because a process
  66  * is likely to use few signalfds relative to its total file descriptors.  It
  67  * reduces the work required for each received signal.
  68  *
  69  * When matching sigfd_poll_waiter_t entries are encountered in the poller list
  70  * during signalfd_pollwake_cb, they are dispatched into signalfd_wakeq to
  71  * perform the pollwake.  This is due to a lock ordering conflict between
  72  * signalfd_poll and signalfd_pollwake_cb.  The former acquires
  73  * pollcache_t`pc_lock before proc_t`p_lock.  The latter (via sigtoproc)
  74  * reverses the order.  Defering the pollwake into a taskq means it can be
  75  * performed without proc_t`p_lock held, avoiding the deadlock.
  76  *
  77  * The sigfd_list is self-cleaning; as signalfd_pollwake_cb is called, the list
  78  * will clear out on its own.  Any remaining per-process state which remains
  79  * will be cleaned up by the exit helper (signalfd_exit_helper).
  80  *
  81  * The structures associated with signalfd state are designed to operate
  82  * correctly across fork, but there is one caveat that applies.  Using
  83  * fork-shared signalfd descriptors in conjuction with fork-shared caching poll
  84  * descriptors (such as /dev/poll or event ports) will result in missed poll
  85  * wake-ups.  This is caused by the pollhead identity of signalfd descriptors
  86  * being dependent on the process they are polled from.  Because it has a
  87  * thread-local cache, poll(2) is unaffected by this limitation.
  88  *
  89  * Lock ordering:
  90  *
  91  * 1. signalfd_lock
  92  * 2. signalfd_state_t`sfd_lock
  93  *
  94  * 1. proc_t`p_lock (to walk p_sigfd)
  95  * 2. signalfd_state_t`sfd_lock
  96  * 2a. signalfd_lock (after sfd_lock is dropped, when sfd_count falls to 0)
  97  */
  98 
  99 #include <sys/ddi.h>
 100 #include <sys/sunddi.h>
 101 #include <sys/signalfd.h>
 102 #include <sys/conf.h>
 103 #include <sys/sysmacros.h>
 104 #include <sys/filio.h>
 105 #include <sys/stat.h>
 106 #include <sys/file.h>
 107 #include <sys/schedctl.h>
 108 #include <sys/id_space.h>
 109 #include <sys/sdt.h>
 110 #include <sys/disp.h>
 111 #include <sys/taskq_impl.h>
 112 
 113 typedef struct signalfd_state signalfd_state_t;
 114 
 115 struct signalfd_state {
 116         list_node_t     sfd_list;               /* node in global list */
 117         kmutex_t        sfd_lock;               /* protects fields below */
 118         uint_t          sfd_count;              /* ref count */
 119         boolean_t       sfd_valid;              /* valid while open */
 120         k_sigset_t      sfd_set;                /* signals for this fd */
 121 };
 122 
 123 typedef struct sigfd_poll_waiter {
 124         list_node_t             spw_list;
 125         signalfd_state_t        *spw_state;
 126         pollhead_t              spw_pollhd;
 127         taskq_ent_t             spw_taskent;
 128         short                   spw_pollev;
 129 } sigfd_poll_waiter_t;
 130 
 131 /*
 132  * Protects global state in signalfd_devi, signalfd_minor, signalfd_softstate,
 133  * and signalfd_state (including sfd_list field of members)
 134  */
 135 static kmutex_t         signalfd_lock;
 136 static dev_info_t       *signalfd_devi;         /* device info */
 137 static id_space_t       *signalfd_minor;        /* minor number arena */
 138 static void             *signalfd_softstate;    /* softstate pointer */
 139 static list_t           signalfd_state;         /* global list of state */
 140 static taskq_t          *signalfd_wakeq;        /* pollwake event taskq */
 141 
 142 
 143 static void
 144 signalfd_state_enter_locked(signalfd_state_t *state)
 145 {
 146         ASSERT(MUTEX_HELD(&state->sfd_lock));
 147         ASSERT(state->sfd_count > 0);
 148         VERIFY(state->sfd_valid == B_TRUE);
 149 
 150         state->sfd_count++;
 151 }
 152 
 153 static void
 154 signalfd_state_release(signalfd_state_t *state, boolean_t force_invalidate)
 155 {
 156         mutex_enter(&state->sfd_lock);
 157 
 158         if (force_invalidate) {
 159                 state->sfd_valid = B_FALSE;
 160         }
 161 
 162         ASSERT(state->sfd_count > 0);
 163         if (state->sfd_count == 1) {
 164                 VERIFY(state->sfd_valid == B_FALSE);
 165                 mutex_exit(&state->sfd_lock);
 166                 if (force_invalidate) {
 167                         /*
 168                          * The invalidation performed in signalfd_close is done
 169                          * while signalfd_lock is held.
 170                          */
 171                         ASSERT(MUTEX_HELD(&signalfd_lock));
 172                         list_remove(&signalfd_state, state);
 173                 } else {
 174                         ASSERT(MUTEX_NOT_HELD(&signalfd_lock));
 175                         mutex_enter(&signalfd_lock);
 176                         list_remove(&signalfd_state, state);
 177                         mutex_exit(&signalfd_lock);
 178                 }
 179                 kmem_free(state, sizeof (*state));
 180                 return;
 181         }
 182         state->sfd_count--;
 183         mutex_exit(&state->sfd_lock);
 184 }
 185 
 186 static sigfd_poll_waiter_t *
 187 signalfd_wake_list_add(sigfd_proc_state_t *pstate, signalfd_state_t *state)
 188 {
 189         list_t *lst = &pstate->sigfd_list;
 190         sigfd_poll_waiter_t *pw;
 191 
 192         for (pw = list_head(lst); pw != NULL; pw = list_next(lst, pw)) {
 193                 if (pw->spw_state == state)
 194                         break;
 195         }
 196 
 197         if (pw == NULL) {
 198                 pw = kmem_zalloc(sizeof (*pw), KM_SLEEP);
 199 
 200                 mutex_enter(&state->sfd_lock);
 201                 signalfd_state_enter_locked(state);
 202                 pw->spw_state = state;
 203                 mutex_exit(&state->sfd_lock);
 204                 list_insert_head(lst, pw);
 205         }
 206         return (pw);
 207 }
 208 
 209 static sigfd_poll_waiter_t *
 210 signalfd_wake_list_rm(sigfd_proc_state_t *pstate, signalfd_state_t *state)
 211 {
 212         list_t *lst = &pstate->sigfd_list;
 213         sigfd_poll_waiter_t *pw;
 214 
 215         for (pw = list_head(lst); pw != NULL; pw = list_next(lst, pw)) {
 216                 if (pw->spw_state == state) {
 217                         break;
 218                 }
 219         }
 220 
 221         if (pw != NULL) {
 222                 list_remove(lst, pw);
 223                 pw->spw_state = NULL;
 224                 signalfd_state_release(state, B_FALSE);
 225         }
 226 
 227         return (pw);
 228 }
 229 
 230 static void
 231 signalfd_wake_list_cleanup(proc_t *p)
 232 {
 233         sigfd_proc_state_t *pstate = p->p_sigfd;
 234         sigfd_poll_waiter_t *pw;
 235         list_t *lst;
 236 
 237         ASSERT(MUTEX_HELD(&p->p_lock));
 238         ASSERT(pstate != NULL);
 239 
 240         lst = &pstate->sigfd_list;
 241         while ((pw = list_remove_head(lst)) != NULL) {
 242                 signalfd_state_t *state = pw->spw_state;
 243 
 244                 pw->spw_state = NULL;
 245                 signalfd_state_release(state, B_FALSE);
 246 
 247                 pollwakeup(&pw->spw_pollhd, POLLERR);
 248                 pollhead_clean(&pw->spw_pollhd);
 249                 kmem_free(pw, sizeof (*pw));
 250         }
 251         list_destroy(lst);
 252 
 253         p->p_sigfd = NULL;
 254         kmem_free(pstate, sizeof (*pstate));
 255 }
 256 
 257 static void
 258 signalfd_exit_helper(void)
 259 {
 260         proc_t *p = curproc;
 261 
 262         mutex_enter(&p->p_lock);
 263         signalfd_wake_list_cleanup(p);
 264         mutex_exit(&p->p_lock);
 265 }
 266 
 267 /*
 268  * Perform pollwake for a sigfd_poll_waiter_t entry.
 269  * Thanks to the strict and conflicting lock orders required for signalfd_poll
 270  * (pc_lock before p_lock) and signalfd_pollwake_cb (p_lock before pc_lock),
 271  * this is relegated to a taskq to avoid deadlock.
 272  */
 273 static void
 274 signalfd_wake_task(void *arg)
 275 {
 276         sigfd_poll_waiter_t *pw = arg;
 277         signalfd_state_t *state = pw->spw_state;
 278 
 279         pw->spw_state = NULL;
 280         signalfd_state_release(state, B_FALSE);
 281         pollwakeup(&pw->spw_pollhd, pw->spw_pollev);
 282         pollhead_clean(&pw->spw_pollhd);
 283         kmem_free(pw, sizeof (*pw));
 284 }
 285 
 286 /*
 287  * Called every time a signal is delivered to the process so that we can
 288  * see if any signal stream needs a pollwakeup. We maintain a list of
 289  * signal state elements so that we don't have to look at every file descriptor
 290  * on the process. If necessary, a further optimization would be to maintain a
 291  * signal set mask that is a union of all of the sets in the list so that
 292  * we don't even traverse the list if the signal is not in one of the elements.
 293  * However, since the list is likely to be very short, this is not currently
 294  * being done. A more complex data structure might also be used, but it is
 295  * unclear what that would be since each signal set needs to be checked for a
 296  * match.
 297  */
 298 static void
 299 signalfd_pollwake_cb(void *arg0, int sig)
 300 {
 301         proc_t *p = (proc_t *)arg0;
 302         sigfd_proc_state_t *pstate = (sigfd_proc_state_t *)p->p_sigfd;
 303         list_t *lst;
 304         sigfd_poll_waiter_t *pw;
 305 
 306         ASSERT(MUTEX_HELD(&p->p_lock));
 307         ASSERT(pstate != NULL);
 308 
 309         lst = &pstate->sigfd_list;
 310         pw = list_head(lst);
 311         while (pw != NULL) {
 312                 signalfd_state_t *state = pw->spw_state;
 313                 sigfd_poll_waiter_t *next;
 314 
 315                 mutex_enter(&state->sfd_lock);
 316                 if (!state->sfd_valid) {
 317                         pw->spw_pollev = POLLERR;
 318                 } else if (sigismember(&state->sfd_set, sig)) {
 319                         pw->spw_pollev = POLLRDNORM | POLLIN;
 320                 } else {
 321                         mutex_exit(&state->sfd_lock);
 322                         pw = list_next(lst, pw);
 323                         continue;
 324                 }
 325                 mutex_exit(&state->sfd_lock);
 326 
 327                 /*
 328                  * Pull the sigfd_poll_waiter_t out of the list and dispatch it
 329                  * to perform a pollwake.  This cannot be done synchronously
 330                  * since signalfd_poll and signalfd_pollwake_cb have
 331                  * conflicting lock orders which can deadlock.
 332                  */
 333                 next = list_next(lst, pw);
 334                 list_remove(lst, pw);
 335                 taskq_dispatch_ent(signalfd_wakeq, signalfd_wake_task, pw, 0,
 336                     &pw->spw_taskent);
 337                 pw = next;
 338         }
 339 }
 340 
 341 _NOTE(ARGSUSED(1))
 342 static int
 343 signalfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
 344 {
 345         signalfd_state_t *state, **sstate;
 346         major_t major = getemajor(*devp);
 347         minor_t minor = getminor(*devp);
 348 
 349         if (minor != SIGNALFDMNRN_SIGNALFD)
 350                 return (ENXIO);
 351 
 352         mutex_enter(&signalfd_lock);
 353 
 354         minor = (minor_t)id_allocff(signalfd_minor);
 355         if (ddi_soft_state_zalloc(signalfd_softstate, minor) != DDI_SUCCESS) {
 356                 id_free(signalfd_minor, minor);
 357                 mutex_exit(&signalfd_lock);
 358                 return (ENODEV);
 359         }
 360 
 361         state = kmem_zalloc(sizeof (*state), KM_SLEEP);
 362         state->sfd_valid = B_TRUE;
 363         state->sfd_count = 1;
 364         list_insert_head(&signalfd_state, (void *)state);
 365 
 366         sstate = ddi_get_soft_state(signalfd_softstate, minor);
 367         *sstate = state;
 368         *devp = makedevice(major, minor);
 369 
 370         mutex_exit(&signalfd_lock);
 371 
 372         return (0);
 373 }
 374 
 375 /*
 376  * Consume one signal from our set in a manner similar to sigtimedwait().
 377  * The block parameter is used to control whether we wait for a signal or
 378  * return immediately if no signal is pending. We use the thread's t_sigwait
 379  * member in the same way that it is used by sigtimedwait.
 380  *
 381  * Return 0 if we successfully consumed a signal or an errno if not.
 382  */
 383 static int
 384 consume_signal(k_sigset_t set, uio_t *uio, boolean_t block)
 385 {
 386         k_sigset_t oldmask;
 387         kthread_t *t = curthread;
 388         klwp_t *lwp = ttolwp(t);
 389         proc_t *p = ttoproc(t);
 390         timespec_t now;
 391         timespec_t *rqtp = NULL;        /* null means blocking */
 392         int timecheck = 0;
 393         int ret = 0;
 394         k_siginfo_t info, *infop;
 395         signalfd_siginfo_t ssi, *ssp = &ssi;
 396 
 397         if (block == B_FALSE) {
 398                 timecheck = timechanged;
 399                 gethrestime(&now);
 400                 rqtp = &now;        /* non-blocking check for pending signals */
 401         }
 402 
 403         t->t_sigwait = set;
 404 
 405         mutex_enter(&p->p_lock);
 406         /*
 407          * set the thread's signal mask to unmask those signals in the
 408          * specified set.
 409          */
 410         schedctl_finish_sigblock(t);
 411         oldmask = t->t_hold;
 412         sigdiffset(&t->t_hold, &t->t_sigwait);
 413 
 414         /*
 415          * Based on rqtp, wait indefinitely until we take a signal in our set
 416          * or return immediately if there are no signals pending from our set.
 417          */
 418         while ((ret = cv_waituntil_sig(&t->t_delay_cv, &p->p_lock, rqtp,
 419             timecheck)) > 0)
 420                 continue;
 421 
 422         /* Restore thread's signal mask to its previous value. */
 423         t->t_hold = oldmask;
 424         t->t_sig_check = 1;  /* so post_syscall sees new t_hold mask */
 425 
 426         if (ret == -1) {
 427                 /* no signals pending */
 428                 mutex_exit(&p->p_lock);
 429                 sigemptyset(&t->t_sigwait);
 430                 return (EAGAIN);        /* no signals pending */
 431         }
 432 
 433         /* Don't bother with signal if it is not in request set. */
 434         if (lwp->lwp_cursig == 0 ||
 435             !sigismember(&t->t_sigwait, lwp->lwp_cursig)) {
 436                 mutex_exit(&p->p_lock);
 437                 /*
 438                  * lwp_cursig is zero if pokelwps() awakened cv_wait_sig().
 439                  * This happens if some other thread in this process called
 440                  * forkall() or exit().
 441                  */
 442                 sigemptyset(&t->t_sigwait);
 443                 return (EINTR);
 444         }
 445 
 446         if (lwp->lwp_curinfo) {
 447                 infop = &lwp->lwp_curinfo->sq_info;
 448         } else {
 449                 infop = &info;
 450                 bzero(infop, sizeof (info));
 451                 infop->si_signo = lwp->lwp_cursig;
 452                 infop->si_code = SI_NOINFO;
 453         }
 454 
 455         lwp->lwp_ru.nsignals++;
 456 
 457         DTRACE_PROC2(signal__clear, int, ret, ksiginfo_t *, infop);
 458         lwp->lwp_cursig = 0;
 459         lwp->lwp_extsig = 0;
 460         mutex_exit(&p->p_lock);
 461 
 462         /* Convert k_siginfo into external, datamodel independent, struct. */
 463         bzero(ssp, sizeof (*ssp));
 464         ssp->ssi_signo = infop->si_signo;
 465         ssp->ssi_errno = infop->si_errno;
 466         ssp->ssi_code = infop->si_code;
 467         ssp->ssi_pid = infop->si_pid;
 468         ssp->ssi_uid = infop->si_uid;
 469         ssp->ssi_fd = infop->si_fd;
 470         ssp->ssi_band = infop->si_band;
 471         ssp->ssi_trapno = infop->si_trapno;
 472         ssp->ssi_status = infop->si_status;
 473         ssp->ssi_utime = infop->si_utime;
 474         ssp->ssi_stime = infop->si_stime;
 475         ssp->ssi_addr = (uint64_t)(intptr_t)infop->si_addr;
 476 
 477         ret = uiomove(ssp, sizeof (*ssp), UIO_READ, uio);
 478 
 479         if (lwp->lwp_curinfo) {
 480                 siginfofree(lwp->lwp_curinfo);
 481                 lwp->lwp_curinfo = NULL;
 482         }
 483         sigemptyset(&t->t_sigwait);
 484         return (ret);
 485 }
 486 
 487 /*
 488  * This is similar to sigtimedwait. Based on the fd mode we may wait until a
 489  * signal within our specified set is posted. We consume as many available
 490  * signals within our set as we can.
 491  */
 492 _NOTE(ARGSUSED(2))
 493 static int
 494 signalfd_read(dev_t dev, uio_t *uio, cred_t *cr)
 495 {
 496         signalfd_state_t *state, **sstate;
 497         minor_t minor = getminor(dev);
 498         boolean_t block = B_TRUE;
 499         k_sigset_t set;
 500         boolean_t got_one = B_FALSE;
 501         int res;
 502 
 503         if (uio->uio_resid < sizeof (signalfd_siginfo_t))
 504                 return (EINVAL);
 505 
 506         sstate = ddi_get_soft_state(signalfd_softstate, minor);
 507         state = *sstate;
 508 
 509         if (uio->uio_fmode & (FNDELAY|FNONBLOCK))
 510                 block = B_FALSE;
 511 
 512         mutex_enter(&state->sfd_lock);
 513         set = state->sfd_set;
 514         mutex_exit(&state->sfd_lock);
 515 
 516         if (sigisempty(&set))
 517                 return (set_errno(EINVAL));
 518 
 519         do  {
 520                 res = consume_signal(set, uio, block);
 521 
 522                 if (res == 0) {
 523                         /*
 524                          * After consuming one signal, do not block while
 525                          * trying to consume more.
 526                          */
 527                         got_one = B_TRUE;
 528                         block = B_FALSE;
 529 
 530                         /*
 531                          * Refresh the matching signal set in case it was
 532                          * updated during the wait.
 533                          */
 534                         mutex_enter(&state->sfd_lock);
 535                         set = state->sfd_set;
 536                         mutex_exit(&state->sfd_lock);
 537                         if (sigisempty(&set))
 538                                 break;
 539                 }
 540         } while (res == 0 && uio->uio_resid >= sizeof (signalfd_siginfo_t));
 541 
 542         if (got_one)
 543                 res = 0;
 544 
 545         return (res);
 546 }
 547 
 548 /*
 549  * If ksigset_t's were a single word, we would do:
 550  *      return (((p->p_sig | t->t_sig) & set) & fillset);
 551  */
 552 static int
 553 signalfd_sig_pending(proc_t *p, kthread_t *t, k_sigset_t set)
 554 {
 555         return (((p->p_sig.__sigbits[0] | t->t_sig.__sigbits[0]) &
 556             set.__sigbits[0]) |
 557             ((p->p_sig.__sigbits[1] | t->t_sig.__sigbits[1]) &
 558             set.__sigbits[1]) |
 559             (((p->p_sig.__sigbits[2] | t->t_sig.__sigbits[2]) &
 560             set.__sigbits[2]) & FILLSET2));
 561 }
 562 
 563 static int
 564 signalfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
 565     struct pollhead **phpp)
 566 {
 567         signalfd_state_t *state, **sstate;
 568         minor_t minor = getminor(dev);
 569         kthread_t *t = curthread;
 570         proc_t *p = ttoproc(t);
 571         short revents = 0;
 572 
 573         sstate = ddi_get_soft_state(signalfd_softstate, minor);
 574         state = *sstate;
 575 
 576         mutex_enter(&state->sfd_lock);
 577 
 578         if (signalfd_sig_pending(p, t, state->sfd_set) != 0)
 579                 revents |= POLLRDNORM | POLLIN;
 580 
 581         mutex_exit(&state->sfd_lock);
 582 
 583         *reventsp = revents & events;
 584         if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
 585                 sigfd_proc_state_t *pstate;
 586                 sigfd_poll_waiter_t *pw;
 587 
 588                 /*
 589                  * Enable pollwakeup handling.
 590                  */
 591                 mutex_enter(&p->p_lock);
 592                 if ((pstate = (sigfd_proc_state_t *)p->p_sigfd) == NULL) {
 593 
 594                         mutex_exit(&p->p_lock);
 595                         pstate = kmem_zalloc(sizeof (*pstate), KM_SLEEP);
 596                         list_create(&pstate->sigfd_list,
 597                             sizeof (sigfd_poll_waiter_t),
 598                             offsetof(sigfd_poll_waiter_t, spw_list));
 599                         pstate->sigfd_pollwake_cb = signalfd_pollwake_cb;
 600 
 601                         /* Check again, after blocking for the alloc. */
 602                         mutex_enter(&p->p_lock);
 603                         if (p->p_sigfd == NULL) {
 604                                 p->p_sigfd = pstate;
 605                         } else {
 606                                 /* someone beat us to it */
 607                                 list_destroy(&pstate->sigfd_list);
 608                                 kmem_free(pstate, sizeof (*pstate));
 609                                 pstate = p->p_sigfd;
 610                         }
 611                 }
 612 
 613                 pw = signalfd_wake_list_add(pstate, state);
 614                 *phpp = &pw->spw_pollhd;
 615                 mutex_exit(&p->p_lock);
 616         }
 617 
 618         return (0);
 619 }
 620 
 621 _NOTE(ARGSUSED(4))
 622 static int
 623 signalfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
 624 {
 625         signalfd_state_t *state, **sstate;
 626         minor_t minor = getminor(dev);
 627         sigset_t mask;
 628 
 629         sstate = ddi_get_soft_state(signalfd_softstate, minor);
 630         state = *sstate;
 631 
 632         switch (cmd) {
 633         case SIGNALFDIOC_MASK:
 634                 if (ddi_copyin((caddr_t)arg, (caddr_t)&mask, sizeof (sigset_t),
 635                     md) != 0)
 636                         return (set_errno(EFAULT));
 637 
 638                 mutex_enter(&state->sfd_lock);
 639                 sigutok(&mask, &state->sfd_set);
 640                 mutex_exit(&state->sfd_lock);
 641 
 642                 return (0);
 643 
 644         default:
 645                 break;
 646         }
 647 
 648         return (ENOTTY);
 649 }
 650 
 651 _NOTE(ARGSUSED(1))
 652 static int
 653 signalfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
 654 {
 655         signalfd_state_t *state, **sstate;
 656         sigfd_poll_waiter_t *pw = NULL;
 657         minor_t minor = getminor(dev);
 658         proc_t *p = curproc;
 659 
 660         sstate = ddi_get_soft_state(signalfd_softstate, minor);
 661         state = *sstate;
 662 
 663         /* Make sure state is removed from this proc's pollwake list. */
 664         mutex_enter(&p->p_lock);
 665         if (p->p_sigfd != NULL) {
 666                 sigfd_proc_state_t *pstate = p->p_sigfd;
 667 
 668                 pw = signalfd_wake_list_rm(pstate, state);
 669                 if (list_is_empty(&pstate->sigfd_list)) {
 670                         signalfd_wake_list_cleanup(p);
 671                 }
 672         }
 673         mutex_exit(&p->p_lock);
 674 
 675         if (pw != NULL) {
 676                 pollwakeup(&pw->spw_pollhd, POLLERR);
 677                 pollhead_clean(&pw->spw_pollhd);
 678                 kmem_free(pw, sizeof (*pw));
 679         }
 680 
 681         mutex_enter(&signalfd_lock);
 682 
 683         *sstate = NULL;
 684         ddi_soft_state_free(signalfd_softstate, minor);
 685         id_free(signalfd_minor, minor);
 686 
 687         signalfd_state_release(state, B_TRUE);
 688 
 689         mutex_exit(&signalfd_lock);
 690 
 691         return (0);
 692 }
 693 
 694 static int
 695 signalfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 696 {
 697         if (cmd != DDI_ATTACH || signalfd_devi != NULL)
 698                 return (DDI_FAILURE);
 699 
 700         mutex_enter(&signalfd_lock);
 701 
 702         signalfd_minor = id_space_create("signalfd_minor", 1, L_MAXMIN32 + 1);
 703         if (signalfd_minor == NULL) {
 704                 cmn_err(CE_WARN, "signalfd couldn't create id space");
 705                 mutex_exit(&signalfd_lock);
 706                 return (DDI_FAILURE);
 707         }
 708 
 709         if (ddi_soft_state_init(&signalfd_softstate,
 710             sizeof (signalfd_state_t *), 0) != 0) {
 711                 cmn_err(CE_WARN, "signalfd failed to create soft state");
 712                 id_space_destroy(signalfd_minor);
 713                 mutex_exit(&signalfd_lock);
 714                 return (DDI_FAILURE);
 715         }
 716 
 717         if (ddi_create_minor_node(devi, "signalfd", S_IFCHR,
 718             SIGNALFDMNRN_SIGNALFD, DDI_PSEUDO, NULL) == DDI_FAILURE) {
 719                 cmn_err(CE_NOTE, "/dev/signalfd couldn't create minor node");
 720                 ddi_soft_state_fini(&signalfd_softstate);
 721                 id_space_destroy(signalfd_minor);
 722                 mutex_exit(&signalfd_lock);
 723                 return (DDI_FAILURE);
 724         }
 725 
 726         ddi_report_dev(devi);
 727         signalfd_devi = devi;
 728 
 729         sigfd_exit_helper = signalfd_exit_helper;
 730 
 731         list_create(&signalfd_state, sizeof (signalfd_state_t),
 732             offsetof(signalfd_state_t, sfd_list));
 733 
 734         signalfd_wakeq = taskq_create("signalfd_wake", 1, minclsyspri,
 735             0, INT_MAX, TASKQ_PREPOPULATE);
 736 
 737         mutex_exit(&signalfd_lock);
 738 
 739         return (DDI_SUCCESS);
 740 }
 741 
 742 _NOTE(ARGSUSED(0))
 743 static int
 744 signalfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 745 {
 746         switch (cmd) {
 747         case DDI_DETACH:
 748                 break;
 749 
 750         default:
 751                 return (DDI_FAILURE);
 752         }
 753 
 754         mutex_enter(&signalfd_lock);
 755 
 756         if (!list_is_empty(&signalfd_state)) {
 757                 /*
 758                  * There are dangling poll waiters holding signalfd_state_t
 759                  * entries on the global list.  Detach is not possible until
 760                  * they purge themselves.
 761                  */
 762                 mutex_exit(&signalfd_lock);
 763                 return (DDI_FAILURE);
 764         }
 765         list_destroy(&signalfd_state);
 766 
 767         /*
 768          * With no remaining entries in the signalfd_state list, the wake taskq
 769          * should be empty with no possibility for new entries.
 770          */
 771         taskq_destroy(signalfd_wakeq);
 772 
 773         id_space_destroy(signalfd_minor);
 774 
 775         ddi_remove_minor_node(signalfd_devi, NULL);
 776         signalfd_devi = NULL;
 777         sigfd_exit_helper = NULL;
 778 
 779         ddi_soft_state_fini(&signalfd_softstate);
 780         mutex_exit(&signalfd_lock);
 781 
 782         return (DDI_SUCCESS);
 783 }
 784 
 785 _NOTE(ARGSUSED(0))
 786 static int
 787 signalfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 788 {
 789         int error;
 790 
 791         switch (infocmd) {
 792         case DDI_INFO_DEVT2DEVINFO:
 793                 *result = (void *)signalfd_devi;
 794                 error = DDI_SUCCESS;
 795                 break;
 796         case DDI_INFO_DEVT2INSTANCE:
 797                 *result = (void *)0;
 798                 error = DDI_SUCCESS;
 799                 break;
 800         default:
 801                 error = DDI_FAILURE;
 802         }
 803         return (error);
 804 }
 805 
 806 static struct cb_ops signalfd_cb_ops = {
 807         signalfd_open,          /* open */
 808         signalfd_close,         /* close */
 809         nulldev,                /* strategy */
 810         nulldev,                /* print */
 811         nodev,                  /* dump */
 812         signalfd_read,          /* read */
 813         nodev,                  /* write */
 814         signalfd_ioctl,         /* ioctl */
 815         nodev,                  /* devmap */
 816         nodev,                  /* mmap */
 817         nodev,                  /* segmap */
 818         signalfd_poll,          /* poll */
 819         ddi_prop_op,            /* cb_prop_op */
 820         0,                      /* streamtab  */
 821         D_NEW | D_MP            /* Driver compatibility flag */
 822 };
 823 
 824 static struct dev_ops signalfd_ops = {
 825         DEVO_REV,               /* devo_rev */
 826         0,                      /* refcnt */
 827         signalfd_info,          /* get_dev_info */
 828         nulldev,                /* identify */
 829         nulldev,                /* probe */
 830         signalfd_attach,        /* attach */
 831         signalfd_detach,        /* detach */
 832         nodev,                  /* reset */
 833         &signalfd_cb_ops,   /* driver operations */
 834         NULL,                   /* bus operations */
 835         nodev,                  /* dev power */
 836         ddi_quiesce_not_needed, /* quiesce */
 837 };
 838 
 839 static struct modldrv modldrv = {
 840         &mod_driverops,             /* module type (this is a pseudo driver) */
 841         "signalfd support",     /* name of module */
 842         &signalfd_ops,              /* driver ops */
 843 };
 844 
 845 static struct modlinkage modlinkage = {
 846         MODREV_1,
 847         (void *)&modldrv,
 848         NULL
 849 };
 850 
 851 int
 852 _init(void)
 853 {
 854         return (mod_install(&modlinkage));
 855 }
 856 
 857 int
 858 _info(struct modinfo *modinfop)
 859 {
 860         return (mod_info(&modlinkage, modinfop));
 861 }
 862 
 863 int
 864 _fini(void)
 865 {
 866         return (mod_remove(&modlinkage));
 867 }