1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2016 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * Support for the signalfd facility, a Linux-borne facility for
  18  * file descriptor-based synchronous signal consumption.
  19  *
  20  * As described on the signalfd(3C) man page, the general idea behind these
  21  * file descriptors is that they can be used to synchronously consume signals
  22  * via the read(2) syscall.  While that capability already exists with the
  23  * sigwaitinfo(3C) function, signalfd holds an advantage since it is file
  24  * descriptor based: It is able use the event facilities (poll(2), /dev/poll,
  25  * event ports) to notify interested parties when consumable signals arrive.
  26  *
  27  * The signalfd lifecycle begins When a process opens /dev/signalfd.  A minor
  28  * will be allocated for them along with an associated signalfd_state_t struct.
  29  * It is there where the mask of desired signals resides.
  30  *
  31  * Reading from the signalfd is straightforward and mimics the kernel behavior
  32  * for sigtimedwait().  Signals continue to live on either the proc's p_sig, or
  33  * thread's t_sig, member.  During a read operation, those which match the mask
  34  * are consumed so they are no longer pending.
  35  *
  36  * The poll side is more complex.  Every time a signal is delivered, all of the
  37  * signalfds on the process need to be examined in order to pollwake threads
  38  * waiting for signal arrival.
  39  *
  40  * When a thread polling on a signalfd requires a pollhead, several steps must
  41  * be taken to safely ensure the proper result.  A sigfd_proc_state_t is
  42  * created for the calling process if it does not yet exist.  It is there where
  43  * a list of sigfd_poll_waiter_t structures reside which associate pollheads to
  44  * signalfd_state_t entries.  The sigfd_proc_state_t list is walked to find a
  45  * sigfd_poll_waiter_t matching the signalfd_state_t which corresponds to the
  46  * polled resource.  If one is found, it is reused.  Otherwise a new one is
  47  * created, incrementing the refcount on the signalfd_state_t, and it is added
  48  * to the sigfd_poll_waiter_t list.
  49  *
  50  * The complications imposed by fork(2) are why the pollhead is stored in the
  51  * associated sigfd_poll_waiter_t instead of directly in the signalfd_state_t.
  52  * More than one process can hold a reference to the signalfd at a time but
  53  * arriving signals should wake only process-local pollers.  Additionally,
  54  * signalfd_close is called only when the last referencing fd is closed, hiding
  55  * occurrences of preceeding threads which released their references.  This
  56  * necessitates reference counting on the signalfd_state_t so it is able to
  57  * persist after close until all poll references have been cleansed.  Doing so
  58  * ensures that blocked pollers which hold references to the signalfd_state_t
  59  * will be able to do clean-up after the descriptor itself has been closed.
  60  *
  61  * When a signal arrives in a process polling on signalfd, signalfd_pollwake_cb
  62  * is called via the pointer in sigfd_proc_state_t.  It will walk over the
  63  * sigfd_poll_waiter_t entries present in the list, searching for any
  64  * associated with a signalfd_state_t with a matching signal mask.  The
  65  * approach of keeping the poller list in p_sigfd was chosen because a process
  66  * is likely to use few signalfds relative to its total file descriptors.  It
  67  * reduces the work required for each received signal.
  68  *
  69  * When matching sigfd_poll_waiter_t entries are encountered in the poller list
  70  * during signalfd_pollwake_cb, they are dispatched into signalfd_wakeq to
  71  * perform the pollwake.  This is due to a lock ordering conflict between
  72  * signalfd_poll and signalfd_pollwake_cb.  The former acquires
  73  * pollcache_t`pc_lock before proc_t`p_lock.  The latter (via sigtoproc)
  74  * reverses the order.  Defering the pollwake into a taskq means it can be
  75  * performed without proc_t`p_lock held, avoiding the deadlock.
  76  *
  77  * The sigfd_list is self-cleaning; as signalfd_pollwake_cb is called, the list
  78  * will clear out on its own.  Any remaining per-process state which remains
  79  * will be cleaned up by the exit helper (signalfd_exit_helper).
  80  *
  81  * The structures associated with signalfd state are designed to operate
  82  * correctly across fork, but there is one caveat that applies.  Using
  83  * fork-shared signalfd descriptors in conjuction with fork-shared caching poll
  84  * descriptors (such as /dev/poll or event ports) will result in missed poll
  85  * wake-ups.  This is caused by the pollhead identity of signalfd descriptors
  86  * being dependent on the process they are polled from.  Because it has a
  87  * thread-local cache, poll(2) is unaffected by this limitation.
  88  *
  89  * Lock ordering:
  90  *
  91  * 1. signalfd_lock
  92  * 2. signalfd_state_t`sfd_lock
  93  *
  94  * 1. proc_t`p_lock (to walk p_sigfd)
  95  * 2. signalfd_state_t`sfd_lock
  96  * 2a. signalfd_lock (after sfd_lock is dropped, when sfd_count falls to 0)
  97  */
  98 
  99 #include <sys/ddi.h>
 100 #include <sys/sunddi.h>
 101 #include <sys/signalfd.h>
 102 #include <sys/conf.h>
 103 #include <sys/sysmacros.h>
 104 #include <sys/filio.h>
 105 #include <sys/stat.h>
 106 #include <sys/file.h>
 107 #include <sys/schedctl.h>
 108 #include <sys/id_space.h>
 109 #include <sys/sdt.h>
 110 #include <sys/brand.h>
 111 #include <sys/disp.h>
 112 #include <sys/taskq_impl.h>
 113 
 114 typedef struct signalfd_state signalfd_state_t;
 115 
 116 struct signalfd_state {
 117         list_node_t     sfd_list;               /* node in global list */
 118         kmutex_t        sfd_lock;               /* protects fields below */
 119         uint_t          sfd_count;              /* ref count */
 120         boolean_t       sfd_valid;              /* valid while open */
 121         k_sigset_t      sfd_set;                /* signals for this fd */
 122 };
 123 
 124 typedef struct sigfd_poll_waiter {
 125         list_node_t             spw_list;
 126         signalfd_state_t        *spw_state;
 127         pollhead_t              spw_pollhd;
 128         taskq_ent_t             spw_taskent;
 129         short                   spw_pollev;
 130 } sigfd_poll_waiter_t;
 131 
 132 /*
 133  * Protects global state in signalfd_devi, signalfd_minor, signalfd_softstate,
 134  * and signalfd_state (including sfd_list field of members)
 135  */
 136 static kmutex_t         signalfd_lock;
 137 static dev_info_t       *signalfd_devi;         /* device info */
 138 static id_space_t       *signalfd_minor;        /* minor number arena */
 139 static void             *signalfd_softstate;    /* softstate pointer */
 140 static list_t           signalfd_state;         /* global list of state */
 141 static taskq_t          *signalfd_wakeq;        /* pollwake event taskq */
 142 
 143 
 144 static void
 145 signalfd_state_enter_locked(signalfd_state_t *state)
 146 {
 147         ASSERT(MUTEX_HELD(&state->sfd_lock));
 148         ASSERT(state->sfd_count > 0);
 149         VERIFY(state->sfd_valid == B_TRUE);
 150 
 151         state->sfd_count++;
 152 }
 153 
 154 static void
 155 signalfd_state_release(signalfd_state_t *state, boolean_t force_invalidate)
 156 {
 157         mutex_enter(&state->sfd_lock);
 158 
 159         if (force_invalidate) {
 160                 state->sfd_valid = B_FALSE;
 161         }
 162 
 163         ASSERT(state->sfd_count > 0);
 164         if (state->sfd_count == 1) {
 165                 VERIFY(state->sfd_valid == B_FALSE);
 166                 mutex_exit(&state->sfd_lock);
 167                 if (force_invalidate) {
 168                         /*
 169                          * The invalidation performed in signalfd_close is done
 170                          * while signalfd_lock is held.
 171                          */
 172                         ASSERT(MUTEX_HELD(&signalfd_lock));
 173                         list_remove(&signalfd_state, state);
 174                 } else {
 175                         ASSERT(MUTEX_NOT_HELD(&signalfd_lock));
 176                         mutex_enter(&signalfd_lock);
 177                         list_remove(&signalfd_state, state);
 178                         mutex_exit(&signalfd_lock);
 179                 }
 180                 kmem_free(state, sizeof (*state));
 181                 return;
 182         }
 183         state->sfd_count--;
 184         mutex_exit(&state->sfd_lock);
 185 }
 186 
 187 static sigfd_poll_waiter_t *
 188 signalfd_wake_list_add(sigfd_proc_state_t *pstate, signalfd_state_t *state)
 189 {
 190         list_t *lst = &pstate->sigfd_list;
 191         sigfd_poll_waiter_t *pw;
 192 
 193         for (pw = list_head(lst); pw != NULL; pw = list_next(lst, pw)) {
 194                 if (pw->spw_state == state)
 195                         break;
 196         }
 197 
 198         if (pw == NULL) {
 199                 pw = kmem_zalloc(sizeof (*pw), KM_SLEEP);
 200 
 201                 mutex_enter(&state->sfd_lock);
 202                 signalfd_state_enter_locked(state);
 203                 pw->spw_state = state;
 204                 mutex_exit(&state->sfd_lock);
 205                 list_insert_head(lst, pw);
 206         }
 207         return (pw);
 208 }
 209 
 210 static sigfd_poll_waiter_t *
 211 signalfd_wake_list_rm(sigfd_proc_state_t *pstate, signalfd_state_t *state)
 212 {
 213         list_t *lst = &pstate->sigfd_list;
 214         sigfd_poll_waiter_t *pw;
 215 
 216         for (pw = list_head(lst); pw != NULL; pw = list_next(lst, pw)) {
 217                 if (pw->spw_state == state) {
 218                         break;
 219                 }
 220         }
 221 
 222         if (pw != NULL) {
 223                 list_remove(lst, pw);
 224                 pw->spw_state = NULL;
 225                 signalfd_state_release(state, B_FALSE);
 226         }
 227 
 228         return (pw);
 229 }
 230 
 231 static void
 232 signalfd_wake_list_cleanup(proc_t *p)
 233 {
 234         sigfd_proc_state_t *pstate = p->p_sigfd;
 235         sigfd_poll_waiter_t *pw;
 236         list_t *lst;
 237 
 238         ASSERT(MUTEX_HELD(&p->p_lock));
 239         ASSERT(pstate != NULL);
 240 
 241         lst = &pstate->sigfd_list;
 242         while ((pw = list_remove_head(lst)) != NULL) {
 243                 signalfd_state_t *state = pw->spw_state;
 244 
 245                 pw->spw_state = NULL;
 246                 signalfd_state_release(state, B_FALSE);
 247 
 248                 pollwakeup(&pw->spw_pollhd, POLLERR);
 249                 pollhead_clean(&pw->spw_pollhd);
 250                 kmem_free(pw, sizeof (*pw));
 251         }
 252         list_destroy(lst);
 253 
 254         p->p_sigfd = NULL;
 255         kmem_free(pstate, sizeof (*pstate));
 256 }
 257 
 258 static void
 259 signalfd_exit_helper(void)
 260 {
 261         proc_t *p = curproc;
 262 
 263         mutex_enter(&p->p_lock);
 264         signalfd_wake_list_cleanup(p);
 265         mutex_exit(&p->p_lock);
 266 }
 267 
 268 /*
 269  * Perform pollwake for a sigfd_poll_waiter_t entry.
 270  * Thanks to the strict and conflicting lock orders required for signalfd_poll
 271  * (pc_lock before p_lock) and signalfd_pollwake_cb (p_lock before pc_lock),
 272  * this is relegated to a taskq to avoid deadlock.
 273  */
 274 static void
 275 signalfd_wake_task(void *arg)
 276 {
 277         sigfd_poll_waiter_t *pw = arg;
 278         signalfd_state_t *state = pw->spw_state;
 279 
 280         pw->spw_state = NULL;
 281         signalfd_state_release(state, B_FALSE);
 282         pollwakeup(&pw->spw_pollhd, pw->spw_pollev);
 283         pollhead_clean(&pw->spw_pollhd);
 284         kmem_free(pw, sizeof (*pw));
 285 }
 286 
 287 /*
 288  * Called every time a signal is delivered to the process so that we can
 289  * see if any signal stream needs a pollwakeup. We maintain a list of
 290  * signal state elements so that we don't have to look at every file descriptor
 291  * on the process. If necessary, a further optimization would be to maintain a
 292  * signal set mask that is a union of all of the sets in the list so that
 293  * we don't even traverse the list if the signal is not in one of the elements.
 294  * However, since the list is likely to be very short, this is not currently
 295  * being done. A more complex data structure might also be used, but it is
 296  * unclear what that would be since each signal set needs to be checked for a
 297  * match.
 298  */
 299 static void
 300 signalfd_pollwake_cb(void *arg0, int sig)
 301 {
 302         proc_t *p = (proc_t *)arg0;
 303         sigfd_proc_state_t *pstate = (sigfd_proc_state_t *)p->p_sigfd;
 304         list_t *lst;
 305         sigfd_poll_waiter_t *pw;
 306 
 307         ASSERT(MUTEX_HELD(&p->p_lock));
 308         ASSERT(pstate != NULL);
 309 
 310         lst = &pstate->sigfd_list;
 311         pw = list_head(lst);
 312         while (pw != NULL) {
 313                 signalfd_state_t *state = pw->spw_state;
 314                 sigfd_poll_waiter_t *next;
 315 
 316                 mutex_enter(&state->sfd_lock);
 317                 if (!state->sfd_valid) {
 318                         pw->spw_pollev = POLLERR;
 319                 } else if (sigismember(&state->sfd_set, sig)) {
 320                         pw->spw_pollev = POLLRDNORM | POLLIN;
 321                 } else {
 322                         mutex_exit(&state->sfd_lock);
 323                         pw = list_next(lst, pw);
 324                         continue;
 325                 }
 326                 mutex_exit(&state->sfd_lock);
 327 
 328                 /*
 329                  * Pull the sigfd_poll_waiter_t out of the list and dispatch it
 330                  * to perform a pollwake.  This cannot be done synchronously
 331                  * since signalfd_poll and signalfd_pollwake_cb have
 332                  * conflicting lock orders which can deadlock.
 333                  */
 334                 next = list_next(lst, pw);
 335                 list_remove(lst, pw);
 336                 taskq_dispatch_ent(signalfd_wakeq, signalfd_wake_task, pw, 0,
 337                     &pw->spw_taskent);
 338                 pw = next;
 339         }
 340 }
 341 
 342 _NOTE(ARGSUSED(1))
 343 static int
 344 signalfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
 345 {
 346         signalfd_state_t *state, **sstate;
 347         major_t major = getemajor(*devp);
 348         minor_t minor = getminor(*devp);
 349 
 350         if (minor != SIGNALFDMNRN_SIGNALFD)
 351                 return (ENXIO);
 352 
 353         mutex_enter(&signalfd_lock);
 354 
 355         minor = (minor_t)id_allocff(signalfd_minor);
 356         if (ddi_soft_state_zalloc(signalfd_softstate, minor) != DDI_SUCCESS) {
 357                 id_free(signalfd_minor, minor);
 358                 mutex_exit(&signalfd_lock);
 359                 return (ENODEV);
 360         }
 361 
 362         state = kmem_zalloc(sizeof (*state), KM_SLEEP);
 363         state->sfd_valid = B_TRUE;
 364         state->sfd_count = 1;
 365         list_insert_head(&signalfd_state, (void *)state);
 366 
 367         sstate = ddi_get_soft_state(signalfd_softstate, minor);
 368         *sstate = state;
 369         *devp = makedevice(major, minor);
 370 
 371         mutex_exit(&signalfd_lock);
 372 
 373         return (0);
 374 }
 375 
 376 /*
 377  * Consume one signal from our set in a manner similar to sigtimedwait().
 378  * The block parameter is used to control whether we wait for a signal or
 379  * return immediately if no signal is pending. We use the thread's t_sigwait
 380  * member in the same way that it is used by sigtimedwait.
 381  *
 382  * Return 0 if we successfully consumed a signal or an errno if not.
 383  */
 384 static int
 385 consume_signal(k_sigset_t set, uio_t *uio, boolean_t block)
 386 {
 387         k_sigset_t oldmask;
 388         kthread_t *t = curthread;
 389         klwp_t *lwp = ttolwp(t);
 390         proc_t *p = ttoproc(t);
 391         timespec_t now;
 392         timespec_t *rqtp = NULL;        /* null means blocking */
 393         int timecheck = 0;
 394         int ret = 0;
 395         k_siginfo_t info, *infop;
 396         signalfd_siginfo_t ssi, *ssp = &ssi;
 397 
 398         if (block == B_FALSE) {
 399                 timecheck = timechanged;
 400                 gethrestime(&now);
 401                 rqtp = &now;        /* non-blocking check for pending signals */
 402         }
 403 
 404         t->t_sigwait = set;
 405 
 406         mutex_enter(&p->p_lock);
 407         /*
 408          * set the thread's signal mask to unmask those signals in the
 409          * specified set.
 410          */
 411         schedctl_finish_sigblock(t);
 412         oldmask = t->t_hold;
 413         sigdiffset(&t->t_hold, &t->t_sigwait);
 414 
 415         /*
 416          * Based on rqtp, wait indefinitely until we take a signal in our set
 417          * or return immediately if there are no signals pending from our set.
 418          */
 419         while ((ret = cv_waituntil_sig(&t->t_delay_cv, &p->p_lock, rqtp,
 420             timecheck)) > 0)
 421                 continue;
 422 
 423         /* Restore thread's signal mask to its previous value. */
 424         t->t_hold = oldmask;
 425         t->t_sig_check = 1;  /* so post_syscall sees new t_hold mask */
 426 
 427         if (ret == -1) {
 428                 /* no signals pending */
 429                 mutex_exit(&p->p_lock);
 430                 sigemptyset(&t->t_sigwait);
 431                 return (EAGAIN);        /* no signals pending */
 432         }
 433 
 434         /* Don't bother with signal if it is not in request set. */
 435         if (lwp->lwp_cursig == 0 ||
 436             !sigismember(&t->t_sigwait, lwp->lwp_cursig)) {
 437                 mutex_exit(&p->p_lock);
 438                 /*
 439                  * lwp_cursig is zero if pokelwps() awakened cv_wait_sig().
 440                  * This happens if some other thread in this process called
 441                  * forkall() or exit().
 442                  */
 443                 sigemptyset(&t->t_sigwait);
 444                 return (EINTR);
 445         }
 446 
 447         if (lwp->lwp_curinfo) {
 448                 infop = &lwp->lwp_curinfo->sq_info;
 449         } else {
 450                 infop = &info;
 451                 bzero(infop, sizeof (info));
 452                 infop->si_signo = lwp->lwp_cursig;
 453                 infop->si_code = SI_NOINFO;
 454         }
 455 
 456         lwp->lwp_ru.nsignals++;
 457 
 458         DTRACE_PROC2(signal__clear, int, ret, ksiginfo_t *, infop);
 459         lwp->lwp_cursig = 0;
 460         lwp->lwp_extsig = 0;
 461         mutex_exit(&p->p_lock);
 462 
 463         if (PROC_IS_BRANDED(p) && BROP(p)->b_sigfd_translate)
 464                 BROP(p)->b_sigfd_translate(infop);
 465 
 466         /* Convert k_siginfo into external, datamodel independent, struct. */
 467         bzero(ssp, sizeof (*ssp));
 468         ssp->ssi_signo = infop->si_signo;
 469         ssp->ssi_errno = infop->si_errno;
 470         ssp->ssi_code = infop->si_code;
 471         ssp->ssi_pid = infop->si_pid;
 472         ssp->ssi_uid = infop->si_uid;
 473         ssp->ssi_fd = infop->si_fd;
 474         ssp->ssi_band = infop->si_band;
 475         ssp->ssi_trapno = infop->si_trapno;
 476         ssp->ssi_status = infop->si_status;
 477         ssp->ssi_utime = infop->si_utime;
 478         ssp->ssi_stime = infop->si_stime;
 479         ssp->ssi_addr = (uint64_t)(intptr_t)infop->si_addr;
 480 
 481         ret = uiomove(ssp, sizeof (*ssp), UIO_READ, uio);
 482 
 483         if (lwp->lwp_curinfo) {
 484                 siginfofree(lwp->lwp_curinfo);
 485                 lwp->lwp_curinfo = NULL;
 486         }
 487         sigemptyset(&t->t_sigwait);
 488         return (ret);
 489 }
 490 
 491 /*
 492  * This is similar to sigtimedwait. Based on the fd mode we may wait until a
 493  * signal within our specified set is posted. We consume as many available
 494  * signals within our set as we can.
 495  */
 496 _NOTE(ARGSUSED(2))
 497 static int
 498 signalfd_read(dev_t dev, uio_t *uio, cred_t *cr)
 499 {
 500         signalfd_state_t *state, **sstate;
 501         minor_t minor = getminor(dev);
 502         boolean_t block = B_TRUE;
 503         k_sigset_t set;
 504         boolean_t got_one = B_FALSE;
 505         int res;
 506 
 507         if (uio->uio_resid < sizeof (signalfd_siginfo_t))
 508                 return (EINVAL);
 509 
 510         sstate = ddi_get_soft_state(signalfd_softstate, minor);
 511         state = *sstate;
 512 
 513         if (uio->uio_fmode & (FNDELAY|FNONBLOCK))
 514                 block = B_FALSE;
 515 
 516         mutex_enter(&state->sfd_lock);
 517         set = state->sfd_set;
 518         mutex_exit(&state->sfd_lock);
 519 
 520         if (sigisempty(&set))
 521                 return (set_errno(EINVAL));
 522 
 523         do  {
 524                 res = consume_signal(set, uio, block);
 525 
 526                 if (res == 0) {
 527                         /*
 528                          * After consuming one signal, do not block while
 529                          * trying to consume more.
 530                          */
 531                         got_one = B_TRUE;
 532                         block = B_FALSE;
 533 
 534                         /*
 535                          * Refresh the matching signal set in case it was
 536                          * updated during the wait.
 537                          */
 538                         mutex_enter(&state->sfd_lock);
 539                         set = state->sfd_set;
 540                         mutex_exit(&state->sfd_lock);
 541                         if (sigisempty(&set))
 542                                 break;
 543                 }
 544         } while (res == 0 && uio->uio_resid >= sizeof (signalfd_siginfo_t));
 545 
 546         if (got_one)
 547                 res = 0;
 548 
 549         return (res);
 550 }
 551 
 552 /*
 553  * If ksigset_t's were a single word, we would do:
 554  *      return (((p->p_sig | t->t_sig) & set) & fillset);
 555  */
 556 static int
 557 signalfd_sig_pending(proc_t *p, kthread_t *t, k_sigset_t set)
 558 {
 559         return (((p->p_sig.__sigbits[0] | t->t_sig.__sigbits[0]) &
 560             set.__sigbits[0]) |
 561             ((p->p_sig.__sigbits[1] | t->t_sig.__sigbits[1]) &
 562             set.__sigbits[1]) |
 563             (((p->p_sig.__sigbits[2] | t->t_sig.__sigbits[2]) &
 564             set.__sigbits[2]) & FILLSET2));
 565 }
 566 
 567 _NOTE(ARGSUSED(4))
 568 static int
 569 signalfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
 570     struct pollhead **phpp)
 571 {
 572         signalfd_state_t *state, **sstate;
 573         minor_t minor = getminor(dev);
 574         kthread_t *t = curthread;
 575         proc_t *p = ttoproc(t);
 576         short revents = 0;
 577 
 578         sstate = ddi_get_soft_state(signalfd_softstate, minor);
 579         state = *sstate;
 580 
 581         mutex_enter(&state->sfd_lock);
 582 
 583         if (signalfd_sig_pending(p, t, state->sfd_set) != 0)
 584                 revents |= POLLRDNORM | POLLIN;
 585 
 586         mutex_exit(&state->sfd_lock);
 587 
 588         if (!(*reventsp = revents & events) && !anyyet) {
 589                 sigfd_proc_state_t *pstate;
 590                 sigfd_poll_waiter_t *pw;
 591 
 592                 /*
 593                  * Enable pollwakeup handling.
 594                  */
 595                 mutex_enter(&p->p_lock);
 596                 if ((pstate = (sigfd_proc_state_t *)p->p_sigfd) == NULL) {
 597 
 598                         mutex_exit(&p->p_lock);
 599                         pstate = kmem_zalloc(sizeof (*pstate), KM_SLEEP);
 600                         list_create(&pstate->sigfd_list,
 601                             sizeof (sigfd_poll_waiter_t),
 602                             offsetof(sigfd_poll_waiter_t, spw_list));
 603                         pstate->sigfd_pollwake_cb = signalfd_pollwake_cb;
 604 
 605                         /* Check again, after blocking for the alloc. */
 606                         mutex_enter(&p->p_lock);
 607                         if (p->p_sigfd == NULL) {
 608                                 p->p_sigfd = pstate;
 609                         } else {
 610                                 /* someone beat us to it */
 611                                 list_destroy(&pstate->sigfd_list);
 612                                 kmem_free(pstate, sizeof (*pstate));
 613                                 pstate = p->p_sigfd;
 614                         }
 615                 }
 616 
 617                 pw = signalfd_wake_list_add(pstate, state);
 618                 *phpp = &pw->spw_pollhd;
 619                 mutex_exit(&p->p_lock);
 620         }
 621 
 622         return (0);
 623 }
 624 
 625 _NOTE(ARGSUSED(4))
 626 static int
 627 signalfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
 628 {
 629         signalfd_state_t *state, **sstate;
 630         minor_t minor = getminor(dev);
 631         sigset_t mask;
 632 
 633         sstate = ddi_get_soft_state(signalfd_softstate, minor);
 634         state = *sstate;
 635 
 636         switch (cmd) {
 637         case SIGNALFDIOC_MASK:
 638                 if (ddi_copyin((caddr_t)arg, (caddr_t)&mask, sizeof (sigset_t),
 639                     md) != 0)
 640                         return (set_errno(EFAULT));
 641 
 642                 mutex_enter(&state->sfd_lock);
 643                 sigutok(&mask, &state->sfd_set);
 644                 mutex_exit(&state->sfd_lock);
 645 
 646                 return (0);
 647 
 648         default:
 649                 break;
 650         }
 651 
 652         return (ENOTTY);
 653 }
 654 
 655 _NOTE(ARGSUSED(1))
 656 static int
 657 signalfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
 658 {
 659         signalfd_state_t *state, **sstate;
 660         sigfd_poll_waiter_t *pw = NULL;
 661         minor_t minor = getminor(dev);
 662         proc_t *p = curproc;
 663 
 664         sstate = ddi_get_soft_state(signalfd_softstate, minor);
 665         state = *sstate;
 666 
 667         /* Make sure state is removed from this proc's pollwake list. */
 668         mutex_enter(&p->p_lock);
 669         if (p->p_sigfd != NULL) {
 670                 sigfd_proc_state_t *pstate = p->p_sigfd;
 671 
 672                 pw = signalfd_wake_list_rm(pstate, state);
 673                 if (list_is_empty(&pstate->sigfd_list)) {
 674                         signalfd_wake_list_cleanup(p);
 675                 }
 676         }
 677         mutex_exit(&p->p_lock);
 678 
 679         if (pw != NULL) {
 680                 pollwakeup(&pw->spw_pollhd, POLLERR);
 681                 pollhead_clean(&pw->spw_pollhd);
 682                 kmem_free(pw, sizeof (*pw));
 683         }
 684 
 685         mutex_enter(&signalfd_lock);
 686 
 687         *sstate = NULL;
 688         ddi_soft_state_free(signalfd_softstate, minor);
 689         id_free(signalfd_minor, minor);
 690 
 691         signalfd_state_release(state, B_TRUE);
 692 
 693         mutex_exit(&signalfd_lock);
 694 
 695         return (0);
 696 }
 697 
 698 static int
 699 signalfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 700 {
 701         if (cmd != DDI_ATTACH || signalfd_devi != NULL)
 702                 return (DDI_FAILURE);
 703 
 704         mutex_enter(&signalfd_lock);
 705 
 706         signalfd_minor = id_space_create("signalfd_minor", 1, L_MAXMIN32 + 1);
 707         if (signalfd_minor == NULL) {
 708                 cmn_err(CE_WARN, "signalfd couldn't create id space");
 709                 mutex_exit(&signalfd_lock);
 710                 return (DDI_FAILURE);
 711         }
 712 
 713         if (ddi_soft_state_init(&signalfd_softstate,
 714             sizeof (signalfd_state_t *), 0) != 0) {
 715                 cmn_err(CE_WARN, "signalfd failed to create soft state");
 716                 id_space_destroy(signalfd_minor);
 717                 mutex_exit(&signalfd_lock);
 718                 return (DDI_FAILURE);
 719         }
 720 
 721         if (ddi_create_minor_node(devi, "signalfd", S_IFCHR,
 722             SIGNALFDMNRN_SIGNALFD, DDI_PSEUDO, NULL) == DDI_FAILURE) {
 723                 cmn_err(CE_NOTE, "/dev/signalfd couldn't create minor node");
 724                 ddi_soft_state_fini(&signalfd_softstate);
 725                 id_space_destroy(signalfd_minor);
 726                 mutex_exit(&signalfd_lock);
 727                 return (DDI_FAILURE);
 728         }
 729 
 730         ddi_report_dev(devi);
 731         signalfd_devi = devi;
 732 
 733         sigfd_exit_helper = signalfd_exit_helper;
 734 
 735         list_create(&signalfd_state, sizeof (signalfd_state_t),
 736             offsetof(signalfd_state_t, sfd_list));
 737 
 738         signalfd_wakeq = taskq_create("signalfd_wake", 1, minclsyspri,
 739             0, INT_MAX, TASKQ_PREPOPULATE);
 740 
 741         mutex_exit(&signalfd_lock);
 742 
 743         return (DDI_SUCCESS);
 744 }
 745 
 746 _NOTE(ARGSUSED(0))
 747 static int
 748 signalfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 749 {
 750         switch (cmd) {
 751         case DDI_DETACH:
 752                 break;
 753 
 754         default:
 755                 return (DDI_FAILURE);
 756         }
 757 
 758         mutex_enter(&signalfd_lock);
 759 
 760         if (!list_is_empty(&signalfd_state)) {
 761                 /*
 762                  * There are dangling poll waiters holding signalfd_state_t
 763                  * entries on the global list.  Detach is not possible until
 764                  * they purge themselves.
 765                  */
 766                 mutex_exit(&signalfd_lock);
 767                 return (DDI_FAILURE);
 768         }
 769         list_destroy(&signalfd_state);
 770 
 771         /*
 772          * With no remaining entries in the signalfd_state list, the wake taskq
 773          * should be empty with no possibility for new entries.
 774          */
 775         taskq_destroy(signalfd_wakeq);
 776 
 777         id_space_destroy(signalfd_minor);
 778 
 779         ddi_remove_minor_node(signalfd_devi, NULL);
 780         signalfd_devi = NULL;
 781         sigfd_exit_helper = NULL;
 782 
 783         ddi_soft_state_fini(&signalfd_softstate);
 784         mutex_exit(&signalfd_lock);
 785 
 786         return (DDI_SUCCESS);
 787 }
 788 
 789 _NOTE(ARGSUSED(0))
 790 static int
 791 signalfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 792 {
 793         int error;
 794 
 795         switch (infocmd) {
 796         case DDI_INFO_DEVT2DEVINFO:
 797                 *result = (void *)signalfd_devi;
 798                 error = DDI_SUCCESS;
 799                 break;
 800         case DDI_INFO_DEVT2INSTANCE:
 801                 *result = (void *)0;
 802                 error = DDI_SUCCESS;
 803                 break;
 804         default:
 805                 error = DDI_FAILURE;
 806         }
 807         return (error);
 808 }
 809 
 810 static struct cb_ops signalfd_cb_ops = {
 811         signalfd_open,          /* open */
 812         signalfd_close,         /* close */
 813         nulldev,                /* strategy */
 814         nulldev,                /* print */
 815         nodev,                  /* dump */
 816         signalfd_read,          /* read */
 817         nodev,                  /* write */
 818         signalfd_ioctl,         /* ioctl */
 819         nodev,                  /* devmap */
 820         nodev,                  /* mmap */
 821         nodev,                  /* segmap */
 822         signalfd_poll,          /* poll */
 823         ddi_prop_op,            /* cb_prop_op */
 824         0,                      /* streamtab  */
 825         D_NEW | D_MP            /* Driver compatibility flag */
 826 };
 827 
 828 static struct dev_ops signalfd_ops = {
 829         DEVO_REV,               /* devo_rev */
 830         0,                      /* refcnt */
 831         signalfd_info,          /* get_dev_info */
 832         nulldev,                /* identify */
 833         nulldev,                /* probe */
 834         signalfd_attach,        /* attach */
 835         signalfd_detach,        /* detach */
 836         nodev,                  /* reset */
 837         &signalfd_cb_ops,   /* driver operations */
 838         NULL,                   /* bus operations */
 839         nodev,                  /* dev power */
 840         ddi_quiesce_not_needed, /* quiesce */
 841 };
 842 
 843 static struct modldrv modldrv = {
 844         &mod_driverops,             /* module type (this is a pseudo driver) */
 845         "signalfd support",     /* name of module */
 846         &signalfd_ops,              /* driver ops */
 847 };
 848 
 849 static struct modlinkage modlinkage = {
 850         MODREV_1,
 851         (void *)&modldrv,
 852         NULL
 853 };
 854 
 855 int
 856 _init(void)
 857 {
 858         return (mod_install(&modlinkage));
 859 }
 860 
 861 int
 862 _info(struct modinfo *modinfop)
 863 {
 864         return (mod_info(&modlinkage, modinfop));
 865 }
 866 
 867 int
 868 _fini(void)
 869 {
 870         return (mod_remove(&modlinkage));
 871 }