1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
  14  * Copyright (c) 2015 The MathWorks, Inc.  All rights reserved.
  15  */
  16 
  17 /*
  18  * Support for the inotify facility, a Linux-borne facility for asynchronous
  19  * notification of certain events on specified files or directories.  Our
  20  * implementation broadly leverages the file event monitoring facility, and
  21  * would actually be quite straightforward were it not for a very serious
  22  * blunder in the inotify interface:  in addition to allowing for one to be
  23  * notified on events on a particular file or directory, inotify also allows
  24  * for one to be notified on certain events on files _within_ a watched
  25  * directory -- even though those events have absolutely nothing to do with
  26  * the directory itself.  This leads to all sorts of madness because file
  27  * operations are (of course) not undertaken on paths but rather on open
  28  * files -- and the relationships between open files and the paths that resolve
  29  * to those files are neither static nor isomorphic.  We implement this
  30  * concept by having _child watches_ when directories are watched with events
  31  * in IN_CHILD_EVENTS.  We add child watches when a watch on a directory is
  32  * first added, and we modify those child watches dynamically as files are
  33  * created, deleted, moved into or moved out of the specified directory.  This
  34  * mechanism works well, absent hard links.  Hard links, unfortunately, break
  35  * this rather badly, and the user is warned that watches on directories that
  36  * have multiple directory entries referring to the same file may behave
  37  * unexpectedly.
  38  */
  39 
  40 #include <sys/ddi.h>
  41 #include <sys/sunddi.h>
  42 #include <sys/inotify.h>
  43 #include <sys/fem.h>
  44 #include <sys/conf.h>
  45 #include <sys/stat.h>
  46 #include <sys/vfs_opreg.h>
  47 #include <sys/vmem.h>
  48 #include <sys/avl.h>
  49 #include <sys/sysmacros.h>
  50 #include <sys/cyclic.h>
  51 #include <sys/filio.h>
  52 
  53 struct inotify_state;
  54 struct inotify_kevent;
  55 
  56 typedef struct inotify_watch inotify_watch_t;
  57 typedef struct inotify_state inotify_state_t;
  58 typedef struct inotify_kevent inotify_kevent_t;
  59 
  60 struct inotify_watch {
  61         kmutex_t inw_lock;                      /* lock protecting ref count */
  62         int inw_refcnt;                         /* reference count */
  63         uint8_t inw_zombie:1;                   /* boolean: is zombie */
  64         uint8_t inw_fired:1;                    /* boolean: fired one-shot */
  65         uint8_t inw_active:1;                   /* boolean: watch is active */
  66         uint8_t inw_orphaned:1;                 /* boolean: orphaned */
  67         kcondvar_t inw_cv;                      /* condvar for zombifier */
  68         uint32_t inw_mask;                      /* mask of watch */
  69         int32_t inw_wd;                         /* watch descriptor */
  70         vnode_t *inw_vp;                        /* underlying vnode */
  71         inotify_watch_t *inw_parent;            /* parent, if a child */
  72         avl_node_t inw_byvp;                    /* watches by vnode */
  73         avl_node_t inw_bywd;                    /* watches by descriptor */
  74         avl_tree_t inw_children;                /* children, if a parent */
  75         char *inw_name;                         /* name, if a child */
  76         list_node_t inw_orphan;                 /* orphan list */
  77         cred_t *inw_cred;                       /* cred, if orphaned */
  78         inotify_state_t *inw_state;             /* corresponding state */
  79 };
  80 
  81 struct inotify_kevent {
  82         inotify_kevent_t *ine_next;             /* next event in queue */
  83         struct inotify_event ine_event;         /* event (variable size) */
  84 };
  85 
  86 #define INOTIFY_EVENT_LENGTH(ev) \
  87         (sizeof (inotify_kevent_t) + (ev)->ine_event.len)
  88 
  89 struct inotify_state {
  90         kmutex_t ins_lock;                      /* lock protecting state */
  91         avl_tree_t ins_byvp;                    /* watches by vnode */
  92         avl_tree_t ins_bywd;                    /* watches by descriptor */
  93         vmem_t *ins_wds;                        /* watch identifier arena */
  94         int ins_maxwatches;                     /* maximum number of watches */
  95         int ins_maxevents;                      /* maximum number of events */
  96         int ins_nevents;                        /* current # of events */
  97         int32_t ins_size;                       /* total size of events */
  98         inotify_kevent_t *ins_head;             /* head of event queue */
  99         inotify_kevent_t *ins_tail;             /* tail of event queue */
 100         pollhead_t ins_pollhd;                  /* poll head */
 101         kcondvar_t ins_cv;                      /* condvar for reading */
 102         list_t ins_orphans;                     /* orphan list */
 103         ddi_periodic_t ins_cleaner;             /* cyclic for cleaning */
 104         inotify_watch_t *ins_zombies;           /* zombie watch list */
 105         cred_t *ins_cred;                       /* creator's credentials */
 106         inotify_state_t *ins_next;              /* next state on global list */
 107 };
 108 
 109 /*
 110  * Tunables (exported read-only in lx-branded zones via /proc).
 111  */
 112 int     inotify_maxwatches = 8192;              /* max watches per instance */
 113 int     inotify_maxevents = 16384;              /* max events */
 114 int     inotify_maxinstances = 128;             /* max instances per user */
 115 
 116 /*
 117  * Internal global variables.
 118  */
 119 static kmutex_t         inotify_lock;           /* lock protecting state */
 120 static dev_info_t       *inotify_devi;          /* device info */
 121 static fem_t            *inotify_femp;          /* FEM pointer */
 122 static vmem_t           *inotify_minor;         /* minor number arena */
 123 static void             *inotify_softstate;     /* softstate pointer */
 124 static inotify_state_t  *inotify_state;         /* global list if state */
 125 
 126 static void inotify_watch_event(inotify_watch_t *, uint64_t, char *);
 127 static void inotify_watch_insert(inotify_watch_t *, vnode_t *, char *);
 128 static void inotify_watch_delete(inotify_watch_t *, uint32_t);
 129 static void inotify_watch_remove(inotify_state_t *state,
 130         inotify_watch_t *watch);
 131 
 132 static int
 133 inotify_fop_close(femarg_t *vf, int flag, int count, offset_t offset,
 134     cred_t *cr, caller_context_t *ct)
 135 {
 136         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 137         int rval;
 138 
 139         if ((rval = vnext_close(vf, flag, count, offset, cr, ct)) == 0) {
 140                 inotify_watch_event(watch, flag & FWRITE ?
 141                     IN_CLOSE_WRITE : IN_CLOSE_NOWRITE, NULL);
 142         }
 143 
 144         return (rval);
 145 }
 146 
 147 static int
 148 inotify_fop_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl,
 149     int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
 150     vsecattr_t *vsecp)
 151 {
 152         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 153         int rval;
 154 
 155         if ((rval = vnext_create(vf, name, vap, excl, mode,
 156             vpp, cr, flag, ct, vsecp)) == 0) {
 157                 inotify_watch_insert(watch, *vpp, name);
 158                 inotify_watch_event(watch, IN_CREATE, name);
 159         }
 160 
 161         return (rval);
 162 }
 163 
 164 static int
 165 inotify_fop_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr,
 166     caller_context_t *ct, int flags)
 167 {
 168         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 169         int rval;
 170 
 171         if ((rval = vnext_link(vf, svp, tnm, cr, ct, flags)) == 0) {
 172                 inotify_watch_insert(watch, svp, tnm);
 173                 inotify_watch_event(watch, IN_CREATE, tnm);
 174         }
 175 
 176         return (rval);
 177 }
 178 
 179 static int
 180 inotify_fop_mkdir(femarg_t *vf, char *name, vattr_t *vap, vnode_t **vpp,
 181     cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
 182 {
 183         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 184         int rval;
 185 
 186         if ((rval = vnext_mkdir(vf, name, vap, vpp, cr,
 187             ct, flags, vsecp)) == 0) {
 188                 inotify_watch_insert(watch, *vpp, name);
 189                 inotify_watch_event(watch, IN_CREATE | IN_ISDIR, name);
 190         }
 191 
 192         return (rval);
 193 }
 194 
 195 static int
 196 inotify_fop_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
 197 {
 198         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 199         int rval;
 200 
 201         if ((rval = vnext_open(vf, mode, cr, ct)) == 0)
 202                 inotify_watch_event(watch, IN_OPEN, NULL);
 203 
 204         return (rval);
 205 }
 206 
 207 static int
 208 inotify_fop_read(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr,
 209     caller_context_t *ct)
 210 {
 211         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 212         int rval = vnext_read(vf, uiop, ioflag, cr, ct);
 213         inotify_watch_event(watch, IN_ACCESS, NULL);
 214 
 215         return (rval);
 216 }
 217 
 218 static int
 219 inotify_fop_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp,
 220     caller_context_t *ct, int flags)
 221 {
 222         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 223         int rval = vnext_readdir(vf, uiop, cr, eofp, ct, flags);
 224         inotify_watch_event(watch, IN_ACCESS | IN_ISDIR, NULL);
 225 
 226         return (rval);
 227 }
 228 
 229 int
 230 inotify_fop_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct,
 231     int flags)
 232 {
 233         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 234         int rval;
 235 
 236         if ((rval = vnext_remove(vf, nm, cr, ct, flags)) == 0)
 237                 inotify_watch_event(watch, IN_DELETE, nm);
 238 
 239         return (rval);
 240 }
 241 
 242 int
 243 inotify_fop_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr,
 244     caller_context_t *ct, int flags)
 245 {
 246         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 247         int rval;
 248 
 249         if ((rval = vnext_rmdir(vf, nm, cdir, cr, ct, flags)) == 0)
 250                 inotify_watch_event(watch, IN_DELETE | IN_ISDIR, nm);
 251 
 252         return (rval);
 253 }
 254 
 255 static int
 256 inotify_fop_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
 257     caller_context_t *ct)
 258 {
 259         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 260         int rval;
 261 
 262         if ((rval = vnext_setattr(vf, vap, flags, cr, ct)) == 0)
 263                 inotify_watch_event(watch, IN_ATTRIB, NULL);
 264 
 265         return (rval);
 266 }
 267 
 268 static int
 269 inotify_fop_write(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr,
 270     caller_context_t *ct)
 271 {
 272         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 273         int rval = vnext_write(vf, uiop, ioflag, cr, ct);
 274         inotify_watch_event(watch, IN_MODIFY, NULL);
 275 
 276         return (rval);
 277 }
 278 
 279 static int
 280 inotify_fop_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *name,
 281     caller_context_t *ct)
 282 {
 283         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 284 
 285         switch (vnevent) {
 286         case VE_RENAME_SRC:
 287                 inotify_watch_event(watch, IN_MOVE_SELF, NULL);
 288                 inotify_watch_delete(watch, IN_MOVE_SELF);
 289                 break;
 290         case VE_REMOVE:
 291                 /*
 292                  * Linux will apparently fire an IN_ATTRIB event when the link
 293                  * count changes (including when it drops to 0 on a remove).
 294                  * This is merely somewhat odd; what is amazing is that this
 295                  * IN_ATTRIB event is not visible on an inotify watch on the
 296                  * parent directory.  (IN_ATTRIB events are normally sent to
 297                  * watches on the parent directory).  While it's hard to
 298                  * believe that this constitutes desired semantics, ltp
 299                  * unfortunately tests this case (if implicitly); in the name
 300                  * of bug-for-bug compatibility, we fire IN_ATTRIB iff we are
 301                  * explicitly watching the file that has been removed.
 302                  */
 303                 if (watch->inw_parent == NULL)
 304                         inotify_watch_event(watch, IN_ATTRIB, NULL);
 305 
 306                 /*FALLTHROUGH*/
 307         case VE_RENAME_DEST:
 308                 inotify_watch_event(watch, IN_DELETE_SELF, NULL);
 309                 inotify_watch_delete(watch, IN_DELETE_SELF);
 310                 break;
 311         case VE_RMDIR:
 312                 /*
 313                  * It seems that IN_ISDIR should really be OR'd in here, but
 314                  * Linux doesn't seem to do that in this case; for the sake of
 315                  * bug-for-bug compatibility, we don't do it either.
 316                  */
 317                 inotify_watch_event(watch, IN_DELETE_SELF, NULL);
 318                 inotify_watch_delete(watch, IN_DELETE_SELF);
 319                 break;
 320         case VE_CREATE:
 321         case VE_TRUNCATE:
 322         case VE_RESIZE:
 323                 inotify_watch_event(watch, IN_MODIFY | IN_ATTRIB, NULL);
 324                 break;
 325         case VE_LINK:
 326                 inotify_watch_event(watch, IN_ATTRIB, NULL);
 327                 break;
 328         case VE_RENAME_SRC_DIR:
 329                 inotify_watch_event(watch, IN_MOVED_FROM, name);
 330                 break;
 331         case VE_RENAME_DEST_DIR:
 332                 if (name == NULL)
 333                         name = dvp->v_path;
 334 
 335                 inotify_watch_insert(watch, dvp, name);
 336                 inotify_watch_event(watch, IN_MOVED_TO, name);
 337                 break;
 338         case VE_SUPPORT:
 339         case VE_MOUNTEDOVER:
 340         case VE_PRE_RENAME_SRC:
 341         case VE_PRE_RENAME_DEST:
 342         case VE_PRE_RENAME_DEST_DIR:
 343                 break;
 344         }
 345 
 346         return (vnext_vnevent(vf, vnevent, dvp, name, ct));
 347 }
 348 
 349 const fs_operation_def_t inotify_vnodesrc_template[] = {
 350         VOPNAME_CLOSE,          { .femop_close = inotify_fop_close },
 351         VOPNAME_CREATE,         { .femop_create = inotify_fop_create },
 352         VOPNAME_LINK,           { .femop_link = inotify_fop_link },
 353         VOPNAME_MKDIR,          { .femop_mkdir = inotify_fop_mkdir },
 354         VOPNAME_OPEN,           { .femop_open = inotify_fop_open },
 355         VOPNAME_READ,           { .femop_read = inotify_fop_read },
 356         VOPNAME_READDIR,        { .femop_readdir = inotify_fop_readdir },
 357         VOPNAME_REMOVE,         { .femop_remove = inotify_fop_remove },
 358         VOPNAME_RMDIR,          { .femop_rmdir = inotify_fop_rmdir },
 359         VOPNAME_SETATTR,        { .femop_setattr = inotify_fop_setattr },
 360         VOPNAME_WRITE,          { .femop_write = inotify_fop_write },
 361         VOPNAME_VNEVENT,        { .femop_vnevent = inotify_fop_vnevent },
 362         NULL, NULL
 363 };
 364 
 365 static int
 366 inotify_watch_cmpwd(inotify_watch_t *lhs, inotify_watch_t *rhs)
 367 {
 368         if (lhs->inw_wd < rhs->inw_wd)
 369                 return (-1);
 370 
 371         if (lhs->inw_wd > rhs->inw_wd)
 372                 return (1);
 373 
 374         return (0);
 375 }
 376 
 377 static int
 378 inotify_watch_cmpvp(inotify_watch_t *lhs, inotify_watch_t *rhs)
 379 {
 380         uintptr_t lvp = (uintptr_t)lhs->inw_vp, rvp = (uintptr_t)rhs->inw_vp;
 381 
 382         if (lvp < rvp)
 383                 return (-1);
 384 
 385         if (lvp > rvp)
 386                 return (1);
 387 
 388         return (0);
 389 }
 390 
 391 static void
 392 inotify_watch_hold(inotify_watch_t *watch)
 393 {
 394         mutex_enter(&watch->inw_lock);
 395         VERIFY(watch->inw_refcnt > 0);
 396         watch->inw_refcnt++;
 397         mutex_exit(&watch->inw_lock);
 398 }
 399 
 400 static void
 401 inotify_watch_release(inotify_watch_t *watch)
 402 {
 403         mutex_enter(&watch->inw_lock);
 404         VERIFY(watch->inw_refcnt > 1);
 405 
 406         if (--watch->inw_refcnt == 1 && watch->inw_zombie) {
 407                 /*
 408                  * We're down to our last reference; kick anyone that might be
 409                  * waiting.
 410                  */
 411                 cv_signal(&watch->inw_cv);
 412         }
 413 
 414         mutex_exit(&watch->inw_lock);
 415 }
 416 
 417 static void
 418 inotify_watch_event(inotify_watch_t *watch, uint64_t mask, char *name)
 419 {
 420         inotify_kevent_t *event, *tail;
 421         inotify_state_t *state = watch->inw_state;
 422         uint32_t wd = watch->inw_wd, cookie = 0, len;
 423         boolean_t removal = mask & IN_REMOVAL ? B_TRUE : B_FALSE;
 424         inotify_watch_t *source = watch;
 425 
 426         if (!(mask &= watch->inw_mask) || mask == IN_ISDIR)
 427                 return;
 428 
 429         if (watch->inw_parent != NULL) {
 430                 /*
 431                  * This is an event on the child; if this isn't a valid child
 432                  * event, return.  Otherwise, we move our watch to be our
 433                  * parent (which we know is around because we have a hold on
 434                  * it) and continue.
 435                  */
 436                 if (!(mask & IN_CHILD_EVENTS))
 437                         return;
 438 
 439                 name = watch->inw_name;
 440                 watch = watch->inw_parent;
 441                 wd = watch->inw_wd;
 442         }
 443 
 444         if (!removal) {
 445                 mutex_enter(&state->ins_lock);
 446 
 447                 if (watch->inw_zombie ||
 448                     watch->inw_fired || !watch->inw_active) {
 449                         mutex_exit(&state->ins_lock);
 450                         return;
 451                 }
 452         } else {
 453                 if (!watch->inw_active)
 454                         return;
 455 
 456                 VERIFY(MUTEX_HELD(&state->ins_lock));
 457         }
 458 
 459         /*
 460          * If this is an operation on a directory and it's a child event
 461          * (event if it's not on a child), we specify IN_ISDIR.
 462          */
 463         if (source->inw_vp->v_type == VDIR && (mask & IN_CHILD_EVENTS))
 464                 mask |= IN_ISDIR;
 465 
 466         if (mask & (IN_MOVED_FROM | IN_MOVED_TO))
 467                 cookie = (uint32_t)curthread->t_did;
 468 
 469         if (state->ins_nevents >= state->ins_maxevents) {
 470                 /*
 471                  * We're at our maximum number of events -- turn our event
 472                  * into an IN_Q_OVERFLOW event, which will be coalesced if
 473                  * it's already the tail event.
 474                  */
 475                 mask = IN_Q_OVERFLOW;
 476                 wd = (uint32_t)-1;
 477                 cookie = 0;
 478                 len = 0;
 479         }
 480 
 481         if ((tail = state->ins_tail) != NULL && tail->ine_event.wd == wd &&
 482             tail->ine_event.mask == mask && tail->ine_event.cookie == cookie &&
 483             ((tail->ine_event.len == 0 && len == 0) ||
 484             (name != NULL && tail->ine_event.len != 0 &&
 485             strcmp(tail->ine_event.name, name) == 0))) {
 486                 /*
 487                  * This is an implicitly coalesced event; we're done.
 488                  */
 489                 if (!removal)
 490                         mutex_exit(&state->ins_lock);
 491                 return;
 492         }
 493 
 494         if (name != NULL) {
 495                 len = strlen(name) + 1;
 496                 len = roundup(len, sizeof (struct inotify_event));
 497         } else {
 498                 len = 0;
 499         }
 500 
 501         event = kmem_zalloc(sizeof (inotify_kevent_t) + len, KM_SLEEP);
 502         event->ine_event.wd = wd;
 503         event->ine_event.mask = (uint32_t)mask;
 504         event->ine_event.cookie = cookie;
 505         event->ine_event.len = len;
 506 
 507         if (name != NULL)
 508                 (void) strcpy(event->ine_event.name, name);
 509 
 510         if (tail != NULL) {
 511                 tail->ine_next = event;
 512         } else {
 513                 VERIFY(state->ins_head == NULL);
 514                 state->ins_head = event;
 515                 cv_broadcast(&state->ins_cv);
 516         }
 517 
 518         state->ins_tail = event;
 519         state->ins_nevents++;
 520         state->ins_size += sizeof (event->ine_event) + len;
 521 
 522         if (removal)
 523                 return;
 524 
 525         if ((watch->inw_mask & IN_ONESHOT) && !watch->inw_fired) {
 526                 /*
 527                  * If this is a one-shot, we need to remove the watch.  (Note
 528                  * that this will recurse back into inotify_watch_event() to
 529                  * fire the IN_IGNORED event -- but with "removal" set.)
 530                  */
 531                 watch->inw_fired = 1;
 532                 inotify_watch_remove(state, watch);
 533         }
 534 
 535         mutex_exit(&state->ins_lock);
 536         pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN);
 537 }
 538 
 539 /*
 540  * Destroy a watch.  By the time we're in here, the watch must have exactly
 541  * one reference.
 542  */
 543 static void
 544 inotify_watch_destroy(inotify_watch_t *watch)
 545 {
 546         VERIFY(MUTEX_HELD(&watch->inw_lock));
 547 
 548         if (watch->inw_name != NULL)
 549                 kmem_free(watch->inw_name, strlen(watch->inw_name) + 1);
 550 
 551         kmem_free(watch, sizeof (inotify_watch_t));
 552 }
 553 
 554 /*
 555  * Zombify a watch.  By the time we come in here, it must be true that the
 556  * watch has already been fem_uninstall()'d -- the only reference should be
 557  * in the state's data structure.  If we can get away with freeing it, we'll
 558  * do that -- but if the reference count is greater than one due to an active
 559  * vnode operation, we'll put this watch on the zombie list on the state
 560  * structure.
 561  */
 562 static void
 563 inotify_watch_zombify(inotify_watch_t *watch)
 564 {
 565         inotify_state_t *state = watch->inw_state;
 566 
 567         VERIFY(MUTEX_HELD(&state->ins_lock));
 568         VERIFY(!watch->inw_zombie);
 569 
 570         watch->inw_zombie = 1;
 571 
 572         if (watch->inw_parent != NULL) {
 573                 inotify_watch_release(watch->inw_parent);
 574         } else {
 575                 avl_remove(&state->ins_byvp, watch);
 576                 avl_remove(&state->ins_bywd, watch);
 577                 vmem_free(state->ins_wds, (void *)(uintptr_t)watch->inw_wd, 1);
 578                 watch->inw_wd = -1;
 579         }
 580 
 581         mutex_enter(&watch->inw_lock);
 582 
 583         if (watch->inw_refcnt == 1) {
 584                 /*
 585                  * There are no operations in flight and there is no way
 586                  * for anyone to discover this watch -- we can destroy it.
 587                  */
 588                 inotify_watch_destroy(watch);
 589         } else {
 590                 /*
 591                  * There are operations in flight; we will need to enqueue
 592                  * this for later destruction.
 593                  */
 594                 watch->inw_parent = state->ins_zombies;
 595                 state->ins_zombies = watch;
 596                 mutex_exit(&watch->inw_lock);
 597         }
 598 }
 599 
 600 static inotify_watch_t *
 601 inotify_watch_add(inotify_state_t *state, inotify_watch_t *parent,
 602     const char *name, vnode_t *vp, uint32_t mask)
 603 {
 604         inotify_watch_t *watch;
 605         int err;
 606 
 607         VERIFY(MUTEX_HELD(&state->ins_lock));
 608 
 609         watch = kmem_zalloc(sizeof (inotify_watch_t), KM_SLEEP);
 610 
 611         watch->inw_vp = vp;
 612         watch->inw_mask = mask;
 613         watch->inw_state = state;
 614         watch->inw_refcnt = 1;
 615 
 616         if (parent == NULL) {
 617                 watch->inw_wd = (int)(uintptr_t)vmem_alloc(state->ins_wds,
 618                     1, VM_BESTFIT | VM_SLEEP);
 619                 avl_add(&state->ins_byvp, watch);
 620                 avl_add(&state->ins_bywd, watch);
 621 
 622                 avl_create(&watch->inw_children,
 623                     (int(*)(const void *, const void *))inotify_watch_cmpvp,
 624                     sizeof (inotify_watch_t),
 625                     offsetof(inotify_watch_t, inw_byvp));
 626         } else {
 627                 VERIFY(name != NULL);
 628                 inotify_watch_hold(parent);
 629                 watch->inw_mask &= IN_CHILD_EVENTS;
 630                 watch->inw_parent = parent;
 631                 watch->inw_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
 632                 /* strcpy() is safe, because strlen(name) bounds us. */
 633                 (void) strcpy(watch->inw_name, name);
 634 
 635                 avl_add(&parent->inw_children, watch);
 636         }
 637 
 638         /*
 639          * Add our monitor to the vnode.  We must not have the watch lock held
 640          * when we do this, as it will immediately hold our watch.
 641          */
 642         err = fem_install(vp, inotify_femp, watch, OPARGUNIQ,
 643             (void (*)(void *))inotify_watch_hold,
 644             (void (*)(void *))inotify_watch_release);
 645 
 646         VERIFY(err == 0);
 647 
 648         return (watch);
 649 }
 650 
 651 /*
 652  * Remove a (non-child) watch.  This is called from either synchronous context
 653  * via inotify_rm_watch() or monitor context via either a vnevent or a
 654  * one-shot.
 655  */
 656 static void
 657 inotify_watch_remove(inotify_state_t *state, inotify_watch_t *watch)
 658 {
 659         inotify_watch_t *child;
 660         int err;
 661 
 662         VERIFY(MUTEX_HELD(&state->ins_lock));
 663         VERIFY(watch->inw_parent == NULL);
 664 
 665         err = fem_uninstall(watch->inw_vp, inotify_femp, watch);
 666         VERIFY(err == 0);
 667 
 668         /*
 669          * If we have children, we're going to remove them all and set them
 670          * all to be zombies.
 671          */
 672         while ((child = avl_first(&watch->inw_children)) != NULL) {
 673                 VERIFY(child->inw_parent == watch);
 674                 avl_remove(&watch->inw_children, child);
 675 
 676                 err = fem_uninstall(child->inw_vp, inotify_femp, child);
 677                 VERIFY(err == 0);
 678 
 679                 /*
 680                  * If this child watch has been orphaned, remove it from the
 681                  * state's list of orphans.
 682                  */
 683                 if (child->inw_orphaned) {
 684                         list_remove(&state->ins_orphans, child);
 685                         crfree(child->inw_cred);
 686                 }
 687 
 688                 VN_RELE(child->inw_vp);
 689 
 690                 /*
 691                  * We're down (or should be down) to a single reference to
 692                  * this child watch; it's safe to zombify it.
 693                  */
 694                 inotify_watch_zombify(child);
 695         }
 696 
 697         inotify_watch_event(watch, IN_IGNORED | IN_REMOVAL, NULL);
 698         VN_RELE(watch->inw_vp);
 699 
 700         /*
 701          * It's now safe to zombify the watch -- we know that the only reference
 702          * can come from operations in flight.
 703          */
 704         inotify_watch_zombify(watch);
 705 }
 706 
 707 /*
 708  * Delete a watch.  Should only be called from VOP context.
 709  */
 710 static void
 711 inotify_watch_delete(inotify_watch_t *watch, uint32_t event)
 712 {
 713         inotify_state_t *state = watch->inw_state;
 714         inotify_watch_t cmp = { .inw_vp = watch->inw_vp }, *parent;
 715         int err;
 716 
 717         if (event != IN_DELETE_SELF && !(watch->inw_mask & IN_CHILD_EVENTS))
 718                 return;
 719 
 720         mutex_enter(&state->ins_lock);
 721 
 722         if (watch->inw_zombie) {
 723                 mutex_exit(&state->ins_lock);
 724                 return;
 725         }
 726 
 727         if ((parent = watch->inw_parent) == NULL) {
 728                 if (event == IN_DELETE_SELF) {
 729                         /*
 730                          * If we're here because we're being deleted and we
 731                          * are not a child watch, we need to delete the entire
 732                          * watch, children and all.
 733                          */
 734                         inotify_watch_remove(state, watch);
 735                 }
 736 
 737                 mutex_exit(&state->ins_lock);
 738                 return;
 739         } else {
 740                 if (event == IN_DELETE_SELF &&
 741                     !(parent->inw_mask & IN_EXCL_UNLINK)) {
 742                         /*
 743                          * This is a child watch for a file that is being
 744                          * removed and IN_EXCL_UNLINK has not been specified;
 745                          * indicate that it is orphaned and add it to the list
 746                          * of orphans.  (This list will be checked by the
 747                          * cleaning cyclic to determine when the watch has
 748                          * become the only hold on the vnode, at which point
 749                          * the watch can be zombified.)  Note that we check
 750                          * if the watch is orphaned before we orphan it:  hard
 751                          * links make it possible for VE_REMOVE to be called
 752                          * multiple times on the same vnode. (!)
 753                          */
 754                         if (!watch->inw_orphaned) {
 755                                 watch->inw_orphaned = 1;
 756                                 watch->inw_cred = CRED();
 757                                 crhold(watch->inw_cred);
 758                                 list_insert_head(&state->ins_orphans, watch);
 759                         }
 760 
 761                         mutex_exit(&state->ins_lock);
 762                         return;
 763                 }
 764 
 765                 if (watch->inw_orphaned) {
 766                         /*
 767                          * If we're here, a file was orphaned and then later
 768                          * moved -- which almost certainly means that hard
 769                          * links are on the scene.  We choose the orphan over
 770                          * the move because we don't want to spuriously
 771                          * drop events if we can avoid it.
 772                          */
 773                         crfree(watch->inw_cred);
 774                         list_remove(&state->ins_orphans, watch);
 775                 }
 776         }
 777 
 778         if (avl_find(&parent->inw_children, &cmp, NULL) == NULL) {
 779                 /*
 780                  * This watch has already been deleted from the parent.
 781                  */
 782                 mutex_exit(&state->ins_lock);
 783                 return;
 784         }
 785 
 786         avl_remove(&parent->inw_children, watch);
 787         err = fem_uninstall(watch->inw_vp, inotify_femp, watch);
 788         VERIFY(err == 0);
 789 
 790         VN_RELE(watch->inw_vp);
 791 
 792         /*
 793          * It's now safe to zombify the watch -- which won't actually delete
 794          * it as we know that the reference count is greater than 1.
 795          */
 796         inotify_watch_zombify(watch);
 797         mutex_exit(&state->ins_lock);
 798 }
 799 
 800 /*
 801  * Insert a new child watch.  Should only be called from VOP context when
 802  * a child is created in a watched directory.
 803  */
 804 static void
 805 inotify_watch_insert(inotify_watch_t *watch, vnode_t *vp, char *name)
 806 {
 807         inotify_state_t *state = watch->inw_state;
 808         inotify_watch_t cmp = { .inw_vp = vp };
 809 
 810         if (!(watch->inw_mask & IN_CHILD_EVENTS))
 811                 return;
 812 
 813         mutex_enter(&state->ins_lock);
 814 
 815         if (watch->inw_zombie || watch->inw_parent != NULL || vp == NULL) {
 816                 mutex_exit(&state->ins_lock);
 817                 return;
 818         }
 819 
 820         if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) {
 821                 mutex_exit(&state->ins_lock);
 822                 return;
 823         }
 824 
 825         VN_HOLD(vp);
 826         watch = inotify_watch_add(state, watch, name, vp, watch->inw_mask);
 827         VERIFY(watch != NULL);
 828 
 829         mutex_exit(&state->ins_lock);
 830 }
 831 
 832 
 833 static int
 834 inotify_add_watch(inotify_state_t *state, vnode_t *vp, uint32_t mask,
 835     int32_t *wdp)
 836 {
 837         inotify_watch_t *watch, cmp = { .inw_vp = vp };
 838         uint32_t set;
 839 
 840         set = (mask & (IN_ALL_EVENTS | IN_MODIFIERS)) | IN_UNMASKABLE;
 841 
 842         /*
 843          * Lookup our vnode to determine if we already have a watch on it.
 844          */
 845         mutex_enter(&state->ins_lock);
 846 
 847         if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) {
 848                 /*
 849                  * We don't have this watch; allocate a new one, provided that
 850                  * we have fewer than our limit.
 851                  */
 852                 if (avl_numnodes(&state->ins_bywd) >= state->ins_maxwatches) {
 853                         mutex_exit(&state->ins_lock);
 854                         return (ENOSPC);
 855                 }
 856 
 857                 VN_HOLD(vp);
 858                 watch = inotify_watch_add(state, NULL, NULL, vp, set);
 859                 *wdp = watch->inw_wd;
 860                 mutex_exit(&state->ins_lock);
 861 
 862                 return (0);
 863         }
 864 
 865         VERIFY(!watch->inw_zombie);
 866 
 867         if (!(mask & IN_MASK_ADD)) {
 868                 /*
 869                  * Note that if we're resetting our event mask and we're
 870                  * transitioning from an event mask that includes child events
 871                  * to one that doesn't, there will be potentially some stale
 872                  * child watches.  This is basically fine:  they won't fire,
 873                  * and they will correctly be removed when the watch is
 874                  * removed.
 875                  */
 876                 watch->inw_mask = 0;
 877         }
 878 
 879         watch->inw_mask |= set;
 880 
 881         *wdp = watch->inw_wd;
 882 
 883         mutex_exit(&state->ins_lock);
 884 
 885         return (0);
 886 }
 887 
 888 static int
 889 inotify_add_child(inotify_state_t *state, vnode_t *vp, char *name)
 890 {
 891         inotify_watch_t *watch, cmp = { .inw_vp = vp };
 892         vnode_t *cvp;
 893         int err;
 894 
 895         /*
 896          * Verify that the specified child doesn't have a directory component
 897          * within it.
 898          */
 899         if (strchr(name, '/') != NULL)
 900                 return (EINVAL);
 901 
 902         /*
 903          * Lookup the underlying file.  Note that this will succeed even if
 904          * we don't have permissions to actually read the file.
 905          */
 906         if ((err = lookupnameat(name,
 907             UIO_SYSSPACE, NO_FOLLOW, NULL, &cvp, vp)) != 0) {
 908                 return (err);
 909         }
 910 
 911         /*
 912          * Use our vnode to find our watch, and then add our child watch to it.
 913          */
 914         mutex_enter(&state->ins_lock);
 915 
 916         if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) {
 917                 /*
 918                  * This is unexpected -- it means that we don't have the
 919                  * watch that we thought we had.
 920                  */
 921                 mutex_exit(&state->ins_lock);
 922                 VN_RELE(cvp);
 923                 return (ENXIO);
 924         }
 925 
 926         /*
 927          * Now lookup the child vnode in the watch; we'll only add it if it
 928          * isn't already there.
 929          */
 930         cmp.inw_vp = cvp;
 931 
 932         if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) {
 933                 mutex_exit(&state->ins_lock);
 934                 VN_RELE(cvp);
 935                 return (0);
 936         }
 937 
 938         watch = inotify_watch_add(state, watch, name, cvp, watch->inw_mask);
 939         VERIFY(watch != NULL);
 940         mutex_exit(&state->ins_lock);
 941 
 942         return (0);
 943 }
 944 
 945 static int
 946 inotify_rm_watch(inotify_state_t *state, int32_t wd)
 947 {
 948         inotify_watch_t *watch, cmp = { .inw_wd = wd };
 949 
 950         mutex_enter(&state->ins_lock);
 951 
 952         if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) {
 953                 mutex_exit(&state->ins_lock);
 954                 return (EINVAL);
 955         }
 956 
 957         inotify_watch_remove(state, watch);
 958         mutex_exit(&state->ins_lock);
 959 
 960         return (0);
 961 }
 962 
 963 static int
 964 inotify_activate(inotify_state_t *state, int32_t wd)
 965 {
 966         inotify_watch_t *watch, cmp = { .inw_wd = wd };
 967 
 968         mutex_enter(&state->ins_lock);
 969 
 970         if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) {
 971                 mutex_exit(&state->ins_lock);
 972                 return (EINVAL);
 973         }
 974 
 975         watch->inw_active = 1;
 976 
 977         mutex_exit(&state->ins_lock);
 978 
 979         return (0);
 980 }
 981 
 982 /*
 983  * Called periodically as a cyclic to process the orphans and zombies.
 984  */
 985 static void
 986 inotify_clean(void *arg)
 987 {
 988         inotify_state_t *state = arg;
 989         inotify_watch_t *watch, *parent, *next, **prev;
 990         cred_t *savecred;
 991         int err;
 992 
 993         mutex_enter(&state->ins_lock);
 994 
 995         for (watch = list_head(&state->ins_orphans);
 996             watch != NULL; watch = next) {
 997                 next = list_next(&state->ins_orphans, watch);
 998 
 999                 VERIFY(!watch->inw_zombie);
1000                 VERIFY((parent = watch->inw_parent) != NULL);
1001 
1002                 if (watch->inw_vp->v_count > 1)
1003                         continue;
1004 
1005                 avl_remove(&parent->inw_children, watch);
1006                 err = fem_uninstall(watch->inw_vp, inotify_femp, watch);
1007                 VERIFY(err == 0);
1008 
1009                 list_remove(&state->ins_orphans, watch);
1010 
1011                 /*
1012                  * For purposes of releasing the vnode, we need to switch our
1013                  * cred to be the cred of the orphaning thread (which we held
1014                  * at the time this watch was orphaned).
1015                  */
1016                 savecred = curthread->t_cred;
1017                 curthread->t_cred = watch->inw_cred;
1018                 VN_RELE(watch->inw_vp);
1019                 crfree(watch->inw_cred);
1020                 curthread->t_cred = savecred;
1021 
1022                 inotify_watch_zombify(watch);
1023         }
1024 
1025         prev = &state->ins_zombies;
1026 
1027         while ((watch = *prev) != NULL) {
1028                 mutex_enter(&watch->inw_lock);
1029 
1030                 if (watch->inw_refcnt == 1) {
1031                         *prev = watch->inw_parent;
1032                         inotify_watch_destroy(watch);
1033                         continue;
1034                 }
1035 
1036                 prev = &watch->inw_parent;
1037                 mutex_exit(&watch->inw_lock);
1038         }
1039 
1040         mutex_exit(&state->ins_lock);
1041 }
1042 
1043 /*ARGSUSED*/
1044 static int
1045 inotify_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
1046 {
1047         inotify_state_t *state;
1048         major_t major = getemajor(*devp);
1049         minor_t minor = getminor(*devp);
1050         int instances = 0;
1051         char c[64];
1052 
1053         if (minor != INOTIFYMNRN_INOTIFY)
1054                 return (ENXIO);
1055 
1056         mutex_enter(&inotify_lock);
1057 
1058         for (state = inotify_state; state != NULL; state = state->ins_next) {
1059                 if (state->ins_cred == cred_p)
1060                         instances++;
1061         }
1062 
1063         if (instances >= inotify_maxinstances) {
1064                 mutex_exit(&inotify_lock);
1065                 return (EMFILE);
1066         }
1067 
1068         minor = (minor_t)(uintptr_t)vmem_alloc(inotify_minor, 1,
1069             VM_BESTFIT | VM_SLEEP);
1070 
1071         if (ddi_soft_state_zalloc(inotify_softstate, minor) != DDI_SUCCESS) {
1072                 vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1);
1073                 mutex_exit(&inotify_lock);
1074                 return (NULL);
1075         }
1076 
1077         state = ddi_get_soft_state(inotify_softstate, minor);
1078         *devp = makedevice(major, minor);
1079 
1080         crhold(cred_p);
1081         state->ins_cred = cred_p;
1082         state->ins_next = inotify_state;
1083         inotify_state = state;
1084 
1085         (void) snprintf(c, sizeof (c), "inotify_watchid_%d", minor);
1086         state->ins_wds = vmem_create(c, (void *)1, UINT32_MAX, 1,
1087             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
1088 
1089         avl_create(&state->ins_bywd,
1090             (int(*)(const void *, const void *))inotify_watch_cmpwd,
1091             sizeof (inotify_watch_t),
1092             offsetof(inotify_watch_t, inw_bywd));
1093 
1094         avl_create(&state->ins_byvp,
1095             (int(*)(const void *, const void *))inotify_watch_cmpvp,
1096             sizeof (inotify_watch_t),
1097             offsetof(inotify_watch_t, inw_byvp));
1098 
1099         list_create(&state->ins_orphans, sizeof (inotify_watch_t),
1100             offsetof(inotify_watch_t, inw_orphan));
1101 
1102         state->ins_maxwatches = inotify_maxwatches;
1103         state->ins_maxevents = inotify_maxevents;
1104 
1105         mutex_exit(&inotify_lock);
1106 
1107         state->ins_cleaner = ddi_periodic_add(inotify_clean,
1108             state, NANOSEC, DDI_IPL_0);
1109 
1110         return (0);
1111 }
1112 
1113 /*ARGSUSED*/
1114 static int
1115 inotify_read(dev_t dev, uio_t *uio, cred_t *cr)
1116 {
1117         inotify_state_t *state;
1118         inotify_kevent_t *event;
1119         minor_t minor = getminor(dev);
1120         int err = 0, nevents = 0;
1121         size_t len;
1122 
1123         state = ddi_get_soft_state(inotify_softstate, minor);
1124 
1125         mutex_enter(&state->ins_lock);
1126 
1127         while (state->ins_head == NULL) {
1128                 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
1129                         mutex_exit(&state->ins_lock);
1130                         return (EAGAIN);
1131                 }
1132 
1133                 if (!cv_wait_sig_swap(&state->ins_cv, &state->ins_lock)) {
1134                         mutex_exit(&state->ins_lock);
1135                         return (EINTR);
1136                 }
1137         }
1138 
1139         /*
1140          * We have events and we have our lock; return as many as we can.
1141          */
1142         while ((event = state->ins_head) != NULL) {
1143                 len = sizeof (event->ine_event) + event->ine_event.len;
1144 
1145                 if (uio->uio_resid < len) {
1146                         if (nevents == 0)
1147                                 err = EINVAL;
1148                         break;
1149                 }
1150 
1151                 nevents++;
1152 
1153                 if ((err = uiomove(&event->ine_event, len, UIO_READ, uio)) != 0)
1154                         break;
1155 
1156                 VERIFY(state->ins_nevents > 0);
1157                 state->ins_nevents--;
1158 
1159                 VERIFY(state->ins_size > 0);
1160                 state->ins_size -= len;
1161 
1162                 if ((state->ins_head = event->ine_next) == NULL) {
1163                         VERIFY(event == state->ins_tail);
1164                         VERIFY(state->ins_nevents == 0);
1165                         state->ins_tail = NULL;
1166                 }
1167 
1168                 kmem_free(event, INOTIFY_EVENT_LENGTH(event));
1169         }
1170 
1171         mutex_exit(&state->ins_lock);
1172 
1173         return (err);
1174 }
1175 
1176 /*ARGSUSED*/
1177 static int
1178 inotify_poll(dev_t dev, short events, int anyyet, short *reventsp,
1179     struct pollhead **phpp)
1180 {
1181         inotify_state_t *state;
1182         minor_t minor = getminor(dev);
1183 
1184         state = ddi_get_soft_state(inotify_softstate, minor);
1185 
1186         mutex_enter(&state->ins_lock);
1187 
1188         if (state->ins_head != NULL) {
1189                 *reventsp = events & (POLLRDNORM | POLLIN);
1190         } else {
1191                 *reventsp = 0;
1192 
1193                 if (!anyyet)
1194                         *phpp = &state->ins_pollhd;
1195         }
1196 
1197         mutex_exit(&state->ins_lock);
1198 
1199         return (0);
1200 }
1201 
1202 /*ARGSUSED*/
1203 static int
1204 inotify_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
1205 {
1206         inotify_state_t *state;
1207         minor_t minor = getminor(dev);
1208         file_t *fp;
1209         int rval;
1210 
1211         state = ddi_get_soft_state(inotify_softstate, minor);
1212 
1213         switch (cmd) {
1214         case INOTIFYIOC_ADD_WATCH: {
1215                 inotify_addwatch_t addwatch;
1216                 file_t *fp;
1217 
1218                 if (copyin((void *)arg, &addwatch, sizeof (addwatch)) != 0)
1219                         return (EFAULT);
1220 
1221                 if ((fp = getf(addwatch.inaw_fd)) == NULL)
1222                         return (EBADF);
1223 
1224                 rval = inotify_add_watch(state, fp->f_vnode,
1225                     addwatch.inaw_mask, rv);
1226 
1227                 releasef(addwatch.inaw_fd);
1228                 return (rval);
1229         }
1230 
1231         case INOTIFYIOC_ADD_CHILD: {
1232                 inotify_addchild_t addchild;
1233                 char name[MAXPATHLEN];
1234 
1235                 if (copyin((void *)arg, &addchild, sizeof (addchild)) != 0)
1236                         return (EFAULT);
1237 
1238                 if (copyinstr(addchild.inac_name, name, MAXPATHLEN, NULL) != 0)
1239                         return (EFAULT);
1240 
1241                 if ((fp = getf(addchild.inac_fd)) == NULL)
1242                         return (EBADF);
1243 
1244                 rval = inotify_add_child(state, fp->f_vnode, name);
1245 
1246                 releasef(addchild.inac_fd);
1247                 return (rval);
1248         }
1249 
1250         case INOTIFYIOC_RM_WATCH:
1251                 return (inotify_rm_watch(state, arg));
1252 
1253         case INOTIFYIOC_ACTIVATE:
1254                 return (inotify_activate(state, arg));
1255 
1256         case FIONREAD: {
1257                 int32_t size;
1258 
1259                 mutex_enter(&state->ins_lock);
1260                 size = state->ins_size;
1261                 mutex_exit(&state->ins_lock);
1262 
1263                 if (copyout(&size, (void *)arg, sizeof (size)) != 0)
1264                         return (EFAULT);
1265 
1266                 return (0);
1267         }
1268 
1269         default:
1270                 break;
1271         }
1272 
1273         return (ENOTTY);
1274 }
1275 
1276 /*ARGSUSED*/
1277 static int
1278 inotify_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
1279 {
1280         inotify_state_t *state, **sp;
1281         inotify_watch_t *watch, *zombies;
1282         inotify_kevent_t *event;
1283         minor_t minor = getminor(dev);
1284 
1285         state = ddi_get_soft_state(inotify_softstate, minor);
1286 
1287         if (state->ins_pollhd.ph_list != NULL) {
1288                 pollwakeup(&state->ins_pollhd, POLLERR);
1289                 pollhead_clean(&state->ins_pollhd);
1290         }
1291 
1292         mutex_enter(&state->ins_lock);
1293 
1294         /*
1295          * First, destroy all of our watches.
1296          */
1297         while ((watch = avl_first(&state->ins_bywd)) != NULL)
1298                 inotify_watch_remove(state, watch);
1299 
1300         /*
1301          * And now destroy our event queue.
1302          */
1303         while ((event = state->ins_head) != NULL) {
1304                 state->ins_head = event->ine_next;
1305                 kmem_free(event, INOTIFY_EVENT_LENGTH(event));
1306         }
1307 
1308         zombies = state->ins_zombies;
1309         state->ins_zombies = NULL;
1310         mutex_exit(&state->ins_lock);
1311 
1312         /*
1313          * Now that our state lock is dropped, we can synchronously wait on
1314          * any zombies.
1315          */
1316         while ((watch = zombies) != NULL) {
1317                 zombies = zombies->inw_parent;
1318 
1319                 mutex_enter(&watch->inw_lock);
1320 
1321                 while (watch->inw_refcnt > 1)
1322                         cv_wait(&watch->inw_cv, &watch->inw_lock);
1323 
1324                 inotify_watch_destroy(watch);
1325         }
1326 
1327         if (state->ins_cleaner != NULL) {
1328                 ddi_periodic_delete(state->ins_cleaner);
1329                 state->ins_cleaner = NULL;
1330         }
1331 
1332         mutex_enter(&inotify_lock);
1333 
1334         /*
1335          * Remove our state from our global list, and release our hold on
1336          * the cred.
1337          */
1338         for (sp = &inotify_state; *sp != state; sp = &((*sp)->ins_next))
1339                 VERIFY(*sp != NULL);
1340 
1341         *sp = (*sp)->ins_next;
1342         crfree(state->ins_cred);
1343         vmem_destroy(state->ins_wds);
1344 
1345         ddi_soft_state_free(inotify_softstate, minor);
1346         vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1);
1347 
1348         mutex_exit(&inotify_lock);
1349 
1350         return (0);
1351 }
1352 
1353 /*ARGSUSED*/
1354 static int
1355 inotify_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
1356 {
1357         mutex_enter(&inotify_lock);
1358 
1359         if (ddi_soft_state_init(&inotify_softstate,
1360             sizeof (inotify_state_t), 0) != 0) {
1361                 cmn_err(CE_NOTE, "/dev/inotify failed to create soft state");
1362                 mutex_exit(&inotify_lock);
1363                 return (DDI_FAILURE);
1364         }
1365 
1366         if (ddi_create_minor_node(devi, "inotify", S_IFCHR,
1367             INOTIFYMNRN_INOTIFY, DDI_PSEUDO, NULL) == DDI_FAILURE) {
1368                 cmn_err(CE_NOTE, "/dev/inotify couldn't create minor node");
1369                 ddi_soft_state_fini(&inotify_softstate);
1370                 mutex_exit(&inotify_lock);
1371                 return (DDI_FAILURE);
1372         }
1373 
1374         if (fem_create("inotify_fem",
1375             inotify_vnodesrc_template, &inotify_femp) != 0) {
1376                 cmn_err(CE_NOTE, "/dev/inotify couldn't create FEM state");
1377                 ddi_remove_minor_node(devi, NULL);
1378                 ddi_soft_state_fini(&inotify_softstate);
1379                 mutex_exit(&inotify_lock);
1380                 return (DDI_FAILURE);
1381         }
1382 
1383         ddi_report_dev(devi);
1384         inotify_devi = devi;
1385 
1386         inotify_minor = vmem_create("inotify_minor", (void *)INOTIFYMNRN_CLONE,
1387             UINT32_MAX - INOTIFYMNRN_CLONE, 1, NULL, NULL, NULL, 0,
1388             VM_SLEEP | VMC_IDENTIFIER);
1389 
1390         mutex_exit(&inotify_lock);
1391 
1392         return (DDI_SUCCESS);
1393 }
1394 
1395 /*ARGSUSED*/
1396 static int
1397 inotify_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1398 {
1399         switch (cmd) {
1400         case DDI_DETACH:
1401                 break;
1402 
1403         case DDI_SUSPEND:
1404                 return (DDI_SUCCESS);
1405 
1406         default:
1407                 return (DDI_FAILURE);
1408         }
1409 
1410         mutex_enter(&inotify_lock);
1411         fem_free(inotify_femp);
1412         vmem_destroy(inotify_minor);
1413 
1414         ddi_remove_minor_node(inotify_devi, NULL);
1415         inotify_devi = NULL;
1416 
1417         ddi_soft_state_fini(&inotify_softstate);
1418         mutex_exit(&inotify_lock);
1419 
1420         return (DDI_SUCCESS);
1421 }
1422 
1423 /*ARGSUSED*/
1424 static int
1425 inotify_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1426 {
1427         int error;
1428 
1429         switch (infocmd) {
1430         case DDI_INFO_DEVT2DEVINFO:
1431                 *result = (void *)inotify_devi;
1432                 error = DDI_SUCCESS;
1433                 break;
1434         case DDI_INFO_DEVT2INSTANCE:
1435                 *result = (void *)0;
1436                 error = DDI_SUCCESS;
1437                 break;
1438         default:
1439                 error = DDI_FAILURE;
1440         }
1441         return (error);
1442 }
1443 
1444 static struct cb_ops inotify_cb_ops = {
1445         inotify_open,           /* open */
1446         inotify_close,          /* close */
1447         nulldev,                /* strategy */
1448         nulldev,                /* print */
1449         nodev,                  /* dump */
1450         inotify_read,           /* read */
1451         nodev,                  /* write */
1452         inotify_ioctl,          /* ioctl */
1453         nodev,                  /* devmap */
1454         nodev,                  /* mmap */
1455         nodev,                  /* segmap */
1456         inotify_poll,           /* poll */
1457         ddi_prop_op,            /* cb_prop_op */
1458         0,                      /* streamtab  */
1459         D_NEW | D_MP            /* Driver compatibility flag */
1460 };
1461 
1462 static struct dev_ops inotify_ops = {
1463         DEVO_REV,               /* devo_rev */
1464         0,                      /* refcnt */
1465         inotify_info,           /* get_dev_info */
1466         nulldev,                /* identify */
1467         nulldev,                /* probe */
1468         inotify_attach,         /* attach */
1469         inotify_detach,         /* detach */
1470         nodev,                  /* reset */
1471         &inotify_cb_ops,    /* driver operations */
1472         NULL,                   /* bus operations */
1473         nodev,                  /* dev power */
1474         ddi_quiesce_not_needed, /* quiesce */
1475 };
1476 
1477 static struct modldrv modldrv = {
1478         &mod_driverops,             /* module type (this is a pseudo driver) */
1479         "inotify support",      /* name of module */
1480         &inotify_ops,               /* driver ops */
1481 };
1482 
1483 static struct modlinkage modlinkage = {
1484         MODREV_1,
1485         (void *)&modldrv,
1486         NULL
1487 };
1488 
1489 int
1490 _init(void)
1491 {
1492         return (mod_install(&modlinkage));
1493 }
1494 
1495 int
1496 _info(struct modinfo *modinfop)
1497 {
1498         return (mod_info(&modlinkage, modinfop));
1499 }
1500 
1501 int
1502 _fini(void)
1503 {
1504         return (mod_remove(&modlinkage));
1505 }