1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
  14  * Copyright (c) 2015 The MathWorks, Inc.  All rights reserved.
  15  */
  16 
  17 /*
  18  * Support for the inotify facility, a Linux-borne facility for asynchronous
  19  * notification of certain events on specified files or directories.  Our
  20  * implementation broadly leverages the file event monitoring facility, and
  21  * would actually be quite straightforward were it not for a very serious
  22  * blunder in the inotify interface:  in addition to allowing for one to be
  23  * notified on events on a particular file or directory, inotify also allows
  24  * for one to be notified on certain events on files _within_ a watched
  25  * directory -- even though those events have absolutely nothing to do with
  26  * the directory itself.  This leads to all sorts of madness because file
  27  * operations are (of course) not undertaken on paths but rather on open
  28  * files -- and the relationships between open files and the paths that resolve
  29  * to those files are neither static nor isomorphic.  We implement this
  30  * concept by having _child watches_ when directories are watched with events
  31  * in IN_CHILD_EVENTS.  We add child watches when a watch on a directory is
  32  * first added, and we modify those child watches dynamically as files are
  33  * created, deleted, moved into or moved out of the specified directory.  This
  34  * mechanism works well, absent hard links.  Hard links, unfortunately, break
  35  * this rather badly, and the user is warned that watches on directories that
  36  * have multiple directory entries referring to the same file may behave
  37  * unexpectedly.
  38  */
  39 
  40 #include <sys/ddi.h>
  41 #include <sys/sunddi.h>
  42 #include <sys/inotify.h>
  43 #include <sys/fem.h>
  44 #include <sys/conf.h>
  45 #include <sys/stat.h>
  46 #include <sys/vfs_opreg.h>
  47 #include <sys/vmem.h>
  48 #include <sys/avl.h>
  49 #include <sys/sysmacros.h>
  50 #include <sys/cyclic.h>
  51 #include <sys/filio.h>
  52 
  53 struct inotify_state;
  54 struct inotify_kevent;
  55 
  56 typedef struct inotify_watch inotify_watch_t;
  57 typedef struct inotify_state inotify_state_t;
  58 typedef struct inotify_kevent inotify_kevent_t;
  59 
  60 struct inotify_watch {
  61         kmutex_t inw_lock;                      /* lock protecting ref count */
  62         int inw_refcnt;                         /* reference count */
  63         uint8_t inw_zombie:1;                   /* boolean: is zombie */
  64         uint8_t inw_fired:1;                    /* boolean: fired one-shot */
  65         uint8_t inw_active:1;                   /* boolean: watch is active */
  66         uint8_t inw_orphaned:1;                 /* boolean: orphaned */
  67         kcondvar_t inw_cv;                      /* condvar for zombifier */
  68         uint32_t inw_mask;                      /* mask of watch */
  69         int32_t inw_wd;                         /* watch descriptor */
  70         vnode_t *inw_vp;                        /* underlying vnode */
  71         inotify_watch_t *inw_parent;            /* parent, if a child */
  72         avl_node_t inw_byvp;                    /* watches by vnode */
  73         avl_node_t inw_bywd;                    /* watches by descriptor */
  74         avl_tree_t inw_children;                /* children, if a parent */
  75         char *inw_name;                         /* name, if a child */
  76         list_node_t inw_orphan;                 /* orphan list */
  77         cred_t *inw_cred;                       /* cred, if orphaned */
  78         inotify_state_t *inw_state;             /* corresponding state */
  79 };
  80 
  81 struct inotify_kevent {
  82         inotify_kevent_t *ine_next;             /* next event in queue */
  83         struct inotify_event ine_event;         /* event (variable size) */
  84 };
  85 
  86 #define INOTIFY_EVENT_LENGTH(ev) \
  87         (sizeof (inotify_kevent_t) + (ev)->ine_event.len)
  88 
  89 struct inotify_state {
  90         kmutex_t ins_lock;                      /* lock protecting state */
  91         avl_tree_t ins_byvp;                    /* watches by vnode */
  92         avl_tree_t ins_bywd;                    /* watches by descriptor */
  93         vmem_t *ins_wds;                        /* watch identifier arena */
  94         int ins_maxwatches;                     /* maximum number of watches */
  95         int ins_maxevents;                      /* maximum number of events */
  96         int ins_nevents;                        /* current # of events */
  97         int32_t ins_size;                       /* total size of events */
  98         inotify_kevent_t *ins_head;             /* head of event queue */
  99         inotify_kevent_t *ins_tail;             /* tail of event queue */
 100         pollhead_t ins_pollhd;                  /* poll head */
 101         kcondvar_t ins_cv;                      /* condvar for reading */
 102         list_t ins_orphans;                     /* orphan list */
 103         ddi_periodic_t ins_cleaner;             /* cyclic for cleaning */
 104         inotify_watch_t *ins_zombies;           /* zombie watch list */
 105         cred_t *ins_cred;                       /* creator's credentials */
 106         inotify_state_t *ins_next;              /* next state on global list */
 107 };
 108 
 109 /*
 110  * Tunables (exported read-only in lx-branded zones via /proc).
 111  */
 112 int     inotify_maxwatches = 8192;              /* max watches per instance */
 113 int     inotify_maxevents = 16384;              /* max events */
 114 int     inotify_maxinstances = 128;             /* max instances per user */
 115 
 116 /*
 117  * Internal global variables.
 118  */
 119 static kmutex_t         inotify_lock;           /* lock protecting state */
 120 static dev_info_t       *inotify_devi;          /* device info */
 121 static fem_t            *inotify_femp;          /* FEM pointer */
 122 static vmem_t           *inotify_minor;         /* minor number arena */
 123 static void             *inotify_softstate;     /* softstate pointer */
 124 static inotify_state_t  *inotify_state;         /* global list if state */
 125 
 126 static void inotify_watch_event(inotify_watch_t *, uint64_t, char *);
 127 static void inotify_watch_insert(inotify_watch_t *, vnode_t *, char *);
 128 static void inotify_watch_delete(inotify_watch_t *, uint32_t);
 129 static void inotify_watch_remove(inotify_state_t *state,
 130         inotify_watch_t *watch);
 131 
 132 static int
 133 inotify_fop_close(femarg_t *vf, int flag, int count, offset_t offset,
 134     cred_t *cr, caller_context_t *ct)
 135 {
 136         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 137         int rval;
 138 
 139         if ((rval = vnext_close(vf, flag, count, offset, cr, ct)) == 0) {
 140                 inotify_watch_event(watch, flag & FWRITE ?
 141                     IN_CLOSE_WRITE : IN_CLOSE_NOWRITE, NULL);
 142         }
 143 
 144         return (rval);
 145 }
 146 
 147 static int
 148 inotify_fop_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl,
 149     int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
 150     vsecattr_t *vsecp)
 151 {
 152         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 153         int rval;
 154 
 155         if ((rval = vnext_create(vf, name, vap, excl, mode,
 156             vpp, cr, flag, ct, vsecp)) == 0) {
 157                 inotify_watch_insert(watch, *vpp, name);
 158                 inotify_watch_event(watch, IN_CREATE, name);
 159         }
 160 
 161         return (rval);
 162 }
 163 
 164 static int
 165 inotify_fop_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr,
 166     caller_context_t *ct, int flags)
 167 {
 168         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 169         int rval;
 170 
 171         if ((rval = vnext_link(vf, svp, tnm, cr, ct, flags)) == 0) {
 172                 inotify_watch_insert(watch, svp, tnm);
 173                 inotify_watch_event(watch, IN_CREATE, tnm);
 174         }
 175 
 176         return (rval);
 177 }
 178 
 179 static int
 180 inotify_fop_mkdir(femarg_t *vf, char *name, vattr_t *vap, vnode_t **vpp,
 181     cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
 182 {
 183         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 184         int rval;
 185 
 186         if ((rval = vnext_mkdir(vf, name, vap, vpp, cr,
 187             ct, flags, vsecp)) == 0) {
 188                 inotify_watch_insert(watch, *vpp, name);
 189                 inotify_watch_event(watch, IN_CREATE | IN_ISDIR, name);
 190         }
 191 
 192         return (rval);
 193 }
 194 
 195 static int
 196 inotify_fop_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
 197 {
 198         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 199         int rval;
 200 
 201         if ((rval = vnext_open(vf, mode, cr, ct)) == 0)
 202                 inotify_watch_event(watch, IN_OPEN, NULL);
 203 
 204         return (rval);
 205 }
 206 
 207 static int
 208 inotify_fop_read(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr,
 209     caller_context_t *ct)
 210 {
 211         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 212         int rval = vnext_read(vf, uiop, ioflag, cr, ct);
 213         inotify_watch_event(watch, IN_ACCESS, NULL);
 214 
 215         return (rval);
 216 }
 217 
 218 static int
 219 inotify_fop_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp,
 220     caller_context_t *ct, int flags)
 221 {
 222         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 223         int rval = vnext_readdir(vf, uiop, cr, eofp, ct, flags);
 224         inotify_watch_event(watch, IN_ACCESS | IN_ISDIR, NULL);
 225 
 226         return (rval);
 227 }
 228 
 229 int
 230 inotify_fop_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct,
 231     int flags)
 232 {
 233         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 234         int rval;
 235 
 236         if ((rval = vnext_remove(vf, nm, cr, ct, flags)) == 0)
 237                 inotify_watch_event(watch, IN_DELETE, nm);
 238 
 239         return (rval);
 240 }
 241 
 242 int
 243 inotify_fop_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr,
 244     caller_context_t *ct, int flags)
 245 {
 246         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 247         int rval;
 248 
 249         if ((rval = vnext_rmdir(vf, nm, cdir, cr, ct, flags)) == 0)
 250                 inotify_watch_event(watch, IN_DELETE | IN_ISDIR, nm);
 251 
 252         return (rval);
 253 }
 254 
 255 static int
 256 inotify_fop_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
 257     caller_context_t *ct)
 258 {
 259         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 260         int rval;
 261 
 262         if ((rval = vnext_setattr(vf, vap, flags, cr, ct)) == 0)
 263                 inotify_watch_event(watch, IN_ATTRIB, NULL);
 264 
 265         return (rval);
 266 }
 267 
 268 static int
 269 inotify_fop_write(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr,
 270     caller_context_t *ct)
 271 {
 272         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 273         int rval = vnext_write(vf, uiop, ioflag, cr, ct);
 274         inotify_watch_event(watch, IN_MODIFY, NULL);
 275 
 276         return (rval);
 277 }
 278 
 279 static int
 280 inotify_fop_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *name,
 281     caller_context_t *ct)
 282 {
 283         inotify_watch_t *watch = vf->fa_fnode->fn_available;
 284 
 285         switch (vnevent) {
 286         case VE_RENAME_SRC:
 287                 inotify_watch_event(watch, IN_MOVE_SELF, NULL);
 288                 inotify_watch_delete(watch, IN_MOVE_SELF);
 289                 break;
 290         case VE_REMOVE:
 291                 /*
 292                  * Linux will apparently fire an IN_ATTRIB event when the link
 293                  * count changes (including when it drops to 0 on a remove).
 294                  * This is merely somewhat odd; what is amazing is that this
 295                  * IN_ATTRIB event is not visible on an inotify watch on the
 296                  * parent directory.  (IN_ATTRIB events are normally sent to
 297                  * watches on the parent directory).  While it's hard to
 298                  * believe that this constitutes desired semantics, ltp
 299                  * unfortunately tests this case (if implicitly); in the name
 300                  * of bug-for-bug compatibility, we fire IN_ATTRIB iff we are
 301                  * explicitly watching the file that has been removed.
 302                  */
 303                 if (watch->inw_parent == NULL)
 304                         inotify_watch_event(watch, IN_ATTRIB, NULL);
 305 
 306                 /*FALLTHROUGH*/
 307         case VE_RENAME_DEST:
 308                 inotify_watch_event(watch, IN_DELETE_SELF, NULL);
 309                 inotify_watch_delete(watch, IN_DELETE_SELF);
 310                 break;
 311         case VE_RMDIR:
 312                 /*
 313                  * It seems that IN_ISDIR should really be OR'd in here, but
 314                  * Linux doesn't seem to do that in this case; for the sake of
 315                  * bug-for-bug compatibility, we don't do it either.
 316                  */
 317                 inotify_watch_event(watch, IN_DELETE_SELF, NULL);
 318                 inotify_watch_delete(watch, IN_DELETE_SELF);
 319                 break;
 320         case VE_CREATE:
 321         case VE_TRUNCATE:
 322         case VE_RESIZE:
 323                 inotify_watch_event(watch, IN_MODIFY | IN_ATTRIB, NULL);
 324                 break;
 325         case VE_LINK:
 326                 inotify_watch_event(watch, IN_ATTRIB, NULL);
 327                 break;
 328         case VE_RENAME_SRC_DIR:
 329                 inotify_watch_event(watch, IN_MOVED_FROM, name);
 330                 break;
 331         case VE_RENAME_DEST_DIR:
 332                 if (name == NULL)
 333                         name = dvp->v_path;
 334 
 335                 inotify_watch_insert(watch, dvp, name);
 336                 inotify_watch_event(watch, IN_MOVED_TO, name);
 337                 break;
 338         case VE_SUPPORT:
 339         case VE_MOUNTEDOVER:
 340         case VE_PRE_RENAME_SRC:
 341         case VE_PRE_RENAME_DEST:
 342         case VE_PRE_RENAME_DEST_DIR:
 343                 break;
 344         }
 345 
 346         return (vnext_vnevent(vf, vnevent, dvp, name, ct));
 347 }
 348 
 349 const fs_operation_def_t inotify_vnodesrc_template[] = {
 350         VOPNAME_CLOSE,          { .femop_close = inotify_fop_close },
 351         VOPNAME_CREATE,         { .femop_create = inotify_fop_create },
 352         VOPNAME_LINK,           { .femop_link = inotify_fop_link },
 353         VOPNAME_MKDIR,          { .femop_mkdir = inotify_fop_mkdir },
 354         VOPNAME_OPEN,           { .femop_open = inotify_fop_open },
 355         VOPNAME_READ,           { .femop_read = inotify_fop_read },
 356         VOPNAME_READDIR,        { .femop_readdir = inotify_fop_readdir },
 357         VOPNAME_REMOVE,         { .femop_remove = inotify_fop_remove },
 358         VOPNAME_RMDIR,          { .femop_rmdir = inotify_fop_rmdir },
 359         VOPNAME_SETATTR,        { .femop_setattr = inotify_fop_setattr },
 360         VOPNAME_WRITE,          { .femop_write = inotify_fop_write },
 361         VOPNAME_VNEVENT,        { .femop_vnevent = inotify_fop_vnevent },
 362         NULL, NULL
 363 };
 364 
 365 static int
 366 inotify_watch_cmpwd(inotify_watch_t *lhs, inotify_watch_t *rhs)
 367 {
 368         if (lhs->inw_wd < rhs->inw_wd)
 369                 return (-1);
 370 
 371         if (lhs->inw_wd > rhs->inw_wd)
 372                 return (1);
 373 
 374         return (0);
 375 }
 376 
 377 static int
 378 inotify_watch_cmpvp(inotify_watch_t *lhs, inotify_watch_t *rhs)
 379 {
 380         uintptr_t lvp = (uintptr_t)lhs->inw_vp, rvp = (uintptr_t)rhs->inw_vp;
 381 
 382         if (lvp < rvp)
 383                 return (-1);
 384 
 385         if (lvp > rvp)
 386                 return (1);
 387 
 388         return (0);
 389 }
 390 
 391 static void
 392 inotify_watch_hold(inotify_watch_t *watch)
 393 {
 394         mutex_enter(&watch->inw_lock);
 395         VERIFY(watch->inw_refcnt > 0);
 396         watch->inw_refcnt++;
 397         mutex_exit(&watch->inw_lock);
 398 }
 399 
 400 static void
 401 inotify_watch_release(inotify_watch_t *watch)
 402 {
 403         mutex_enter(&watch->inw_lock);
 404         VERIFY(watch->inw_refcnt > 1);
 405 
 406         if (--watch->inw_refcnt == 1 && watch->inw_zombie) {
 407                 /*
 408                  * We're down to our last reference; kick anyone that might be
 409                  * waiting.
 410                  */
 411                 cv_signal(&watch->inw_cv);
 412         }
 413 
 414         mutex_exit(&watch->inw_lock);
 415 }
 416 
 417 static void
 418 inotify_watch_event(inotify_watch_t *watch, uint64_t mask, char *name)
 419 {
 420         inotify_kevent_t *event, *tail;
 421         inotify_state_t *state = watch->inw_state;
 422         uint32_t wd = watch->inw_wd, cookie = 0, len;
 423         boolean_t removal = mask & IN_REMOVAL ? B_TRUE : B_FALSE;
 424         inotify_watch_t *source = watch;
 425 
 426         if (!(mask &= watch->inw_mask) || mask == IN_ISDIR)
 427                 return;
 428 
 429         if (watch->inw_parent != NULL) {
 430                 /*
 431                  * This is an event on the child; if this isn't a valid child
 432                  * event, return.  Otherwise, we move our watch to be our
 433                  * parent (which we know is around because we have a hold on
 434                  * it) and continue.
 435                  */
 436                 if (!(mask & IN_CHILD_EVENTS))
 437                         return;
 438 
 439                 name = watch->inw_name;
 440                 watch = watch->inw_parent;
 441                 wd = watch->inw_wd;
 442         }
 443 
 444         if (!removal) {
 445                 mutex_enter(&state->ins_lock);
 446 
 447                 if (watch->inw_zombie ||
 448                     watch->inw_fired || !watch->inw_active) {
 449                         mutex_exit(&state->ins_lock);
 450                         return;
 451                 }
 452         } else {
 453                 if (!watch->inw_active)
 454                         return;
 455 
 456                 VERIFY(MUTEX_HELD(&state->ins_lock));
 457         }
 458 
 459         /*
 460          * If this is an operation on a directory and it's a child event
 461          * (event if it's not on a child), we specify IN_ISDIR.
 462          */
 463         if (source->inw_vp->v_type == VDIR && (mask & IN_CHILD_EVENTS))
 464                 mask |= IN_ISDIR;
 465 
 466         if (mask & (IN_MOVED_FROM | IN_MOVED_TO))
 467                 cookie = (uint32_t)curthread->t_did;
 468 
 469         if (state->ins_nevents >= state->ins_maxevents) {
 470                 /*
 471                  * We're at our maximum number of events -- turn our event
 472                  * into an IN_Q_OVERFLOW event, which will be coalesced if
 473                  * it's already the tail event.
 474                  */
 475                 mask = IN_Q_OVERFLOW;
 476                 wd = (uint32_t)-1;
 477                 cookie = 0;
 478                 len = 0;
 479         }
 480 
 481         if ((tail = state->ins_tail) != NULL && tail->ine_event.wd == wd &&
 482             tail->ine_event.mask == mask && tail->ine_event.cookie == cookie &&
 483             ((tail->ine_event.len == 0 && len == 0) ||
 484             (name != NULL && tail->ine_event.len != 0 &&
 485             strcmp(tail->ine_event.name, name) == 0))) {
 486                 /*
 487                  * This is an implicitly coalesced event; we're done.
 488                  */
 489                 if (!removal)
 490                         mutex_exit(&state->ins_lock);
 491                 return;
 492         }
 493 
 494         if (name != NULL) {
 495                 len = strlen(name) + 1;
 496                 len = roundup(len, sizeof (struct inotify_event));
 497         } else {
 498                 len = 0;
 499         }
 500 
 501         event = kmem_zalloc(sizeof (inotify_kevent_t) + len, KM_SLEEP);
 502         event->ine_event.wd = wd;
 503         event->ine_event.mask = (uint32_t)mask;
 504         event->ine_event.cookie = cookie;
 505         event->ine_event.len = len;
 506 
 507         if (name != NULL)
 508                 strcpy(event->ine_event.name, name);
 509 
 510         if (tail != NULL) {
 511                 tail->ine_next = event;
 512         } else {
 513                 VERIFY(state->ins_head == NULL);
 514                 state->ins_head = event;
 515                 cv_broadcast(&state->ins_cv);
 516         }
 517 
 518         state->ins_tail = event;
 519         state->ins_nevents++;
 520         state->ins_size += sizeof (event->ine_event) + len;
 521 
 522         if (removal)
 523                 return;
 524 
 525         if ((watch->inw_mask & IN_ONESHOT) && !watch->inw_fired) {
 526                 /*
 527                  * If this is a one-shot, we need to remove the watch.  (Note
 528                  * that this will recurse back into inotify_watch_event() to
 529                  * fire the IN_IGNORED event -- but with "removal" set.)
 530                  */
 531                 watch->inw_fired = 1;
 532                 inotify_watch_remove(state, watch);
 533         }
 534 
 535         mutex_exit(&state->ins_lock);
 536         pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN);
 537 }
 538 
 539 /*
 540  * Destroy a watch.  By the time we're in here, the watch must have exactly
 541  * one reference.
 542  */
 543 static void
 544 inotify_watch_destroy(inotify_watch_t *watch)
 545 {
 546         VERIFY(MUTEX_HELD(&watch->inw_lock));
 547 
 548         if (watch->inw_name != NULL)
 549                 kmem_free(watch->inw_name, strlen(watch->inw_name) + 1);
 550 
 551         kmem_free(watch, sizeof (inotify_watch_t));
 552 }
 553 
 554 /*
 555  * Zombify a watch.  By the time we come in here, it must be true that the
 556  * watch has already been fem_uninstall()'d -- the only reference should be
 557  * in the state's data structure.  If we can get away with freeing it, we'll
 558  * do that -- but if the reference count is greater than one due to an active
 559  * vnode operation, we'll put this watch on the zombie list on the state
 560  * structure.
 561  */
 562 static void
 563 inotify_watch_zombify(inotify_watch_t *watch)
 564 {
 565         inotify_state_t *state = watch->inw_state;
 566 
 567         VERIFY(MUTEX_HELD(&state->ins_lock));
 568         VERIFY(!watch->inw_zombie);
 569 
 570         watch->inw_zombie = 1;
 571 
 572         if (watch->inw_parent != NULL) {
 573                 inotify_watch_release(watch->inw_parent);
 574         } else {
 575                 avl_remove(&state->ins_byvp, watch);
 576                 avl_remove(&state->ins_bywd, watch);
 577                 vmem_free(state->ins_wds, (void *)(uintptr_t)watch->inw_wd, 1);
 578                 watch->inw_wd = -1;
 579         }
 580 
 581         mutex_enter(&watch->inw_lock);
 582 
 583         if (watch->inw_refcnt == 1) {
 584                 /*
 585                  * There are no operations in flight and there is no way
 586                  * for anyone to discover this watch -- we can destroy it.
 587                  */
 588                 inotify_watch_destroy(watch);
 589         } else {
 590                 /*
 591                  * There are operations in flight; we will need to enqueue
 592                  * this for later destruction.
 593                  */
 594                 watch->inw_parent = state->ins_zombies;
 595                 state->ins_zombies = watch;
 596                 mutex_exit(&watch->inw_lock);
 597         }
 598 }
 599 
 600 static inotify_watch_t *
 601 inotify_watch_add(inotify_state_t *state, inotify_watch_t *parent,
 602     const char *name, vnode_t *vp, uint32_t mask)
 603 {
 604         inotify_watch_t *watch;
 605         int err;
 606 
 607         VERIFY(MUTEX_HELD(&state->ins_lock));
 608 
 609         watch = kmem_zalloc(sizeof (inotify_watch_t), KM_SLEEP);
 610 
 611         watch->inw_vp = vp;
 612         watch->inw_mask = mask;
 613         watch->inw_state = state;
 614         watch->inw_refcnt = 1;
 615 
 616         if (parent == NULL) {
 617                 watch->inw_wd = (int)(uintptr_t)vmem_alloc(state->ins_wds,
 618                     1, VM_BESTFIT | VM_SLEEP);
 619                 avl_add(&state->ins_byvp, watch);
 620                 avl_add(&state->ins_bywd, watch);
 621 
 622                 avl_create(&watch->inw_children,
 623                     (int(*)(const void *, const void *))inotify_watch_cmpvp,
 624                     sizeof (inotify_watch_t),
 625                     offsetof(inotify_watch_t, inw_byvp));
 626         } else {
 627                 VERIFY(name != NULL);
 628                 inotify_watch_hold(parent);
 629                 watch->inw_mask &= IN_CHILD_EVENTS;
 630                 watch->inw_parent = parent;
 631                 watch->inw_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
 632                 strcpy(watch->inw_name, name);
 633 
 634                 avl_add(&parent->inw_children, watch);
 635         }
 636 
 637         /*
 638          * Add our monitor to the vnode.  We must not have the watch lock held
 639          * when we do this, as it will immediately hold our watch.
 640          */
 641         err = fem_install(vp, inotify_femp, watch, OPARGUNIQ,
 642             (void (*)(void *))inotify_watch_hold,
 643             (void (*)(void *))inotify_watch_release);
 644 
 645         VERIFY(err == 0);
 646 
 647         return (watch);
 648 }
 649 
 650 /*
 651  * Remove a (non-child) watch.  This is called from either synchronous context
 652  * via inotify_rm_watch() or monitor context via either a vnevent or a
 653  * one-shot.
 654  */
 655 static void
 656 inotify_watch_remove(inotify_state_t *state, inotify_watch_t *watch)
 657 {
 658         inotify_watch_t *child;
 659         int err;
 660 
 661         VERIFY(MUTEX_HELD(&state->ins_lock));
 662         VERIFY(watch->inw_parent == NULL);
 663 
 664         err = fem_uninstall(watch->inw_vp, inotify_femp, watch);
 665         VERIFY(err == 0);
 666 
 667         /*
 668          * If we have children, we're going to remove them all and set them
 669          * all to be zombies.
 670          */
 671         while ((child = avl_first(&watch->inw_children)) != NULL) {
 672                 VERIFY(child->inw_parent == watch);
 673                 avl_remove(&watch->inw_children, child);
 674 
 675                 err = fem_uninstall(child->inw_vp, inotify_femp, child);
 676                 VERIFY(err == 0);
 677 
 678                 /*
 679                  * If this child watch has been orphaned, remove it from the
 680                  * state's list of orphans.
 681                  */
 682                 if (child->inw_orphaned) {
 683                         list_remove(&state->ins_orphans, child);
 684                         crfree(child->inw_cred);
 685                 }
 686 
 687                 VN_RELE(child->inw_vp);
 688 
 689                 /*
 690                  * We're down (or should be down) to a single reference to
 691                  * this child watch; it's safe to zombify it.
 692                  */
 693                 inotify_watch_zombify(child);
 694         }
 695 
 696         inotify_watch_event(watch, IN_IGNORED | IN_REMOVAL, NULL);
 697         VN_RELE(watch->inw_vp);
 698 
 699         /*
 700          * It's now safe to zombify the watch -- we know that the only reference
 701          * can come from operations in flight.
 702          */
 703         inotify_watch_zombify(watch);
 704 }
 705 
 706 /*
 707  * Delete a watch.  Should only be called from VOP context.
 708  */
 709 static void
 710 inotify_watch_delete(inotify_watch_t *watch, uint32_t event)
 711 {
 712         inotify_state_t *state = watch->inw_state;
 713         inotify_watch_t cmp = { .inw_vp = watch->inw_vp }, *parent;
 714         int err;
 715 
 716         if (event != IN_DELETE_SELF && !(watch->inw_mask & IN_CHILD_EVENTS))
 717                 return;
 718 
 719         mutex_enter(&state->ins_lock);
 720 
 721         if (watch->inw_zombie) {
 722                 mutex_exit(&state->ins_lock);
 723                 return;
 724         }
 725 
 726         if ((parent = watch->inw_parent) == NULL) {
 727                 if (event == IN_DELETE_SELF) {
 728                         /*
 729                          * If we're here because we're being deleted and we
 730                          * are not a child watch, we need to delete the entire
 731                          * watch, children and all.
 732                          */
 733                         inotify_watch_remove(state, watch);
 734                 }
 735 
 736                 mutex_exit(&state->ins_lock);
 737                 return;
 738         } else {
 739                 if (event == IN_DELETE_SELF &&
 740                     !(parent->inw_mask & IN_EXCL_UNLINK)) {
 741                         /*
 742                          * This is a child watch for a file that is being
 743                          * removed and IN_EXCL_UNLINK has not been specified;
 744                          * indicate that it is orphaned and add it to the list
 745                          * of orphans.  (This list will be checked by the
 746                          * cleaning cyclic to determine when the watch has
 747                          * become the only hold on the vnode, at which point
 748                          * the watch can be zombified.)  Note that we check
 749                          * if the watch is orphaned before we orphan it:  hard
 750                          * links make it possible for VE_REMOVE to be called
 751                          * multiple times on the same vnode. (!)
 752                          */
 753                         if (!watch->inw_orphaned) {
 754                                 watch->inw_orphaned = 1;
 755                                 watch->inw_cred = CRED();
 756                                 crhold(watch->inw_cred);
 757                                 list_insert_head(&state->ins_orphans, watch);
 758                         }
 759 
 760                         mutex_exit(&state->ins_lock);
 761                         return;
 762                 }
 763 
 764                 if (watch->inw_orphaned) {
 765                         /*
 766                          * If we're here, a file was orphaned and then later
 767                          * moved -- which almost certainly means that hard
 768                          * links are on the scene.  We choose the orphan over
 769                          * the move because we don't want to spuriously
 770                          * drop events if we can avoid it.
 771                          */
 772                         crfree(watch->inw_cred);
 773                         list_remove(&state->ins_orphans, watch);
 774                 }
 775         }
 776 
 777         if (avl_find(&parent->inw_children, &cmp, NULL) == NULL) {
 778                 /*
 779                  * This watch has already been deleted from the parent.
 780                  */
 781                 mutex_exit(&state->ins_lock);
 782                 return;
 783         }
 784 
 785         avl_remove(&parent->inw_children, watch);
 786         err = fem_uninstall(watch->inw_vp, inotify_femp, watch);
 787         VERIFY(err == 0);
 788 
 789         VN_RELE(watch->inw_vp);
 790 
 791         /*
 792          * It's now safe to zombify the watch -- which won't actually delete
 793          * it as we know that the reference count is greater than 1.
 794          */
 795         inotify_watch_zombify(watch);
 796         mutex_exit(&state->ins_lock);
 797 }
 798 
 799 /*
 800  * Insert a new child watch.  Should only be called from VOP context when
 801  * a child is created in a watched directory.
 802  */
 803 static void
 804 inotify_watch_insert(inotify_watch_t *watch, vnode_t *vp, char *name)
 805 {
 806         inotify_state_t *state = watch->inw_state;
 807         inotify_watch_t cmp = { .inw_vp = vp };
 808 
 809         if (!(watch->inw_mask & IN_CHILD_EVENTS))
 810                 return;
 811 
 812         mutex_enter(&state->ins_lock);
 813 
 814         if (watch->inw_zombie || watch->inw_parent != NULL || vp == NULL) {
 815                 mutex_exit(&state->ins_lock);
 816                 return;
 817         }
 818 
 819         if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) {
 820                 mutex_exit(&state->ins_lock);
 821                 return;
 822         }
 823 
 824         VN_HOLD(vp);
 825         watch = inotify_watch_add(state, watch, name, vp, watch->inw_mask);
 826         VERIFY(watch != NULL);
 827 
 828         mutex_exit(&state->ins_lock);
 829 }
 830 
 831 
 832 static int
 833 inotify_add_watch(inotify_state_t *state, vnode_t *vp, uint32_t mask,
 834     int32_t *wdp)
 835 {
 836         inotify_watch_t *watch, cmp = { .inw_vp = vp };
 837         uint32_t set;
 838 
 839         set = (mask & (IN_ALL_EVENTS | IN_MODIFIERS)) | IN_UNMASKABLE;
 840 
 841         /*
 842          * Lookup our vnode to determine if we already have a watch on it.
 843          */
 844         mutex_enter(&state->ins_lock);
 845 
 846         if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) {
 847                 /*
 848                  * We don't have this watch; allocate a new one, provided that
 849                  * we have fewer than our limit.
 850                  */
 851                 if (avl_numnodes(&state->ins_bywd) >= state->ins_maxwatches) {
 852                         mutex_exit(&state->ins_lock);
 853                         return (ENOSPC);
 854                 }
 855 
 856                 VN_HOLD(vp);
 857                 watch = inotify_watch_add(state, NULL, NULL, vp, set);
 858                 *wdp = watch->inw_wd;
 859                 mutex_exit(&state->ins_lock);
 860 
 861                 return (0);
 862         }
 863 
 864         VERIFY(!watch->inw_zombie);
 865 
 866         if (!(mask & IN_MASK_ADD)) {
 867                 /*
 868                  * Note that if we're resetting our event mask and we're
 869                  * transitioning from an event mask that includes child events
 870                  * to one that doesn't, there will be potentially some stale
 871                  * child watches.  This is basically fine:  they won't fire,
 872                  * and they will correctly be removed when the watch is
 873                  * removed.
 874                  */
 875                 watch->inw_mask = 0;
 876         }
 877 
 878         watch->inw_mask |= set;
 879 
 880         *wdp = watch->inw_wd;
 881 
 882         mutex_exit(&state->ins_lock);
 883 
 884         return (0);
 885 }
 886 
 887 static int
 888 inotify_add_child(inotify_state_t *state, vnode_t *vp, char *name)
 889 {
 890         inotify_watch_t *watch, cmp = { .inw_vp = vp };
 891         vnode_t *cvp;
 892         int err;
 893 
 894         /*
 895          * Verify that the specified child doesn't have a directory component
 896          * within it.
 897          */
 898         if (strchr(name, '/') != NULL)
 899                 return (EINVAL);
 900 
 901         /*
 902          * Lookup the underlying file.  Note that this will succeed even if
 903          * we don't have permissions to actually read the file.
 904          */
 905         if ((err = lookupnameat(name,
 906             UIO_SYSSPACE, NO_FOLLOW, NULL, &cvp, vp)) != 0) {
 907                 return (err);
 908         }
 909 
 910         /*
 911          * Use our vnode to find our watch, and then add our child watch to it.
 912          */
 913         mutex_enter(&state->ins_lock);
 914 
 915         if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) {
 916                 /*
 917                  * This is unexpected -- it means that we don't have the
 918                  * watch that we thought we had.
 919                  */
 920                 mutex_exit(&state->ins_lock);
 921                 VN_RELE(cvp);
 922                 return (ENXIO);
 923         }
 924 
 925         /*
 926          * Now lookup the child vnode in the watch; we'll only add it if it
 927          * isn't already there.
 928          */
 929         cmp.inw_vp = cvp;
 930 
 931         if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) {
 932                 mutex_exit(&state->ins_lock);
 933                 VN_RELE(cvp);
 934                 return (0);
 935         }
 936 
 937         watch = inotify_watch_add(state, watch, name, cvp, watch->inw_mask);
 938         VERIFY(watch != NULL);
 939         mutex_exit(&state->ins_lock);
 940 
 941         return (0);
 942 }
 943 
 944 static int
 945 inotify_rm_watch(inotify_state_t *state, int32_t wd)
 946 {
 947         inotify_watch_t *watch, cmp = { .inw_wd = wd };
 948 
 949         mutex_enter(&state->ins_lock);
 950 
 951         if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) {
 952                 mutex_exit(&state->ins_lock);
 953                 return (EINVAL);
 954         }
 955 
 956         inotify_watch_remove(state, watch);
 957         mutex_exit(&state->ins_lock);
 958 
 959         return (0);
 960 }
 961 
 962 static int
 963 inotify_activate(inotify_state_t *state, int32_t wd)
 964 {
 965         inotify_watch_t *watch, cmp = { .inw_wd = wd };
 966 
 967         mutex_enter(&state->ins_lock);
 968 
 969         if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) {
 970                 mutex_exit(&state->ins_lock);
 971                 return (EINVAL);
 972         }
 973 
 974         watch->inw_active = 1;
 975 
 976         mutex_exit(&state->ins_lock);
 977 
 978         return (0);
 979 }
 980 
 981 /*
 982  * Called periodically as a cyclic to process the orphans and zombies.
 983  */
 984 static void
 985 inotify_clean(void *arg)
 986 {
 987         inotify_state_t *state = arg;
 988         inotify_watch_t *watch, *parent, *next, **prev;
 989         cred_t *savecred;
 990         int err;
 991 
 992         mutex_enter(&state->ins_lock);
 993 
 994         for (watch = list_head(&state->ins_orphans);
 995             watch != NULL; watch = next) {
 996                 next = list_next(&state->ins_orphans, watch);
 997 
 998                 VERIFY(!watch->inw_zombie);
 999                 VERIFY((parent = watch->inw_parent) != NULL);
1000 
1001                 if (watch->inw_vp->v_count > 1)
1002                         continue;
1003 
1004                 avl_remove(&parent->inw_children, watch);
1005                 err = fem_uninstall(watch->inw_vp, inotify_femp, watch);
1006                 VERIFY(err == 0);
1007 
1008                 list_remove(&state->ins_orphans, watch);
1009 
1010                 /*
1011                  * For purposes of releasing the vnode, we need to switch our
1012                  * cred to be the cred of the orphaning thread (which we held
1013                  * at the time this watch was orphaned).
1014                  */
1015                 savecred = curthread->t_cred;
1016                 curthread->t_cred = watch->inw_cred;
1017                 VN_RELE(watch->inw_vp);
1018                 crfree(watch->inw_cred);
1019                 curthread->t_cred = savecred;
1020 
1021                 inotify_watch_zombify(watch);
1022         }
1023 
1024         prev = &state->ins_zombies;
1025 
1026         while ((watch = *prev) != NULL) {
1027                 mutex_enter(&watch->inw_lock);
1028 
1029                 if (watch->inw_refcnt == 1) {
1030                         *prev = watch->inw_parent;
1031                         inotify_watch_destroy(watch);
1032                         continue;
1033                 }
1034 
1035                 prev = &watch->inw_parent;
1036                 mutex_exit(&watch->inw_lock);
1037         }
1038 
1039         mutex_exit(&state->ins_lock);
1040 }
1041 
1042 /*ARGSUSED*/
1043 static int
1044 inotify_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
1045 {
1046         inotify_state_t *state;
1047         major_t major = getemajor(*devp);
1048         minor_t minor = getminor(*devp);
1049         int instances = 0;
1050         char c[64];
1051 
1052         if (minor != INOTIFYMNRN_INOTIFY)
1053                 return (ENXIO);
1054 
1055         mutex_enter(&inotify_lock);
1056 
1057         for (state = inotify_state; state != NULL; state = state->ins_next) {
1058                 if (state->ins_cred == cred_p)
1059                         instances++;
1060         }
1061 
1062         if (instances >= inotify_maxinstances) {
1063                 mutex_exit(&inotify_lock);
1064                 return (EMFILE);
1065         }
1066 
1067         minor = (minor_t)(uintptr_t)vmem_alloc(inotify_minor, 1,
1068             VM_BESTFIT | VM_SLEEP);
1069 
1070         if (ddi_soft_state_zalloc(inotify_softstate, minor) != DDI_SUCCESS) {
1071                 vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1);
1072                 mutex_exit(&inotify_lock);
1073                 return (NULL);
1074         }
1075 
1076         state = ddi_get_soft_state(inotify_softstate, minor);
1077         *devp = makedevice(major, minor);
1078 
1079         crhold(cred_p);
1080         state->ins_cred = cred_p;
1081         state->ins_next = inotify_state;
1082         inotify_state = state;
1083 
1084         (void) snprintf(c, sizeof (c), "inotify_watchid_%d", minor);
1085         state->ins_wds = vmem_create(c, (void *)1, UINT32_MAX, 1,
1086             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
1087 
1088         avl_create(&state->ins_bywd,
1089             (int(*)(const void *, const void *))inotify_watch_cmpwd,
1090             sizeof (inotify_watch_t),
1091             offsetof(inotify_watch_t, inw_bywd));
1092 
1093         avl_create(&state->ins_byvp,
1094             (int(*)(const void *, const void *))inotify_watch_cmpvp,
1095             sizeof (inotify_watch_t),
1096             offsetof(inotify_watch_t, inw_byvp));
1097 
1098         list_create(&state->ins_orphans, sizeof (inotify_watch_t),
1099             offsetof(inotify_watch_t, inw_orphan));
1100 
1101         state->ins_maxwatches = inotify_maxwatches;
1102         state->ins_maxevents = inotify_maxevents;
1103 
1104         mutex_exit(&inotify_lock);
1105 
1106         state->ins_cleaner = ddi_periodic_add(inotify_clean,
1107             state, NANOSEC, DDI_IPL_0);
1108 
1109         return (0);
1110 }
1111 
1112 /*ARGSUSED*/
1113 static int
1114 inotify_read(dev_t dev, uio_t *uio, cred_t *cr)
1115 {
1116         inotify_state_t *state;
1117         inotify_kevent_t *event;
1118         minor_t minor = getminor(dev);
1119         int err = 0, nevents = 0;
1120         size_t len;
1121 
1122         state = ddi_get_soft_state(inotify_softstate, minor);
1123 
1124         mutex_enter(&state->ins_lock);
1125 
1126         while (state->ins_head == NULL) {
1127                 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
1128                         mutex_exit(&state->ins_lock);
1129                         return (EAGAIN);
1130                 }
1131 
1132                 if (!cv_wait_sig_swap(&state->ins_cv, &state->ins_lock)) {
1133                         mutex_exit(&state->ins_lock);
1134                         return (EINTR);
1135                 }
1136         }
1137 
1138         /*
1139          * We have events and we have our lock; return as many as we can.
1140          */
1141         while ((event = state->ins_head) != NULL) {
1142                 len = sizeof (event->ine_event) + event->ine_event.len;
1143 
1144                 if (uio->uio_resid < len) {
1145                         if (nevents == 0)
1146                                 err = EINVAL;
1147                         break;
1148                 }
1149 
1150                 nevents++;
1151 
1152                 if ((err = uiomove(&event->ine_event, len, UIO_READ, uio)) != 0)
1153                         break;
1154 
1155                 VERIFY(state->ins_nevents > 0);
1156                 state->ins_nevents--;
1157 
1158                 VERIFY(state->ins_size > 0);
1159                 state->ins_size -= len;
1160 
1161                 if ((state->ins_head = event->ine_next) == NULL) {
1162                         VERIFY(event == state->ins_tail);
1163                         VERIFY(state->ins_nevents == 0);
1164                         state->ins_tail = NULL;
1165                 }
1166 
1167                 kmem_free(event, INOTIFY_EVENT_LENGTH(event));
1168         }
1169 
1170         mutex_exit(&state->ins_lock);
1171 
1172         return (err);
1173 }
1174 
1175 /*ARGSUSED*/
1176 static int
1177 inotify_poll(dev_t dev, short events, int anyyet, short *reventsp,
1178     struct pollhead **phpp)
1179 {
1180         inotify_state_t *state;
1181         minor_t minor = getminor(dev);
1182 
1183         state = ddi_get_soft_state(inotify_softstate, minor);
1184 
1185         mutex_enter(&state->ins_lock);
1186 
1187         if (state->ins_head != NULL) {
1188                 *reventsp = events & (POLLRDNORM | POLLIN);
1189         } else {
1190                 *reventsp = 0;
1191 
1192                 if (!anyyet)
1193                         *phpp = &state->ins_pollhd;
1194         }
1195 
1196         mutex_exit(&state->ins_lock);
1197 
1198         return (0);
1199 }
1200 
1201 /*ARGSUSED*/
1202 static int
1203 inotify_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
1204 {
1205         inotify_state_t *state;
1206         minor_t minor = getminor(dev);
1207         file_t *fp;
1208         int rval;
1209 
1210         state = ddi_get_soft_state(inotify_softstate, minor);
1211 
1212         switch (cmd) {
1213         case INOTIFYIOC_ADD_WATCH: {
1214                 inotify_addwatch_t addwatch;
1215                 file_t *fp;
1216 
1217                 if (copyin((void *)arg, &addwatch, sizeof (addwatch)) != 0)
1218                         return (EFAULT);
1219 
1220                 if ((fp = getf(addwatch.inaw_fd)) == NULL)
1221                         return (EBADF);
1222 
1223                 rval = inotify_add_watch(state, fp->f_vnode,
1224                     addwatch.inaw_mask, rv);
1225 
1226                 releasef(addwatch.inaw_fd);
1227                 return (rval);
1228         }
1229 
1230         case INOTIFYIOC_ADD_CHILD: {
1231                 inotify_addchild_t addchild;
1232                 char name[MAXPATHLEN];
1233 
1234                 if (copyin((void *)arg, &addchild, sizeof (addchild)) != 0)
1235                         return (EFAULT);
1236 
1237                 if (copyinstr(addchild.inac_name, name, MAXPATHLEN, NULL) != 0)
1238                         return (EFAULT);
1239 
1240                 if ((fp = getf(addchild.inac_fd)) == NULL)
1241                         return (EBADF);
1242 
1243                 rval = inotify_add_child(state, fp->f_vnode, name);
1244 
1245                 releasef(addchild.inac_fd);
1246                 return (rval);
1247         }
1248 
1249         case INOTIFYIOC_RM_WATCH:
1250                 return (inotify_rm_watch(state, arg));
1251 
1252         case INOTIFYIOC_ACTIVATE:
1253                 return (inotify_activate(state, arg));
1254 
1255         case FIONREAD: {
1256                 int32_t size;
1257 
1258                 mutex_enter(&state->ins_lock);
1259                 size = state->ins_size;
1260                 mutex_exit(&state->ins_lock);
1261 
1262                 if (copyout(&size, (void *)arg, sizeof (size)) != 0)
1263                         return (EFAULT);
1264 
1265                 return (0);
1266         }
1267 
1268         default:
1269                 break;
1270         }
1271 
1272         return (ENOTTY);
1273 }
1274 
1275 /*ARGSUSED*/
1276 static int
1277 inotify_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
1278 {
1279         inotify_state_t *state, **sp;
1280         inotify_watch_t *watch, *zombies;
1281         inotify_kevent_t *event;
1282         minor_t minor = getminor(dev);
1283 
1284         state = ddi_get_soft_state(inotify_softstate, minor);
1285 
1286         if (state->ins_pollhd.ph_list != NULL) {
1287                 pollwakeup(&state->ins_pollhd, POLLERR);
1288                 pollhead_clean(&state->ins_pollhd);
1289         }
1290 
1291         mutex_enter(&state->ins_lock);
1292 
1293         /*
1294          * First, destroy all of our watches.
1295          */
1296         while ((watch = avl_first(&state->ins_bywd)) != NULL)
1297                 inotify_watch_remove(state, watch);
1298 
1299         /*
1300          * And now destroy our event queue.
1301          */
1302         while ((event = state->ins_head) != NULL) {
1303                 state->ins_head = event->ine_next;
1304                 kmem_free(event, INOTIFY_EVENT_LENGTH(event));
1305         }
1306 
1307         zombies = state->ins_zombies;
1308         state->ins_zombies = NULL;
1309         mutex_exit(&state->ins_lock);
1310 
1311         /*
1312          * Now that our state lock is dropped, we can synchronously wait on
1313          * any zombies.
1314          */
1315         while ((watch = zombies) != NULL) {
1316                 zombies = zombies->inw_parent;
1317 
1318                 mutex_enter(&watch->inw_lock);
1319 
1320                 while (watch->inw_refcnt > 1)
1321                         cv_wait(&watch->inw_cv, &watch->inw_lock);
1322 
1323                 inotify_watch_destroy(watch);
1324         }
1325 
1326         if (state->ins_cleaner != NULL) {
1327                 ddi_periodic_delete(state->ins_cleaner);
1328                 state->ins_cleaner = NULL;
1329         }
1330 
1331         mutex_enter(&inotify_lock);
1332 
1333         /*
1334          * Remove our state from our global list, and release our hold on
1335          * the cred.
1336          */
1337         for (sp = &inotify_state; *sp != state; sp = &((*sp)->ins_next))
1338                 VERIFY(*sp != NULL);
1339 
1340         *sp = (*sp)->ins_next;
1341         crfree(state->ins_cred);
1342         vmem_destroy(state->ins_wds);
1343 
1344         ddi_soft_state_free(inotify_softstate, minor);
1345         vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1);
1346 
1347         mutex_exit(&inotify_lock);
1348 
1349         return (0);
1350 }
1351 
1352 /*ARGSUSED*/
1353 static int
1354 inotify_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
1355 {
1356         mutex_enter(&inotify_lock);
1357 
1358         if (ddi_soft_state_init(&inotify_softstate,
1359             sizeof (inotify_state_t), 0) != 0) {
1360                 cmn_err(CE_NOTE, "/dev/inotify failed to create soft state");
1361                 mutex_exit(&inotify_lock);
1362                 return (DDI_FAILURE);
1363         }
1364 
1365         if (ddi_create_minor_node(devi, "inotify", S_IFCHR,
1366             INOTIFYMNRN_INOTIFY, DDI_PSEUDO, NULL) == DDI_FAILURE) {
1367                 cmn_err(CE_NOTE, "/dev/inotify couldn't create minor node");
1368                 ddi_soft_state_fini(&inotify_softstate);
1369                 mutex_exit(&inotify_lock);
1370                 return (DDI_FAILURE);
1371         }
1372 
1373         if (fem_create("inotify_fem",
1374             inotify_vnodesrc_template, &inotify_femp) != 0) {
1375                 cmn_err(CE_NOTE, "/dev/inotify couldn't create FEM state");
1376                 ddi_remove_minor_node(devi, NULL);
1377                 ddi_soft_state_fini(&inotify_softstate);
1378                 mutex_exit(&inotify_lock);
1379                 return (DDI_FAILURE);
1380         }
1381 
1382         ddi_report_dev(devi);
1383         inotify_devi = devi;
1384 
1385         inotify_minor = vmem_create("inotify_minor", (void *)INOTIFYMNRN_CLONE,
1386             UINT32_MAX - INOTIFYMNRN_CLONE, 1, NULL, NULL, NULL, 0,
1387             VM_SLEEP | VMC_IDENTIFIER);
1388 
1389         mutex_exit(&inotify_lock);
1390 
1391         return (DDI_SUCCESS);
1392 }
1393 
1394 /*ARGSUSED*/
1395 static int
1396 inotify_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1397 {
1398         switch (cmd) {
1399         case DDI_DETACH:
1400                 break;
1401 
1402         case DDI_SUSPEND:
1403                 return (DDI_SUCCESS);
1404 
1405         default:
1406                 return (DDI_FAILURE);
1407         }
1408 
1409         mutex_enter(&inotify_lock);
1410         fem_free(inotify_femp);
1411         vmem_destroy(inotify_minor);
1412 
1413         ddi_remove_minor_node(inotify_devi, NULL);
1414         inotify_devi = NULL;
1415 
1416         ddi_soft_state_fini(&inotify_softstate);
1417         mutex_exit(&inotify_lock);
1418 
1419         return (DDI_SUCCESS);
1420 }
1421 
1422 /*ARGSUSED*/
1423 static int
1424 inotify_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1425 {
1426         int error;
1427 
1428         switch (infocmd) {
1429         case DDI_INFO_DEVT2DEVINFO:
1430                 *result = (void *)inotify_devi;
1431                 error = DDI_SUCCESS;
1432                 break;
1433         case DDI_INFO_DEVT2INSTANCE:
1434                 *result = (void *)0;
1435                 error = DDI_SUCCESS;
1436                 break;
1437         default:
1438                 error = DDI_FAILURE;
1439         }
1440         return (error);
1441 }
1442 
1443 static struct cb_ops inotify_cb_ops = {
1444         inotify_open,           /* open */
1445         inotify_close,          /* close */
1446         nulldev,                /* strategy */
1447         nulldev,                /* print */
1448         nodev,                  /* dump */
1449         inotify_read,           /* read */
1450         nodev,                  /* write */
1451         inotify_ioctl,          /* ioctl */
1452         nodev,                  /* devmap */
1453         nodev,                  /* mmap */
1454         nodev,                  /* segmap */
1455         inotify_poll,           /* poll */
1456         ddi_prop_op,            /* cb_prop_op */
1457         0,                      /* streamtab  */
1458         D_NEW | D_MP            /* Driver compatibility flag */
1459 };
1460 
1461 static struct dev_ops inotify_ops = {
1462         DEVO_REV,               /* devo_rev */
1463         0,                      /* refcnt */
1464         inotify_info,           /* get_dev_info */
1465         nulldev,                /* identify */
1466         nulldev,                /* probe */
1467         inotify_attach,         /* attach */
1468         inotify_detach,         /* detach */
1469         nodev,                  /* reset */
1470         &inotify_cb_ops,    /* driver operations */
1471         NULL,                   /* bus operations */
1472         nodev,                  /* dev power */
1473         ddi_quiesce_not_needed, /* quiesce */
1474 };
1475 
1476 static struct modldrv modldrv = {
1477         &mod_driverops,             /* module type (this is a pseudo driver) */
1478         "inotify support",      /* name of module */
1479         &inotify_ops,               /* driver ops */
1480 };
1481 
1482 static struct modlinkage modlinkage = {
1483         MODREV_1,
1484         (void *)&modldrv,
1485         NULL
1486 };
1487 
1488 int
1489 _init(void)
1490 {
1491         return (mod_install(&modlinkage));
1492 }
1493 
1494 int
1495 _info(struct modinfo *modinfop)
1496 {
1497         return (mod_info(&modlinkage, modinfop));
1498 }
1499 
1500 int
1501 _fini(void)
1502 {
1503         return (mod_remove(&modlinkage));
1504 }