big-one New usr/src/uts/common/fs/ufs/ufs

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
  25  */
  26 
  27 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39 
  40 #include <sys/types.h>
  41 #include <sys/t_lock.h>
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/uio.h>
  45 #include <sys/bitmap.h>
  46 #include <sys/signal.h>
  47 #include <sys/cred.h>
  48 #include <sys/user.h>
  49 #include <sys/vfs.h>
  50 #include <sys/stat.h>
  51 #include <sys/vnode.h>
  52 #include <sys/buf.h>
  53 #include <sys/proc.h>
  54 #include <sys/disp.h>
  55 #include <sys/dnlc.h>
  56 #include <sys/mode.h>
  57 #include <sys/cmn_err.h>
  58 #include <sys/kstat.h>
  59 #include <sys/acl.h>
  60 #include <sys/var.h>
  61 #include <sys/fs/ufs_inode.h>
  62 #include <sys/fs/ufs_fs.h>
  63 #include <sys/fs/ufs_trans.h>
  64 #include <sys/fs/ufs_acl.h>
  65 #include <sys/fs/ufs_bio.h>
  66 #include <sys/fs/ufs_quota.h>
  67 #include <sys/fs/ufs_log.h>
  68 #include <vm/hat.h>
  69 #include <vm/as.h>
  70 #include <vm/pvn.h>
  71 #include <vm/seg.h>
  72 #include <sys/swap.h>
  73 #include <sys/cpuvar.h>
  74 #include <sys/sysmacros.h>
  75 #include <sys/errno.h>
  76 #include <sys/kmem.h>
  77 #include <sys/debug.h>
  78 #include <fs/fs_subr.h>
  79 #include <sys/policy.h>
  80 
  81 struct kmem_cache *inode_cache;         /* cache of free inodes */
  82 
  83 /* UFS Inode Cache Stats -- Not protected */
  84 struct  instats ins = {
  85         { "size",               KSTAT_DATA_ULONG },
  86         { "maxsize",            KSTAT_DATA_ULONG },
  87         { "hits",               KSTAT_DATA_ULONG },
  88         { "misses",             KSTAT_DATA_ULONG },
  89         { "kmem allocs",        KSTAT_DATA_ULONG },
  90         { "kmem frees",         KSTAT_DATA_ULONG },
  91         { "maxsize reached",    KSTAT_DATA_ULONG },
  92         { "puts at frontlist",  KSTAT_DATA_ULONG },
  93         { "puts at backlist",   KSTAT_DATA_ULONG },
  94         { "queues to free",     KSTAT_DATA_ULONG },
  95         { "scans",              KSTAT_DATA_ULONG },
  96         { "thread idles",       KSTAT_DATA_ULONG },
  97         { "lookup idles",       KSTAT_DATA_ULONG },
  98         { "vget idles",         KSTAT_DATA_ULONG },
  99         { "cache allocs",       KSTAT_DATA_ULONG },
 100         { "cache frees",        KSTAT_DATA_ULONG },
 101         { "pushes at close",    KSTAT_DATA_ULONG }
 102 };
 103 
 104 /* kstat data */
 105 static kstat_t          *ufs_inode_kstat = NULL;
 106 
 107 union ihead *ihead;     /* inode LRU cache, Chris Maltby */
 108 kmutex_t *ih_lock;      /* protect inode cache hash table */
 109 static int ino_hashlen = 4;     /* desired average hash chain length */
 110 int inohsz;             /* number of buckets in the hash table */
 111 
 112 kmutex_t        ufs_scan_lock;  /* stop racing multiple ufs_scan_inodes() */
 113 kmutex_t        ufs_iuniqtime_lock; /* protect iuniqtime */
 114 kmutex_t        ufsvfs_mutex;
 115 struct ufsvfs   *oldufsvfslist, *ufsvfslist;
 116 
 117 /*
 118  * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
 119  * I/Os are going on.
 120  */
 121 clock_t ufs_iowait;
 122 
 123 /*
 124  * the threads that process idle inodes and free (deleted) inodes
 125  * have high water marks that are set in ufsinit().
 126  * These values but can be no less then the minimum shown below
 127  */
 128 int     ufs_idle_max;   /* # of allowable idle inodes */
 129 ulong_t ufs_inode_max;  /* hard limit of allowable idle inodes */
 130 #define UFS_IDLE_MAX    (16)    /* min # of allowable idle inodes */
 131 
 132 /*
 133  * Tunables for ufs write throttling.
 134  * These are validated in ufs_iinit() since improper settings
 135  * can lead to filesystem hangs.
 136  */
 137 #define UFS_HW_DEFAULT  (16 * 1024 * 1024)
 138 #define UFS_LW_DEFAULT  (8 * 1024 * 1024)
 139 volatile int    ufs_HW = UFS_HW_DEFAULT;
 140 volatile int    ufs_LW = UFS_LW_DEFAULT;
 141 
 142 static void ihinit(void);
 143 extern int hash2ints(int, int);
 144 
 145 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
 146     struct cred *, int);
 147 
 148 /* ARGSUSED */
 149 static int
 150 ufs_inode_kstat_update(kstat_t *ksp, int rw)
 151 {
 152         if (rw == KSTAT_WRITE)
 153                 return (EACCES);
 154 
 155         ins.in_malloc.value.ul  = (ulong_t)kmem_cache_stat(inode_cache,
 156             "slab_alloc");
 157         ins.in_mfree.value.ul   = (ulong_t)kmem_cache_stat(inode_cache,
 158             "slab_free");
 159         ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
 160             "alloc");
 161         ins.in_kcfree.value.ul  = (ulong_t)kmem_cache_stat(inode_cache,
 162             "free");
 163         ins.in_size.value.ul    = (ulong_t)kmem_cache_stat(inode_cache,
 164             "buf_inuse");
 165         ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
 166             "buf_max");
 167         ins.in_misses.value.ul = ins.in_kcalloc.value.ul;
 168 
 169         return (0);
 170 }
 171 
 172 void
 173 ufs_iinit(void)
 174 {
 175         /*
 176          * Validate that ufs_HW > ufs_LW.
 177          * The default values for these two tunables have been increased.
 178          * There is now a range of values for ufs_HW that used to be
 179          * legal on previous Solaris versions but no longer is now.
 180          * Upgrading a machine which has an /etc/system setting for ufs_HW
 181          * from that range can lead to filesystem hangs unless the values
 182          * are checked here.
 183          */
 184         if (ufs_HW <= ufs_LW) {
 185                 cmn_err(CE_WARN,
 186                     "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
 187                     ufs_HW, ufs_LW);
 188                 ufs_LW = UFS_LW_DEFAULT;
 189                 ufs_HW = UFS_HW_DEFAULT;
 190                 cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
 191                     ufs_HW, ufs_LW);
 192         }
 193 
 194         /*
 195          * Adjust the tunable `ufs_ninode' to a reasonable value
 196          */
 197         if (ufs_ninode <= 0)
 198                 ufs_ninode = ncsize;
 199         if (ufs_inode_max == 0)
 200                 ufs_inode_max =
 201                     (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode));
 202         if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
 203                 cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
 204                     ufs_inode_max);
 205                 ufs_ninode = ufs_inode_max;
 206         }
 207         /*
 208          * Wait till third call of ufs_update to declare that no I/Os are
 209          * going on. This allows deferred access times to be flushed to disk.
 210          */
 211         ufs_iowait = v.v_autoup * hz * 2;
 212 
 213         /*
 214          * idle thread runs when 25% of ufs_ninode entries are on the queue
 215          */
 216         if (ufs_idle_max == 0)
 217                 ufs_idle_max = ufs_ninode >> 2;
 218         if (ufs_idle_max < UFS_IDLE_MAX)
 219                 ufs_idle_max = UFS_IDLE_MAX;
 220         if (ufs_idle_max > ufs_ninode)
 221                 ufs_idle_max = ufs_ninode;
 222         /*
 223          * This is really a misnomer, it is ufs_queue_init
 224          */
 225         ufs_thread_init(&ufs_idle_q, ufs_idle_max);
 226         ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);
 227 
 228         /*
 229          * global hlock thread
 230          */
 231         ufs_thread_init(&ufs_hlock, 1);
 232         ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);
 233 
 234         ihinit();
 235         qtinit();
 236         ins.in_maxsize.value.ul = ufs_ninode;
 237         if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
 238             KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
 239             KSTAT_FLAG_VIRTUAL)) != NULL) {
 240                 ufs_inode_kstat->ks_data = (void *)&ins;
 241                 ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
 242                 kstat_install(ufs_inode_kstat);
 243         }
 244         ufsfx_init();           /* fix-on-panic initialization */
 245         si_cache_init();
 246         ufs_directio_init();
 247         lufs_init();
 248         mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
 249 }
 250 
 251 /* ARGSUSED */
 252 static int
 253 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
 254 {
 255         struct inode *ip = buf;
 256         struct vnode *vp;
 257 
 258         vp = ip->i_vnode = vn_alloc(kmflags);
 259         if (vp == NULL) {
 260                 return (-1);
 261         }
 262         vn_setops(vp, ufs_vnodeops);
 263         vp->v_data = ip;
 264 
 265         rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
 266         rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
 267         mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
 268         dnlc_dir_init(&ip->i_danchor);
 269 
 270         cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);
 271 
 272         return (0);
 273 }
 274 
 275 /* ARGSUSED */
 276 static void
 277 ufs_inode_cache_destructor(void *buf, void *cdrarg)
 278 {
 279         struct inode *ip = buf;
 280         struct vnode *vp;
 281 
 282         vp = ITOV(ip);
 283 
 284         rw_destroy(&ip->i_rwlock);
 285         rw_destroy(&ip->i_contents);
 286         mutex_destroy(&ip->i_tlock);
 287         if (vp->v_type == VDIR) {
 288                 dnlc_dir_fini(&ip->i_danchor);
 289         }
 290 
 291         cv_destroy(&ip->i_wrcv);
 292 
 293         vn_free(vp);
 294 }
 295 
 296 /*
 297  * Initialize hash links for inodes
 298  * and build inode free list.
 299  */
 300 void
 301 ihinit(void)
 302 {
 303         int i;
 304         union   ihead *ih = ihead;
 305 
 306         mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);
 307 
 308         inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
 309         ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
 310         ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);
 311 
 312         for (i = 0, ih = ihead; i < inohsz; i++,  ih++) {
 313                 ih->ih_head[0] = ih;
 314                 ih->ih_head[1] = ih;
 315                 mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
 316         }
 317         inode_cache = kmem_cache_create("ufs_inode_cache",
 318             sizeof (struct inode), 0, ufs_inode_cache_constructor,
 319             ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
 320             NULL, NULL, 0);
 321 }
 322 
 323 /*
 324  * Free an inode structure
 325  */
 326 void
 327 ufs_free_inode(struct inode *ip)
 328 {
 329         vn_invalid(ITOV(ip));
 330         kmem_cache_free(inode_cache, ip);
 331 }
 332 
 333 /*
 334  * Allocate an inode structure
 335  */
 336 struct inode *
 337 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
 338 {
 339         struct inode *ip;
 340         vnode_t *vp;
 341 
 342         ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
 343         /*
 344          * at this point we have a newly allocated inode
 345          */
 346         ip->i_freef = ip;
 347         ip->i_freeb = ip;
 348         ip->i_flag = IREF;
 349         ip->i_seq = 0xFF;    /* Unique initial value */
 350         ip->i_dev = ufsvfsp->vfs_dev;
 351         ip->i_ufsvfs = ufsvfsp;
 352         ip->i_devvp = ufsvfsp->vfs_devvp;
 353         ip->i_number = ino;
 354         ip->i_diroff = 0;
 355         ip->i_nextr = 0;
 356         ip->i_map = NULL;
 357         ip->i_rdev = 0;
 358         ip->i_writes = 0;
 359         ip->i_mode = 0;
 360         ip->i_delaylen = 0;
 361         ip->i_delayoff = 0;
 362         ip->i_nextrio = 0;
 363         ip->i_ufs_acl = NULL;
 364         ip->i_cflags = 0;
 365         ip->i_mapcnt = 0;
 366         ip->i_dquot = NULL;
 367         ip->i_cachedir = CD_ENABLED;
 368         ip->i_writer = NULL;
 369 
 370         /*
 371          * the vnode for this inode was allocated by the constructor
 372          */
 373         vp = ITOV(ip);
 374         vn_reinit(vp);
 375         if (ino == (ino_t)UFSROOTINO)
 376                 vp->v_flag = VROOT;
 377         vp->v_vfsp = ufsvfsp->vfs_vfs;
 378         vn_exists(vp);
 379         return (ip);
 380 }
 381 
 382 /*
 383  * Look up an inode by device, inumber.  If it is in core (in the
 384  * inode structure), honor the locking protocol.  If it is not in
 385  * core, read it in from the specified device after freeing any pages.
 386  * In all cases, a pointer to a VN_HELD inode structure is returned.
 387  */
 388 int
 389 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
 390 {
 391         return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
 392 }
 393 
 394 /*
 395  * A version of ufs_iget which returns only allocated, linked inodes.
 396  * This is appropriate for any callers who do not expect a free inode.
 397  */
 398 int
 399 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
 400     struct cred *cr)
 401 {
 402         return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
 403 }
 404 
 405 /*
 406  * Set vnode attributes based on v_type, this should be called whenever
 407  * an inode's i_mode is changed.
 408  */
 409 void
 410 ufs_reset_vnode(vnode_t *vp)
 411 {
 412         /*
 413          * an old DBE hack
 414          */
 415         if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
 416                 vp->v_flag |= VSWAPLIKE;
 417         else
 418                 vp->v_flag &= ~VSWAPLIKE;
 419 
 420         /*
 421          * if not swap like and it's just a regular file, we want
 422          * to maintain the vnode's pages sorted by clean/modified
 423          * for faster sync'ing to disk
 424          */
 425         if (vp->v_type == VREG)
 426                 vp->v_flag |= VMODSORT;
 427         else
 428                 vp->v_flag &= ~VMODSORT;
 429 
 430         /*
 431          * Is this an attribute hidden dir?
 432          */
 433         if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
 434                 vp->v_flag |= V_XATTRDIR;
 435         else
 436                 vp->v_flag &= ~V_XATTRDIR;
 437 }
 438 
 439 /*
 440  * Shared implementation of ufs_iget and ufs_iget_alloced.  The 'validate'
 441  * flag is used to distinguish the two; when true, we validate that the inode
 442  * being retrieved looks like a linked and allocated inode.
 443  */
 444 /* ARGSUSED */
 445 static int
 446 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
 447     struct cred *cr, int validate)
 448 {
 449         struct inode *ip, *sp;
 450         union ihead *ih;
 451         kmutex_t *ihm;
 452         struct buf *bp;
 453         struct dinode *dp;
 454         struct vnode *vp;
 455         extern vfs_t EIO_vfs;
 456         int error;
 457         int ftype;      /* XXX - Remove later on */
 458         dev_t vfs_dev;
 459         struct ufsvfs *ufsvfsp;
 460         struct fs *fs;
 461         int hno;
 462         daddr_t bno;
 463         ulong_t ioff;
 464 
 465         CPU_STATS_ADD_K(sys, ufsiget, 1);
 466 
 467         /*
 468          * Lookup inode in cache.
 469          */
 470         vfs_dev = vfsp->vfs_dev;
 471         hno = INOHASH(ino);
 472         ih = &ihead[hno];
 473         ihm = &ih_lock[hno];
 474 
 475 again:
 476         mutex_enter(ihm);
 477         for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
 478                 if (ino != ip->i_number || vfs_dev != ip->i_dev ||
 479                     (ip->i_flag & ISTALE))
 480                         continue;
 481 
 482                 /*
 483                  * Found the interesting inode; hold it and drop the cache lock
 484                  */
 485                 vp = ITOV(ip);  /* for locknest */
 486                 VN_HOLD(vp);
 487                 mutex_exit(ihm);
 488                 rw_enter(&ip->i_contents, RW_READER);
 489 
 490                 /*
 491                  * if necessary, remove from idle list
 492                  */
 493                 if ((ip->i_flag & IREF) == 0) {
 494                         if (ufs_rmidle(ip))
 495                                 VN_RELE(vp);
 496                 }
 497 
 498                 /*
 499                  * Could the inode be read from disk?
 500                  */
 501                 if (ip->i_flag & ISTALE) {
 502                         rw_exit(&ip->i_contents);
 503                         VN_RELE(vp);
 504                         goto again;
 505                 }
 506 
 507                 ins.in_hits.value.ul++;
 508                 *ipp = ip;
 509 
 510                 /*
 511                  * Reset the vnode's attribute flags
 512                  */
 513                 mutex_enter(&vp->v_lock);
 514                 ufs_reset_vnode(vp);
 515                 mutex_exit(&vp->v_lock);
 516 
 517                 rw_exit(&ip->i_contents);
 518 
 519                 return (0);
 520         }
 521         mutex_exit(ihm);
 522 
 523         /*
 524          * Inode was not in cache.
 525          *
 526          * Allocate a new entry
 527          */
 528         ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
 529         fs = ufsvfsp->vfs_fs;
 530 
 531         ip = ufs_alloc_inode(ufsvfsp, ino);
 532         vp = ITOV(ip);
 533 
 534         bno = fsbtodb(fs, itod(fs, ino));
 535         ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
 536         ip->i_doff = (offset_t)ioff + ldbtob(bno);
 537 
 538         /*
 539          * put a place holder in the cache (if not already there)
 540          */
 541         mutex_enter(ihm);
 542         for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
 543                 if (ino == sp->i_number && vfs_dev == sp->i_dev &&
 544                     ((sp->i_flag & ISTALE) == 0)) {
 545                         mutex_exit(ihm);
 546                         ufs_free_inode(ip);
 547                         goto again;
 548                 }
 549         /*
 550          * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
 551          * here, but if we do, then shadow inode allocations panic the
 552          * system.  We don't have to hold vfs_dqrwlock for shadow inodes
 553          * and the ufs_iget() parameters don't tell us what we are getting
 554          * so we have no way of knowing this is a ufs_iget() call from
 555          * a ufs_ialloc() call for a shadow inode.
 556          */
 557         rw_enter(&ip->i_contents, RW_WRITER);
 558         insque(ip, ih);
 559         mutex_exit(ihm);
 560         /*
 561          * read the dinode
 562          */
 563         bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);
 564 
 565         /*
 566          * Check I/O errors
 567          */
 568         error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
 569         if (error) {
 570                 brelse(bp);
 571                 ip->i_flag |= ISTALE;        /* in case someone is looking it up */
 572                 rw_exit(&ip->i_contents);
 573                 vp->v_vfsp = &EIO_vfs;
 574                 VN_RELE(vp);
 575                 return (error);
 576         }
 577         /*
 578          * initialize the inode's dinode
 579          */
 580         dp = (struct dinode *)(ioff + bp->b_un.b_addr);
 581         ip->i_ic = dp->di_ic;                     /* structure assignment */
 582         brelse(bp);
 583 
 584         /*
 585          * Maintain compatibility with Solaris 1.x UFS
 586          */
 587         if (ip->i_suid != UID_LONG)
 588                 ip->i_uid = ip->i_suid;
 589         if (ip->i_sgid != GID_LONG)
 590                 ip->i_gid = ip->i_sgid;
 591 
 592         ftype = ip->i_mode & IFMT;
 593         if (ftype == IFBLK || ftype == IFCHR) {
 594                 dev_t dv;
 595                 uint_t top16 = ip->i_ordev & 0xffff0000u;
 596 
 597                 if (top16 == 0 || top16 == 0xffff0000u)
 598                         dv = expdev(ip->i_ordev);
 599                 else
 600                         dv = expldev(ip->i_ordev);
 601                 vp->v_rdev = ip->i_rdev = dv;
 602         }
 603 
 604         /*
 605          * if our caller only expects allocated inodes, verify that
 606          * this inode looks good; throw it out if it's bad.
 607          */
 608         if (validate) {
 609                 if ((ftype == 0) || (ip->i_nlink <= 0)) {
 610                         ip->i_flag |= ISTALE;
 611                         rw_exit(&ip->i_contents);
 612                         vp->v_vfsp = &EIO_vfs;
 613                         VN_RELE(vp);
 614                         cmn_err(CE_NOTE,
 615                             "%s: unexpected free inode %d, run fsck(1M)%s",
 616                             fs->fs_fsmnt, (int)ino,
 617                             (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
 618                         return (EIO);
 619                 }
 620         }
 621 
 622         /*
 623          * Finish initializing the vnode, special handling for shadow inodes
 624          * because IFTOVT() will produce a v_type of VNON which is not what we
 625          * want, set v_type to VREG explicitly in that case.
 626          */
 627         if (ftype == IFSHAD) {
 628                 vp->v_type = VREG;
 629         } else {
 630                 vp->v_type = IFTOVT((mode_t)ip->i_mode);
 631         }
 632 
 633         ufs_reset_vnode(vp);
 634 
 635         /*
 636          * read the shadow
 637          */
 638         if (ftype != 0 && ip->i_shadow != 0) {
 639                 if ((error = ufs_si_load(ip, cr)) != 0) {
 640                         ip->i_flag |= ISTALE;
 641                         ip->i_ufs_acl = NULL;
 642                         rw_exit(&ip->i_contents);
 643                         vp->v_vfsp = &EIO_vfs;
 644                         VN_RELE(vp);
 645                         return (error);
 646                 }
 647         }
 648 
 649         /*
 650          * Only attach quota information if the inode has a type and if
 651          * that type is not a shadow inode.
 652          */
 653         if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
 654             ((ip->i_mode & IFMT) != IFATTRDIR)) {
 655                 ip->i_dquot = getinoquota(ip);
 656         }
 657         TRANS_MATA_IGET(ufsvfsp, ip);
 658         *ipp = ip;
 659         rw_exit(&ip->i_contents);
 660 
 661         return (0);
 662 }
 663 
 664 /*
 665  * Vnode is no longer referenced, write the inode out
 666  * and if necessary, truncate and deallocate the file.
 667  */
 668 void
 669 ufs_iinactive(struct inode *ip)
 670 {
 671         int             front;
 672         struct inode    *iq;
 673         struct inode    *hip;
 674         struct ufs_q    *uq;
 675         struct vnode    *vp = ITOV(ip);
 676         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
 677         struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
 678 
 679         /*
 680          * Because the vnode type might have been changed,
 681          * the dnlc_dir_purge must be called unconditionally.
 682          */
 683         dnlc_dir_purge(&ip->i_danchor);
 684 
 685         /*
 686          * Get exclusive access to inode data.
 687          */
 688         rw_enter(&ip->i_contents, RW_WRITER);
 689         ASSERT(ip->i_flag & IREF);
 690 
 691         /*
 692          * Make sure no one reclaimed the inode before we put it on
 693          * the freelist or destroy it. We keep our 'hold' on the vnode
 694          * from vn_rele until we are ready to do something with the inode.
 695          *
 696          * Pageout may put a VN_HOLD/VN_RELE at anytime during this
 697          * operation via an async putpage, so we must make sure
 698          * we don't free/destroy the inode more than once. ufs_iget
 699          * may also put a VN_HOLD on the inode before it grabs
 700          * the i_contents lock. This is done so we don't free
 701          * an inode that a thread is waiting on.
 702          */
 703         mutex_enter(&vp->v_lock);
 704 
 705         if (vp->v_count > 1) {
 706                 VN_RELE_LOCKED(vp);
 707                 mutex_exit(&vp->v_lock);
 708                 rw_exit(&ip->i_contents);
 709                 return;
 710         }
 711         mutex_exit(&vp->v_lock);
 712 
 713         /*
 714          * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
 715          * and clean.  It can be safely destroyed (cyf).
 716          */
 717         if (ip->i_ufsvfs == NULL) {
 718                 rw_exit(&ip->i_contents);
 719                 ufs_si_del(ip);
 720                 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
 721                 ufs_free_inode(ip);
 722                 return;
 723         }
 724 
 725         /*
 726          * queue idle inode to appropriate thread. Will check v_count == 1
 727          * prior to putting this on the appropriate queue.
 728          * Stale inodes will be unhashed and freed by the ufs idle thread
 729          * in ufs_idle_free()
 730          */
 731         front = 1;
 732         if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
 733             ip->i_mode && ip->i_nlink <= 0) {
 734                 /*
 735                  * Mark the i_flag to indicate that inode is being deleted.
 736                  * This flag will be cleared when the deletion is complete.
 737                  * This prevents nfs from sneaking in via ufs_vget() while
 738                  * the delete is in progress (bugid 1242481).
 739                  */
 740                 ip->i_flag |= IDEL;
 741 
 742                 /*
 743                  * NOIDEL means that deletes are not allowed at this time;
 744                  * whoever resets NOIDEL will also send this inode back
 745                  * through ufs_iinactive.  IREF remains set.
 746                  */
 747                 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
 748                         mutex_enter(&vp->v_lock);
 749                         VN_RELE_LOCKED(vp);
 750                         mutex_exit(&vp->v_lock);
 751                         rw_exit(&ip->i_contents);
 752                         return;
 753                 }
 754                 if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
 755                         rw_exit(&ip->i_contents);
 756                         ufs_delete(ip->i_ufsvfs, ip, 0);
 757                         return;
 758                 }
 759 
 760                 /* queue to delete thread; IREF remains set */
 761                 ins.in_qfree.value.ul++;
 762                 uq = &ip->i_ufsvfs->vfs_delete;
 763 
 764                 mutex_enter(&uq->uq_mutex);
 765 
 766                 /* add to q */
 767                 if ((iq = uq->uq_ihead) != 0) {
 768                         ip->i_freef = iq;
 769                         ip->i_freeb = iq->i_freeb;
 770                         iq->i_freeb->i_freef = ip;
 771                         iq->i_freeb = ip;
 772                         if (front)
 773                                 uq->uq_ihead = ip;
 774                 } else {
 775                         uq->uq_ihead = ip;
 776                         ip->i_freef = ip;
 777                         ip->i_freeb = ip;
 778                 }
 779 
 780                 delq_info->delq_unreclaimed_files += 1;
 781                 delq_info->delq_unreclaimed_blocks += ip->i_blocks;
 782         } else {
 783                 /*
 784                  * queue to idle thread
 785                  *  Check the v_count == 1 again.
 786                  *
 787                  */
 788                 mutex_enter(&vp->v_lock);
 789                 if (vp->v_count > 1) {
 790                         VN_RELE_LOCKED(vp);
 791                         mutex_exit(&vp->v_lock);
 792                         rw_exit(&ip->i_contents);
 793                         return;
 794                 }
 795                 mutex_exit(&vp->v_lock);
 796                 uq = &ufs_idle_q;
 797 
 798                 /*
 799                  * useful iff it has pages or is a fastsymlink; otherwise junk
 800                  */
 801                 mutex_enter(&uq->uq_mutex);
 802 
 803                 /* clear IREF means `on idle list' */
 804                 ip->i_flag &= ~(IREF | IDIRECTIO);
 805 
 806                 if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
 807                         ins.in_frback.value.ul++;
 808                         hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
 809                         ufs_nuseful_iq++;
 810                 } else {
 811                         ins.in_frfront.value.ul++;
 812                         hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
 813                         ip->i_flag |= IJUNKIQ;
 814                         ufs_njunk_iq++;
 815                 }
 816                 ip->i_freef = hip;
 817                 ip->i_freeb = hip->i_freeb;
 818                 hip->i_freeb->i_freef = ip;
 819                 hip->i_freeb = ip;
 820         }
 821 
 822         /* wakeup thread(s) if q is overfull */
 823         if (++uq->uq_ne == uq->uq_lowat)
 824                 cv_broadcast(&uq->uq_cv);
 825 
 826         /* all done, release the q and inode */
 827         mutex_exit(&uq->uq_mutex);
 828         rw_exit(&ip->i_contents);
 829 }
 830 
 831 /*
 832  * Check accessed and update flags on an inode structure.
 833  * If any are on, update the inode with the (unique) current time.
 834  * If waitfor is given, insure I/O order so wait for write to complete.
 835  */
 836 void
 837 ufs_iupdat(struct inode *ip, int waitfor)
 838 {
 839         struct buf      *bp;
 840         struct fs       *fp;
 841         struct dinode   *dp;
 842         struct ufsvfs   *ufsvfsp        = ip->i_ufsvfs;
 843         int             i;
 844         int             do_trans_times;
 845         ushort_t        flag;
 846         o_uid_t         suid;
 847         o_gid_t         sgid;
 848 
 849         /*
 850          * This function is now safe to be called with either the reader
 851          * or writer i_contents lock.
 852          */
 853         ASSERT(RW_LOCK_HELD(&ip->i_contents));
 854 
 855         /*
 856          * Return if file system has been forcibly umounted.
 857          */
 858         if (ufsvfsp == NULL)
 859                 return;
 860 
 861         flag = ip->i_flag;   /* Atomic read */
 862         /*
 863          * We better not update the disk inode from a stale inode.
 864          */
 865         if (flag & ISTALE)
 866                 return;
 867 
 868         fp = ip->i_fs;
 869 
 870         if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
 871                 if (fp->fs_ronly) {
 872                         mutex_enter(&ip->i_tlock);
 873                         ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
 874                         mutex_exit(&ip->i_tlock);
 875                         return;
 876                 }
 877                 /*
 878                  * fs is active while metadata is being written
 879                  */
 880                 mutex_enter(&ufsvfsp->vfs_lock);
 881                 ufs_notclean(ufsvfsp);
 882                 /*
 883                  * get the dinode
 884                  */
 885                 bp = UFS_BREAD(ufsvfsp, ip->i_dev,
 886                     (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
 887                     (int)fp->fs_bsize);
 888                 if (bp->b_flags & B_ERROR) {
 889                         mutex_enter(&ip->i_tlock);
 890                         ip->i_flag &=
 891                             ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
 892                         mutex_exit(&ip->i_tlock);
 893                         brelse(bp);
 894                         return;
 895                 }
 896                 /*
 897                  * munge inode fields
 898                  */
 899                 mutex_enter(&ip->i_tlock);
 900                 ITIMES_NOLOCK(ip);
 901                 do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
 902                 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
 903                 mutex_exit(&ip->i_tlock);
 904 
 905                 /*
 906                  * For reads and concurrent re-writes, no deltas were
 907                  * entered for the access time changes - do it now.
 908                  */
 909                 if (do_trans_times) {
 910                         TRANS_INODE_TIMES(ufsvfsp, ip);
 911                 }
 912 
 913                 /*
 914                  * For SunOS 5.0->5.4, these lines below read:
 915                  *
 916                  * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
 917                  * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
 918                  *
 919                  * where MAXUID was set to 60002.  This was incorrect -
 920                  * the uids should have been constrained to what fitted into
 921                  * a 16-bit word.
 922                  *
 923                  * This means that files from 4.x filesystems that have an
 924                  * i_suid field larger than 60002 will have that field
 925                  * changed to 65535.
 926                  *
 927                  * Security note: 4.x UFS could never create a i_suid of
 928                  * UID_LONG since that would've corresponded to -1.
 929                  */
 930                 suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
 931                     UID_LONG : ip->i_uid;
 932                 sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
 933                     GID_LONG : ip->i_gid;
 934 
 935                 if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
 936                         ip->i_suid = suid;
 937                         ip->i_sgid = sgid;
 938                         TRANS_INODE(ufsvfsp, ip);
 939                 }
 940 
 941                 if ((ip->i_mode & IFMT) == IFBLK ||
 942                     (ip->i_mode & IFMT) == IFCHR) {
 943                         dev_t d = ip->i_rdev;
 944                         dev32_t dev32;
 945 
 946                         /*
 947                          * load first direct block only if special device
 948                          */
 949                         if (!cmpldev(&dev32, d)) {
 950                                 /*
 951                                  * We panic here because there's "no way"
 952                                  * we should have been able to create a large
 953                                  * inode with a large dev_t.  Earlier layers
 954                                  * should've caught this.
 955                                  */
 956                                 panic("ip %p: i_rdev too big", (void *)ip);
 957                         }
 958 
 959                         if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
 960                                 ip->i_ordev = dev32; /* can't use old fmt. */
 961                         } else {
 962                                 ip->i_ordev = cmpdev(d);
 963                         }
 964                 }
 965 
 966                 /*
 967                  * copy inode to dinode (zero fastsymlnk in dinode)
 968                  */
 969                 dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
 970                 dp->di_ic = ip->i_ic;     /* structure assignment */
 971                 if (flag & IFASTSYMLNK) {
 972                         for (i = 1; i < NDADDR; i++)
 973                                 dp->di_db[i] = 0;
 974                         for (i = 0; i < NIADDR; i++)
 975                                 dp->di_ib[i] = 0;
 976                 }
 977                 if (TRANS_ISTRANS(ufsvfsp)) {
 978                         /*
 979                          * Pass only a sector size buffer containing
 980                          * the inode, otherwise when the buffer is copied
 981                          * into a cached roll buffer then too much memory
 982                          * gets consumed if 8KB inode buffers are passed.
 983                          */
 984                         TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
 985                             sizeof (struct dinode),
 986                             (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
 987                             DEV_BSIZE);
 988 
 989                         brelse(bp);
 990                 } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
 991                         UFS_BRWRITE(ufsvfsp, bp);
 992 
 993                         /*
 994                          * Synchronous write has guaranteed that inode
 995                          * has been written on disk so clear the flag
 996                          */
 997                         mutex_enter(&ip->i_tlock);
 998                         ip->i_flag &= ~IBDWRITE;
 999                         mutex_exit(&ip->i_tlock);
1000                 } else {
1001                         bdrwrite(bp);
1002 
1003                         /*
1004                          * This write hasn't guaranteed that inode has been
1005                          * written on the disk.
1006                          * Since, all updat flags on inode are cleared, we must
1007                          * remember the condition in case inode is to be updated
1008                          * synchronously later (e.g.- fsync()/fdatasync())
1009                          * and inode has not been modified yet.
1010                          */
1011                         mutex_enter(&ip->i_tlock);
1012                         ip->i_flag |= IBDWRITE;
1013                         mutex_exit(&ip->i_tlock);
1014                 }
1015         } else {
1016                 /*
1017                  * In case previous inode update was done asynchronously
1018                  * (IBDWRITE) and this inode update request wants guaranteed
1019                  * (synchronous) disk update, flush the inode.
1020                  */
1021                 if (waitfor && (flag & IBDWRITE)) {
1022                         blkflush(ip->i_dev,
1023                             (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
1024                         mutex_enter(&ip->i_tlock);
1025                         ip->i_flag &= ~IBDWRITE;
1026                         mutex_exit(&ip->i_tlock);
1027                 }
1028         }
1029 }
1030 
1031 #define SINGLE  0       /* index of single indirect block */
1032 #define DOUBLE  1       /* index of double indirect block */
1033 #define TRIPLE  2       /* index of triple indirect block */
1034 
1035 /*
1036  * Release blocks associated with the inode ip and
1037  * stored in the indirect block bn.  Blocks are free'd
1038  * in LIFO order up to (but not including) lastbn.  If
1039  * level is greater than SINGLE, the block is an indirect
1040  * block and recursive calls to indirtrunc must be used to
1041  * cleanse other indirect blocks.
1042  *
1043  * N.B.: triple indirect blocks are untested.
1044  */
1045 static long
1046 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
1047 {
1048         int i;
1049         struct buf *bp, *copy;
1050         daddr32_t *bap;
1051         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1052         struct fs *fs = ufsvfsp->vfs_fs;
1053         daddr_t nb, last;
1054         long factor;
1055         int blocksreleased = 0, nblocks;
1056 
1057         ASSERT(RW_WRITE_HELD(&ip->i_contents));
1058         /*
1059          * Calculate index in current block of last
1060          * block to be kept.  -1 indicates the entire
1061          * block so we need not calculate the index.
1062          */
1063         factor = 1;
1064         for (i = SINGLE; i < level; i++)
1065                 factor *= NINDIR(fs);
1066         last = lastbn;
1067         if (lastbn > 0)
1068                 last /= factor;
1069         nblocks = btodb(fs->fs_bsize);
1070         /*
1071          * Get buffer of block pointers, zero those
1072          * entries corresponding to blocks to be free'd,
1073          * and update on disk copy first.
1074          * *Unless* the root pointer has been synchronously
1075          * written to disk.  If nothing points to this
1076          * indirect block then don't bother zero'ing and
1077          * writing it.
1078          */
1079         bp = UFS_BREAD(ufsvfsp,
1080             ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
1081         if (bp->b_flags & B_ERROR) {
1082                 brelse(bp);
1083                 return (0);
1084         }
1085         bap = bp->b_un.b_daddr;
1086         if ((flags & I_CHEAP) == 0) {
1087                 uint_t  zb;
1088 
1089                 zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));
1090 
1091                 if (zb) {
1092                         /*
1093                          * push any data into the log before we zero it
1094                          */
1095                         if (bp->b_flags & B_DELWRI)
1096                                 TRANS_LOG(ufsvfsp, (caddr_t)bap,
1097                                     ldbtob(bp->b_blkno), bp->b_bcount,
1098                                     bp->b_un.b_addr, bp->b_bcount);
1099                         copy = ngeteblk(fs->fs_bsize);
1100                         bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
1101                             (uint_t)fs->fs_bsize);
1102                         bzero((caddr_t)&bap[last + 1], zb);
1103 
1104                         TRANS_BUF(ufsvfsp,
1105                             (caddr_t)&bap[last + 1] - (caddr_t)bap,
1106                             zb, bp, DT_ABZERO);
1107 
1108                         UFS_BRWRITE(ufsvfsp, bp);
1109                         bp = copy, bap = bp->b_un.b_daddr;
1110                 }
1111         } else {
1112                 /* make sure write retries are also cleared */
1113                 bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
1114                 bp->b_flags |= B_STALE | B_AGE;
1115         }
1116 
1117         /*
1118          * Recursively free totally unused blocks.
1119          */
1120         flags |= I_CHEAP;
1121         for (i = NINDIR(fs) - 1; i > last; i--) {
1122                 nb = bap[i];
1123                 if (nb == 0)
1124                         continue;
1125                 if (level > SINGLE) {
1126                         blocksreleased +=
1127                             indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
1128                         free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
1129                 } else
1130                         free(ip, nb, (off_t)fs->fs_bsize, flags);
1131                 blocksreleased += nblocks;
1132         }
1133         flags &= ~I_CHEAP;
1134 
1135         /*
1136          * Recursively free last partial block.
1137          */
1138         if (level > SINGLE && lastbn >= 0) {
1139                 last = lastbn % factor;
1140                 nb = bap[i];
1141                 if (nb != 0)
1142                         blocksreleased +=
1143                             indirtrunc(ip, nb, last, level - 1, flags);
1144         }
1145         brelse(bp);
1146         return (blocksreleased);
1147 }
1148 
1149 /*
1150  * Truncate the inode ip to at most length size.
1151  * Free affected disk blocks -- the blocks of the
1152  * file are removed in reverse order.
1153  *
1154  * N.B.: triple indirect blocks are untested.
1155  */
1156 static int i_genrand = 1234;
1157 int
1158 ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr)
1159 {
1160         struct fs *fs = oip->i_fs;
1161         struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
1162         struct inode *ip;
1163         daddr_t lastblock;
1164         off_t bsize;
1165         int boff;
1166         daddr_t bn, lastiblock[NIADDR];
1167         int level;
1168         long nblocks, blocksreleased = 0;
1169         int i;
1170         ushort_t mode;
1171         struct inode tip;
1172         int err;
1173         u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
1174             (UFS_MAXOFFSET_T) : (MAXOFF32_T);
1175 
1176         /*
1177          * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
1178          * other uses need the reader lock. opendq() holds the writer lock.
1179          */
1180         ASSERT((oip->i_mode & IFMT) == IFSHAD ||
1181             RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
1182         ASSERT(RW_WRITE_HELD(&oip->i_contents));
1183         /*
1184          * We only allow truncation of regular files and directories
1185          * to arbitrary lengths here.  In addition, we allow symbolic
1186          * links to be truncated only to zero length.  Other inode
1187          * types cannot have their length set here.  Disk blocks are
1188          * being dealt with - especially device inodes where
1189          * ip->i_ordev is actually being stored in ip->i_db[0]!
1190          */
1191         TRANS_INODE(ufsvfsp, oip);
1192         mode = oip->i_mode & IFMT;
1193         if (flags & I_FREE) {
1194                 i_genrand *= 16843009;  /* turns into shift and adds */
1195                 i_genrand++;
1196                 oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1;
1197                 oip->i_flag |= ICHG |IUPD;
1198                 oip->i_seq++;
1199                 if (length == oip->i_size)
1200                         return (0);
1201                 flags |= I_CHEAP;
1202         }
1203         if (mode == IFIFO)
1204                 return (0);
1205         if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
1206             !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
1207                 return (EINVAL);
1208         if (length > maxoffset)
1209                 return (EFBIG);
1210         if ((mode == IFDIR) || (mode == IFATTRDIR))
1211                 flags |= I_DIR;
1212         if (mode == IFSHAD)
1213                 flags |= I_SHAD;
1214         if (oip == ufsvfsp->vfs_qinod)
1215                 flags |= I_QUOTA;
1216         if (length == oip->i_size) {
1217                 /* update ctime and mtime to please POSIX tests */
1218                 oip->i_flag |= ICHG |IUPD;
1219                 oip->i_seq++;
1220                 if (length == 0) {
1221                         /* nothing to cache so clear the flag */
1222                         oip->i_flag &= ~IFASTSYMLNK;
1223                 }
1224                 return (0);
1225         }
1226         /* wipe out fast symlink till next access */
1227         if (oip->i_flag & IFASTSYMLNK) {
1228                 int j;
1229 
1230                 ASSERT(ITOV(oip)->v_type == VLNK);
1231 
1232                 oip->i_flag &= ~IFASTSYMLNK;
1233 
1234                 for (j = 1; j < NDADDR; j++)
1235                         oip->i_db[j] = 0;
1236                 for (j = 0; j < NIADDR; j++)
1237                         oip->i_ib[j] = 0;
1238         }
1239 
1240         boff = (int)blkoff(fs, length);
1241 
1242         if (length > oip->i_size) {
1243                 /*
1244                  * Trunc up case.  BMAPALLOC will insure that the right blocks
1245                  * are allocated.  This includes extending the old frag to a
1246                  * full block (if needed) in addition to doing any work
1247                  * needed for allocating the last block.
1248                  */
1249                 if (boff == 0)
1250                         err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
1251                 else
1252                         err = BMAPALLOC(oip, length - 1, boff, cr);
1253 
1254                 if (err == 0) {
1255                         /*
1256                          * Save old size and set inode's size now
1257                          * so that we don't cause too much of the
1258                          * file to be zero'd and pushed.
1259                          */
1260                         u_offset_t osize = oip->i_size;
1261                         oip->i_size  = length;
1262                         /*
1263                          * Make sure we zero out the remaining bytes of
1264                          * the page in case a mmap scribbled on it. We
1265                          * can't prevent a mmap from writing beyond EOF
1266                          * on the last page of a file.
1267                          *
1268                          */
1269                         if ((boff = (int)blkoff(fs, osize)) != 0) {
1270                                 bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
1271                                     fs->fs_bsize : fragroundup(fs, boff);
1272                                 pvn_vpzero(ITOV(oip), osize,
1273                                     (size_t)(bsize - boff));
1274                         }
1275                         oip->i_flag |= ICHG|IATTCHG;
1276                         oip->i_seq++;
1277                         ITIMES_NOLOCK(oip);
1278                         /*
1279                          * MAXOFF32_T is old 2GB size limit. If
1280                          * this operation caused a large file to be
1281                          * created, turn on the superblock flag
1282                          * and update the superblock, if the flag
1283                          * is not already on.
1284                          */
1285                         if ((length > (u_offset_t)MAXOFF32_T) &&
1286                             !(fs->fs_flags & FSLARGEFILES)) {
1287                                 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1288                                 mutex_enter(&ufsvfsp->vfs_lock);
1289                                 fs->fs_flags |= FSLARGEFILES;
1290                                 ufs_sbwrite(ufsvfsp);
1291                                 mutex_exit(&ufsvfsp->vfs_lock);
1292                         }
1293                 }
1294 
1295                 return (err);
1296         }
1297 
1298         /*
1299          * Update the pages of the file.  If the file is not being
1300          * truncated to a block boundary, the contents of the
1301          * pages following the end of the file must be zero'ed
1302          * in case it ever become accessible again because
1303          * of subsequent file growth.
1304          */
1305         if (boff == 0) {
1306                 (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
1307                     B_INVAL | B_TRUNC, CRED());
1308         } else {
1309                 /*
1310                  * Make sure that the last block is properly allocated.
1311                  * We only really have to do this if the last block is
1312                  * actually allocated since ufs_bmap will now handle the case
1313                  * of an fragment which has no block allocated.  Just to
1314                  * be sure, we do it now independent of current allocation.
1315                  */
1316                 err = BMAPALLOC(oip, length - 1, boff, cr);
1317                 if (err)
1318                         return (err);
1319 
1320                 /*
1321                  * BMAPALLOC will call bmap_write which defers i_seq
1322                  * processing.  If the timestamps were changed, update
1323                  * i_seq before rdip drops i_contents or syncs the inode.
1324                  */
1325                 if (oip->i_flag & (ICHG|IUPD))
1326                         oip->i_seq++;
1327 
1328                 /*
1329                  * BugId 4069932
1330                  * Make sure that the relevant partial page appears in
1331                  * the v_pages list, so that pvn_vpzero() will do its
1332                  * job.  Since doing this correctly requires everything
1333                  * in rdip() except for the uiomove(), it's easier and
1334                  * safer to do the uiomove() rather than duplicate the
1335                  * rest of rdip() here.
1336                  *
1337                  * To get here, we know that length indicates a byte
1338                  * that is not the first byte of a block.  (length - 1)
1339                  * is the last actual byte known to exist.  Deduction
1340                  * shows it is in the same block as byte (length).
1341                  * Thus, this rdip() invocation should always succeed
1342                  * except in the face of i/o errors, and give us the
1343                  * block we care about.
1344                  *
1345                  * rdip() makes the same locking assertions and
1346                  * assumptions as we do.  We do not acquire any locks
1347                  * before calling it, so we have not changed the locking
1348                  * situation.  Finally, there do not appear to be any
1349                  * paths whereby rdip() ends up invoking us again.
1350                  * Thus, infinite recursion is avoided.
1351                  */
1352                 {
1353                         uio_t uio;
1354                         iovec_t iov[1];
1355                         char buffer;
1356 
1357                         uio.uio_iov = iov;
1358                         uio.uio_iovcnt = 1;
1359                         uio.uio_loffset = length - 1;
1360                         uio.uio_resid = 1;
1361                         uio.uio_segflg = UIO_SYSSPACE;
1362                         uio.uio_extflg = UIO_COPY_CACHED;
1363 
1364                         iov[0].iov_base = &buffer;
1365                         iov[0].iov_len = 1;
1366 
1367                         err = rdip(oip, &uio, UIO_READ, NULL);
1368                         if (err)
1369                                 return (err);
1370                 }
1371 
1372                 bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
1373                     fs->fs_bsize : fragroundup(fs, boff);
1374                 pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
1375                 /*
1376                  * Ensure full fs block is marked as dirty.
1377                  */
1378                 (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
1379                     ufs_putapage, B_INVAL | B_TRUNC, CRED());
1380         }
1381 
1382         /*
1383          * Calculate index into inode's block list of
1384          * last direct and indirect blocks (if any)
1385          * which we want to keep.  Lastblock is -1 when
1386          * the file is truncated to 0.
1387          */
1388         lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
1389         lastiblock[SINGLE] = lastblock - NDADDR;
1390         lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
1391         lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
1392         nblocks = btodb(fs->fs_bsize);
1393 
1394         /*
1395          * Update file and block pointers
1396          * on disk before we start freeing blocks.
1397          * If we crash before free'ing blocks below,
1398          * the blocks will be returned to the free list.
1399          * lastiblock values are also normalized to -1
1400          * for calls to indirtrunc below.
1401          */
1402         tip = *oip;                     /* structure copy */
1403         ip = &tip;
1404 
1405         for (level = TRIPLE; level >= SINGLE; level--)
1406                 if (lastiblock[level] < 0) {
1407                         oip->i_ib[level] = 0;
1408                         lastiblock[level] = -1;
1409                 }
1410         for (i = NDADDR - 1; i > lastblock; i--) {
1411                 oip->i_db[i] = 0;
1412                 flags |= I_CHEAP;
1413         }
1414         oip->i_size = length;
1415         oip->i_flag |= ICHG|IUPD|IATTCHG;
1416         oip->i_seq++;
1417         if (!TRANS_ISTRANS(ufsvfsp))
1418                 ufs_iupdat(oip, I_SYNC);        /* do sync inode update */
1419 
1420         /*
1421          * Indirect blocks first.
1422          */
1423         for (level = TRIPLE; level >= SINGLE; level--) {
1424                 bn = ip->i_ib[level];
1425                 if (bn != 0) {
1426                         blocksreleased +=
1427                             indirtrunc(ip, bn, lastiblock[level], level, flags);
1428                         if (lastiblock[level] < 0) {
1429                                 ip->i_ib[level] = 0;
1430                                 free(ip, bn, (off_t)fs->fs_bsize,
1431                                     flags | I_IBLK);
1432                                 blocksreleased += nblocks;
1433                         }
1434                 }
1435                 if (lastiblock[level] >= 0)
1436                         goto done;
1437         }
1438 
1439         /*
1440          * All whole direct blocks or frags.
1441          */
1442         for (i = NDADDR - 1; i > lastblock; i--) {
1443                 bn = ip->i_db[i];
1444                 if (bn == 0)
1445                         continue;
1446                 ip->i_db[i] = 0;
1447                 bsize = (off_t)blksize(fs, ip, i);
1448                 free(ip, bn, bsize, flags);
1449                 blocksreleased += btodb(bsize);
1450         }
1451         if (lastblock < 0)
1452                 goto done;
1453 
1454         /*
1455          * Finally, look for a change in size of the
1456          * last direct block; release any frags.
1457          */
1458         bn = ip->i_db[lastblock];
1459         if (bn != 0) {
1460                 off_t oldspace, newspace;
1461 
1462                 /*
1463                  * Calculate amount of space we're giving
1464                  * back as old block size minus new block size.
1465                  */
1466                 oldspace = blksize(fs, ip, lastblock);
1467                 UFS_SET_ISIZE(length, ip);
1468                 newspace = blksize(fs, ip, lastblock);
1469                 if (newspace == 0) {
1470                         err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
1471                         return (err);
1472                 }
1473                 if (oldspace - newspace > 0) {
1474                         /*
1475                          * Block number of space to be free'd is
1476                          * the old block # plus the number of frags
1477                          * required for the storage we're keeping.
1478                          */
1479                         bn += numfrags(fs, newspace);
1480                         free(ip, bn, oldspace - newspace, flags);
1481                         blocksreleased += btodb(oldspace - newspace);
1482                 }
1483         }
1484 done:
1485 /* BEGIN PARANOIA */
1486         for (level = SINGLE; level <= TRIPLE; level++)
1487                 if (ip->i_ib[level] != oip->i_ib[level]) {
1488                         err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
1489                         return (err);
1490                 }
1491 
1492         for (i = 0; i < NDADDR; i++)
1493                 if (ip->i_db[i] != oip->i_db[i]) {
1494                         err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
1495                         return (err);
1496                 }
1497 /* END PARANOIA */
1498         oip->i_blocks -= blocksreleased;
1499 
1500         if (oip->i_blocks < 0) {          /* sanity */
1501                 cmn_err(CE_NOTE,
1502                     "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
1503                     fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
1504                     (int)oip->i_blocks);
1505                 oip->i_blocks = 0;
1506         }
1507         oip->i_flag |= ICHG|IATTCHG;
1508         oip->i_seq++;
1509         /* blocksreleased is >= zero, so this can not fail */
1510         (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL,
1511             (size_t *)NULL);
1512         return (0);
1513 }
1514 
1515 /*
1516  * Check mode permission on inode.  Mode is READ, WRITE or EXEC.
1517  * In the case of WRITE, the read-only status of the file system
1518  * is checked.  Depending on the calling user, the appropriate
1519  * mode bits are selected; privileges to override missing permission
1520  * bits are checked through secpolicy_vnode_access().
1521  * The i_contens lock must be held as reader here to prevent racing with
1522  * the acl subsystem removing/setting/changing acls on this inode.
1523  * The caller is responsible for indicating whether or not the i_contents
1524  * lock needs to be acquired here or if already held.
1525  */
1526 int
1527 ufs_iaccess(struct inode  *ip, int mode, struct cred *cr, int dolock)
1528 {
1529         int shift = 0;
1530         int ret = 0;
1531 
1532         if (dolock)
1533                 rw_enter(&ip->i_contents, RW_READER);
1534         ASSERT(RW_LOCK_HELD(&ip->i_contents));
1535 
1536         if (mode & IWRITE) {
1537                 /*
1538                  * Disallow write attempts on read-only
1539                  * file systems, unless the file is a block
1540                  * or character device or a FIFO.
1541                  */
1542                 if (ip->i_fs->fs_ronly != 0) {
1543                         if ((ip->i_mode & IFMT) != IFCHR &&
1544                             (ip->i_mode & IFMT) != IFBLK &&
1545                             (ip->i_mode & IFMT) != IFIFO) {
1546                                 ret = EROFS;
1547                                 goto out;
1548                         }
1549                 }
1550         }
1551         /*
1552          * If there is an acl, check the acl and return.
1553          */
1554         if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) {
1555                 ret = ufs_acl_access(ip, mode, cr);
1556                 goto out;
1557         }
1558 
1559         /*
1560          * Access check is based on only one of owner, group, public.
1561          * If not owner, then check group.
1562          * If not a member of the group, then check public access.
1563          */
1564         if (crgetuid(cr) != ip->i_uid) {
1565                 shift += 3;
1566                 if (!groupmember((uid_t)ip->i_gid, cr))
1567                         shift += 3;
1568         }
1569 
1570         /* test missing privilege bits */
1571         ret = secpolicy_vnode_access2(cr, ITOV(ip), ip->i_uid,
1572             ip->i_mode << shift, mode);
1573 out:
1574         if (dolock)
1575                 rw_exit(&ip->i_contents);
1576         return (ret);
1577 }
1578 
1579 /*
1580  * if necessary, remove an inode from the free list
1581  *      i_contents is held except at unmount
1582  *
1583  * Return 1 if the inode is taken off of the ufs_idle_q,
1584  * and the caller is expected to call VN_RELE.
1585  *
1586  * Return 0 otherwise.
1587  */
1588 int
1589 ufs_rmidle(struct inode *ip)
1590 {
1591         int rval = 0;
1592 
1593         mutex_enter(&ip->i_tlock);
1594         if ((ip->i_flag & IREF) == 0) {
1595                 mutex_enter(&ufs_idle_q.uq_mutex);
1596                 ip->i_freef->i_freeb = ip->i_freeb;
1597                 ip->i_freeb->i_freef = ip->i_freef;
1598                 ip->i_freef = ip;
1599                 ip->i_freeb = ip;
1600                 ip->i_flag |= IREF;
1601                 ufs_idle_q.uq_ne--;
1602                 if (ip->i_flag & IJUNKIQ) {
1603                         ufs_njunk_iq--;
1604                         ip->i_flag &= ~IJUNKIQ;
1605                 } else {
1606                         ufs_nuseful_iq--;
1607                 }
1608                 mutex_exit(&ufs_idle_q.uq_mutex);
1609                 rval = 1;
1610         }
1611         mutex_exit(&ip->i_tlock);
1612         return (rval);
1613 }
1614 
1615 /*
1616  * scan the hash of inodes and call func with the inode locked
1617  */
1618 int
1619 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
1620     struct ufsvfs *ufsvfsp)
1621 {
1622         struct inode            *ip;            /* current inode */
1623         struct inode            *lip = NULL;    /* last/previous inode */
1624         union ihead             *ih;            /* current hash chain */
1625         int                     error, i;
1626         int                     saverror = 0;
1627         int                     lip_held;       /* lip needs a VN_RELE() */
1628 
1629         /*
1630          * If ufsvfsp is NULL, then our caller should be holding
1631          * ufs_scan_lock to avoid conflicts between ufs_unmount() and
1632          * ufs_update().  Otherwise, to avoid false-positives in
1633          * ufs_unmount()'s v_count-based EBUSY check, we only hold
1634          * those inodes that are in the file system our caller cares
1635          * about.
1636          *
1637          * We know that ip is a valid inode in the hash chain (and thus
1638          * we can trust i_ufsvfs) because the inode we chained from
1639          * (lip) is still in the hash chain.  This is true because either:
1640          *
1641          * 1. We did not drop the hash chain lock since the last
1642          *    iteration (because we were not interested in the last inode),
1643          * or
1644          * 2. We maintained a hold on the last inode while we
1645          *    we were processing it, so it could not be removed
1646          *    from the hash chain.
1647          *
1648          * The whole reason we're dropping and re-grabbing the chain
1649          * lock on every inode is so that we don't present a major
1650          * choke point on throughput, particularly when we've been
1651          * called on behalf of fsflush.
1652          */
1653 
1654         for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
1655                 mutex_enter(&ih_lock[i]);
1656                 for (ip = ih->ih_chain[0], lip_held = 0;
1657                     ip != (struct inode *)ih;
1658                     ip = lip->i_forw) {
1659 
1660                         ins.in_scan.value.ul++;
1661 
1662                         /*
1663                          * Undo the previous iteration's VN_HOLD(), but
1664                          * only if one was done.
1665                          */
1666                         if (lip_held)
1667                                 VN_RELE(ITOV(lip));
1668 
1669                         lip = ip;
1670                         if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
1671                                 /*
1672                                  * We're not processing all inodes, and
1673                                  * this inode is not in the filesystem of
1674                                  * interest, so skip it.  No need to do a
1675                                  * VN_HOLD() since we're not dropping the
1676                                  * hash chain lock until after we've
1677                                  * done the i_forw traversal above.
1678                                  */
1679                                 lip_held = 0;
1680                                 continue;
1681                         }
1682                         VN_HOLD(ITOV(ip));
1683                         lip_held = 1;
1684                         mutex_exit(&ih_lock[i]);
1685 
1686                         /*
1687                          * Acquire the contents lock as writer to make
1688                          * sure that the inode has been initialized in
1689                          * the cache or removed from the idle list by
1690                          * ufs_iget().  This works because ufs_iget()
1691                          * acquires the contents lock before putting
1692                          * the inode into the cache.  If we can lock
1693                          * it, then ufs_iget() is done with it.
1694                          */
1695 
1696                         if (rwtry) {
1697                                 if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
1698                                         mutex_enter(&ih_lock[i]);
1699                                         continue;
1700                                 }
1701                         } else {
1702                                 rw_enter(&ip->i_contents, RW_WRITER);
1703                         }
1704 
1705                         rw_exit(&ip->i_contents);
1706 
1707                         /*
1708                          * ISTALE means the inode couldn't be read
1709                          *
1710                          * We don't have to hold the i_contents lock
1711                          * for this check for a couple of
1712                          * reasons. First, if ISTALE is set then the
1713                          * flag cannot be cleared until the inode is
1714                          * removed from the cache and that cannot
1715                          * happen until after we VN_RELE() it.
1716                          * Second, if ISTALE is not set, then the
1717                          * inode is in the cache and does not need to
1718                          * be read from disk so ISTALE cannot be set
1719                          * while we are not looking.
1720                          */
1721                         if ((ip->i_flag & ISTALE) == 0) {
1722                                 if ((error = (*func)(ip, arg)) != 0)
1723                                         saverror = error;
1724                         }
1725 
1726                         mutex_enter(&ih_lock[i]);
1727                 }
1728                 if (lip_held)
1729                         VN_RELE(ITOV(lip));
1730                 mutex_exit(&ih_lock[i]);
1731         }
1732         return (saverror);
1733 }
1734 
1735 /*
1736  * Mark inode with the current time, plus a unique increment.
1737  *
1738  * Since we only keep 32-bit time on disk, if UFS is still alive
1739  * beyond 2038, filesystem times will simply stick at the last
1740  * possible second of 32-bit time. Not ideal, but probably better
1741  * than going into the remote past, or confusing applications with
1742  * negative time.
1743  */
1744 void
1745 ufs_imark(struct inode *ip)
1746 {
1747         timestruc_t now;
1748         int32_t usec, nsec;
1749 
1750         /*
1751          * The update of i_seq may have been deferred, increase i_seq here
1752          * to make sure it is in sync with the timestamps.
1753          */
1754         if (ip->i_flag & ISEQ) {
1755                 ASSERT(ip->i_flag & (IUPD|ICHG));
1756                 ip->i_seq++;
1757                 ip->i_flag &= ~ISEQ;
1758         }
1759 
1760         gethrestime(&now);
1761 
1762         /*
1763          * Fast algorithm to convert nsec to usec -- see hrt2ts()
1764          * in common/os/timers.c for a full description.
1765          */
1766         nsec = now.tv_nsec;
1767         usec = nsec + (nsec >> 2);
1768         usec = nsec + (usec >> 1);
1769         usec = nsec + (usec >> 2);
1770         usec = nsec + (usec >> 4);
1771         usec = nsec - (usec >> 3);
1772         usec = nsec + (usec >> 2);
1773         usec = nsec + (usec >> 3);
1774         usec = nsec + (usec >> 4);
1775         usec = nsec + (usec >> 1);
1776         usec = nsec + (usec >> 6);
1777         usec = usec >> 10;
1778 
1779         mutex_enter(&ufs_iuniqtime_lock);
1780         if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
1781             usec > iuniqtime.tv_usec) {
1782                 if (now.tv_sec < TIME32_MAX) {
1783                         iuniqtime.tv_sec = (time32_t)now.tv_sec;
1784                         iuniqtime.tv_usec = usec;
1785                 }
1786         } else {
1787                 if (iuniqtime.tv_sec < TIME32_MAX) {
1788                         iuniqtime.tv_usec++;
1789                         /* Check for usec overflow */
1790                         if (iuniqtime.tv_usec >= MICROSEC) {
1791                                 iuniqtime.tv_sec++;
1792                                 iuniqtime.tv_usec = 0;
1793                         }
1794                 }
1795         }
1796 
1797         if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
1798                 ip->i_atime = iuniqtime;
1799         }
1800         if (ip->i_flag & IUPD) {
1801                 ip->i_mtime = iuniqtime;
1802                 ip->i_flag |= IMODTIME;
1803         }
1804         if (ip->i_flag & ICHG) {
1805                 ip->i_diroff = 0;
1806                 ip->i_ctime = iuniqtime;
1807         }
1808         mutex_exit(&ufs_iuniqtime_lock);
1809 }
1810 
1811 /*
1812  * Update timestamps in inode.
1813  */
1814 void
1815 ufs_itimes_nolock(struct inode *ip)
1816 {
1817 
1818         /*
1819          * if noatime is set and the inode access time is the only field that
1820          * must be changed, exit immediately.
1821          */
1822         if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
1823             (ip->i_ufsvfs->vfs_noatime)) {
1824                 return;
1825         }
1826 
1827         if (ip->i_flag & (IUPD|IACC|ICHG)) {
1828                 if (ip->i_flag & ICHG)
1829                         ip->i_flag |= IMOD;
1830                 else
1831                         ip->i_flag |= IMODACC;
1832                 ufs_imark(ip);
1833                 ip->i_flag &= ~(IACC|IUPD|ICHG);
1834         }
1835 }