Print this page
    
re #13613 rb4516 Tunables needs volatile keyword
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/ufs/ufs_inode.c
          +++ new/usr/src/uts/common/fs/ufs/ufs_inode.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  
    | 
      ↓ open down ↓ | 
    12 lines elided | 
    
      ↑ open up ↑ | 
  
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  23   24   * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
  24   25   */
  25   26  
  26   27  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  27   28  /*        All Rights Reserved   */
  28   29  
  29   30  /*
  30   31   * University Copyright- Copyright (c) 1982, 1986, 1988
  31   32   * The Regents of the University of California
  32   33   * All Rights Reserved
  33   34   *
  34   35   * University Acknowledgment- Portions of this document are derived from
  35   36   * software developed by the University of California, Berkeley, and its
  36   37   * contributors.
  37   38   */
  38   39  
  39   40  #include <sys/types.h>
  40   41  #include <sys/t_lock.h>
  41   42  #include <sys/param.h>
  42   43  #include <sys/systm.h>
  43   44  #include <sys/uio.h>
  44   45  #include <sys/bitmap.h>
  45   46  #include <sys/signal.h>
  46   47  #include <sys/cred.h>
  47   48  #include <sys/user.h>
  48   49  #include <sys/vfs.h>
  49   50  #include <sys/stat.h>
  50   51  #include <sys/vnode.h>
  51   52  #include <sys/buf.h>
  52   53  #include <sys/proc.h>
  53   54  #include <sys/disp.h>
  54   55  #include <sys/dnlc.h>
  55   56  #include <sys/mode.h>
  56   57  #include <sys/cmn_err.h>
  57   58  #include <sys/kstat.h>
  58   59  #include <sys/acl.h>
  59   60  #include <sys/var.h>
  60   61  #include <sys/fs/ufs_inode.h>
  61   62  #include <sys/fs/ufs_fs.h>
  62   63  #include <sys/fs/ufs_trans.h>
  63   64  #include <sys/fs/ufs_acl.h>
  64   65  #include <sys/fs/ufs_bio.h>
  65   66  #include <sys/fs/ufs_quota.h>
  66   67  #include <sys/fs/ufs_log.h>
  67   68  #include <vm/hat.h>
  68   69  #include <vm/as.h>
  69   70  #include <vm/pvn.h>
  70   71  #include <vm/seg.h>
  71   72  #include <sys/swap.h>
  72   73  #include <sys/cpuvar.h>
  73   74  #include <sys/sysmacros.h>
  74   75  #include <sys/errno.h>
  75   76  #include <sys/kmem.h>
  76   77  #include <sys/debug.h>
  77   78  #include <fs/fs_subr.h>
  78   79  #include <sys/policy.h>
  79   80  
  80   81  struct kmem_cache *inode_cache;         /* cache of free inodes */
  81   82  
  82   83  /* UFS Inode Cache Stats -- Not protected */
  83   84  struct  instats ins = {
  84   85          { "size",               KSTAT_DATA_ULONG },
  85   86          { "maxsize",            KSTAT_DATA_ULONG },
  86   87          { "hits",               KSTAT_DATA_ULONG },
  87   88          { "misses",             KSTAT_DATA_ULONG },
  88   89          { "kmem allocs",        KSTAT_DATA_ULONG },
  89   90          { "kmem frees",         KSTAT_DATA_ULONG },
  90   91          { "maxsize reached",    KSTAT_DATA_ULONG },
  91   92          { "puts at frontlist",  KSTAT_DATA_ULONG },
  92   93          { "puts at backlist",   KSTAT_DATA_ULONG },
  93   94          { "queues to free",     KSTAT_DATA_ULONG },
  94   95          { "scans",              KSTAT_DATA_ULONG },
  95   96          { "thread idles",       KSTAT_DATA_ULONG },
  96   97          { "lookup idles",       KSTAT_DATA_ULONG },
  97   98          { "vget idles",         KSTAT_DATA_ULONG },
  98   99          { "cache allocs",       KSTAT_DATA_ULONG },
  99  100          { "cache frees",        KSTAT_DATA_ULONG },
 100  101          { "pushes at close",    KSTAT_DATA_ULONG }
 101  102  };
 102  103  
 103  104  /* kstat data */
 104  105  static kstat_t          *ufs_inode_kstat = NULL;
 105  106  
 106  107  union ihead *ihead;     /* inode LRU cache, Chris Maltby */
 107  108  kmutex_t *ih_lock;      /* protect inode cache hash table */
 108  109  static int ino_hashlen = 4;     /* desired average hash chain length */
 109  110  int inohsz;             /* number of buckets in the hash table */
 110  111  
 111  112  kmutex_t        ufs_scan_lock;  /* stop racing multiple ufs_scan_inodes() */
 112  113  kmutex_t        ufs_iuniqtime_lock; /* protect iuniqtime */
 113  114  kmutex_t        ufsvfs_mutex;
 114  115  struct ufsvfs   *oldufsvfslist, *ufsvfslist;
 115  116  
 116  117  /*
 117  118   * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
 118  119   * I/Os are going on.
 119  120   */
 120  121  clock_t ufs_iowait;
 121  122  
 122  123  /*
 123  124   * the threads that process idle inodes and free (deleted) inodes
 124  125   * have high water marks that are set in ufsinit().
 125  126   * These values but can be no less then the minimum shown below
 126  127   */
 127  128  int     ufs_idle_max;   /* # of allowable idle inodes */
  
    | 
      ↓ open down ↓ | 
    95 lines elided | 
    
      ↑ open up ↑ | 
  
 128  129  ulong_t ufs_inode_max;  /* hard limit of allowable idle inodes */
 129  130  #define UFS_IDLE_MAX    (16)    /* min # of allowable idle inodes */
 130  131  
 131  132  /*
 132  133   * Tunables for ufs write throttling.
 133  134   * These are validated in ufs_iinit() since improper settings
 134  135   * can lead to filesystem hangs.
 135  136   */
 136  137  #define UFS_HW_DEFAULT  (16 * 1024 * 1024)
 137  138  #define UFS_LW_DEFAULT  (8 * 1024 * 1024)
 138      -int     ufs_HW = UFS_HW_DEFAULT;
 139      -int     ufs_LW = UFS_LW_DEFAULT;
      139 +volatile int    ufs_HW = UFS_HW_DEFAULT;
      140 +volatile int    ufs_LW = UFS_LW_DEFAULT;
 140  141  
 141  142  static void ihinit(void);
 142  143  extern int hash2ints(int, int);
 143  144  
 144  145  static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
 145  146      struct cred *, int);
 146  147  
 147  148  /* ARGSUSED */
 148  149  static int
 149  150  ufs_inode_kstat_update(kstat_t *ksp, int rw)
 150  151  {
 151  152          if (rw == KSTAT_WRITE)
 152  153                  return (EACCES);
 153  154  
 154  155          ins.in_malloc.value.ul  = (ulong_t)kmem_cache_stat(inode_cache,
 155  156              "slab_alloc");
 156  157          ins.in_mfree.value.ul   = (ulong_t)kmem_cache_stat(inode_cache,
 157  158              "slab_free");
 158  159          ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
 159  160              "alloc");
 160  161          ins.in_kcfree.value.ul  = (ulong_t)kmem_cache_stat(inode_cache,
 161  162              "free");
 162  163          ins.in_size.value.ul    = (ulong_t)kmem_cache_stat(inode_cache,
 163  164              "buf_inuse");
 164  165          ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
 165  166              "buf_max");
 166  167          ins.in_misses.value.ul = ins.in_kcalloc.value.ul;
 167  168  
 168  169          return (0);
 169  170  }
 170  171  
 171  172  void
 172  173  ufs_iinit(void)
 173  174  {
 174  175          /*
 175  176           * Validate that ufs_HW > ufs_LW.
 176  177           * The default values for these two tunables have been increased.
 177  178           * There is now a range of values for ufs_HW that used to be
 178  179           * legal on previous Solaris versions but no longer is now.
 179  180           * Upgrading a machine which has an /etc/system setting for ufs_HW
 180  181           * from that range can lead to filesystem hangs unless the values
 181  182           * are checked here.
 182  183           */
 183  184          if (ufs_HW <= ufs_LW) {
 184  185                  cmn_err(CE_WARN,
 185  186                      "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
 186  187                      ufs_HW, ufs_LW);
 187  188                  ufs_LW = UFS_LW_DEFAULT;
 188  189                  ufs_HW = UFS_HW_DEFAULT;
 189  190                  cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
 190  191                      ufs_HW, ufs_LW);
 191  192          }
 192  193  
 193  194          /*
 194  195           * Adjust the tunable `ufs_ninode' to a reasonable value
 195  196           */
 196  197          if (ufs_ninode <= 0)
 197  198                  ufs_ninode = ncsize;
 198  199          if (ufs_inode_max == 0)
 199  200                  ufs_inode_max =
 200  201                      (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode));
 201  202          if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
 202  203                  cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
 203  204                      ufs_inode_max);
 204  205                  ufs_ninode = ufs_inode_max;
 205  206          }
 206  207          /*
 207  208           * Wait till third call of ufs_update to declare that no I/Os are
 208  209           * going on. This allows deferred access times to be flushed to disk.
 209  210           */
 210  211          ufs_iowait = v.v_autoup * hz * 2;
 211  212  
 212  213          /*
 213  214           * idle thread runs when 25% of ufs_ninode entries are on the queue
 214  215           */
 215  216          if (ufs_idle_max == 0)
 216  217                  ufs_idle_max = ufs_ninode >> 2;
 217  218          if (ufs_idle_max < UFS_IDLE_MAX)
 218  219                  ufs_idle_max = UFS_IDLE_MAX;
 219  220          if (ufs_idle_max > ufs_ninode)
 220  221                  ufs_idle_max = ufs_ninode;
 221  222          /*
 222  223           * This is really a misnomer, it is ufs_queue_init
 223  224           */
 224  225          ufs_thread_init(&ufs_idle_q, ufs_idle_max);
 225  226          ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);
 226  227  
 227  228          /*
 228  229           * global hlock thread
 229  230           */
 230  231          ufs_thread_init(&ufs_hlock, 1);
 231  232          ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);
 232  233  
 233  234          ihinit();
 234  235          qtinit();
 235  236          ins.in_maxsize.value.ul = ufs_ninode;
 236  237          if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
 237  238              KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
 238  239              KSTAT_FLAG_VIRTUAL)) != NULL) {
 239  240                  ufs_inode_kstat->ks_data = (void *)&ins;
 240  241                  ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
 241  242                  kstat_install(ufs_inode_kstat);
 242  243          }
 243  244          ufsfx_init();           /* fix-on-panic initialization */
 244  245          si_cache_init();
 245  246          ufs_directio_init();
 246  247          lufs_init();
 247  248          mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
 248  249  }
 249  250  
 250  251  /* ARGSUSED */
 251  252  static int
 252  253  ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
 253  254  {
 254  255          struct inode *ip = buf;
 255  256          struct vnode *vp;
 256  257  
 257  258          vp = ip->i_vnode = vn_alloc(kmflags);
 258  259          if (vp == NULL) {
 259  260                  return (-1);
 260  261          }
 261  262          vn_setops(vp, ufs_vnodeops);
 262  263          vp->v_data = ip;
 263  264  
 264  265          rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
 265  266          rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
 266  267          mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
 267  268          dnlc_dir_init(&ip->i_danchor);
 268  269  
 269  270          cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);
 270  271  
 271  272          return (0);
 272  273  }
 273  274  
 274  275  /* ARGSUSED */
 275  276  static void
 276  277  ufs_inode_cache_destructor(void *buf, void *cdrarg)
 277  278  {
 278  279          struct inode *ip = buf;
 279  280          struct vnode *vp;
 280  281  
 281  282          vp = ITOV(ip);
 282  283  
 283  284          rw_destroy(&ip->i_rwlock);
 284  285          rw_destroy(&ip->i_contents);
 285  286          mutex_destroy(&ip->i_tlock);
 286  287          if (vp->v_type == VDIR) {
 287  288                  dnlc_dir_fini(&ip->i_danchor);
 288  289          }
 289  290  
 290  291          cv_destroy(&ip->i_wrcv);
 291  292  
 292  293          vn_free(vp);
 293  294  }
 294  295  
 295  296  /*
 296  297   * Initialize hash links for inodes
 297  298   * and build inode free list.
 298  299   */
 299  300  void
 300  301  ihinit(void)
 301  302  {
 302  303          int i;
 303  304          union   ihead *ih = ihead;
 304  305  
 305  306          mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);
 306  307  
 307  308          inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
 308  309          ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
 309  310          ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);
 310  311  
 311  312          for (i = 0, ih = ihead; i < inohsz; i++,  ih++) {
 312  313                  ih->ih_head[0] = ih;
 313  314                  ih->ih_head[1] = ih;
 314  315                  mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
 315  316          }
 316  317          inode_cache = kmem_cache_create("ufs_inode_cache",
 317  318              sizeof (struct inode), 0, ufs_inode_cache_constructor,
 318  319              ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
 319  320              NULL, NULL, 0);
 320  321  }
 321  322  
 322  323  /*
 323  324   * Free an inode structure
 324  325   */
 325  326  void
 326  327  ufs_free_inode(struct inode *ip)
 327  328  {
 328  329          vn_invalid(ITOV(ip));
 329  330          kmem_cache_free(inode_cache, ip);
 330  331  }
 331  332  
 332  333  /*
 333  334   * Allocate an inode structure
 334  335   */
 335  336  struct inode *
 336  337  ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
 337  338  {
 338  339          struct inode *ip;
 339  340          vnode_t *vp;
 340  341  
 341  342          ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
 342  343          /*
 343  344           * at this point we have a newly allocated inode
 344  345           */
 345  346          ip->i_freef = ip;
 346  347          ip->i_freeb = ip;
 347  348          ip->i_flag = IREF;
 348  349          ip->i_seq = 0xFF;       /* Unique initial value */
 349  350          ip->i_dev = ufsvfsp->vfs_dev;
 350  351          ip->i_ufsvfs = ufsvfsp;
 351  352          ip->i_devvp = ufsvfsp->vfs_devvp;
 352  353          ip->i_number = ino;
 353  354          ip->i_diroff = 0;
 354  355          ip->i_nextr = 0;
 355  356          ip->i_map = NULL;
 356  357          ip->i_rdev = 0;
 357  358          ip->i_writes = 0;
 358  359          ip->i_mode = 0;
 359  360          ip->i_delaylen = 0;
 360  361          ip->i_delayoff = 0;
 361  362          ip->i_nextrio = 0;
 362  363          ip->i_ufs_acl = NULL;
 363  364          ip->i_cflags = 0;
 364  365          ip->i_mapcnt = 0;
 365  366          ip->i_dquot = NULL;
 366  367          ip->i_cachedir = CD_ENABLED;
 367  368          ip->i_writer = NULL;
 368  369  
 369  370          /*
 370  371           * the vnode for this inode was allocated by the constructor
 371  372           */
 372  373          vp = ITOV(ip);
 373  374          vn_reinit(vp);
 374  375          if (ino == (ino_t)UFSROOTINO)
 375  376                  vp->v_flag = VROOT;
 376  377          vp->v_vfsp = ufsvfsp->vfs_vfs;
 377  378          vn_exists(vp);
 378  379          return (ip);
 379  380  }
 380  381  
 381  382  /*
 382  383   * Look up an inode by device, inumber.  If it is in core (in the
 383  384   * inode structure), honor the locking protocol.  If it is not in
 384  385   * core, read it in from the specified device after freeing any pages.
 385  386   * In all cases, a pointer to a VN_HELD inode structure is returned.
 386  387   */
 387  388  int
 388  389  ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
 389  390  {
 390  391          return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
 391  392  }
 392  393  
 393  394  /*
 394  395   * A version of ufs_iget which returns only allocated, linked inodes.
 395  396   * This is appropriate for any callers who do not expect a free inode.
 396  397   */
 397  398  int
 398  399  ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
 399  400      struct cred *cr)
 400  401  {
 401  402          return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
 402  403  }
 403  404  
 404  405  /*
 405  406   * Set vnode attributes based on v_type, this should be called whenever
 406  407   * an inode's i_mode is changed.
 407  408   */
 408  409  void
 409  410  ufs_reset_vnode(vnode_t *vp)
 410  411  {
 411  412          /*
 412  413           * an old DBE hack
 413  414           */
 414  415          if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
 415  416                  vp->v_flag |= VSWAPLIKE;
 416  417          else
 417  418                  vp->v_flag &= ~VSWAPLIKE;
 418  419  
 419  420          /*
 420  421           * if not swap like and it's just a regular file, we want
 421  422           * to maintain the vnode's pages sorted by clean/modified
 422  423           * for faster sync'ing to disk
 423  424           */
 424  425          if (vp->v_type == VREG)
 425  426                  vp->v_flag |= VMODSORT;
 426  427          else
 427  428                  vp->v_flag &= ~VMODSORT;
 428  429  
 429  430          /*
 430  431           * Is this an attribute hidden dir?
 431  432           */
 432  433          if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
 433  434                  vp->v_flag |= V_XATTRDIR;
 434  435          else
 435  436                  vp->v_flag &= ~V_XATTRDIR;
 436  437  }
 437  438  
 438  439  /*
 439  440   * Shared implementation of ufs_iget and ufs_iget_alloced.  The 'validate'
 440  441   * flag is used to distinguish the two; when true, we validate that the inode
 441  442   * being retrieved looks like a linked and allocated inode.
 442  443   */
 443  444  /* ARGSUSED */
 444  445  static int
 445  446  ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
 446  447      struct cred *cr, int validate)
 447  448  {
 448  449          struct inode *ip, *sp;
 449  450          union ihead *ih;
 450  451          kmutex_t *ihm;
 451  452          struct buf *bp;
 452  453          struct dinode *dp;
 453  454          struct vnode *vp;
 454  455          extern vfs_t EIO_vfs;
 455  456          int error;
 456  457          int ftype;      /* XXX - Remove later on */
 457  458          dev_t vfs_dev;
 458  459          struct ufsvfs *ufsvfsp;
 459  460          struct fs *fs;
 460  461          int hno;
 461  462          daddr_t bno;
 462  463          ulong_t ioff;
 463  464  
 464  465          CPU_STATS_ADD_K(sys, ufsiget, 1);
 465  466  
 466  467          /*
 467  468           * Lookup inode in cache.
 468  469           */
 469  470          vfs_dev = vfsp->vfs_dev;
 470  471          hno = INOHASH(ino);
 471  472          ih = &ihead[hno];
 472  473          ihm = &ih_lock[hno];
 473  474  
 474  475  again:
 475  476          mutex_enter(ihm);
 476  477          for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
 477  478                  if (ino != ip->i_number || vfs_dev != ip->i_dev ||
 478  479                      (ip->i_flag & ISTALE))
 479  480                          continue;
 480  481  
 481  482                  /*
 482  483                   * Found the interesting inode; hold it and drop the cache lock
 483  484                   */
 484  485                  vp = ITOV(ip);  /* for locknest */
 485  486                  VN_HOLD(vp);
 486  487                  mutex_exit(ihm);
 487  488                  rw_enter(&ip->i_contents, RW_READER);
 488  489  
 489  490                  /*
 490  491                   * if necessary, remove from idle list
 491  492                   */
 492  493                  if ((ip->i_flag & IREF) == 0) {
 493  494                          if (ufs_rmidle(ip))
 494  495                                  VN_RELE(vp);
 495  496                  }
 496  497  
 497  498                  /*
 498  499                   * Could the inode be read from disk?
 499  500                   */
 500  501                  if (ip->i_flag & ISTALE) {
 501  502                          rw_exit(&ip->i_contents);
 502  503                          VN_RELE(vp);
 503  504                          goto again;
 504  505                  }
 505  506  
 506  507                  ins.in_hits.value.ul++;
 507  508                  *ipp = ip;
 508  509  
 509  510                  /*
 510  511                   * Reset the vnode's attribute flags
 511  512                   */
 512  513                  mutex_enter(&vp->v_lock);
 513  514                  ufs_reset_vnode(vp);
 514  515                  mutex_exit(&vp->v_lock);
 515  516  
 516  517                  rw_exit(&ip->i_contents);
 517  518  
 518  519                  return (0);
 519  520          }
 520  521          mutex_exit(ihm);
 521  522  
 522  523          /*
 523  524           * Inode was not in cache.
 524  525           *
 525  526           * Allocate a new entry
 526  527           */
 527  528          ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
 528  529          fs = ufsvfsp->vfs_fs;
 529  530  
 530  531          ip = ufs_alloc_inode(ufsvfsp, ino);
 531  532          vp = ITOV(ip);
 532  533  
 533  534          bno = fsbtodb(fs, itod(fs, ino));
 534  535          ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
 535  536          ip->i_doff = (offset_t)ioff + ldbtob(bno);
 536  537  
 537  538          /*
 538  539           * put a place holder in the cache (if not already there)
 539  540           */
 540  541          mutex_enter(ihm);
 541  542          for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
 542  543                  if (ino == sp->i_number && vfs_dev == sp->i_dev &&
 543  544                      ((sp->i_flag & ISTALE) == 0)) {
 544  545                          mutex_exit(ihm);
 545  546                          ufs_free_inode(ip);
 546  547                          goto again;
 547  548                  }
 548  549          /*
 549  550           * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
 550  551           * here, but if we do, then shadow inode allocations panic the
 551  552           * system.  We don't have to hold vfs_dqrwlock for shadow inodes
 552  553           * and the ufs_iget() parameters don't tell us what we are getting
 553  554           * so we have no way of knowing this is a ufs_iget() call from
 554  555           * a ufs_ialloc() call for a shadow inode.
 555  556           */
 556  557          rw_enter(&ip->i_contents, RW_WRITER);
 557  558          insque(ip, ih);
 558  559          mutex_exit(ihm);
 559  560          /*
 560  561           * read the dinode
 561  562           */
 562  563          bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);
 563  564  
 564  565          /*
 565  566           * Check I/O errors
 566  567           */
 567  568          error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
 568  569          if (error) {
 569  570                  brelse(bp);
 570  571                  ip->i_flag |= ISTALE;   /* in case someone is looking it up */
 571  572                  rw_exit(&ip->i_contents);
 572  573                  vp->v_vfsp = &EIO_vfs;
 573  574                  VN_RELE(vp);
 574  575                  return (error);
 575  576          }
 576  577          /*
 577  578           * initialize the inode's dinode
 578  579           */
 579  580          dp = (struct dinode *)(ioff + bp->b_un.b_addr);
 580  581          ip->i_ic = dp->di_ic;                   /* structure assignment */
 581  582          brelse(bp);
 582  583  
 583  584          /*
 584  585           * Maintain compatibility with Solaris 1.x UFS
 585  586           */
 586  587          if (ip->i_suid != UID_LONG)
 587  588                  ip->i_uid = ip->i_suid;
 588  589          if (ip->i_sgid != GID_LONG)
 589  590                  ip->i_gid = ip->i_sgid;
 590  591  
 591  592          ftype = ip->i_mode & IFMT;
 592  593          if (ftype == IFBLK || ftype == IFCHR) {
 593  594                  dev_t dv;
 594  595                  uint_t top16 = ip->i_ordev & 0xffff0000u;
 595  596  
 596  597                  if (top16 == 0 || top16 == 0xffff0000u)
 597  598                          dv = expdev(ip->i_ordev);
 598  599                  else
 599  600                          dv = expldev(ip->i_ordev);
 600  601                  vp->v_rdev = ip->i_rdev = dv;
 601  602          }
 602  603  
 603  604          /*
 604  605           * if our caller only expects allocated inodes, verify that
 605  606           * this inode looks good; throw it out if it's bad.
 606  607           */
 607  608          if (validate) {
 608  609                  if ((ftype == 0) || (ip->i_nlink <= 0)) {
 609  610                          ip->i_flag |= ISTALE;
 610  611                          rw_exit(&ip->i_contents);
 611  612                          vp->v_vfsp = &EIO_vfs;
 612  613                          VN_RELE(vp);
 613  614                          cmn_err(CE_NOTE,
 614  615                              "%s: unexpected free inode %d, run fsck(1M)%s",
 615  616                              fs->fs_fsmnt, (int)ino,
 616  617                              (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
 617  618                          return (EIO);
 618  619                  }
 619  620          }
 620  621  
 621  622          /*
 622  623           * Finish initializing the vnode, special handling for shadow inodes
 623  624           * because IFTOVT() will produce a v_type of VNON which is not what we
 624  625           * want, set v_type to VREG explicitly in that case.
 625  626           */
 626  627          if (ftype == IFSHAD) {
 627  628                  vp->v_type = VREG;
 628  629          } else {
 629  630                  vp->v_type = IFTOVT((mode_t)ip->i_mode);
 630  631          }
 631  632  
 632  633          ufs_reset_vnode(vp);
 633  634  
 634  635          /*
 635  636           * read the shadow
 636  637           */
 637  638          if (ftype != 0 && ip->i_shadow != 0) {
 638  639                  if ((error = ufs_si_load(ip, cr)) != 0) {
 639  640                          ip->i_flag |= ISTALE;
 640  641                          ip->i_ufs_acl = NULL;
 641  642                          rw_exit(&ip->i_contents);
 642  643                          vp->v_vfsp = &EIO_vfs;
 643  644                          VN_RELE(vp);
 644  645                          return (error);
 645  646                  }
 646  647          }
 647  648  
 648  649          /*
 649  650           * Only attach quota information if the inode has a type and if
 650  651           * that type is not a shadow inode.
 651  652           */
 652  653          if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
 653  654              ((ip->i_mode & IFMT) != IFATTRDIR)) {
 654  655                  ip->i_dquot = getinoquota(ip);
 655  656          }
 656  657          TRANS_MATA_IGET(ufsvfsp, ip);
 657  658          *ipp = ip;
 658  659          rw_exit(&ip->i_contents);
 659  660  
 660  661          return (0);
 661  662  }
 662  663  
 663  664  /*
 664  665   * Vnode is no longer referenced, write the inode out
 665  666   * and if necessary, truncate and deallocate the file.
 666  667   */
 667  668  void
 668  669  ufs_iinactive(struct inode *ip)
 669  670  {
 670  671          int             front;
 671  672          struct inode    *iq;
 672  673          struct inode    *hip;
 673  674          struct ufs_q    *uq;
 674  675          struct vnode    *vp = ITOV(ip);
 675  676          struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
 676  677          struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
 677  678  
 678  679          /*
 679  680           * Because the vnode type might have been changed,
 680  681           * the dnlc_dir_purge must be called unconditionally.
 681  682           */
 682  683          dnlc_dir_purge(&ip->i_danchor);
 683  684  
 684  685          /*
 685  686           * Get exclusive access to inode data.
 686  687           */
 687  688          rw_enter(&ip->i_contents, RW_WRITER);
 688  689          ASSERT(ip->i_flag & IREF);
 689  690  
 690  691          /*
 691  692           * Make sure no one reclaimed the inode before we put it on
 692  693           * the freelist or destroy it. We keep our 'hold' on the vnode
 693  694           * from vn_rele until we are ready to do something with the inode.
 694  695           *
 695  696           * Pageout may put a VN_HOLD/VN_RELE at anytime during this
 696  697           * operation via an async putpage, so we must make sure
 697  698           * we don't free/destroy the inode more than once. ufs_iget
 698  699           * may also put a VN_HOLD on the inode before it grabs
 699  700           * the i_contents lock. This is done so we don't free
 700  701           * an inode that a thread is waiting on.
 701  702           */
 702  703          mutex_enter(&vp->v_lock);
 703  704  
 704  705          if (vp->v_count > 1) {
 705  706                  VN_RELE_LOCKED(vp);
 706  707                  mutex_exit(&vp->v_lock);
 707  708                  rw_exit(&ip->i_contents);
 708  709                  return;
 709  710          }
 710  711          mutex_exit(&vp->v_lock);
 711  712  
 712  713          /*
 713  714           * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
 714  715           * and clean.  It can be safely destroyed (cyf).
 715  716           */
 716  717          if (ip->i_ufsvfs == NULL) {
 717  718                  rw_exit(&ip->i_contents);
 718  719                  ufs_si_del(ip);
 719  720                  ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
 720  721                  ufs_free_inode(ip);
 721  722                  return;
 722  723          }
 723  724  
 724  725          /*
 725  726           * queue idle inode to appropriate thread. Will check v_count == 1
 726  727           * prior to putting this on the appropriate queue.
 727  728           * Stale inodes will be unhashed and freed by the ufs idle thread
 728  729           * in ufs_idle_free()
 729  730           */
 730  731          front = 1;
 731  732          if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
 732  733              ip->i_mode && ip->i_nlink <= 0) {
 733  734                  /*
 734  735                   * Mark the i_flag to indicate that inode is being deleted.
 735  736                   * This flag will be cleared when the deletion is complete.
 736  737                   * This prevents nfs from sneaking in via ufs_vget() while
 737  738                   * the delete is in progress (bugid 1242481).
 738  739                   */
 739  740                  ip->i_flag |= IDEL;
 740  741  
 741  742                  /*
 742  743                   * NOIDEL means that deletes are not allowed at this time;
 743  744                   * whoever resets NOIDEL will also send this inode back
 744  745                   * through ufs_iinactive.  IREF remains set.
 745  746                   */
 746  747                  if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
 747  748                          mutex_enter(&vp->v_lock);
 748  749                          VN_RELE_LOCKED(vp);
 749  750                          mutex_exit(&vp->v_lock);
 750  751                          rw_exit(&ip->i_contents);
 751  752                          return;
 752  753                  }
 753  754                  if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
 754  755                          rw_exit(&ip->i_contents);
 755  756                          ufs_delete(ip->i_ufsvfs, ip, 0);
 756  757                          return;
 757  758                  }
 758  759  
 759  760                  /* queue to delete thread; IREF remains set */
 760  761                  ins.in_qfree.value.ul++;
 761  762                  uq = &ip->i_ufsvfs->vfs_delete;
 762  763  
 763  764                  mutex_enter(&uq->uq_mutex);
 764  765  
 765  766                  /* add to q */
 766  767                  if ((iq = uq->uq_ihead) != 0) {
 767  768                          ip->i_freef = iq;
 768  769                          ip->i_freeb = iq->i_freeb;
 769  770                          iq->i_freeb->i_freef = ip;
 770  771                          iq->i_freeb = ip;
 771  772                          if (front)
 772  773                                  uq->uq_ihead = ip;
 773  774                  } else {
 774  775                          uq->uq_ihead = ip;
 775  776                          ip->i_freef = ip;
 776  777                          ip->i_freeb = ip;
 777  778                  }
 778  779  
 779  780                  delq_info->delq_unreclaimed_files += 1;
 780  781                  delq_info->delq_unreclaimed_blocks += ip->i_blocks;
 781  782          } else {
 782  783                  /*
 783  784                   * queue to idle thread
 784  785                   *  Check the v_count == 1 again.
 785  786                   *
 786  787                   */
 787  788                  mutex_enter(&vp->v_lock);
 788  789                  if (vp->v_count > 1) {
 789  790                          VN_RELE_LOCKED(vp);
 790  791                          mutex_exit(&vp->v_lock);
 791  792                          rw_exit(&ip->i_contents);
 792  793                          return;
 793  794                  }
 794  795                  mutex_exit(&vp->v_lock);
 795  796                  uq = &ufs_idle_q;
 796  797  
 797  798                  /*
 798  799                   * useful iff it has pages or is a fastsymlink; otherwise junk
 799  800                   */
 800  801                  mutex_enter(&uq->uq_mutex);
 801  802  
 802  803                  /* clear IREF means `on idle list' */
 803  804                  ip->i_flag &= ~(IREF | IDIRECTIO);
 804  805  
 805  806                  if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
 806  807                          ins.in_frback.value.ul++;
 807  808                          hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
 808  809                          ufs_nuseful_iq++;
 809  810                  } else {
 810  811                          ins.in_frfront.value.ul++;
 811  812                          hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
 812  813                          ip->i_flag |= IJUNKIQ;
 813  814                          ufs_njunk_iq++;
 814  815                  }
 815  816                  ip->i_freef = hip;
 816  817                  ip->i_freeb = hip->i_freeb;
 817  818                  hip->i_freeb->i_freef = ip;
 818  819                  hip->i_freeb = ip;
 819  820          }
 820  821  
 821  822          /* wakeup thread(s) if q is overfull */
 822  823          if (++uq->uq_ne == uq->uq_lowat)
 823  824                  cv_broadcast(&uq->uq_cv);
 824  825  
 825  826          /* all done, release the q and inode */
 826  827          mutex_exit(&uq->uq_mutex);
 827  828          rw_exit(&ip->i_contents);
 828  829  }
 829  830  
 830  831  /*
 831  832   * Check accessed and update flags on an inode structure.
 832  833   * If any are on, update the inode with the (unique) current time.
 833  834   * If waitfor is given, insure I/O order so wait for write to complete.
 834  835   */
 835  836  void
 836  837  ufs_iupdat(struct inode *ip, int waitfor)
 837  838  {
 838  839          struct buf      *bp;
 839  840          struct fs       *fp;
 840  841          struct dinode   *dp;
 841  842          struct ufsvfs   *ufsvfsp        = ip->i_ufsvfs;
 842  843          int             i;
 843  844          int             do_trans_times;
 844  845          ushort_t        flag;
 845  846          o_uid_t         suid;
 846  847          o_gid_t         sgid;
 847  848  
 848  849          /*
 849  850           * This function is now safe to be called with either the reader
 850  851           * or writer i_contents lock.
 851  852           */
 852  853          ASSERT(RW_LOCK_HELD(&ip->i_contents));
 853  854  
 854  855          /*
 855  856           * Return if file system has been forcibly umounted.
 856  857           */
 857  858          if (ufsvfsp == NULL)
 858  859                  return;
 859  860  
 860  861          flag = ip->i_flag;      /* Atomic read */
 861  862          /*
 862  863           * We better not update the disk inode from a stale inode.
 863  864           */
 864  865          if (flag & ISTALE)
 865  866                  return;
 866  867  
 867  868          fp = ip->i_fs;
 868  869  
 869  870          if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
 870  871                  if (fp->fs_ronly) {
 871  872                          mutex_enter(&ip->i_tlock);
 872  873                          ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
 873  874                          mutex_exit(&ip->i_tlock);
 874  875                          return;
 875  876                  }
 876  877                  /*
 877  878                   * fs is active while metadata is being written
 878  879                   */
 879  880                  mutex_enter(&ufsvfsp->vfs_lock);
 880  881                  ufs_notclean(ufsvfsp);
 881  882                  /*
 882  883                   * get the dinode
 883  884                   */
 884  885                  bp = UFS_BREAD(ufsvfsp, ip->i_dev,
 885  886                      (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
 886  887                      (int)fp->fs_bsize);
 887  888                  if (bp->b_flags & B_ERROR) {
 888  889                          mutex_enter(&ip->i_tlock);
 889  890                          ip->i_flag &=
 890  891                              ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
 891  892                          mutex_exit(&ip->i_tlock);
 892  893                          brelse(bp);
 893  894                          return;
 894  895                  }
 895  896                  /*
 896  897                   * munge inode fields
 897  898                   */
 898  899                  mutex_enter(&ip->i_tlock);
 899  900                  ITIMES_NOLOCK(ip);
 900  901                  do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
 901  902                  ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
 902  903                  mutex_exit(&ip->i_tlock);
 903  904  
 904  905                  /*
 905  906                   * For reads and concurrent re-writes, no deltas were
 906  907                   * entered for the access time changes - do it now.
 907  908                   */
 908  909                  if (do_trans_times) {
 909  910                          TRANS_INODE_TIMES(ufsvfsp, ip);
 910  911                  }
 911  912  
 912  913                  /*
 913  914                   * For SunOS 5.0->5.4, these lines below read:
 914  915                   *
 915  916                   * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
 916  917                   * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
 917  918                   *
 918  919                   * where MAXUID was set to 60002.  This was incorrect -
 919  920                   * the uids should have been constrained to what fitted into
 920  921                   * a 16-bit word.
 921  922                   *
 922  923                   * This means that files from 4.x filesystems that have an
 923  924                   * i_suid field larger than 60002 will have that field
 924  925                   * changed to 65535.
 925  926                   *
 926  927                   * Security note: 4.x UFS could never create a i_suid of
 927  928                   * UID_LONG since that would've corresponded to -1.
 928  929                   */
 929  930                  suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
 930  931                      UID_LONG : ip->i_uid;
 931  932                  sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
 932  933                      GID_LONG : ip->i_gid;
 933  934  
 934  935                  if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
 935  936                          ip->i_suid = suid;
 936  937                          ip->i_sgid = sgid;
 937  938                          TRANS_INODE(ufsvfsp, ip);
 938  939                  }
 939  940  
 940  941                  if ((ip->i_mode & IFMT) == IFBLK ||
 941  942                      (ip->i_mode & IFMT) == IFCHR) {
 942  943                          dev_t d = ip->i_rdev;
 943  944                          dev32_t dev32;
 944  945  
 945  946                          /*
 946  947                           * load first direct block only if special device
 947  948                           */
 948  949                          if (!cmpldev(&dev32, d)) {
 949  950                                  /*
 950  951                                   * We panic here because there's "no way"
 951  952                                   * we should have been able to create a large
 952  953                                   * inode with a large dev_t.  Earlier layers
 953  954                                   * should've caught this.
 954  955                                   */
 955  956                                  panic("ip %p: i_rdev too big", (void *)ip);
 956  957                          }
 957  958  
 958  959                          if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
 959  960                                  ip->i_ordev = dev32;    /* can't use old fmt. */
 960  961                          } else {
 961  962                                  ip->i_ordev = cmpdev(d);
 962  963                          }
 963  964                  }
 964  965  
 965  966                  /*
 966  967                   * copy inode to dinode (zero fastsymlnk in dinode)
 967  968                   */
 968  969                  dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
 969  970                  dp->di_ic = ip->i_ic;   /* structure assignment */
 970  971                  if (flag & IFASTSYMLNK) {
 971  972                          for (i = 1; i < NDADDR; i++)
 972  973                                  dp->di_db[i] = 0;
 973  974                          for (i = 0; i < NIADDR; i++)
 974  975                                  dp->di_ib[i] = 0;
 975  976                  }
 976  977                  if (TRANS_ISTRANS(ufsvfsp)) {
 977  978                          /*
 978  979                           * Pass only a sector size buffer containing
 979  980                           * the inode, otherwise when the buffer is copied
 980  981                           * into a cached roll buffer then too much memory
 981  982                           * gets consumed if 8KB inode buffers are passed.
 982  983                           */
 983  984                          TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
 984  985                              sizeof (struct dinode),
 985  986                              (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
 986  987                              DEV_BSIZE);
 987  988  
 988  989                          brelse(bp);
 989  990                  } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
 990  991                          UFS_BRWRITE(ufsvfsp, bp);
 991  992  
 992  993                          /*
 993  994                           * Synchronous write has guaranteed that inode
 994  995                           * has been written on disk so clear the flag
 995  996                           */
 996  997                          mutex_enter(&ip->i_tlock);
 997  998                          ip->i_flag &= ~IBDWRITE;
 998  999                          mutex_exit(&ip->i_tlock);
 999 1000                  } else {
1000 1001                          bdrwrite(bp);
1001 1002  
1002 1003                          /*
1003 1004                           * This write hasn't guaranteed that inode has been
1004 1005                           * written on the disk.
1005 1006                           * Since, all updat flags on inode are cleared, we must
1006 1007                           * remember the condition in case inode is to be updated
1007 1008                           * synchronously later (e.g.- fsync()/fdatasync())
1008 1009                           * and inode has not been modified yet.
1009 1010                           */
1010 1011                          mutex_enter(&ip->i_tlock);
1011 1012                          ip->i_flag |= IBDWRITE;
1012 1013                          mutex_exit(&ip->i_tlock);
1013 1014                  }
1014 1015          } else {
1015 1016                  /*
1016 1017                   * In case previous inode update was done asynchronously
1017 1018                   * (IBDWRITE) and this inode update request wants guaranteed
1018 1019                   * (synchronous) disk update, flush the inode.
1019 1020                   */
1020 1021                  if (waitfor && (flag & IBDWRITE)) {
1021 1022                          blkflush(ip->i_dev,
1022 1023                              (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
1023 1024                          mutex_enter(&ip->i_tlock);
1024 1025                          ip->i_flag &= ~IBDWRITE;
1025 1026                          mutex_exit(&ip->i_tlock);
1026 1027                  }
1027 1028          }
1028 1029  }
1029 1030  
1030 1031  #define SINGLE  0       /* index of single indirect block */
1031 1032  #define DOUBLE  1       /* index of double indirect block */
1032 1033  #define TRIPLE  2       /* index of triple indirect block */
1033 1034  
1034 1035  /*
1035 1036   * Release blocks associated with the inode ip and
1036 1037   * stored in the indirect block bn.  Blocks are free'd
1037 1038   * in LIFO order up to (but not including) lastbn.  If
1038 1039   * level is greater than SINGLE, the block is an indirect
1039 1040   * block and recursive calls to indirtrunc must be used to
1040 1041   * cleanse other indirect blocks.
1041 1042   *
1042 1043   * N.B.: triple indirect blocks are untested.
1043 1044   */
1044 1045  static long
1045 1046  indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
1046 1047  {
1047 1048          int i;
1048 1049          struct buf *bp, *copy;
1049 1050          daddr32_t *bap;
1050 1051          struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1051 1052          struct fs *fs = ufsvfsp->vfs_fs;
1052 1053          daddr_t nb, last;
1053 1054          long factor;
1054 1055          int blocksreleased = 0, nblocks;
1055 1056  
1056 1057          ASSERT(RW_WRITE_HELD(&ip->i_contents));
1057 1058          /*
1058 1059           * Calculate index in current block of last
1059 1060           * block to be kept.  -1 indicates the entire
1060 1061           * block so we need not calculate the index.
1061 1062           */
1062 1063          factor = 1;
1063 1064          for (i = SINGLE; i < level; i++)
1064 1065                  factor *= NINDIR(fs);
1065 1066          last = lastbn;
1066 1067          if (lastbn > 0)
1067 1068                  last /= factor;
1068 1069          nblocks = btodb(fs->fs_bsize);
1069 1070          /*
1070 1071           * Get buffer of block pointers, zero those
1071 1072           * entries corresponding to blocks to be free'd,
1072 1073           * and update on disk copy first.
1073 1074           * *Unless* the root pointer has been synchronously
1074 1075           * written to disk.  If nothing points to this
1075 1076           * indirect block then don't bother zero'ing and
1076 1077           * writing it.
1077 1078           */
1078 1079          bp = UFS_BREAD(ufsvfsp,
1079 1080              ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
1080 1081          if (bp->b_flags & B_ERROR) {
1081 1082                  brelse(bp);
1082 1083                  return (0);
1083 1084          }
1084 1085          bap = bp->b_un.b_daddr;
1085 1086          if ((flags & I_CHEAP) == 0) {
1086 1087                  uint_t  zb;
1087 1088  
1088 1089                  zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));
1089 1090  
1090 1091                  if (zb) {
1091 1092                          /*
1092 1093                           * push any data into the log before we zero it
1093 1094                           */
1094 1095                          if (bp->b_flags & B_DELWRI)
1095 1096                                  TRANS_LOG(ufsvfsp, (caddr_t)bap,
1096 1097                                      ldbtob(bp->b_blkno), bp->b_bcount,
1097 1098                                      bp->b_un.b_addr, bp->b_bcount);
1098 1099                          copy = ngeteblk(fs->fs_bsize);
1099 1100                          bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
1100 1101                              (uint_t)fs->fs_bsize);
1101 1102                          bzero((caddr_t)&bap[last + 1], zb);
1102 1103  
1103 1104                          TRANS_BUF(ufsvfsp,
1104 1105                              (caddr_t)&bap[last + 1] - (caddr_t)bap,
1105 1106                              zb, bp, DT_ABZERO);
1106 1107  
1107 1108                          UFS_BRWRITE(ufsvfsp, bp);
1108 1109                          bp = copy, bap = bp->b_un.b_daddr;
1109 1110                  }
1110 1111          } else {
1111 1112                  /* make sure write retries are also cleared */
1112 1113                  bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
1113 1114                  bp->b_flags |= B_STALE | B_AGE;
1114 1115          }
1115 1116  
1116 1117          /*
1117 1118           * Recursively free totally unused blocks.
1118 1119           */
1119 1120          flags |= I_CHEAP;
1120 1121          for (i = NINDIR(fs) - 1; i > last; i--) {
1121 1122                  nb = bap[i];
1122 1123                  if (nb == 0)
1123 1124                          continue;
1124 1125                  if (level > SINGLE) {
1125 1126                          blocksreleased +=
1126 1127                              indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
1127 1128                          free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
1128 1129                  } else
1129 1130                          free(ip, nb, (off_t)fs->fs_bsize, flags);
1130 1131                  blocksreleased += nblocks;
1131 1132          }
1132 1133          flags &= ~I_CHEAP;
1133 1134  
1134 1135          /*
1135 1136           * Recursively free last partial block.
1136 1137           */
1137 1138          if (level > SINGLE && lastbn >= 0) {
1138 1139                  last = lastbn % factor;
1139 1140                  nb = bap[i];
1140 1141                  if (nb != 0)
1141 1142                          blocksreleased +=
1142 1143                              indirtrunc(ip, nb, last, level - 1, flags);
1143 1144          }
1144 1145          brelse(bp);
1145 1146          return (blocksreleased);
1146 1147  }
1147 1148  
1148 1149  /*
1149 1150   * Truncate the inode ip to at most length size.
1150 1151   * Free affected disk blocks -- the blocks of the
1151 1152   * file are removed in reverse order.
1152 1153   *
1153 1154   * N.B.: triple indirect blocks are untested.
1154 1155   */
1155 1156  static int i_genrand = 1234;
1156 1157  int
1157 1158  ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr)
1158 1159  {
1159 1160          struct fs *fs = oip->i_fs;
1160 1161          struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
1161 1162          struct inode *ip;
1162 1163          daddr_t lastblock;
1163 1164          off_t bsize;
1164 1165          int boff;
1165 1166          daddr_t bn, lastiblock[NIADDR];
1166 1167          int level;
1167 1168          long nblocks, blocksreleased = 0;
1168 1169          int i;
1169 1170          ushort_t mode;
1170 1171          struct inode tip;
1171 1172          int err;
1172 1173          u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
1173 1174              (UFS_MAXOFFSET_T) : (MAXOFF32_T);
1174 1175  
1175 1176          /*
1176 1177           * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
1177 1178           * other uses need the reader lock. opendq() holds the writer lock.
1178 1179           */
1179 1180          ASSERT((oip->i_mode & IFMT) == IFSHAD ||
1180 1181              RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
1181 1182          ASSERT(RW_WRITE_HELD(&oip->i_contents));
1182 1183          /*
1183 1184           * We only allow truncation of regular files and directories
1184 1185           * to arbitrary lengths here.  In addition, we allow symbolic
1185 1186           * links to be truncated only to zero length.  Other inode
1186 1187           * types cannot have their length set here.  Disk blocks are
1187 1188           * being dealt with - especially device inodes where
1188 1189           * ip->i_ordev is actually being stored in ip->i_db[0]!
1189 1190           */
1190 1191          TRANS_INODE(ufsvfsp, oip);
1191 1192          mode = oip->i_mode & IFMT;
1192 1193          if (flags & I_FREE) {
1193 1194                  i_genrand *= 16843009;  /* turns into shift and adds */
1194 1195                  i_genrand++;
1195 1196                  oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1;
1196 1197                  oip->i_flag |= ICHG |IUPD;
1197 1198                  oip->i_seq++;
1198 1199                  if (length == oip->i_size)
1199 1200                          return (0);
1200 1201                  flags |= I_CHEAP;
1201 1202          }
1202 1203          if (mode == IFIFO)
1203 1204                  return (0);
1204 1205          if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
1205 1206              !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
1206 1207                  return (EINVAL);
1207 1208          if (length > maxoffset)
1208 1209                  return (EFBIG);
1209 1210          if ((mode == IFDIR) || (mode == IFATTRDIR))
1210 1211                  flags |= I_DIR;
1211 1212          if (mode == IFSHAD)
1212 1213                  flags |= I_SHAD;
1213 1214          if (oip == ufsvfsp->vfs_qinod)
1214 1215                  flags |= I_QUOTA;
1215 1216          if (length == oip->i_size) {
1216 1217                  /* update ctime and mtime to please POSIX tests */
1217 1218                  oip->i_flag |= ICHG |IUPD;
1218 1219                  oip->i_seq++;
1219 1220                  if (length == 0) {
1220 1221                          /* nothing to cache so clear the flag */
1221 1222                          oip->i_flag &= ~IFASTSYMLNK;
1222 1223                  }
1223 1224                  return (0);
1224 1225          }
1225 1226          /* wipe out fast symlink till next access */
1226 1227          if (oip->i_flag & IFASTSYMLNK) {
1227 1228                  int j;
1228 1229  
1229 1230                  ASSERT(ITOV(oip)->v_type == VLNK);
1230 1231  
1231 1232                  oip->i_flag &= ~IFASTSYMLNK;
1232 1233  
1233 1234                  for (j = 1; j < NDADDR; j++)
1234 1235                          oip->i_db[j] = 0;
1235 1236                  for (j = 0; j < NIADDR; j++)
1236 1237                          oip->i_ib[j] = 0;
1237 1238          }
1238 1239  
1239 1240          boff = (int)blkoff(fs, length);
1240 1241  
1241 1242          if (length > oip->i_size) {
1242 1243                  /*
1243 1244                   * Trunc up case.  BMAPALLOC will insure that the right blocks
1244 1245                   * are allocated.  This includes extending the old frag to a
1245 1246                   * full block (if needed) in addition to doing any work
1246 1247                   * needed for allocating the last block.
1247 1248                   */
1248 1249                  if (boff == 0)
1249 1250                          err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
1250 1251                  else
1251 1252                          err = BMAPALLOC(oip, length - 1, boff, cr);
1252 1253  
1253 1254                  if (err == 0) {
1254 1255                          /*
1255 1256                           * Save old size and set inode's size now
1256 1257                           * so that we don't cause too much of the
1257 1258                           * file to be zero'd and pushed.
1258 1259                           */
1259 1260                          u_offset_t osize = oip->i_size;
1260 1261                          oip->i_size  = length;
1261 1262                          /*
1262 1263                           * Make sure we zero out the remaining bytes of
1263 1264                           * the page in case a mmap scribbled on it. We
1264 1265                           * can't prevent a mmap from writing beyond EOF
1265 1266                           * on the last page of a file.
1266 1267                           *
1267 1268                           */
1268 1269                          if ((boff = (int)blkoff(fs, osize)) != 0) {
1269 1270                                  bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
1270 1271                                      fs->fs_bsize : fragroundup(fs, boff);
1271 1272                                  pvn_vpzero(ITOV(oip), osize,
1272 1273                                      (size_t)(bsize - boff));
1273 1274                          }
1274 1275                          oip->i_flag |= ICHG|IATTCHG;
1275 1276                          oip->i_seq++;
1276 1277                          ITIMES_NOLOCK(oip);
1277 1278                          /*
1278 1279                           * MAXOFF32_T is old 2GB size limit. If
1279 1280                           * this operation caused a large file to be
1280 1281                           * created, turn on the superblock flag
1281 1282                           * and update the superblock, if the flag
1282 1283                           * is not already on.
1283 1284                           */
1284 1285                          if ((length > (u_offset_t)MAXOFF32_T) &&
1285 1286                              !(fs->fs_flags & FSLARGEFILES)) {
1286 1287                                  ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1287 1288                                  mutex_enter(&ufsvfsp->vfs_lock);
1288 1289                                  fs->fs_flags |= FSLARGEFILES;
1289 1290                                  ufs_sbwrite(ufsvfsp);
1290 1291                                  mutex_exit(&ufsvfsp->vfs_lock);
1291 1292                          }
1292 1293                  }
1293 1294  
1294 1295                  return (err);
1295 1296          }
1296 1297  
1297 1298          /*
1298 1299           * Update the pages of the file.  If the file is not being
1299 1300           * truncated to a block boundary, the contents of the
1300 1301           * pages following the end of the file must be zero'ed
1301 1302           * in case it ever become accessible again because
1302 1303           * of subsequent file growth.
1303 1304           */
1304 1305          if (boff == 0) {
1305 1306                  (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
1306 1307                      B_INVAL | B_TRUNC, CRED());
1307 1308          } else {
1308 1309                  /*
1309 1310                   * Make sure that the last block is properly allocated.
1310 1311                   * We only really have to do this if the last block is
1311 1312                   * actually allocated since ufs_bmap will now handle the case
1312 1313                   * of an fragment which has no block allocated.  Just to
1313 1314                   * be sure, we do it now independent of current allocation.
1314 1315                   */
1315 1316                  err = BMAPALLOC(oip, length - 1, boff, cr);
1316 1317                  if (err)
1317 1318                          return (err);
1318 1319  
1319 1320                  /*
1320 1321                   * BMAPALLOC will call bmap_write which defers i_seq
1321 1322                   * processing.  If the timestamps were changed, update
1322 1323                   * i_seq before rdip drops i_contents or syncs the inode.
1323 1324                   */
1324 1325                  if (oip->i_flag & (ICHG|IUPD))
1325 1326                          oip->i_seq++;
1326 1327  
1327 1328                  /*
1328 1329                   * BugId 4069932
1329 1330                   * Make sure that the relevant partial page appears in
1330 1331                   * the v_pages list, so that pvn_vpzero() will do its
1331 1332                   * job.  Since doing this correctly requires everything
1332 1333                   * in rdip() except for the uiomove(), it's easier and
1333 1334                   * safer to do the uiomove() rather than duplicate the
1334 1335                   * rest of rdip() here.
1335 1336                   *
1336 1337                   * To get here, we know that length indicates a byte
1337 1338                   * that is not the first byte of a block.  (length - 1)
1338 1339                   * is the last actual byte known to exist.  Deduction
1339 1340                   * shows it is in the same block as byte (length).
1340 1341                   * Thus, this rdip() invocation should always succeed
1341 1342                   * except in the face of i/o errors, and give us the
1342 1343                   * block we care about.
1343 1344                   *
1344 1345                   * rdip() makes the same locking assertions and
1345 1346                   * assumptions as we do.  We do not acquire any locks
1346 1347                   * before calling it, so we have not changed the locking
1347 1348                   * situation.  Finally, there do not appear to be any
1348 1349                   * paths whereby rdip() ends up invoking us again.
1349 1350                   * Thus, infinite recursion is avoided.
1350 1351                   */
1351 1352                  {
1352 1353                          uio_t uio;
1353 1354                          iovec_t iov[1];
1354 1355                          char buffer;
1355 1356  
1356 1357                          uio.uio_iov = iov;
1357 1358                          uio.uio_iovcnt = 1;
1358 1359                          uio.uio_loffset = length - 1;
1359 1360                          uio.uio_resid = 1;
1360 1361                          uio.uio_segflg = UIO_SYSSPACE;
1361 1362                          uio.uio_extflg = UIO_COPY_CACHED;
1362 1363  
1363 1364                          iov[0].iov_base = &buffer;
1364 1365                          iov[0].iov_len = 1;
1365 1366  
1366 1367                          err = rdip(oip, &uio, UIO_READ, NULL);
1367 1368                          if (err)
1368 1369                                  return (err);
1369 1370                  }
1370 1371  
1371 1372                  bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
1372 1373                      fs->fs_bsize : fragroundup(fs, boff);
1373 1374                  pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
1374 1375                  /*
1375 1376                   * Ensure full fs block is marked as dirty.
1376 1377                   */
1377 1378                  (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
1378 1379                      ufs_putapage, B_INVAL | B_TRUNC, CRED());
1379 1380          }
1380 1381  
1381 1382          /*
1382 1383           * Calculate index into inode's block list of
1383 1384           * last direct and indirect blocks (if any)
1384 1385           * which we want to keep.  Lastblock is -1 when
1385 1386           * the file is truncated to 0.
1386 1387           */
1387 1388          lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
1388 1389          lastiblock[SINGLE] = lastblock - NDADDR;
1389 1390          lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
1390 1391          lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
1391 1392          nblocks = btodb(fs->fs_bsize);
1392 1393  
1393 1394          /*
1394 1395           * Update file and block pointers
1395 1396           * on disk before we start freeing blocks.
1396 1397           * If we crash before free'ing blocks below,
1397 1398           * the blocks will be returned to the free list.
1398 1399           * lastiblock values are also normalized to -1
1399 1400           * for calls to indirtrunc below.
1400 1401           */
1401 1402          tip = *oip;                     /* structure copy */
1402 1403          ip = &tip;
1403 1404  
1404 1405          for (level = TRIPLE; level >= SINGLE; level--)
1405 1406                  if (lastiblock[level] < 0) {
1406 1407                          oip->i_ib[level] = 0;
1407 1408                          lastiblock[level] = -1;
1408 1409                  }
1409 1410          for (i = NDADDR - 1; i > lastblock; i--) {
1410 1411                  oip->i_db[i] = 0;
1411 1412                  flags |= I_CHEAP;
1412 1413          }
1413 1414          oip->i_size = length;
1414 1415          oip->i_flag |= ICHG|IUPD|IATTCHG;
1415 1416          oip->i_seq++;
1416 1417          if (!TRANS_ISTRANS(ufsvfsp))
1417 1418                  ufs_iupdat(oip, I_SYNC);        /* do sync inode update */
1418 1419  
1419 1420          /*
1420 1421           * Indirect blocks first.
1421 1422           */
1422 1423          for (level = TRIPLE; level >= SINGLE; level--) {
1423 1424                  bn = ip->i_ib[level];
1424 1425                  if (bn != 0) {
1425 1426                          blocksreleased +=
1426 1427                              indirtrunc(ip, bn, lastiblock[level], level, flags);
1427 1428                          if (lastiblock[level] < 0) {
1428 1429                                  ip->i_ib[level] = 0;
1429 1430                                  free(ip, bn, (off_t)fs->fs_bsize,
1430 1431                                      flags | I_IBLK);
1431 1432                                  blocksreleased += nblocks;
1432 1433                          }
1433 1434                  }
1434 1435                  if (lastiblock[level] >= 0)
1435 1436                          goto done;
1436 1437          }
1437 1438  
1438 1439          /*
1439 1440           * All whole direct blocks or frags.
1440 1441           */
1441 1442          for (i = NDADDR - 1; i > lastblock; i--) {
1442 1443                  bn = ip->i_db[i];
1443 1444                  if (bn == 0)
1444 1445                          continue;
1445 1446                  ip->i_db[i] = 0;
1446 1447                  bsize = (off_t)blksize(fs, ip, i);
1447 1448                  free(ip, bn, bsize, flags);
1448 1449                  blocksreleased += btodb(bsize);
1449 1450          }
1450 1451          if (lastblock < 0)
1451 1452                  goto done;
1452 1453  
1453 1454          /*
1454 1455           * Finally, look for a change in size of the
1455 1456           * last direct block; release any frags.
1456 1457           */
1457 1458          bn = ip->i_db[lastblock];
1458 1459          if (bn != 0) {
1459 1460                  off_t oldspace, newspace;
1460 1461  
1461 1462                  /*
1462 1463                   * Calculate amount of space we're giving
1463 1464                   * back as old block size minus new block size.
1464 1465                   */
1465 1466                  oldspace = blksize(fs, ip, lastblock);
1466 1467                  UFS_SET_ISIZE(length, ip);
1467 1468                  newspace = blksize(fs, ip, lastblock);
1468 1469                  if (newspace == 0) {
1469 1470                          err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
1470 1471                          return (err);
1471 1472                  }
1472 1473                  if (oldspace - newspace > 0) {
1473 1474                          /*
1474 1475                           * Block number of space to be free'd is
1475 1476                           * the old block # plus the number of frags
1476 1477                           * required for the storage we're keeping.
1477 1478                           */
1478 1479                          bn += numfrags(fs, newspace);
1479 1480                          free(ip, bn, oldspace - newspace, flags);
1480 1481                          blocksreleased += btodb(oldspace - newspace);
1481 1482                  }
1482 1483          }
1483 1484  done:
1484 1485  /* BEGIN PARANOIA */
1485 1486          for (level = SINGLE; level <= TRIPLE; level++)
1486 1487                  if (ip->i_ib[level] != oip->i_ib[level]) {
1487 1488                          err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
1488 1489                          return (err);
1489 1490                  }
1490 1491  
1491 1492          for (i = 0; i < NDADDR; i++)
1492 1493                  if (ip->i_db[i] != oip->i_db[i]) {
1493 1494                          err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
1494 1495                          return (err);
1495 1496                  }
1496 1497  /* END PARANOIA */
1497 1498          oip->i_blocks -= blocksreleased;
1498 1499  
1499 1500          if (oip->i_blocks < 0) {                /* sanity */
1500 1501                  cmn_err(CE_NOTE,
1501 1502                      "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
1502 1503                      fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
1503 1504                      (int)oip->i_blocks);
1504 1505                  oip->i_blocks = 0;
1505 1506          }
1506 1507          oip->i_flag |= ICHG|IATTCHG;
1507 1508          oip->i_seq++;
1508 1509          /* blocksreleased is >= zero, so this can not fail */
1509 1510          (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL,
1510 1511              (size_t *)NULL);
1511 1512          return (0);
1512 1513  }
1513 1514  
1514 1515  /*
1515 1516   * Check mode permission on inode.  Mode is READ, WRITE or EXEC.
1516 1517   * In the case of WRITE, the read-only status of the file system
1517 1518   * is checked.  Depending on the calling user, the appropriate
1518 1519   * mode bits are selected; privileges to override missing permission
1519 1520   * bits are checked through secpolicy_vnode_access().
1520 1521   * The i_contens lock must be held as reader here to prevent racing with
1521 1522   * the acl subsystem removing/setting/changing acls on this inode.
1522 1523   * The caller is responsible for indicating whether or not the i_contents
1523 1524   * lock needs to be acquired here or if already held.
1524 1525   */
1525 1526  int
1526 1527  ufs_iaccess(struct inode  *ip, int mode, struct cred *cr, int dolock)
1527 1528  {
1528 1529          int shift = 0;
1529 1530          int ret = 0;
1530 1531  
1531 1532          if (dolock)
1532 1533                  rw_enter(&ip->i_contents, RW_READER);
1533 1534          ASSERT(RW_LOCK_HELD(&ip->i_contents));
1534 1535  
1535 1536          if (mode & IWRITE) {
1536 1537                  /*
1537 1538                   * Disallow write attempts on read-only
1538 1539                   * file systems, unless the file is a block
1539 1540                   * or character device or a FIFO.
1540 1541                   */
1541 1542                  if (ip->i_fs->fs_ronly != 0) {
1542 1543                          if ((ip->i_mode & IFMT) != IFCHR &&
1543 1544                              (ip->i_mode & IFMT) != IFBLK &&
1544 1545                              (ip->i_mode & IFMT) != IFIFO) {
1545 1546                                  ret = EROFS;
1546 1547                                  goto out;
1547 1548                          }
1548 1549                  }
1549 1550          }
1550 1551          /*
1551 1552           * If there is an acl, check the acl and return.
1552 1553           */
1553 1554          if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) {
1554 1555                  ret = ufs_acl_access(ip, mode, cr);
1555 1556                  goto out;
1556 1557          }
1557 1558  
1558 1559          /*
1559 1560           * Access check is based on only one of owner, group, public.
1560 1561           * If not owner, then check group.
1561 1562           * If not a member of the group, then check public access.
1562 1563           */
1563 1564          if (crgetuid(cr) != ip->i_uid) {
1564 1565                  shift += 3;
1565 1566                  if (!groupmember((uid_t)ip->i_gid, cr))
1566 1567                          shift += 3;
1567 1568          }
1568 1569  
1569 1570          /* test missing privilege bits */
1570 1571          ret = secpolicy_vnode_access2(cr, ITOV(ip), ip->i_uid,
1571 1572              ip->i_mode << shift, mode);
1572 1573  out:
1573 1574          if (dolock)
1574 1575                  rw_exit(&ip->i_contents);
1575 1576          return (ret);
1576 1577  }
1577 1578  
1578 1579  /*
1579 1580   * if necessary, remove an inode from the free list
1580 1581   *      i_contents is held except at unmount
1581 1582   *
1582 1583   * Return 1 if the inode is taken off of the ufs_idle_q,
1583 1584   * and the caller is expected to call VN_RELE.
1584 1585   *
1585 1586   * Return 0 otherwise.
1586 1587   */
1587 1588  int
1588 1589  ufs_rmidle(struct inode *ip)
1589 1590  {
1590 1591          int rval = 0;
1591 1592  
1592 1593          mutex_enter(&ip->i_tlock);
1593 1594          if ((ip->i_flag & IREF) == 0) {
1594 1595                  mutex_enter(&ufs_idle_q.uq_mutex);
1595 1596                  ip->i_freef->i_freeb = ip->i_freeb;
1596 1597                  ip->i_freeb->i_freef = ip->i_freef;
1597 1598                  ip->i_freef = ip;
1598 1599                  ip->i_freeb = ip;
1599 1600                  ip->i_flag |= IREF;
1600 1601                  ufs_idle_q.uq_ne--;
1601 1602                  if (ip->i_flag & IJUNKIQ) {
1602 1603                          ufs_njunk_iq--;
1603 1604                          ip->i_flag &= ~IJUNKIQ;
1604 1605                  } else {
1605 1606                          ufs_nuseful_iq--;
1606 1607                  }
1607 1608                  mutex_exit(&ufs_idle_q.uq_mutex);
1608 1609                  rval = 1;
1609 1610          }
1610 1611          mutex_exit(&ip->i_tlock);
1611 1612          return (rval);
1612 1613  }
1613 1614  
1614 1615  /*
1615 1616   * scan the hash of inodes and call func with the inode locked
1616 1617   */
1617 1618  int
1618 1619  ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
1619 1620      struct ufsvfs *ufsvfsp)
1620 1621  {
1621 1622          struct inode            *ip;            /* current inode */
1622 1623          struct inode            *lip = NULL;    /* last/previous inode */
1623 1624          union ihead             *ih;            /* current hash chain */
1624 1625          int                     error, i;
1625 1626          int                     saverror = 0;
1626 1627          int                     lip_held;       /* lip needs a VN_RELE() */
1627 1628  
1628 1629          /*
1629 1630           * If ufsvfsp is NULL, then our caller should be holding
1630 1631           * ufs_scan_lock to avoid conflicts between ufs_unmount() and
1631 1632           * ufs_update().  Otherwise, to avoid false-positives in
1632 1633           * ufs_unmount()'s v_count-based EBUSY check, we only hold
1633 1634           * those inodes that are in the file system our caller cares
1634 1635           * about.
1635 1636           *
1636 1637           * We know that ip is a valid inode in the hash chain (and thus
1637 1638           * we can trust i_ufsvfs) because the inode we chained from
1638 1639           * (lip) is still in the hash chain.  This is true because either:
1639 1640           *
1640 1641           * 1. We did not drop the hash chain lock since the last
1641 1642           *    iteration (because we were not interested in the last inode),
1642 1643           * or
1643 1644           * 2. We maintained a hold on the last inode while we
1644 1645           *    we were processing it, so it could not be removed
1645 1646           *    from the hash chain.
1646 1647           *
1647 1648           * The whole reason we're dropping and re-grabbing the chain
1648 1649           * lock on every inode is so that we don't present a major
1649 1650           * choke point on throughput, particularly when we've been
1650 1651           * called on behalf of fsflush.
1651 1652           */
1652 1653  
1653 1654          for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
1654 1655                  mutex_enter(&ih_lock[i]);
1655 1656                  for (ip = ih->ih_chain[0], lip_held = 0;
1656 1657                      ip != (struct inode *)ih;
1657 1658                      ip = lip->i_forw) {
1658 1659  
1659 1660                          ins.in_scan.value.ul++;
1660 1661  
1661 1662                          /*
1662 1663                           * Undo the previous iteration's VN_HOLD(), but
1663 1664                           * only if one was done.
1664 1665                           */
1665 1666                          if (lip_held)
1666 1667                                  VN_RELE(ITOV(lip));
1667 1668  
1668 1669                          lip = ip;
1669 1670                          if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
1670 1671                                  /*
1671 1672                                   * We're not processing all inodes, and
1672 1673                                   * this inode is not in the filesystem of
1673 1674                                   * interest, so skip it.  No need to do a
1674 1675                                   * VN_HOLD() since we're not dropping the
1675 1676                                   * hash chain lock until after we've
1676 1677                                   * done the i_forw traversal above.
1677 1678                                   */
1678 1679                                  lip_held = 0;
1679 1680                                  continue;
1680 1681                          }
1681 1682                          VN_HOLD(ITOV(ip));
1682 1683                          lip_held = 1;
1683 1684                          mutex_exit(&ih_lock[i]);
1684 1685  
1685 1686                          /*
1686 1687                           * Acquire the contents lock as writer to make
1687 1688                           * sure that the inode has been initialized in
1688 1689                           * the cache or removed from the idle list by
1689 1690                           * ufs_iget().  This works because ufs_iget()
1690 1691                           * acquires the contents lock before putting
1691 1692                           * the inode into the cache.  If we can lock
1692 1693                           * it, then ufs_iget() is done with it.
1693 1694                           */
1694 1695  
1695 1696                          if (rwtry) {
1696 1697                                  if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
1697 1698                                          mutex_enter(&ih_lock[i]);
1698 1699                                          continue;
1699 1700                                  }
1700 1701                          } else {
1701 1702                                  rw_enter(&ip->i_contents, RW_WRITER);
1702 1703                          }
1703 1704  
1704 1705                          rw_exit(&ip->i_contents);
1705 1706  
1706 1707                          /*
1707 1708                           * ISTALE means the inode couldn't be read
1708 1709                           *
1709 1710                           * We don't have to hold the i_contents lock
1710 1711                           * for this check for a couple of
1711 1712                           * reasons. First, if ISTALE is set then the
1712 1713                           * flag cannot be cleared until the inode is
1713 1714                           * removed from the cache and that cannot
1714 1715                           * happen until after we VN_RELE() it.
1715 1716                           * Second, if ISTALE is not set, then the
1716 1717                           * inode is in the cache and does not need to
1717 1718                           * be read from disk so ISTALE cannot be set
1718 1719                           * while we are not looking.
1719 1720                           */
1720 1721                          if ((ip->i_flag & ISTALE) == 0) {
1721 1722                                  if ((error = (*func)(ip, arg)) != 0)
1722 1723                                          saverror = error;
1723 1724                          }
1724 1725  
1725 1726                          mutex_enter(&ih_lock[i]);
1726 1727                  }
1727 1728                  if (lip_held)
1728 1729                          VN_RELE(ITOV(lip));
1729 1730                  mutex_exit(&ih_lock[i]);
1730 1731          }
1731 1732          return (saverror);
1732 1733  }
1733 1734  
1734 1735  /*
1735 1736   * Mark inode with the current time, plus a unique increment.
1736 1737   *
1737 1738   * Since we only keep 32-bit time on disk, if UFS is still alive
1738 1739   * beyond 2038, filesystem times will simply stick at the last
1739 1740   * possible second of 32-bit time. Not ideal, but probably better
1740 1741   * than going into the remote past, or confusing applications with
1741 1742   * negative time.
1742 1743   */
1743 1744  void
1744 1745  ufs_imark(struct inode *ip)
1745 1746  {
1746 1747          timestruc_t now;
1747 1748          int32_t usec, nsec;
1748 1749  
1749 1750          /*
1750 1751           * The update of i_seq may have been deferred, increase i_seq here
1751 1752           * to make sure it is in sync with the timestamps.
1752 1753           */
1753 1754          if (ip->i_flag & ISEQ) {
1754 1755                  ASSERT(ip->i_flag & (IUPD|ICHG));
1755 1756                  ip->i_seq++;
1756 1757                  ip->i_flag &= ~ISEQ;
1757 1758          }
1758 1759  
1759 1760          gethrestime(&now);
1760 1761  
1761 1762          /*
1762 1763           * Fast algorithm to convert nsec to usec -- see hrt2ts()
1763 1764           * in common/os/timers.c for a full description.
1764 1765           */
1765 1766          nsec = now.tv_nsec;
1766 1767          usec = nsec + (nsec >> 2);
1767 1768          usec = nsec + (usec >> 1);
1768 1769          usec = nsec + (usec >> 2);
1769 1770          usec = nsec + (usec >> 4);
1770 1771          usec = nsec - (usec >> 3);
1771 1772          usec = nsec + (usec >> 2);
1772 1773          usec = nsec + (usec >> 3);
1773 1774          usec = nsec + (usec >> 4);
1774 1775          usec = nsec + (usec >> 1);
1775 1776          usec = nsec + (usec >> 6);
1776 1777          usec = usec >> 10;
1777 1778  
1778 1779          mutex_enter(&ufs_iuniqtime_lock);
1779 1780          if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
1780 1781              usec > iuniqtime.tv_usec) {
1781 1782                  if (now.tv_sec < TIME32_MAX) {
1782 1783                          iuniqtime.tv_sec = (time32_t)now.tv_sec;
1783 1784                          iuniqtime.tv_usec = usec;
1784 1785                  }
1785 1786          } else {
1786 1787                  if (iuniqtime.tv_sec < TIME32_MAX) {
1787 1788                          iuniqtime.tv_usec++;
1788 1789                          /* Check for usec overflow */
1789 1790                          if (iuniqtime.tv_usec >= MICROSEC) {
1790 1791                                  iuniqtime.tv_sec++;
1791 1792                                  iuniqtime.tv_usec = 0;
1792 1793                          }
1793 1794                  }
1794 1795          }
1795 1796  
1796 1797          if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
1797 1798                  ip->i_atime = iuniqtime;
1798 1799          }
1799 1800          if (ip->i_flag & IUPD) {
1800 1801                  ip->i_mtime = iuniqtime;
1801 1802                  ip->i_flag |= IMODTIME;
1802 1803          }
1803 1804          if (ip->i_flag & ICHG) {
1804 1805                  ip->i_diroff = 0;
1805 1806                  ip->i_ctime = iuniqtime;
1806 1807          }
1807 1808          mutex_exit(&ufs_iuniqtime_lock);
1808 1809  }
1809 1810  
1810 1811  /*
1811 1812   * Update timestamps in inode.
1812 1813   */
1813 1814  void
1814 1815  ufs_itimes_nolock(struct inode *ip)
1815 1816  {
1816 1817  
1817 1818          /*
1818 1819           * if noatime is set and the inode access time is the only field that
1819 1820           * must be changed, exit immediately.
1820 1821           */
1821 1822          if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
1822 1823              (ip->i_ufsvfs->vfs_noatime)) {
1823 1824                  return;
1824 1825          }
1825 1826  
1826 1827          if (ip->i_flag & (IUPD|IACC|ICHG)) {
1827 1828                  if (ip->i_flag & ICHG)
1828 1829                          ip->i_flag |= IMOD;
1829 1830                  else
1830 1831                          ip->i_flag |= IMODACC;
1831 1832                  ufs_imark(ip);
1832 1833                  ip->i_flag &= ~(IACC|IUPD|ICHG);
1833 1834          }
1834 1835  }
  
    | 
      ↓ open down ↓ | 
    1685 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX