Print this page
re #13613 rb4516 Tunables needs volatile keyword
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/ufs/ufs_inode.c
+++ new/usr/src/uts/common/fs/ufs/ufs_inode.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
23 24 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
24 25 */
25 26
26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 28 /* All Rights Reserved */
28 29
29 30 /*
30 31 * University Copyright- Copyright (c) 1982, 1986, 1988
31 32 * The Regents of the University of California
32 33 * All Rights Reserved
33 34 *
34 35 * University Acknowledgment- Portions of this document are derived from
35 36 * software developed by the University of California, Berkeley, and its
36 37 * contributors.
37 38 */
38 39
39 40 #include <sys/types.h>
40 41 #include <sys/t_lock.h>
41 42 #include <sys/param.h>
42 43 #include <sys/systm.h>
43 44 #include <sys/uio.h>
44 45 #include <sys/bitmap.h>
45 46 #include <sys/signal.h>
46 47 #include <sys/cred.h>
47 48 #include <sys/user.h>
48 49 #include <sys/vfs.h>
49 50 #include <sys/stat.h>
50 51 #include <sys/vnode.h>
51 52 #include <sys/buf.h>
52 53 #include <sys/proc.h>
53 54 #include <sys/disp.h>
54 55 #include <sys/dnlc.h>
55 56 #include <sys/mode.h>
56 57 #include <sys/cmn_err.h>
57 58 #include <sys/kstat.h>
58 59 #include <sys/acl.h>
59 60 #include <sys/var.h>
60 61 #include <sys/fs/ufs_inode.h>
61 62 #include <sys/fs/ufs_fs.h>
62 63 #include <sys/fs/ufs_trans.h>
63 64 #include <sys/fs/ufs_acl.h>
64 65 #include <sys/fs/ufs_bio.h>
65 66 #include <sys/fs/ufs_quota.h>
66 67 #include <sys/fs/ufs_log.h>
67 68 #include <vm/hat.h>
68 69 #include <vm/as.h>
69 70 #include <vm/pvn.h>
70 71 #include <vm/seg.h>
71 72 #include <sys/swap.h>
72 73 #include <sys/cpuvar.h>
73 74 #include <sys/sysmacros.h>
74 75 #include <sys/errno.h>
75 76 #include <sys/kmem.h>
76 77 #include <sys/debug.h>
77 78 #include <fs/fs_subr.h>
78 79 #include <sys/policy.h>
79 80
80 81 struct kmem_cache *inode_cache; /* cache of free inodes */
81 82
82 83 /* UFS Inode Cache Stats -- Not protected */
83 84 struct instats ins = {
84 85 { "size", KSTAT_DATA_ULONG },
85 86 { "maxsize", KSTAT_DATA_ULONG },
86 87 { "hits", KSTAT_DATA_ULONG },
87 88 { "misses", KSTAT_DATA_ULONG },
88 89 { "kmem allocs", KSTAT_DATA_ULONG },
89 90 { "kmem frees", KSTAT_DATA_ULONG },
90 91 { "maxsize reached", KSTAT_DATA_ULONG },
91 92 { "puts at frontlist", KSTAT_DATA_ULONG },
92 93 { "puts at backlist", KSTAT_DATA_ULONG },
93 94 { "queues to free", KSTAT_DATA_ULONG },
94 95 { "scans", KSTAT_DATA_ULONG },
95 96 { "thread idles", KSTAT_DATA_ULONG },
96 97 { "lookup idles", KSTAT_DATA_ULONG },
97 98 { "vget idles", KSTAT_DATA_ULONG },
98 99 { "cache allocs", KSTAT_DATA_ULONG },
99 100 { "cache frees", KSTAT_DATA_ULONG },
100 101 { "pushes at close", KSTAT_DATA_ULONG }
101 102 };
102 103
103 104 /* kstat data */
104 105 static kstat_t *ufs_inode_kstat = NULL;
105 106
106 107 union ihead *ihead; /* inode LRU cache, Chris Maltby */
107 108 kmutex_t *ih_lock; /* protect inode cache hash table */
108 109 static int ino_hashlen = 4; /* desired average hash chain length */
109 110 int inohsz; /* number of buckets in the hash table */
110 111
111 112 kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */
112 113 kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */
113 114 kmutex_t ufsvfs_mutex;
114 115 struct ufsvfs *oldufsvfslist, *ufsvfslist;
115 116
116 117 /*
117 118 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
118 119 * I/Os are going on.
119 120 */
120 121 clock_t ufs_iowait;
121 122
122 123 /*
123 124 * the threads that process idle inodes and free (deleted) inodes
124 125 * have high water marks that are set in ufsinit().
125 126 * These values but can be no less then the minimum shown below
126 127 */
127 128 int ufs_idle_max; /* # of allowable idle inodes */
|
↓ open down ↓ |
95 lines elided |
↑ open up ↑ |
128 129 ulong_t ufs_inode_max; /* hard limit of allowable idle inodes */
129 130 #define UFS_IDLE_MAX (16) /* min # of allowable idle inodes */
130 131
131 132 /*
132 133 * Tunables for ufs write throttling.
133 134 * These are validated in ufs_iinit() since improper settings
134 135 * can lead to filesystem hangs.
135 136 */
136 137 #define UFS_HW_DEFAULT (16 * 1024 * 1024)
137 138 #define UFS_LW_DEFAULT (8 * 1024 * 1024)
138 -int ufs_HW = UFS_HW_DEFAULT;
139 -int ufs_LW = UFS_LW_DEFAULT;
139 +volatile int ufs_HW = UFS_HW_DEFAULT;
140 +volatile int ufs_LW = UFS_LW_DEFAULT;
140 141
141 142 static void ihinit(void);
142 143 extern int hash2ints(int, int);
143 144
144 145 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
145 146 struct cred *, int);
146 147
147 148 /* ARGSUSED */
148 149 static int
149 150 ufs_inode_kstat_update(kstat_t *ksp, int rw)
150 151 {
151 152 if (rw == KSTAT_WRITE)
152 153 return (EACCES);
153 154
154 155 ins.in_malloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
155 156 "slab_alloc");
156 157 ins.in_mfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
157 158 "slab_free");
158 159 ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
159 160 "alloc");
160 161 ins.in_kcfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
161 162 "free");
162 163 ins.in_size.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
163 164 "buf_inuse");
164 165 ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
165 166 "buf_max");
166 167 ins.in_misses.value.ul = ins.in_kcalloc.value.ul;
167 168
168 169 return (0);
169 170 }
170 171
171 172 void
172 173 ufs_iinit(void)
173 174 {
174 175 /*
175 176 * Validate that ufs_HW > ufs_LW.
176 177 * The default values for these two tunables have been increased.
177 178 * There is now a range of values for ufs_HW that used to be
178 179 * legal on previous Solaris versions but no longer is now.
179 180 * Upgrading a machine which has an /etc/system setting for ufs_HW
180 181 * from that range can lead to filesystem hangs unless the values
181 182 * are checked here.
182 183 */
183 184 if (ufs_HW <= ufs_LW) {
184 185 cmn_err(CE_WARN,
185 186 "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
186 187 ufs_HW, ufs_LW);
187 188 ufs_LW = UFS_LW_DEFAULT;
188 189 ufs_HW = UFS_HW_DEFAULT;
189 190 cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
190 191 ufs_HW, ufs_LW);
191 192 }
192 193
193 194 /*
194 195 * Adjust the tunable `ufs_ninode' to a reasonable value
195 196 */
196 197 if (ufs_ninode <= 0)
197 198 ufs_ninode = ncsize;
198 199 if (ufs_inode_max == 0)
199 200 ufs_inode_max =
200 201 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode));
201 202 if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
202 203 cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
203 204 ufs_inode_max);
204 205 ufs_ninode = ufs_inode_max;
205 206 }
206 207 /*
207 208 * Wait till third call of ufs_update to declare that no I/Os are
208 209 * going on. This allows deferred access times to be flushed to disk.
209 210 */
210 211 ufs_iowait = v.v_autoup * hz * 2;
211 212
212 213 /*
213 214 * idle thread runs when 25% of ufs_ninode entries are on the queue
214 215 */
215 216 if (ufs_idle_max == 0)
216 217 ufs_idle_max = ufs_ninode >> 2;
217 218 if (ufs_idle_max < UFS_IDLE_MAX)
218 219 ufs_idle_max = UFS_IDLE_MAX;
219 220 if (ufs_idle_max > ufs_ninode)
220 221 ufs_idle_max = ufs_ninode;
221 222 /*
222 223 * This is really a misnomer, it is ufs_queue_init
223 224 */
224 225 ufs_thread_init(&ufs_idle_q, ufs_idle_max);
225 226 ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);
226 227
227 228 /*
228 229 * global hlock thread
229 230 */
230 231 ufs_thread_init(&ufs_hlock, 1);
231 232 ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);
232 233
233 234 ihinit();
234 235 qtinit();
235 236 ins.in_maxsize.value.ul = ufs_ninode;
236 237 if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
237 238 KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
238 239 KSTAT_FLAG_VIRTUAL)) != NULL) {
239 240 ufs_inode_kstat->ks_data = (void *)&ins;
240 241 ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
241 242 kstat_install(ufs_inode_kstat);
242 243 }
243 244 ufsfx_init(); /* fix-on-panic initialization */
244 245 si_cache_init();
245 246 ufs_directio_init();
246 247 lufs_init();
247 248 mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
248 249 }
249 250
250 251 /* ARGSUSED */
251 252 static int
252 253 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
253 254 {
254 255 struct inode *ip = buf;
255 256 struct vnode *vp;
256 257
257 258 vp = ip->i_vnode = vn_alloc(kmflags);
258 259 if (vp == NULL) {
259 260 return (-1);
260 261 }
261 262 vn_setops(vp, ufs_vnodeops);
262 263 vp->v_data = ip;
263 264
264 265 rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
265 266 rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
266 267 mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
267 268 dnlc_dir_init(&ip->i_danchor);
268 269
269 270 cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);
270 271
271 272 return (0);
272 273 }
273 274
274 275 /* ARGSUSED */
275 276 static void
276 277 ufs_inode_cache_destructor(void *buf, void *cdrarg)
277 278 {
278 279 struct inode *ip = buf;
279 280 struct vnode *vp;
280 281
281 282 vp = ITOV(ip);
282 283
283 284 rw_destroy(&ip->i_rwlock);
284 285 rw_destroy(&ip->i_contents);
285 286 mutex_destroy(&ip->i_tlock);
286 287 if (vp->v_type == VDIR) {
287 288 dnlc_dir_fini(&ip->i_danchor);
288 289 }
289 290
290 291 cv_destroy(&ip->i_wrcv);
291 292
292 293 vn_free(vp);
293 294 }
294 295
295 296 /*
296 297 * Initialize hash links for inodes
297 298 * and build inode free list.
298 299 */
299 300 void
300 301 ihinit(void)
301 302 {
302 303 int i;
303 304 union ihead *ih = ihead;
304 305
305 306 mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);
306 307
307 308 inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
308 309 ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
309 310 ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);
310 311
311 312 for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
312 313 ih->ih_head[0] = ih;
313 314 ih->ih_head[1] = ih;
314 315 mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
315 316 }
316 317 inode_cache = kmem_cache_create("ufs_inode_cache",
317 318 sizeof (struct inode), 0, ufs_inode_cache_constructor,
318 319 ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
319 320 NULL, NULL, 0);
320 321 }
321 322
322 323 /*
323 324 * Free an inode structure
324 325 */
325 326 void
326 327 ufs_free_inode(struct inode *ip)
327 328 {
328 329 vn_invalid(ITOV(ip));
329 330 kmem_cache_free(inode_cache, ip);
330 331 }
331 332
332 333 /*
333 334 * Allocate an inode structure
334 335 */
335 336 struct inode *
336 337 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
337 338 {
338 339 struct inode *ip;
339 340 vnode_t *vp;
340 341
341 342 ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
342 343 /*
343 344 * at this point we have a newly allocated inode
344 345 */
345 346 ip->i_freef = ip;
346 347 ip->i_freeb = ip;
347 348 ip->i_flag = IREF;
348 349 ip->i_seq = 0xFF; /* Unique initial value */
349 350 ip->i_dev = ufsvfsp->vfs_dev;
350 351 ip->i_ufsvfs = ufsvfsp;
351 352 ip->i_devvp = ufsvfsp->vfs_devvp;
352 353 ip->i_number = ino;
353 354 ip->i_diroff = 0;
354 355 ip->i_nextr = 0;
355 356 ip->i_map = NULL;
356 357 ip->i_rdev = 0;
357 358 ip->i_writes = 0;
358 359 ip->i_mode = 0;
359 360 ip->i_delaylen = 0;
360 361 ip->i_delayoff = 0;
361 362 ip->i_nextrio = 0;
362 363 ip->i_ufs_acl = NULL;
363 364 ip->i_cflags = 0;
364 365 ip->i_mapcnt = 0;
365 366 ip->i_dquot = NULL;
366 367 ip->i_cachedir = CD_ENABLED;
367 368 ip->i_writer = NULL;
368 369
369 370 /*
370 371 * the vnode for this inode was allocated by the constructor
371 372 */
372 373 vp = ITOV(ip);
373 374 vn_reinit(vp);
374 375 if (ino == (ino_t)UFSROOTINO)
375 376 vp->v_flag = VROOT;
376 377 vp->v_vfsp = ufsvfsp->vfs_vfs;
377 378 vn_exists(vp);
378 379 return (ip);
379 380 }
380 381
381 382 /*
382 383 * Look up an inode by device, inumber. If it is in core (in the
383 384 * inode structure), honor the locking protocol. If it is not in
384 385 * core, read it in from the specified device after freeing any pages.
385 386 * In all cases, a pointer to a VN_HELD inode structure is returned.
386 387 */
387 388 int
388 389 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
389 390 {
390 391 return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
391 392 }
392 393
393 394 /*
394 395 * A version of ufs_iget which returns only allocated, linked inodes.
395 396 * This is appropriate for any callers who do not expect a free inode.
396 397 */
397 398 int
398 399 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
399 400 struct cred *cr)
400 401 {
401 402 return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
402 403 }
403 404
404 405 /*
405 406 * Set vnode attributes based on v_type, this should be called whenever
406 407 * an inode's i_mode is changed.
407 408 */
408 409 void
409 410 ufs_reset_vnode(vnode_t *vp)
410 411 {
411 412 /*
412 413 * an old DBE hack
413 414 */
414 415 if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
415 416 vp->v_flag |= VSWAPLIKE;
416 417 else
417 418 vp->v_flag &= ~VSWAPLIKE;
418 419
419 420 /*
420 421 * if not swap like and it's just a regular file, we want
421 422 * to maintain the vnode's pages sorted by clean/modified
422 423 * for faster sync'ing to disk
423 424 */
424 425 if (vp->v_type == VREG)
425 426 vp->v_flag |= VMODSORT;
426 427 else
427 428 vp->v_flag &= ~VMODSORT;
428 429
429 430 /*
430 431 * Is this an attribute hidden dir?
431 432 */
432 433 if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
433 434 vp->v_flag |= V_XATTRDIR;
434 435 else
435 436 vp->v_flag &= ~V_XATTRDIR;
436 437 }
437 438
438 439 /*
439 440 * Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate'
440 441 * flag is used to distinguish the two; when true, we validate that the inode
441 442 * being retrieved looks like a linked and allocated inode.
442 443 */
443 444 /* ARGSUSED */
444 445 static int
445 446 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
446 447 struct cred *cr, int validate)
447 448 {
448 449 struct inode *ip, *sp;
449 450 union ihead *ih;
450 451 kmutex_t *ihm;
451 452 struct buf *bp;
452 453 struct dinode *dp;
453 454 struct vnode *vp;
454 455 extern vfs_t EIO_vfs;
455 456 int error;
456 457 int ftype; /* XXX - Remove later on */
457 458 dev_t vfs_dev;
458 459 struct ufsvfs *ufsvfsp;
459 460 struct fs *fs;
460 461 int hno;
461 462 daddr_t bno;
462 463 ulong_t ioff;
463 464
464 465 CPU_STATS_ADD_K(sys, ufsiget, 1);
465 466
466 467 /*
467 468 * Lookup inode in cache.
468 469 */
469 470 vfs_dev = vfsp->vfs_dev;
470 471 hno = INOHASH(ino);
471 472 ih = &ihead[hno];
472 473 ihm = &ih_lock[hno];
473 474
474 475 again:
475 476 mutex_enter(ihm);
476 477 for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
477 478 if (ino != ip->i_number || vfs_dev != ip->i_dev ||
478 479 (ip->i_flag & ISTALE))
479 480 continue;
480 481
481 482 /*
482 483 * Found the interesting inode; hold it and drop the cache lock
483 484 */
484 485 vp = ITOV(ip); /* for locknest */
485 486 VN_HOLD(vp);
486 487 mutex_exit(ihm);
487 488 rw_enter(&ip->i_contents, RW_READER);
488 489
489 490 /*
490 491 * if necessary, remove from idle list
491 492 */
492 493 if ((ip->i_flag & IREF) == 0) {
493 494 if (ufs_rmidle(ip))
494 495 VN_RELE(vp);
495 496 }
496 497
497 498 /*
498 499 * Could the inode be read from disk?
499 500 */
500 501 if (ip->i_flag & ISTALE) {
501 502 rw_exit(&ip->i_contents);
502 503 VN_RELE(vp);
503 504 goto again;
504 505 }
505 506
506 507 ins.in_hits.value.ul++;
507 508 *ipp = ip;
508 509
509 510 /*
510 511 * Reset the vnode's attribute flags
511 512 */
512 513 mutex_enter(&vp->v_lock);
513 514 ufs_reset_vnode(vp);
514 515 mutex_exit(&vp->v_lock);
515 516
516 517 rw_exit(&ip->i_contents);
517 518
518 519 return (0);
519 520 }
520 521 mutex_exit(ihm);
521 522
522 523 /*
523 524 * Inode was not in cache.
524 525 *
525 526 * Allocate a new entry
526 527 */
527 528 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
528 529 fs = ufsvfsp->vfs_fs;
529 530
530 531 ip = ufs_alloc_inode(ufsvfsp, ino);
531 532 vp = ITOV(ip);
532 533
533 534 bno = fsbtodb(fs, itod(fs, ino));
534 535 ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
535 536 ip->i_doff = (offset_t)ioff + ldbtob(bno);
536 537
537 538 /*
538 539 * put a place holder in the cache (if not already there)
539 540 */
540 541 mutex_enter(ihm);
541 542 for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
542 543 if (ino == sp->i_number && vfs_dev == sp->i_dev &&
543 544 ((sp->i_flag & ISTALE) == 0)) {
544 545 mutex_exit(ihm);
545 546 ufs_free_inode(ip);
546 547 goto again;
547 548 }
548 549 /*
549 550 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
550 551 * here, but if we do, then shadow inode allocations panic the
551 552 * system. We don't have to hold vfs_dqrwlock for shadow inodes
552 553 * and the ufs_iget() parameters don't tell us what we are getting
553 554 * so we have no way of knowing this is a ufs_iget() call from
554 555 * a ufs_ialloc() call for a shadow inode.
555 556 */
556 557 rw_enter(&ip->i_contents, RW_WRITER);
557 558 insque(ip, ih);
558 559 mutex_exit(ihm);
559 560 /*
560 561 * read the dinode
561 562 */
562 563 bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);
563 564
564 565 /*
565 566 * Check I/O errors
566 567 */
567 568 error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
568 569 if (error) {
569 570 brelse(bp);
570 571 ip->i_flag |= ISTALE; /* in case someone is looking it up */
571 572 rw_exit(&ip->i_contents);
572 573 vp->v_vfsp = &EIO_vfs;
573 574 VN_RELE(vp);
574 575 return (error);
575 576 }
576 577 /*
577 578 * initialize the inode's dinode
578 579 */
579 580 dp = (struct dinode *)(ioff + bp->b_un.b_addr);
580 581 ip->i_ic = dp->di_ic; /* structure assignment */
581 582 brelse(bp);
582 583
583 584 /*
584 585 * Maintain compatibility with Solaris 1.x UFS
585 586 */
586 587 if (ip->i_suid != UID_LONG)
587 588 ip->i_uid = ip->i_suid;
588 589 if (ip->i_sgid != GID_LONG)
589 590 ip->i_gid = ip->i_sgid;
590 591
591 592 ftype = ip->i_mode & IFMT;
592 593 if (ftype == IFBLK || ftype == IFCHR) {
593 594 dev_t dv;
594 595 uint_t top16 = ip->i_ordev & 0xffff0000u;
595 596
596 597 if (top16 == 0 || top16 == 0xffff0000u)
597 598 dv = expdev(ip->i_ordev);
598 599 else
599 600 dv = expldev(ip->i_ordev);
600 601 vp->v_rdev = ip->i_rdev = dv;
601 602 }
602 603
603 604 /*
604 605 * if our caller only expects allocated inodes, verify that
605 606 * this inode looks good; throw it out if it's bad.
606 607 */
607 608 if (validate) {
608 609 if ((ftype == 0) || (ip->i_nlink <= 0)) {
609 610 ip->i_flag |= ISTALE;
610 611 rw_exit(&ip->i_contents);
611 612 vp->v_vfsp = &EIO_vfs;
612 613 VN_RELE(vp);
613 614 cmn_err(CE_NOTE,
614 615 "%s: unexpected free inode %d, run fsck(1M)%s",
615 616 fs->fs_fsmnt, (int)ino,
616 617 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
617 618 return (EIO);
618 619 }
619 620 }
620 621
621 622 /*
622 623 * Finish initializing the vnode, special handling for shadow inodes
623 624 * because IFTOVT() will produce a v_type of VNON which is not what we
624 625 * want, set v_type to VREG explicitly in that case.
625 626 */
626 627 if (ftype == IFSHAD) {
627 628 vp->v_type = VREG;
628 629 } else {
629 630 vp->v_type = IFTOVT((mode_t)ip->i_mode);
630 631 }
631 632
632 633 ufs_reset_vnode(vp);
633 634
634 635 /*
635 636 * read the shadow
636 637 */
637 638 if (ftype != 0 && ip->i_shadow != 0) {
638 639 if ((error = ufs_si_load(ip, cr)) != 0) {
639 640 ip->i_flag |= ISTALE;
640 641 ip->i_ufs_acl = NULL;
641 642 rw_exit(&ip->i_contents);
642 643 vp->v_vfsp = &EIO_vfs;
643 644 VN_RELE(vp);
644 645 return (error);
645 646 }
646 647 }
647 648
648 649 /*
649 650 * Only attach quota information if the inode has a type and if
650 651 * that type is not a shadow inode.
651 652 */
652 653 if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
653 654 ((ip->i_mode & IFMT) != IFATTRDIR)) {
654 655 ip->i_dquot = getinoquota(ip);
655 656 }
656 657 TRANS_MATA_IGET(ufsvfsp, ip);
657 658 *ipp = ip;
658 659 rw_exit(&ip->i_contents);
659 660
660 661 return (0);
661 662 }
662 663
663 664 /*
664 665 * Vnode is no longer referenced, write the inode out
665 666 * and if necessary, truncate and deallocate the file.
666 667 */
667 668 void
668 669 ufs_iinactive(struct inode *ip)
669 670 {
670 671 int front;
671 672 struct inode *iq;
672 673 struct inode *hip;
673 674 struct ufs_q *uq;
674 675 struct vnode *vp = ITOV(ip);
675 676 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
676 677 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
677 678
678 679 /*
679 680 * Because the vnode type might have been changed,
680 681 * the dnlc_dir_purge must be called unconditionally.
681 682 */
682 683 dnlc_dir_purge(&ip->i_danchor);
683 684
684 685 /*
685 686 * Get exclusive access to inode data.
686 687 */
687 688 rw_enter(&ip->i_contents, RW_WRITER);
688 689 ASSERT(ip->i_flag & IREF);
689 690
690 691 /*
691 692 * Make sure no one reclaimed the inode before we put it on
692 693 * the freelist or destroy it. We keep our 'hold' on the vnode
693 694 * from vn_rele until we are ready to do something with the inode.
694 695 *
695 696 * Pageout may put a VN_HOLD/VN_RELE at anytime during this
696 697 * operation via an async putpage, so we must make sure
697 698 * we don't free/destroy the inode more than once. ufs_iget
698 699 * may also put a VN_HOLD on the inode before it grabs
699 700 * the i_contents lock. This is done so we don't free
700 701 * an inode that a thread is waiting on.
701 702 */
702 703 mutex_enter(&vp->v_lock);
703 704
704 705 if (vp->v_count > 1) {
705 706 VN_RELE_LOCKED(vp);
706 707 mutex_exit(&vp->v_lock);
707 708 rw_exit(&ip->i_contents);
708 709 return;
709 710 }
710 711 mutex_exit(&vp->v_lock);
711 712
712 713 /*
713 714 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
714 715 * and clean. It can be safely destroyed (cyf).
715 716 */
716 717 if (ip->i_ufsvfs == NULL) {
717 718 rw_exit(&ip->i_contents);
718 719 ufs_si_del(ip);
719 720 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
720 721 ufs_free_inode(ip);
721 722 return;
722 723 }
723 724
724 725 /*
725 726 * queue idle inode to appropriate thread. Will check v_count == 1
726 727 * prior to putting this on the appropriate queue.
727 728 * Stale inodes will be unhashed and freed by the ufs idle thread
728 729 * in ufs_idle_free()
729 730 */
730 731 front = 1;
731 732 if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
732 733 ip->i_mode && ip->i_nlink <= 0) {
733 734 /*
734 735 * Mark the i_flag to indicate that inode is being deleted.
735 736 * This flag will be cleared when the deletion is complete.
736 737 * This prevents nfs from sneaking in via ufs_vget() while
737 738 * the delete is in progress (bugid 1242481).
738 739 */
739 740 ip->i_flag |= IDEL;
740 741
741 742 /*
742 743 * NOIDEL means that deletes are not allowed at this time;
743 744 * whoever resets NOIDEL will also send this inode back
744 745 * through ufs_iinactive. IREF remains set.
745 746 */
746 747 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
747 748 mutex_enter(&vp->v_lock);
748 749 VN_RELE_LOCKED(vp);
749 750 mutex_exit(&vp->v_lock);
750 751 rw_exit(&ip->i_contents);
751 752 return;
752 753 }
753 754 if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
754 755 rw_exit(&ip->i_contents);
755 756 ufs_delete(ip->i_ufsvfs, ip, 0);
756 757 return;
757 758 }
758 759
759 760 /* queue to delete thread; IREF remains set */
760 761 ins.in_qfree.value.ul++;
761 762 uq = &ip->i_ufsvfs->vfs_delete;
762 763
763 764 mutex_enter(&uq->uq_mutex);
764 765
765 766 /* add to q */
766 767 if ((iq = uq->uq_ihead) != 0) {
767 768 ip->i_freef = iq;
768 769 ip->i_freeb = iq->i_freeb;
769 770 iq->i_freeb->i_freef = ip;
770 771 iq->i_freeb = ip;
771 772 if (front)
772 773 uq->uq_ihead = ip;
773 774 } else {
774 775 uq->uq_ihead = ip;
775 776 ip->i_freef = ip;
776 777 ip->i_freeb = ip;
777 778 }
778 779
779 780 delq_info->delq_unreclaimed_files += 1;
780 781 delq_info->delq_unreclaimed_blocks += ip->i_blocks;
781 782 } else {
782 783 /*
783 784 * queue to idle thread
784 785 * Check the v_count == 1 again.
785 786 *
786 787 */
787 788 mutex_enter(&vp->v_lock);
788 789 if (vp->v_count > 1) {
789 790 VN_RELE_LOCKED(vp);
790 791 mutex_exit(&vp->v_lock);
791 792 rw_exit(&ip->i_contents);
792 793 return;
793 794 }
794 795 mutex_exit(&vp->v_lock);
795 796 uq = &ufs_idle_q;
796 797
797 798 /*
798 799 * useful iff it has pages or is a fastsymlink; otherwise junk
799 800 */
800 801 mutex_enter(&uq->uq_mutex);
801 802
802 803 /* clear IREF means `on idle list' */
803 804 ip->i_flag &= ~(IREF | IDIRECTIO);
804 805
805 806 if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
806 807 ins.in_frback.value.ul++;
807 808 hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
808 809 ufs_nuseful_iq++;
809 810 } else {
810 811 ins.in_frfront.value.ul++;
811 812 hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
812 813 ip->i_flag |= IJUNKIQ;
813 814 ufs_njunk_iq++;
814 815 }
815 816 ip->i_freef = hip;
816 817 ip->i_freeb = hip->i_freeb;
817 818 hip->i_freeb->i_freef = ip;
818 819 hip->i_freeb = ip;
819 820 }
820 821
821 822 /* wakeup thread(s) if q is overfull */
822 823 if (++uq->uq_ne == uq->uq_lowat)
823 824 cv_broadcast(&uq->uq_cv);
824 825
825 826 /* all done, release the q and inode */
826 827 mutex_exit(&uq->uq_mutex);
827 828 rw_exit(&ip->i_contents);
828 829 }
829 830
830 831 /*
831 832 * Check accessed and update flags on an inode structure.
832 833 * If any are on, update the inode with the (unique) current time.
833 834 * If waitfor is given, insure I/O order so wait for write to complete.
834 835 */
835 836 void
836 837 ufs_iupdat(struct inode *ip, int waitfor)
837 838 {
838 839 struct buf *bp;
839 840 struct fs *fp;
840 841 struct dinode *dp;
841 842 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
842 843 int i;
843 844 int do_trans_times;
844 845 ushort_t flag;
845 846 o_uid_t suid;
846 847 o_gid_t sgid;
847 848
848 849 /*
849 850 * This function is now safe to be called with either the reader
850 851 * or writer i_contents lock.
851 852 */
852 853 ASSERT(RW_LOCK_HELD(&ip->i_contents));
853 854
854 855 /*
855 856 * Return if file system has been forcibly umounted.
856 857 */
857 858 if (ufsvfsp == NULL)
858 859 return;
859 860
860 861 flag = ip->i_flag; /* Atomic read */
861 862 /*
862 863 * We better not update the disk inode from a stale inode.
863 864 */
864 865 if (flag & ISTALE)
865 866 return;
866 867
867 868 fp = ip->i_fs;
868 869
869 870 if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
870 871 if (fp->fs_ronly) {
871 872 mutex_enter(&ip->i_tlock);
872 873 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
873 874 mutex_exit(&ip->i_tlock);
874 875 return;
875 876 }
876 877 /*
877 878 * fs is active while metadata is being written
878 879 */
879 880 mutex_enter(&ufsvfsp->vfs_lock);
880 881 ufs_notclean(ufsvfsp);
881 882 /*
882 883 * get the dinode
883 884 */
884 885 bp = UFS_BREAD(ufsvfsp, ip->i_dev,
885 886 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
886 887 (int)fp->fs_bsize);
887 888 if (bp->b_flags & B_ERROR) {
888 889 mutex_enter(&ip->i_tlock);
889 890 ip->i_flag &=
890 891 ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
891 892 mutex_exit(&ip->i_tlock);
892 893 brelse(bp);
893 894 return;
894 895 }
895 896 /*
896 897 * munge inode fields
897 898 */
898 899 mutex_enter(&ip->i_tlock);
899 900 ITIMES_NOLOCK(ip);
900 901 do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
901 902 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
902 903 mutex_exit(&ip->i_tlock);
903 904
904 905 /*
905 906 * For reads and concurrent re-writes, no deltas were
906 907 * entered for the access time changes - do it now.
907 908 */
908 909 if (do_trans_times) {
909 910 TRANS_INODE_TIMES(ufsvfsp, ip);
910 911 }
911 912
912 913 /*
913 914 * For SunOS 5.0->5.4, these lines below read:
914 915 *
915 916 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
916 917 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
917 918 *
918 919 * where MAXUID was set to 60002. This was incorrect -
919 920 * the uids should have been constrained to what fitted into
920 921 * a 16-bit word.
921 922 *
922 923 * This means that files from 4.x filesystems that have an
923 924 * i_suid field larger than 60002 will have that field
924 925 * changed to 65535.
925 926 *
926 927 * Security note: 4.x UFS could never create a i_suid of
927 928 * UID_LONG since that would've corresponded to -1.
928 929 */
929 930 suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
930 931 UID_LONG : ip->i_uid;
931 932 sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
932 933 GID_LONG : ip->i_gid;
933 934
934 935 if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
935 936 ip->i_suid = suid;
936 937 ip->i_sgid = sgid;
937 938 TRANS_INODE(ufsvfsp, ip);
938 939 }
939 940
940 941 if ((ip->i_mode & IFMT) == IFBLK ||
941 942 (ip->i_mode & IFMT) == IFCHR) {
942 943 dev_t d = ip->i_rdev;
943 944 dev32_t dev32;
944 945
945 946 /*
946 947 * load first direct block only if special device
947 948 */
948 949 if (!cmpldev(&dev32, d)) {
949 950 /*
950 951 * We panic here because there's "no way"
951 952 * we should have been able to create a large
952 953 * inode with a large dev_t. Earlier layers
953 954 * should've caught this.
954 955 */
955 956 panic("ip %p: i_rdev too big", (void *)ip);
956 957 }
957 958
958 959 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
959 960 ip->i_ordev = dev32; /* can't use old fmt. */
960 961 } else {
961 962 ip->i_ordev = cmpdev(d);
962 963 }
963 964 }
964 965
965 966 /*
966 967 * copy inode to dinode (zero fastsymlnk in dinode)
967 968 */
968 969 dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
969 970 dp->di_ic = ip->i_ic; /* structure assignment */
970 971 if (flag & IFASTSYMLNK) {
971 972 for (i = 1; i < NDADDR; i++)
972 973 dp->di_db[i] = 0;
973 974 for (i = 0; i < NIADDR; i++)
974 975 dp->di_ib[i] = 0;
975 976 }
976 977 if (TRANS_ISTRANS(ufsvfsp)) {
977 978 /*
978 979 * Pass only a sector size buffer containing
979 980 * the inode, otherwise when the buffer is copied
980 981 * into a cached roll buffer then too much memory
981 982 * gets consumed if 8KB inode buffers are passed.
982 983 */
983 984 TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
984 985 sizeof (struct dinode),
985 986 (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
986 987 DEV_BSIZE);
987 988
988 989 brelse(bp);
989 990 } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
990 991 UFS_BRWRITE(ufsvfsp, bp);
991 992
992 993 /*
993 994 * Synchronous write has guaranteed that inode
994 995 * has been written on disk so clear the flag
995 996 */
996 997 mutex_enter(&ip->i_tlock);
997 998 ip->i_flag &= ~IBDWRITE;
998 999 mutex_exit(&ip->i_tlock);
999 1000 } else {
1000 1001 bdrwrite(bp);
1001 1002
1002 1003 /*
1003 1004 * This write hasn't guaranteed that inode has been
1004 1005 * written on the disk.
1005 1006 * Since, all updat flags on inode are cleared, we must
1006 1007 * remember the condition in case inode is to be updated
1007 1008 * synchronously later (e.g.- fsync()/fdatasync())
1008 1009 * and inode has not been modified yet.
1009 1010 */
1010 1011 mutex_enter(&ip->i_tlock);
1011 1012 ip->i_flag |= IBDWRITE;
1012 1013 mutex_exit(&ip->i_tlock);
1013 1014 }
1014 1015 } else {
1015 1016 /*
1016 1017 * In case previous inode update was done asynchronously
1017 1018 * (IBDWRITE) and this inode update request wants guaranteed
1018 1019 * (synchronous) disk update, flush the inode.
1019 1020 */
1020 1021 if (waitfor && (flag & IBDWRITE)) {
1021 1022 blkflush(ip->i_dev,
1022 1023 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
1023 1024 mutex_enter(&ip->i_tlock);
1024 1025 ip->i_flag &= ~IBDWRITE;
1025 1026 mutex_exit(&ip->i_tlock);
1026 1027 }
1027 1028 }
1028 1029 }
1029 1030
1030 1031 #define SINGLE 0 /* index of single indirect block */
1031 1032 #define DOUBLE 1 /* index of double indirect block */
1032 1033 #define TRIPLE 2 /* index of triple indirect block */
1033 1034
1034 1035 /*
1035 1036 * Release blocks associated with the inode ip and
1036 1037 * stored in the indirect block bn. Blocks are free'd
1037 1038 * in LIFO order up to (but not including) lastbn. If
1038 1039 * level is greater than SINGLE, the block is an indirect
1039 1040 * block and recursive calls to indirtrunc must be used to
1040 1041 * cleanse other indirect blocks.
1041 1042 *
1042 1043 * N.B.: triple indirect blocks are untested.
1043 1044 */
1044 1045 static long
1045 1046 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
1046 1047 {
1047 1048 int i;
1048 1049 struct buf *bp, *copy;
1049 1050 daddr32_t *bap;
1050 1051 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1051 1052 struct fs *fs = ufsvfsp->vfs_fs;
1052 1053 daddr_t nb, last;
1053 1054 long factor;
1054 1055 int blocksreleased = 0, nblocks;
1055 1056
1056 1057 ASSERT(RW_WRITE_HELD(&ip->i_contents));
1057 1058 /*
1058 1059 * Calculate index in current block of last
1059 1060 * block to be kept. -1 indicates the entire
1060 1061 * block so we need not calculate the index.
1061 1062 */
1062 1063 factor = 1;
1063 1064 for (i = SINGLE; i < level; i++)
1064 1065 factor *= NINDIR(fs);
1065 1066 last = lastbn;
1066 1067 if (lastbn > 0)
1067 1068 last /= factor;
1068 1069 nblocks = btodb(fs->fs_bsize);
1069 1070 /*
1070 1071 * Get buffer of block pointers, zero those
1071 1072 * entries corresponding to blocks to be free'd,
1072 1073 * and update on disk copy first.
1073 1074 * *Unless* the root pointer has been synchronously
1074 1075 * written to disk. If nothing points to this
1075 1076 * indirect block then don't bother zero'ing and
1076 1077 * writing it.
1077 1078 */
1078 1079 bp = UFS_BREAD(ufsvfsp,
1079 1080 ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
1080 1081 if (bp->b_flags & B_ERROR) {
1081 1082 brelse(bp);
1082 1083 return (0);
1083 1084 }
1084 1085 bap = bp->b_un.b_daddr;
1085 1086 if ((flags & I_CHEAP) == 0) {
1086 1087 uint_t zb;
1087 1088
1088 1089 zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));
1089 1090
1090 1091 if (zb) {
1091 1092 /*
1092 1093 * push any data into the log before we zero it
1093 1094 */
1094 1095 if (bp->b_flags & B_DELWRI)
1095 1096 TRANS_LOG(ufsvfsp, (caddr_t)bap,
1096 1097 ldbtob(bp->b_blkno), bp->b_bcount,
1097 1098 bp->b_un.b_addr, bp->b_bcount);
1098 1099 copy = ngeteblk(fs->fs_bsize);
1099 1100 bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
1100 1101 (uint_t)fs->fs_bsize);
1101 1102 bzero((caddr_t)&bap[last + 1], zb);
1102 1103
1103 1104 TRANS_BUF(ufsvfsp,
1104 1105 (caddr_t)&bap[last + 1] - (caddr_t)bap,
1105 1106 zb, bp, DT_ABZERO);
1106 1107
1107 1108 UFS_BRWRITE(ufsvfsp, bp);
1108 1109 bp = copy, bap = bp->b_un.b_daddr;
1109 1110 }
1110 1111 } else {
1111 1112 /* make sure write retries are also cleared */
1112 1113 bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
1113 1114 bp->b_flags |= B_STALE | B_AGE;
1114 1115 }
1115 1116
1116 1117 /*
1117 1118 * Recursively free totally unused blocks.
1118 1119 */
1119 1120 flags |= I_CHEAP;
1120 1121 for (i = NINDIR(fs) - 1; i > last; i--) {
1121 1122 nb = bap[i];
1122 1123 if (nb == 0)
1123 1124 continue;
1124 1125 if (level > SINGLE) {
1125 1126 blocksreleased +=
1126 1127 indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
1127 1128 free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
1128 1129 } else
1129 1130 free(ip, nb, (off_t)fs->fs_bsize, flags);
1130 1131 blocksreleased += nblocks;
1131 1132 }
1132 1133 flags &= ~I_CHEAP;
1133 1134
1134 1135 /*
1135 1136 * Recursively free last partial block.
1136 1137 */
1137 1138 if (level > SINGLE && lastbn >= 0) {
1138 1139 last = lastbn % factor;
1139 1140 nb = bap[i];
1140 1141 if (nb != 0)
1141 1142 blocksreleased +=
1142 1143 indirtrunc(ip, nb, last, level - 1, flags);
1143 1144 }
1144 1145 brelse(bp);
1145 1146 return (blocksreleased);
1146 1147 }
1147 1148
1148 1149 /*
1149 1150 * Truncate the inode ip to at most length size.
1150 1151 * Free affected disk blocks -- the blocks of the
1151 1152 * file are removed in reverse order.
1152 1153 *
1153 1154 * N.B.: triple indirect blocks are untested.
1154 1155 */
1155 1156 static int i_genrand = 1234;
1156 1157 int
1157 1158 ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr)
1158 1159 {
1159 1160 struct fs *fs = oip->i_fs;
1160 1161 struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
1161 1162 struct inode *ip;
1162 1163 daddr_t lastblock;
1163 1164 off_t bsize;
1164 1165 int boff;
1165 1166 daddr_t bn, lastiblock[NIADDR];
1166 1167 int level;
1167 1168 long nblocks, blocksreleased = 0;
1168 1169 int i;
1169 1170 ushort_t mode;
1170 1171 struct inode tip;
1171 1172 int err;
1172 1173 u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
1173 1174 (UFS_MAXOFFSET_T) : (MAXOFF32_T);
1174 1175
1175 1176 /*
1176 1177 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
1177 1178 * other uses need the reader lock. opendq() holds the writer lock.
1178 1179 */
1179 1180 ASSERT((oip->i_mode & IFMT) == IFSHAD ||
1180 1181 RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
1181 1182 ASSERT(RW_WRITE_HELD(&oip->i_contents));
1182 1183 /*
1183 1184 * We only allow truncation of regular files and directories
1184 1185 * to arbitrary lengths here. In addition, we allow symbolic
1185 1186 * links to be truncated only to zero length. Other inode
1186 1187 * types cannot have their length set here. Disk blocks are
1187 1188 * being dealt with - especially device inodes where
1188 1189 * ip->i_ordev is actually being stored in ip->i_db[0]!
1189 1190 */
1190 1191 TRANS_INODE(ufsvfsp, oip);
1191 1192 mode = oip->i_mode & IFMT;
1192 1193 if (flags & I_FREE) {
1193 1194 i_genrand *= 16843009; /* turns into shift and adds */
1194 1195 i_genrand++;
1195 1196 oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1;
1196 1197 oip->i_flag |= ICHG |IUPD;
1197 1198 oip->i_seq++;
1198 1199 if (length == oip->i_size)
1199 1200 return (0);
1200 1201 flags |= I_CHEAP;
1201 1202 }
1202 1203 if (mode == IFIFO)
1203 1204 return (0);
1204 1205 if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
1205 1206 !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
1206 1207 return (EINVAL);
1207 1208 if (length > maxoffset)
1208 1209 return (EFBIG);
1209 1210 if ((mode == IFDIR) || (mode == IFATTRDIR))
1210 1211 flags |= I_DIR;
1211 1212 if (mode == IFSHAD)
1212 1213 flags |= I_SHAD;
1213 1214 if (oip == ufsvfsp->vfs_qinod)
1214 1215 flags |= I_QUOTA;
1215 1216 if (length == oip->i_size) {
1216 1217 /* update ctime and mtime to please POSIX tests */
1217 1218 oip->i_flag |= ICHG |IUPD;
1218 1219 oip->i_seq++;
1219 1220 if (length == 0) {
1220 1221 /* nothing to cache so clear the flag */
1221 1222 oip->i_flag &= ~IFASTSYMLNK;
1222 1223 }
1223 1224 return (0);
1224 1225 }
1225 1226 /* wipe out fast symlink till next access */
1226 1227 if (oip->i_flag & IFASTSYMLNK) {
1227 1228 int j;
1228 1229
1229 1230 ASSERT(ITOV(oip)->v_type == VLNK);
1230 1231
1231 1232 oip->i_flag &= ~IFASTSYMLNK;
1232 1233
1233 1234 for (j = 1; j < NDADDR; j++)
1234 1235 oip->i_db[j] = 0;
1235 1236 for (j = 0; j < NIADDR; j++)
1236 1237 oip->i_ib[j] = 0;
1237 1238 }
1238 1239
1239 1240 boff = (int)blkoff(fs, length);
1240 1241
1241 1242 if (length > oip->i_size) {
1242 1243 /*
1243 1244 * Trunc up case. BMAPALLOC will insure that the right blocks
1244 1245 * are allocated. This includes extending the old frag to a
1245 1246 * full block (if needed) in addition to doing any work
1246 1247 * needed for allocating the last block.
1247 1248 */
1248 1249 if (boff == 0)
1249 1250 err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
1250 1251 else
1251 1252 err = BMAPALLOC(oip, length - 1, boff, cr);
1252 1253
1253 1254 if (err == 0) {
1254 1255 /*
1255 1256 * Save old size and set inode's size now
1256 1257 * so that we don't cause too much of the
1257 1258 * file to be zero'd and pushed.
1258 1259 */
1259 1260 u_offset_t osize = oip->i_size;
1260 1261 oip->i_size = length;
1261 1262 /*
1262 1263 * Make sure we zero out the remaining bytes of
1263 1264 * the page in case a mmap scribbled on it. We
1264 1265 * can't prevent a mmap from writing beyond EOF
1265 1266 * on the last page of a file.
1266 1267 *
1267 1268 */
1268 1269 if ((boff = (int)blkoff(fs, osize)) != 0) {
1269 1270 bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
1270 1271 fs->fs_bsize : fragroundup(fs, boff);
1271 1272 pvn_vpzero(ITOV(oip), osize,
1272 1273 (size_t)(bsize - boff));
1273 1274 }
1274 1275 oip->i_flag |= ICHG|IATTCHG;
1275 1276 oip->i_seq++;
1276 1277 ITIMES_NOLOCK(oip);
1277 1278 /*
1278 1279 * MAXOFF32_T is old 2GB size limit. If
1279 1280 * this operation caused a large file to be
1280 1281 * created, turn on the superblock flag
1281 1282 * and update the superblock, if the flag
1282 1283 * is not already on.
1283 1284 */
1284 1285 if ((length > (u_offset_t)MAXOFF32_T) &&
1285 1286 !(fs->fs_flags & FSLARGEFILES)) {
1286 1287 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1287 1288 mutex_enter(&ufsvfsp->vfs_lock);
1288 1289 fs->fs_flags |= FSLARGEFILES;
1289 1290 ufs_sbwrite(ufsvfsp);
1290 1291 mutex_exit(&ufsvfsp->vfs_lock);
1291 1292 }
1292 1293 }
1293 1294
1294 1295 return (err);
1295 1296 }
1296 1297
1297 1298 /*
1298 1299 * Update the pages of the file. If the file is not being
1299 1300 * truncated to a block boundary, the contents of the
1300 1301 * pages following the end of the file must be zero'ed
1301 1302 * in case it ever become accessible again because
1302 1303 * of subsequent file growth.
1303 1304 */
1304 1305 if (boff == 0) {
1305 1306 (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
1306 1307 B_INVAL | B_TRUNC, CRED());
1307 1308 } else {
1308 1309 /*
1309 1310 * Make sure that the last block is properly allocated.
1310 1311 * We only really have to do this if the last block is
1311 1312 * actually allocated since ufs_bmap will now handle the case
1312 1313 * of an fragment which has no block allocated. Just to
1313 1314 * be sure, we do it now independent of current allocation.
1314 1315 */
1315 1316 err = BMAPALLOC(oip, length - 1, boff, cr);
1316 1317 if (err)
1317 1318 return (err);
1318 1319
1319 1320 /*
1320 1321 * BMAPALLOC will call bmap_write which defers i_seq
1321 1322 * processing. If the timestamps were changed, update
1322 1323 * i_seq before rdip drops i_contents or syncs the inode.
1323 1324 */
1324 1325 if (oip->i_flag & (ICHG|IUPD))
1325 1326 oip->i_seq++;
1326 1327
1327 1328 /*
1328 1329 * BugId 4069932
1329 1330 * Make sure that the relevant partial page appears in
1330 1331 * the v_pages list, so that pvn_vpzero() will do its
1331 1332 * job. Since doing this correctly requires everything
1332 1333 * in rdip() except for the uiomove(), it's easier and
1333 1334 * safer to do the uiomove() rather than duplicate the
1334 1335 * rest of rdip() here.
1335 1336 *
1336 1337 * To get here, we know that length indicates a byte
1337 1338 * that is not the first byte of a block. (length - 1)
1338 1339 * is the last actual byte known to exist. Deduction
1339 1340 * shows it is in the same block as byte (length).
1340 1341 * Thus, this rdip() invocation should always succeed
1341 1342 * except in the face of i/o errors, and give us the
1342 1343 * block we care about.
1343 1344 *
1344 1345 * rdip() makes the same locking assertions and
1345 1346 * assumptions as we do. We do not acquire any locks
1346 1347 * before calling it, so we have not changed the locking
1347 1348 * situation. Finally, there do not appear to be any
1348 1349 * paths whereby rdip() ends up invoking us again.
1349 1350 * Thus, infinite recursion is avoided.
1350 1351 */
1351 1352 {
1352 1353 uio_t uio;
1353 1354 iovec_t iov[1];
1354 1355 char buffer;
1355 1356
1356 1357 uio.uio_iov = iov;
1357 1358 uio.uio_iovcnt = 1;
1358 1359 uio.uio_loffset = length - 1;
1359 1360 uio.uio_resid = 1;
1360 1361 uio.uio_segflg = UIO_SYSSPACE;
1361 1362 uio.uio_extflg = UIO_COPY_CACHED;
1362 1363
1363 1364 iov[0].iov_base = &buffer;
1364 1365 iov[0].iov_len = 1;
1365 1366
1366 1367 err = rdip(oip, &uio, UIO_READ, NULL);
1367 1368 if (err)
1368 1369 return (err);
1369 1370 }
1370 1371
1371 1372 bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
1372 1373 fs->fs_bsize : fragroundup(fs, boff);
1373 1374 pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
1374 1375 /*
1375 1376 * Ensure full fs block is marked as dirty.
1376 1377 */
1377 1378 (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
1378 1379 ufs_putapage, B_INVAL | B_TRUNC, CRED());
1379 1380 }
1380 1381
1381 1382 /*
1382 1383 * Calculate index into inode's block list of
1383 1384 * last direct and indirect blocks (if any)
1384 1385 * which we want to keep. Lastblock is -1 when
1385 1386 * the file is truncated to 0.
1386 1387 */
1387 1388 lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
1388 1389 lastiblock[SINGLE] = lastblock - NDADDR;
1389 1390 lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
1390 1391 lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
1391 1392 nblocks = btodb(fs->fs_bsize);
1392 1393
1393 1394 /*
1394 1395 * Update file and block pointers
1395 1396 * on disk before we start freeing blocks.
1396 1397 * If we crash before free'ing blocks below,
1397 1398 * the blocks will be returned to the free list.
1398 1399 * lastiblock values are also normalized to -1
1399 1400 * for calls to indirtrunc below.
1400 1401 */
1401 1402 tip = *oip; /* structure copy */
1402 1403 ip = &tip;
1403 1404
1404 1405 for (level = TRIPLE; level >= SINGLE; level--)
1405 1406 if (lastiblock[level] < 0) {
1406 1407 oip->i_ib[level] = 0;
1407 1408 lastiblock[level] = -1;
1408 1409 }
1409 1410 for (i = NDADDR - 1; i > lastblock; i--) {
1410 1411 oip->i_db[i] = 0;
1411 1412 flags |= I_CHEAP;
1412 1413 }
1413 1414 oip->i_size = length;
1414 1415 oip->i_flag |= ICHG|IUPD|IATTCHG;
1415 1416 oip->i_seq++;
1416 1417 if (!TRANS_ISTRANS(ufsvfsp))
1417 1418 ufs_iupdat(oip, I_SYNC); /* do sync inode update */
1418 1419
1419 1420 /*
1420 1421 * Indirect blocks first.
1421 1422 */
1422 1423 for (level = TRIPLE; level >= SINGLE; level--) {
1423 1424 bn = ip->i_ib[level];
1424 1425 if (bn != 0) {
1425 1426 blocksreleased +=
1426 1427 indirtrunc(ip, bn, lastiblock[level], level, flags);
1427 1428 if (lastiblock[level] < 0) {
1428 1429 ip->i_ib[level] = 0;
1429 1430 free(ip, bn, (off_t)fs->fs_bsize,
1430 1431 flags | I_IBLK);
1431 1432 blocksreleased += nblocks;
1432 1433 }
1433 1434 }
1434 1435 if (lastiblock[level] >= 0)
1435 1436 goto done;
1436 1437 }
1437 1438
1438 1439 /*
1439 1440 * All whole direct blocks or frags.
1440 1441 */
1441 1442 for (i = NDADDR - 1; i > lastblock; i--) {
1442 1443 bn = ip->i_db[i];
1443 1444 if (bn == 0)
1444 1445 continue;
1445 1446 ip->i_db[i] = 0;
1446 1447 bsize = (off_t)blksize(fs, ip, i);
1447 1448 free(ip, bn, bsize, flags);
1448 1449 blocksreleased += btodb(bsize);
1449 1450 }
1450 1451 if (lastblock < 0)
1451 1452 goto done;
1452 1453
1453 1454 /*
1454 1455 * Finally, look for a change in size of the
1455 1456 * last direct block; release any frags.
1456 1457 */
1457 1458 bn = ip->i_db[lastblock];
1458 1459 if (bn != 0) {
1459 1460 off_t oldspace, newspace;
1460 1461
1461 1462 /*
1462 1463 * Calculate amount of space we're giving
1463 1464 * back as old block size minus new block size.
1464 1465 */
1465 1466 oldspace = blksize(fs, ip, lastblock);
1466 1467 UFS_SET_ISIZE(length, ip);
1467 1468 newspace = blksize(fs, ip, lastblock);
1468 1469 if (newspace == 0) {
1469 1470 err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
1470 1471 return (err);
1471 1472 }
1472 1473 if (oldspace - newspace > 0) {
1473 1474 /*
1474 1475 * Block number of space to be free'd is
1475 1476 * the old block # plus the number of frags
1476 1477 * required for the storage we're keeping.
1477 1478 */
1478 1479 bn += numfrags(fs, newspace);
1479 1480 free(ip, bn, oldspace - newspace, flags);
1480 1481 blocksreleased += btodb(oldspace - newspace);
1481 1482 }
1482 1483 }
1483 1484 done:
1484 1485 /* BEGIN PARANOIA */
1485 1486 for (level = SINGLE; level <= TRIPLE; level++)
1486 1487 if (ip->i_ib[level] != oip->i_ib[level]) {
1487 1488 err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
1488 1489 return (err);
1489 1490 }
1490 1491
1491 1492 for (i = 0; i < NDADDR; i++)
1492 1493 if (ip->i_db[i] != oip->i_db[i]) {
1493 1494 err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
1494 1495 return (err);
1495 1496 }
1496 1497 /* END PARANOIA */
1497 1498 oip->i_blocks -= blocksreleased;
1498 1499
1499 1500 if (oip->i_blocks < 0) { /* sanity */
1500 1501 cmn_err(CE_NOTE,
1501 1502 "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
1502 1503 fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
1503 1504 (int)oip->i_blocks);
1504 1505 oip->i_blocks = 0;
1505 1506 }
1506 1507 oip->i_flag |= ICHG|IATTCHG;
1507 1508 oip->i_seq++;
1508 1509 /* blocksreleased is >= zero, so this can not fail */
1509 1510 (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL,
1510 1511 (size_t *)NULL);
1511 1512 return (0);
1512 1513 }
1513 1514
1514 1515 /*
1515 1516 * Check mode permission on inode. Mode is READ, WRITE or EXEC.
1516 1517 * In the case of WRITE, the read-only status of the file system
1517 1518 * is checked. Depending on the calling user, the appropriate
1518 1519 * mode bits are selected; privileges to override missing permission
1519 1520 * bits are checked through secpolicy_vnode_access().
1520 1521 * The i_contens lock must be held as reader here to prevent racing with
1521 1522 * the acl subsystem removing/setting/changing acls on this inode.
1522 1523 * The caller is responsible for indicating whether or not the i_contents
1523 1524 * lock needs to be acquired here or if already held.
1524 1525 */
1525 1526 int
1526 1527 ufs_iaccess(struct inode *ip, int mode, struct cred *cr, int dolock)
1527 1528 {
1528 1529 int shift = 0;
1529 1530 int ret = 0;
1530 1531
1531 1532 if (dolock)
1532 1533 rw_enter(&ip->i_contents, RW_READER);
1533 1534 ASSERT(RW_LOCK_HELD(&ip->i_contents));
1534 1535
1535 1536 if (mode & IWRITE) {
1536 1537 /*
1537 1538 * Disallow write attempts on read-only
1538 1539 * file systems, unless the file is a block
1539 1540 * or character device or a FIFO.
1540 1541 */
1541 1542 if (ip->i_fs->fs_ronly != 0) {
1542 1543 if ((ip->i_mode & IFMT) != IFCHR &&
1543 1544 (ip->i_mode & IFMT) != IFBLK &&
1544 1545 (ip->i_mode & IFMT) != IFIFO) {
1545 1546 ret = EROFS;
1546 1547 goto out;
1547 1548 }
1548 1549 }
1549 1550 }
1550 1551 /*
1551 1552 * If there is an acl, check the acl and return.
1552 1553 */
1553 1554 if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) {
1554 1555 ret = ufs_acl_access(ip, mode, cr);
1555 1556 goto out;
1556 1557 }
1557 1558
1558 1559 /*
1559 1560 * Access check is based on only one of owner, group, public.
1560 1561 * If not owner, then check group.
1561 1562 * If not a member of the group, then check public access.
1562 1563 */
1563 1564 if (crgetuid(cr) != ip->i_uid) {
1564 1565 shift += 3;
1565 1566 if (!groupmember((uid_t)ip->i_gid, cr))
1566 1567 shift += 3;
1567 1568 }
1568 1569
1569 1570 /* test missing privilege bits */
1570 1571 ret = secpolicy_vnode_access2(cr, ITOV(ip), ip->i_uid,
1571 1572 ip->i_mode << shift, mode);
1572 1573 out:
1573 1574 if (dolock)
1574 1575 rw_exit(&ip->i_contents);
1575 1576 return (ret);
1576 1577 }
1577 1578
1578 1579 /*
1579 1580 * if necessary, remove an inode from the free list
1580 1581 * i_contents is held except at unmount
1581 1582 *
1582 1583 * Return 1 if the inode is taken off of the ufs_idle_q,
1583 1584 * and the caller is expected to call VN_RELE.
1584 1585 *
1585 1586 * Return 0 otherwise.
1586 1587 */
1587 1588 int
1588 1589 ufs_rmidle(struct inode *ip)
1589 1590 {
1590 1591 int rval = 0;
1591 1592
1592 1593 mutex_enter(&ip->i_tlock);
1593 1594 if ((ip->i_flag & IREF) == 0) {
1594 1595 mutex_enter(&ufs_idle_q.uq_mutex);
1595 1596 ip->i_freef->i_freeb = ip->i_freeb;
1596 1597 ip->i_freeb->i_freef = ip->i_freef;
1597 1598 ip->i_freef = ip;
1598 1599 ip->i_freeb = ip;
1599 1600 ip->i_flag |= IREF;
1600 1601 ufs_idle_q.uq_ne--;
1601 1602 if (ip->i_flag & IJUNKIQ) {
1602 1603 ufs_njunk_iq--;
1603 1604 ip->i_flag &= ~IJUNKIQ;
1604 1605 } else {
1605 1606 ufs_nuseful_iq--;
1606 1607 }
1607 1608 mutex_exit(&ufs_idle_q.uq_mutex);
1608 1609 rval = 1;
1609 1610 }
1610 1611 mutex_exit(&ip->i_tlock);
1611 1612 return (rval);
1612 1613 }
1613 1614
1614 1615 /*
1615 1616 * scan the hash of inodes and call func with the inode locked
1616 1617 */
1617 1618 int
1618 1619 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
1619 1620 struct ufsvfs *ufsvfsp)
1620 1621 {
1621 1622 struct inode *ip; /* current inode */
1622 1623 struct inode *lip = NULL; /* last/previous inode */
1623 1624 union ihead *ih; /* current hash chain */
1624 1625 int error, i;
1625 1626 int saverror = 0;
1626 1627 int lip_held; /* lip needs a VN_RELE() */
1627 1628
1628 1629 /*
1629 1630 * If ufsvfsp is NULL, then our caller should be holding
1630 1631 * ufs_scan_lock to avoid conflicts between ufs_unmount() and
1631 1632 * ufs_update(). Otherwise, to avoid false-positives in
1632 1633 * ufs_unmount()'s v_count-based EBUSY check, we only hold
1633 1634 * those inodes that are in the file system our caller cares
1634 1635 * about.
1635 1636 *
1636 1637 * We know that ip is a valid inode in the hash chain (and thus
1637 1638 * we can trust i_ufsvfs) because the inode we chained from
1638 1639 * (lip) is still in the hash chain. This is true because either:
1639 1640 *
1640 1641 * 1. We did not drop the hash chain lock since the last
1641 1642 * iteration (because we were not interested in the last inode),
1642 1643 * or
1643 1644 * 2. We maintained a hold on the last inode while we
1644 1645 * we were processing it, so it could not be removed
1645 1646 * from the hash chain.
1646 1647 *
1647 1648 * The whole reason we're dropping and re-grabbing the chain
1648 1649 * lock on every inode is so that we don't present a major
1649 1650 * choke point on throughput, particularly when we've been
1650 1651 * called on behalf of fsflush.
1651 1652 */
1652 1653
1653 1654 for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
1654 1655 mutex_enter(&ih_lock[i]);
1655 1656 for (ip = ih->ih_chain[0], lip_held = 0;
1656 1657 ip != (struct inode *)ih;
1657 1658 ip = lip->i_forw) {
1658 1659
1659 1660 ins.in_scan.value.ul++;
1660 1661
1661 1662 /*
1662 1663 * Undo the previous iteration's VN_HOLD(), but
1663 1664 * only if one was done.
1664 1665 */
1665 1666 if (lip_held)
1666 1667 VN_RELE(ITOV(lip));
1667 1668
1668 1669 lip = ip;
1669 1670 if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
1670 1671 /*
1671 1672 * We're not processing all inodes, and
1672 1673 * this inode is not in the filesystem of
1673 1674 * interest, so skip it. No need to do a
1674 1675 * VN_HOLD() since we're not dropping the
1675 1676 * hash chain lock until after we've
1676 1677 * done the i_forw traversal above.
1677 1678 */
1678 1679 lip_held = 0;
1679 1680 continue;
1680 1681 }
1681 1682 VN_HOLD(ITOV(ip));
1682 1683 lip_held = 1;
1683 1684 mutex_exit(&ih_lock[i]);
1684 1685
1685 1686 /*
1686 1687 * Acquire the contents lock as writer to make
1687 1688 * sure that the inode has been initialized in
1688 1689 * the cache or removed from the idle list by
1689 1690 * ufs_iget(). This works because ufs_iget()
1690 1691 * acquires the contents lock before putting
1691 1692 * the inode into the cache. If we can lock
1692 1693 * it, then ufs_iget() is done with it.
1693 1694 */
1694 1695
1695 1696 if (rwtry) {
1696 1697 if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
1697 1698 mutex_enter(&ih_lock[i]);
1698 1699 continue;
1699 1700 }
1700 1701 } else {
1701 1702 rw_enter(&ip->i_contents, RW_WRITER);
1702 1703 }
1703 1704
1704 1705 rw_exit(&ip->i_contents);
1705 1706
1706 1707 /*
1707 1708 * ISTALE means the inode couldn't be read
1708 1709 *
1709 1710 * We don't have to hold the i_contents lock
1710 1711 * for this check for a couple of
1711 1712 * reasons. First, if ISTALE is set then the
1712 1713 * flag cannot be cleared until the inode is
1713 1714 * removed from the cache and that cannot
1714 1715 * happen until after we VN_RELE() it.
1715 1716 * Second, if ISTALE is not set, then the
1716 1717 * inode is in the cache and does not need to
1717 1718 * be read from disk so ISTALE cannot be set
1718 1719 * while we are not looking.
1719 1720 */
1720 1721 if ((ip->i_flag & ISTALE) == 0) {
1721 1722 if ((error = (*func)(ip, arg)) != 0)
1722 1723 saverror = error;
1723 1724 }
1724 1725
1725 1726 mutex_enter(&ih_lock[i]);
1726 1727 }
1727 1728 if (lip_held)
1728 1729 VN_RELE(ITOV(lip));
1729 1730 mutex_exit(&ih_lock[i]);
1730 1731 }
1731 1732 return (saverror);
1732 1733 }
1733 1734
1734 1735 /*
1735 1736 * Mark inode with the current time, plus a unique increment.
1736 1737 *
1737 1738 * Since we only keep 32-bit time on disk, if UFS is still alive
1738 1739 * beyond 2038, filesystem times will simply stick at the last
1739 1740 * possible second of 32-bit time. Not ideal, but probably better
1740 1741 * than going into the remote past, or confusing applications with
1741 1742 * negative time.
1742 1743 */
1743 1744 void
1744 1745 ufs_imark(struct inode *ip)
1745 1746 {
1746 1747 timestruc_t now;
1747 1748 int32_t usec, nsec;
1748 1749
1749 1750 /*
1750 1751 * The update of i_seq may have been deferred, increase i_seq here
1751 1752 * to make sure it is in sync with the timestamps.
1752 1753 */
1753 1754 if (ip->i_flag & ISEQ) {
1754 1755 ASSERT(ip->i_flag & (IUPD|ICHG));
1755 1756 ip->i_seq++;
1756 1757 ip->i_flag &= ~ISEQ;
1757 1758 }
1758 1759
1759 1760 gethrestime(&now);
1760 1761
1761 1762 /*
1762 1763 * Fast algorithm to convert nsec to usec -- see hrt2ts()
1763 1764 * in common/os/timers.c for a full description.
1764 1765 */
1765 1766 nsec = now.tv_nsec;
1766 1767 usec = nsec + (nsec >> 2);
1767 1768 usec = nsec + (usec >> 1);
1768 1769 usec = nsec + (usec >> 2);
1769 1770 usec = nsec + (usec >> 4);
1770 1771 usec = nsec - (usec >> 3);
1771 1772 usec = nsec + (usec >> 2);
1772 1773 usec = nsec + (usec >> 3);
1773 1774 usec = nsec + (usec >> 4);
1774 1775 usec = nsec + (usec >> 1);
1775 1776 usec = nsec + (usec >> 6);
1776 1777 usec = usec >> 10;
1777 1778
1778 1779 mutex_enter(&ufs_iuniqtime_lock);
1779 1780 if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
1780 1781 usec > iuniqtime.tv_usec) {
1781 1782 if (now.tv_sec < TIME32_MAX) {
1782 1783 iuniqtime.tv_sec = (time32_t)now.tv_sec;
1783 1784 iuniqtime.tv_usec = usec;
1784 1785 }
1785 1786 } else {
1786 1787 if (iuniqtime.tv_sec < TIME32_MAX) {
1787 1788 iuniqtime.tv_usec++;
1788 1789 /* Check for usec overflow */
1789 1790 if (iuniqtime.tv_usec >= MICROSEC) {
1790 1791 iuniqtime.tv_sec++;
1791 1792 iuniqtime.tv_usec = 0;
1792 1793 }
1793 1794 }
1794 1795 }
1795 1796
1796 1797 if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
1797 1798 ip->i_atime = iuniqtime;
1798 1799 }
1799 1800 if (ip->i_flag & IUPD) {
1800 1801 ip->i_mtime = iuniqtime;
1801 1802 ip->i_flag |= IMODTIME;
1802 1803 }
1803 1804 if (ip->i_flag & ICHG) {
1804 1805 ip->i_diroff = 0;
1805 1806 ip->i_ctime = iuniqtime;
1806 1807 }
1807 1808 mutex_exit(&ufs_iuniqtime_lock);
1808 1809 }
1809 1810
1810 1811 /*
1811 1812 * Update timestamps in inode.
1812 1813 */
1813 1814 void
1814 1815 ufs_itimes_nolock(struct inode *ip)
1815 1816 {
1816 1817
1817 1818 /*
1818 1819 * if noatime is set and the inode access time is the only field that
1819 1820 * must be changed, exit immediately.
1820 1821 */
1821 1822 if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
1822 1823 (ip->i_ufsvfs->vfs_noatime)) {
1823 1824 return;
1824 1825 }
1825 1826
1826 1827 if (ip->i_flag & (IUPD|IACC|ICHG)) {
1827 1828 if (ip->i_flag & ICHG)
1828 1829 ip->i_flag |= IMOD;
1829 1830 else
1830 1831 ip->i_flag |= IMODACC;
1831 1832 ufs_imark(ip);
1832 1833 ip->i_flag &= ~(IACC|IUPD|ICHG);
1833 1834 }
1834 1835 }
|
↓ open down ↓ |
1685 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX