Print this page
re #13613 rb4516 Tunables needs volatile keyword
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/fsflush.c
+++ new/usr/src/uts/common/fs/fsflush.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
22 22 /* All Rights Reserved */
23 23
24 -
25 24 /*
26 25 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
27 26 * Use is subject to license terms.
28 27 */
28 +/*
29 + * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
30 + */
29 31
30 32 #include <sys/types.h>
31 33 #include <sys/t_lock.h>
32 34 #include <sys/param.h>
33 35 #include <sys/tuneable.h>
34 36 #include <sys/inline.h>
35 37 #include <sys/systm.h>
36 38 #include <sys/proc.h>
37 39 #include <sys/user.h>
38 40 #include <sys/var.h>
39 41 #include <sys/buf.h>
40 42 #include <sys/vfs.h>
41 43 #include <sys/cred.h>
42 44 #include <sys/kmem.h>
43 45 #include <sys/vnode.h>
44 46 #include <sys/swap.h>
45 47 #include <sys/vm.h>
46 48 #include <sys/debug.h>
47 49 #include <sys/cmn_err.h>
48 50 #include <sys/sysinfo.h>
49 51 #include <sys/callb.h>
|
↓ open down ↓ |
11 lines elided |
↑ open up ↑ |
50 52 #include <sys/reboot.h>
51 53 #include <sys/time.h>
52 54 #include <sys/fs/ufs_inode.h>
53 55 #include <sys/fs/ufs_bio.h>
54 56
55 57 #include <vm/hat.h>
56 58 #include <vm/page.h>
57 59 #include <vm/pvn.h>
58 60 #include <vm/seg_kmem.h>
59 61
60 -int doiflush = 1; /* non-zero to turn inode flushing on */
61 -int dopageflush = 1; /* non-zero to turn page flushing on */
62 +volatile int doiflush = 1; /* non-zero to turn inode flushing on */
63 +volatile int dopageflush = 1; /* non-zero to turn page flushing on */
62 64
63 65 /*
64 66 * To improve boot performance, don't run the inode flushing loop until
65 67 * the specified number of seconds after boot. To revert to the old
66 68 * behavior, set fsflush_iflush_delay to 0. We have not created any new
67 69 * filesystem danger that did not exist previously, since there is always a
68 70 * window in between when fsflush does the inode flush loop during which the
69 71 * system could crash, fail to sync the filesystem, and fsck will be needed
70 72 * to recover. We have, however, widened this window. Finally,
71 73 * we never delay inode flushing if we're booting into single user mode,
72 74 * where the administrator may be modifying files or using fsck. This
73 75 * modification avoids inode flushes during boot whose only purpose is to
74 76 * update atimes on files which have been accessed during boot.
75 77 */
76 78 int fsflush_iflush_delay = 60;
77 79
78 80 kcondvar_t fsflush_cv;
79 81 static kmutex_t fsflush_lock; /* just for the cv_wait */
80 82 ksema_t fsflush_sema; /* to serialize with reboot */
81 83
82 84 /*
83 85 * some statistics for fsflush_do_pages
84 86 */
85 87 typedef struct {
86 88 ulong_t fsf_scan; /* number of pages scanned */
87 89 ulong_t fsf_examined; /* number of page_t's actually examined, can */
88 90 /* be less than fsf_scan due to large pages */
89 91 ulong_t fsf_locked; /* pages we actually page_lock()ed */
90 92 ulong_t fsf_modified; /* number of modified pages found */
91 93 ulong_t fsf_coalesce; /* number of page coalesces done */
92 94 ulong_t fsf_time; /* nanoseconds of run time */
93 95 ulong_t fsf_releases; /* number of page_release() done */
94 96 } fsf_stat_t;
95 97
96 98 fsf_stat_t fsf_recent; /* counts for most recent duty cycle */
97 99 fsf_stat_t fsf_total; /* total of counts */
98 100 ulong_t fsf_cycles; /* number of runs refelected in fsf_total */
99 101
100 102 /*
101 103 * data used to determine when we can coalesce consecutive free pages
102 104 * into larger pages.
103 105 */
104 106 #define MAX_PAGESIZES 32
105 107 static ulong_t fsf_npgsz;
106 108 static pgcnt_t fsf_pgcnt[MAX_PAGESIZES];
107 109 static pgcnt_t fsf_mask[MAX_PAGESIZES];
108 110
109 111
110 112 /*
111 113 * Scan page_t's and issue I/O's for modified pages.
112 114 *
113 115 * Also coalesces consecutive small sized free pages into the next larger
114 116 * pagesize. This costs a tiny bit of time in fsflush, but will reduce time
115 117 * spent scanning on later passes and for anybody allocating large pages.
116 118 */
117 119 static void
118 120 fsflush_do_pages()
119 121 {
120 122 vnode_t *vp;
121 123 ulong_t pcount;
122 124 hrtime_t timer = gethrtime();
123 125 ulong_t releases = 0;
124 126 ulong_t nexamined = 0;
125 127 ulong_t nlocked = 0;
126 128 ulong_t nmodified = 0;
127 129 ulong_t ncoalesce = 0;
128 130 ulong_t cnt;
129 131 int mod;
130 132 int fspage = 1;
131 133 u_offset_t offset;
132 134 uint_t szc;
133 135
134 136 page_t *coal_page = NULL; /* 1st page in group to coalesce */
135 137 uint_t coal_szc = 0; /* size code, coal_page->p_szc */
136 138 uint_t coal_cnt = 0; /* count of pages seen */
137 139
138 140 static ulong_t nscan = 0;
139 141 static pgcnt_t last_total_pages = 0;
140 142 static page_t *pp = NULL;
141 143
142 144 /*
143 145 * Check to see if total_pages has changed.
144 146 */
145 147 if (total_pages != last_total_pages) {
146 148 last_total_pages = total_pages;
147 149 nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup;
148 150 }
149 151
150 152 if (pp == NULL)
151 153 pp = memsegs->pages;
152 154
153 155 pcount = 0;
154 156 while (pcount < nscan) {
155 157
156 158 /*
157 159 * move to the next page, skipping over large pages
158 160 * and issuing prefetches.
159 161 */
160 162 if (pp->p_szc && fspage == 0) {
161 163 pfn_t pfn;
162 164
163 165 pfn = page_pptonum(pp);
164 166 cnt = page_get_pagecnt(pp->p_szc);
165 167 cnt -= pfn & (cnt - 1);
166 168 } else
167 169 cnt = 1;
168 170
169 171 pp = page_nextn(pp, cnt);
170 172 prefetch_page_r((void *)pp);
171 173 ASSERT(pp != NULL);
172 174 pcount += cnt;
173 175
174 176 /*
175 177 * Do a bunch of dirty tests (ie. no locking) to determine
176 178 * if we can quickly skip this page. These tests are repeated
177 179 * after acquiring the page lock.
178 180 */
179 181 ++nexamined;
180 182 if (PP_ISSWAP(pp)) {
181 183 fspage = 0;
182 184 coal_page = NULL;
183 185 continue;
184 186 }
185 187
186 188 /*
187 189 * skip free pages too, but try coalescing them into larger
188 190 * pagesizes
189 191 */
190 192 if (PP_ISFREE(pp)) {
191 193 /*
192 194 * skip pages with a file system identity or that
193 195 * are already maximum size
194 196 */
195 197 fspage = 0;
196 198 szc = pp->p_szc;
197 199 if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) {
198 200 coal_page = NULL;
199 201 continue;
200 202 }
201 203
202 204 /*
203 205 * If not in a coalescing candidate page or the size
204 206 * codes are different, start a new candidate.
205 207 */
206 208 if (coal_page == NULL || coal_szc != szc) {
207 209
208 210 /*
209 211 * page must be properly aligned
210 212 */
211 213 if ((page_pptonum(pp) & fsf_mask[szc]) != 0) {
212 214 coal_page = NULL;
213 215 continue;
214 216 }
215 217 coal_page = pp;
216 218 coal_szc = szc;
217 219 coal_cnt = 1;
218 220 continue;
219 221 }
220 222
221 223 /*
222 224 * acceptable to add this to existing candidate page
223 225 */
224 226 ++coal_cnt;
225 227 if (coal_cnt < fsf_pgcnt[coal_szc])
226 228 continue;
227 229
228 230 /*
229 231 * We've got enough pages to coalesce, so do it.
230 232 * After promoting, we clear coal_page, so it will
231 233 * take another pass to promote this to an even
232 234 * larger page.
233 235 */
234 236 ++ncoalesce;
235 237 (void) page_promote_size(coal_page, coal_szc);
236 238 coal_page = NULL;
237 239 continue;
238 240 } else {
239 241 coal_page = NULL;
240 242 }
241 243
242 244 if (PP_ISKAS(pp) ||
243 245 PAGE_LOCKED(pp) ||
244 246 pp->p_lckcnt != 0 ||
245 247 pp->p_cowcnt != 0) {
246 248 fspage = 0;
247 249 continue;
248 250 }
249 251
250 252
251 253 /*
252 254 * Reject pages that can't be "exclusively" locked.
253 255 */
254 256 if (!page_trylock(pp, SE_EXCL))
255 257 continue;
256 258 ++nlocked;
257 259
258 260
259 261 /*
260 262 * After locking the page, redo the above checks.
261 263 * Since we locked the page, leave out the PAGE_LOCKED() test.
262 264 */
263 265 vp = pp->p_vnode;
264 266 if (PP_ISSWAP(pp) ||
265 267 PP_ISFREE(pp) ||
266 268 vp == NULL ||
267 269 PP_ISKAS(pp) ||
268 270 (vp->v_flag & VISSWAP) != 0) {
269 271 page_unlock(pp);
270 272 fspage = 0;
271 273 continue;
272 274 }
273 275 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
274 276 page_unlock(pp);
275 277 continue;
276 278 }
277 279
278 280 fspage = 1;
279 281 ASSERT(vp->v_type != VCHR);
280 282
281 283 /*
282 284 * Check the modified bit. Leaving the bit alone in hardware.
283 285 * It will be cleared if we do the putpage.
284 286 */
285 287 if (IS_VMODSORT(vp))
286 288 mod = hat_ismod(pp);
287 289 else
288 290 mod = hat_pagesync(pp,
289 291 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD;
290 292
291 293 if (mod) {
292 294 ++nmodified;
293 295 offset = pp->p_offset;
294 296
295 297 /*
296 298 * Hold the vnode before releasing the page lock
297 299 * to prevent it from being freed and re-used by
298 300 * some other thread.
299 301 */
300 302 VN_HOLD(vp);
301 303
302 304 page_unlock(pp);
303 305
304 306 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC,
305 307 kcred, NULL);
306 308
307 309 VN_RELE(vp);
308 310 } else {
309 311
310 312 /*
311 313 * Catch any pages which should be on the cache list,
312 314 * but aren't yet.
313 315 */
314 316 if (hat_page_is_mapped(pp) == 0) {
315 317 ++releases;
316 318 (void) page_release(pp, 1);
317 319 } else {
318 320 page_unlock(pp);
319 321 }
320 322 }
321 323 }
322 324
323 325 /*
324 326 * maintain statistics
325 327 * reset every million wakeups, just to avoid overflow
326 328 */
327 329 if (++fsf_cycles == 1000000) {
328 330 fsf_cycles = 0;
329 331 fsf_total.fsf_scan = 0;
330 332 fsf_total.fsf_examined = 0;
331 333 fsf_total.fsf_locked = 0;
332 334 fsf_total.fsf_modified = 0;
333 335 fsf_total.fsf_coalesce = 0;
334 336 fsf_total.fsf_time = 0;
335 337 fsf_total.fsf_releases = 0;
336 338 } else {
337 339 fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan;
338 340 fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined;
339 341 fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked;
340 342 fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified;
341 343 fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce;
342 344 fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer;
343 345 fsf_total.fsf_releases += fsf_recent.fsf_releases = releases;
344 346 }
345 347 }
346 348
347 349 /*
348 350 * As part of file system hardening, this daemon is awakened
349 351 * every second to flush cached data which includes the
350 352 * buffer cache, the inode cache and mapped pages.
351 353 */
352 354 void
353 355 fsflush()
354 356 {
355 357 struct buf *bp, *dwp;
356 358 struct hbuf *hp;
357 359 int autoup;
358 360 unsigned int ix, icount, count = 0;
359 361 callb_cpr_t cprinfo;
360 362 uint_t bcount;
361 363 kmutex_t *hmp;
362 364 struct vfssw *vswp;
363 365
364 366 proc_fsflush = ttoproc(curthread);
365 367 proc_fsflush->p_cstime = 0;
366 368 proc_fsflush->p_stime = 0;
367 369 proc_fsflush->p_cutime = 0;
368 370 proc_fsflush->p_utime = 0;
369 371 bcopy("fsflush", curproc->p_user.u_psargs, 8);
370 372 bcopy("fsflush", curproc->p_user.u_comm, 7);
371 373
372 374 mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL);
373 375 sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL);
374 376
375 377 /*
376 378 * Setup page coalescing.
377 379 */
378 380 fsf_npgsz = page_num_pagesizes();
379 381 ASSERT(fsf_npgsz < MAX_PAGESIZES);
380 382 for (ix = 0; ix < fsf_npgsz - 1; ++ix) {
381 383 fsf_pgcnt[ix] =
382 384 page_get_pagesize(ix + 1) / page_get_pagesize(ix);
383 385 fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1;
384 386 }
385 387
386 388 autoup = v.v_autoup * hz;
387 389 icount = v.v_autoup / tune.t_fsflushr;
388 390 CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush");
389 391 loop:
390 392 sema_v(&fsflush_sema);
391 393 mutex_enter(&fsflush_lock);
392 394 CALLB_CPR_SAFE_BEGIN(&cprinfo);
393 395 cv_wait(&fsflush_cv, &fsflush_lock); /* wait for clock */
394 396 CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock);
395 397 mutex_exit(&fsflush_lock);
396 398 sema_p(&fsflush_sema);
397 399
398 400 /*
399 401 * Write back all old B_DELWRI buffers on the freelist.
400 402 */
401 403 bcount = 0;
402 404 for (ix = 0; ix < v.v_hbuf; ix++) {
403 405
404 406 hp = &hbuf[ix];
405 407 dwp = (struct buf *)&dwbuf[ix];
406 408
407 409 bcount += (hp->b_length);
408 410
409 411 if (dwp->av_forw == dwp) {
410 412 continue;
411 413 }
412 414
413 415 hmp = &hbuf[ix].b_lock;
414 416 mutex_enter(hmp);
415 417 bp = dwp->av_forw;
416 418
417 419 /*
418 420 * Go down only on the delayed write lists.
419 421 */
420 422 while (bp != dwp) {
421 423
422 424 ASSERT(bp->b_flags & B_DELWRI);
423 425
424 426 if ((bp->b_flags & B_DELWRI) &&
425 427 (ddi_get_lbolt() - bp->b_start >= autoup) &&
426 428 sema_tryp(&bp->b_sem)) {
427 429 bp->b_flags |= B_ASYNC;
428 430 hp->b_length--;
429 431 notavail(bp);
430 432 mutex_exit(hmp);
431 433 if (bp->b_vp == NULL) {
432 434 BWRITE(bp);
433 435 } else {
434 436 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs,
435 437 bp);
436 438 }
437 439 mutex_enter(hmp);
438 440 bp = dwp->av_forw;
439 441 } else {
440 442 bp = bp->av_forw;
441 443 }
442 444 }
443 445 mutex_exit(hmp);
444 446 }
445 447
446 448 /*
447 449 *
448 450 * There is no need to wakeup any thread waiting on bio_mem_cv
449 451 * since brelse will wake them up as soon as IO is complete.
450 452 */
451 453 bfreelist.b_bcount = bcount;
452 454
453 455 if (dopageflush)
454 456 fsflush_do_pages();
455 457
456 458 if (!doiflush)
457 459 goto loop;
458 460
459 461 /*
460 462 * If the system was not booted to single user mode, skip the
461 463 * inode flushing until after fsflush_iflush_delay secs have elapsed.
462 464 */
463 465 if ((boothowto & RB_SINGLE) == 0 &&
464 466 (ddi_get_lbolt64() / hz) < fsflush_iflush_delay)
465 467 goto loop;
466 468
467 469 /*
468 470 * Flush cached attribute information (e.g. inodes).
469 471 */
470 472 if (++count >= icount) {
471 473 count = 0;
472 474
473 475 /*
474 476 * Sync back cached data.
475 477 */
476 478 RLOCK_VFSSW();
477 479 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
478 480 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
479 481 vfs_refvfssw(vswp);
480 482 RUNLOCK_VFSSW();
481 483 (void) fsop_sync_by_kind(vswp - vfssw,
482 484 SYNC_ATTR, kcred);
483 485 vfs_unrefvfssw(vswp);
484 486 RLOCK_VFSSW();
485 487 }
486 488 }
487 489 RUNLOCK_VFSSW();
488 490 }
489 491 goto loop;
490 492 }
|
↓ open down ↓ |
419 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX