1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
22 /* All Rights Reserved */
23
24 /*
25 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
27 */
28 /*
29 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
30 */
31
32 #include <sys/types.h>
33 #include <sys/t_lock.h>
34 #include <sys/param.h>
35 #include <sys/tuneable.h>
36 #include <sys/inline.h>
37 #include <sys/systm.h>
38 #include <sys/proc.h>
39 #include <sys/user.h>
40 #include <sys/var.h>
41 #include <sys/buf.h>
42 #include <sys/vfs.h>
43 #include <sys/cred.h>
44 #include <sys/kmem.h>
45 #include <sys/vnode.h>
46 #include <sys/swap.h>
47 #include <sys/vm.h>
48 #include <sys/debug.h>
49 #include <sys/cmn_err.h>
50 #include <sys/sysinfo.h>
51 #include <sys/callb.h>
52 #include <sys/reboot.h>
53 #include <sys/time.h>
54 #include <sys/fs/ufs_inode.h>
55 #include <sys/fs/ufs_bio.h>
56
57 #include <vm/hat.h>
58 #include <vm/page.h>
59 #include <vm/pvn.h>
60 #include <vm/seg_kmem.h>
61
62 volatile int doiflush = 1; /* non-zero to turn inode flushing on */
63 volatile int dopageflush = 1; /* non-zero to turn page flushing on */
64
65 /*
66 * To improve boot performance, don't run the inode flushing loop until
67 * the specified number of seconds after boot. To revert to the old
68 * behavior, set fsflush_iflush_delay to 0. We have not created any new
69 * filesystem danger that did not exist previously, since there is always a
70 * window in between when fsflush does the inode flush loop during which the
71 * system could crash, fail to sync the filesystem, and fsck will be needed
72 * to recover. We have, however, widened this window. Finally,
73 * we never delay inode flushing if we're booting into single user mode,
74 * where the administrator may be modifying files or using fsck. This
75 * modification avoids inode flushes during boot whose only purpose is to
76 * update atimes on files which have been accessed during boot.
77 */
78 int fsflush_iflush_delay = 60;
79
80 kcondvar_t fsflush_cv;
81 static kmutex_t fsflush_lock; /* just for the cv_wait */
82 ksema_t fsflush_sema; /* to serialize with reboot */
83
84 /*
85 * some statistics for fsflush_do_pages
86 */
87 typedef struct {
88 ulong_t fsf_scan; /* number of pages scanned */
89 ulong_t fsf_examined; /* number of page_t's actually examined, can */
90 /* be less than fsf_scan due to large pages */
91 ulong_t fsf_locked; /* pages we actually page_lock()ed */
92 ulong_t fsf_modified; /* number of modified pages found */
93 ulong_t fsf_coalesce; /* number of page coalesces done */
94 ulong_t fsf_time; /* nanoseconds of run time */
95 ulong_t fsf_releases; /* number of page_release() done */
96 } fsf_stat_t;
97
98 fsf_stat_t fsf_recent; /* counts for most recent duty cycle */
99 fsf_stat_t fsf_total; /* total of counts */
100 ulong_t fsf_cycles; /* number of runs refelected in fsf_total */
101
102 /*
103 * data used to determine when we can coalesce consecutive free pages
104 * into larger pages.
105 */
106 #define MAX_PAGESIZES 32
107 static ulong_t fsf_npgsz;
108 static pgcnt_t fsf_pgcnt[MAX_PAGESIZES];
109 static pgcnt_t fsf_mask[MAX_PAGESIZES];
110
111
112 /*
113 * Scan page_t's and issue I/O's for modified pages.
114 *
115 * Also coalesces consecutive small sized free pages into the next larger
116 * pagesize. This costs a tiny bit of time in fsflush, but will reduce time
117 * spent scanning on later passes and for anybody allocating large pages.
118 */
119 static void
120 fsflush_do_pages()
121 {
122 vnode_t *vp;
123 ulong_t pcount;
124 hrtime_t timer = gethrtime();
125 ulong_t releases = 0;
126 ulong_t nexamined = 0;
127 ulong_t nlocked = 0;
128 ulong_t nmodified = 0;
129 ulong_t ncoalesce = 0;
130 ulong_t cnt;
131 int mod;
132 int fspage = 1;
133 u_offset_t offset;
134 uint_t szc;
135
136 page_t *coal_page = NULL; /* 1st page in group to coalesce */
137 uint_t coal_szc = 0; /* size code, coal_page->p_szc */
138 uint_t coal_cnt = 0; /* count of pages seen */
139
140 static ulong_t nscan = 0;
141 static pgcnt_t last_total_pages = 0;
142 static page_t *pp = NULL;
143
144 /*
145 * Check to see if total_pages has changed.
146 */
147 if (total_pages != last_total_pages) {
148 last_total_pages = total_pages;
149 nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup;
150 }
151
152 if (pp == NULL)
153 pp = memsegs->pages;
154
155 pcount = 0;
156 while (pcount < nscan) {
157
158 /*
159 * move to the next page, skipping over large pages
160 * and issuing prefetches.
161 */
162 if (pp->p_szc && fspage == 0) {
163 pfn_t pfn;
164
165 pfn = page_pptonum(pp);
166 cnt = page_get_pagecnt(pp->p_szc);
167 cnt -= pfn & (cnt - 1);
168 } else
169 cnt = 1;
170
171 pp = page_nextn(pp, cnt);
172 prefetch_page_r((void *)pp);
173 ASSERT(pp != NULL);
174 pcount += cnt;
175
176 /*
177 * Do a bunch of dirty tests (ie. no locking) to determine
178 * if we can quickly skip this page. These tests are repeated
179 * after acquiring the page lock.
180 */
181 ++nexamined;
182 if (PP_ISSWAP(pp)) {
183 fspage = 0;
184 coal_page = NULL;
185 continue;
186 }
187
188 /*
189 * skip free pages too, but try coalescing them into larger
190 * pagesizes
191 */
192 if (PP_ISFREE(pp)) {
193 /*
194 * skip pages with a file system identity or that
195 * are already maximum size
196 */
197 fspage = 0;
198 szc = pp->p_szc;
199 if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) {
200 coal_page = NULL;
201 continue;
202 }
203
204 /*
205 * If not in a coalescing candidate page or the size
206 * codes are different, start a new candidate.
207 */
208 if (coal_page == NULL || coal_szc != szc) {
209
210 /*
211 * page must be properly aligned
212 */
213 if ((page_pptonum(pp) & fsf_mask[szc]) != 0) {
214 coal_page = NULL;
215 continue;
216 }
217 coal_page = pp;
218 coal_szc = szc;
219 coal_cnt = 1;
220 continue;
221 }
222
223 /*
224 * acceptable to add this to existing candidate page
225 */
226 ++coal_cnt;
227 if (coal_cnt < fsf_pgcnt[coal_szc])
228 continue;
229
230 /*
231 * We've got enough pages to coalesce, so do it.
232 * After promoting, we clear coal_page, so it will
233 * take another pass to promote this to an even
234 * larger page.
235 */
236 ++ncoalesce;
237 (void) page_promote_size(coal_page, coal_szc);
238 coal_page = NULL;
239 continue;
240 } else {
241 coal_page = NULL;
242 }
243
244 if (PP_ISKAS(pp) ||
245 PAGE_LOCKED(pp) ||
246 pp->p_lckcnt != 0 ||
247 pp->p_cowcnt != 0) {
248 fspage = 0;
249 continue;
250 }
251
252
253 /*
254 * Reject pages that can't be "exclusively" locked.
255 */
256 if (!page_trylock(pp, SE_EXCL))
257 continue;
258 ++nlocked;
259
260
261 /*
262 * After locking the page, redo the above checks.
263 * Since we locked the page, leave out the PAGE_LOCKED() test.
264 */
265 vp = pp->p_vnode;
266 if (PP_ISSWAP(pp) ||
267 PP_ISFREE(pp) ||
268 vp == NULL ||
269 PP_ISKAS(pp) ||
270 (vp->v_flag & VISSWAP) != 0) {
271 page_unlock(pp);
272 fspage = 0;
273 continue;
274 }
275 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
276 page_unlock(pp);
277 continue;
278 }
279
280 fspage = 1;
281 ASSERT(vp->v_type != VCHR);
282
283 /*
284 * Check the modified bit. Leaving the bit alone in hardware.
285 * It will be cleared if we do the putpage.
286 */
287 if (IS_VMODSORT(vp))
288 mod = hat_ismod(pp);
289 else
290 mod = hat_pagesync(pp,
291 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD;
292
293 if (mod) {
294 ++nmodified;
295 offset = pp->p_offset;
296
297 /*
298 * Hold the vnode before releasing the page lock
299 * to prevent it from being freed and re-used by
300 * some other thread.
301 */
302 VN_HOLD(vp);
303
304 page_unlock(pp);
305
306 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC,
307 kcred, NULL);
308
309 VN_RELE(vp);
310 } else {
311
312 /*
313 * Catch any pages which should be on the cache list,
314 * but aren't yet.
315 */
316 if (hat_page_is_mapped(pp) == 0) {
317 ++releases;
318 (void) page_release(pp, 1);
319 } else {
320 page_unlock(pp);
321 }
322 }
323 }
324
325 /*
326 * maintain statistics
327 * reset every million wakeups, just to avoid overflow
328 */
329 if (++fsf_cycles == 1000000) {
330 fsf_cycles = 0;
331 fsf_total.fsf_scan = 0;
332 fsf_total.fsf_examined = 0;
333 fsf_total.fsf_locked = 0;
334 fsf_total.fsf_modified = 0;
335 fsf_total.fsf_coalesce = 0;
336 fsf_total.fsf_time = 0;
337 fsf_total.fsf_releases = 0;
338 } else {
339 fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan;
340 fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined;
341 fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked;
342 fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified;
343 fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce;
344 fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer;
345 fsf_total.fsf_releases += fsf_recent.fsf_releases = releases;
346 }
347 }
348
349 /*
350 * As part of file system hardening, this daemon is awakened
351 * every second to flush cached data which includes the
352 * buffer cache, the inode cache and mapped pages.
353 */
354 void
355 fsflush()
356 {
357 struct buf *bp, *dwp;
358 struct hbuf *hp;
359 int autoup;
360 unsigned int ix, icount, count = 0;
361 callb_cpr_t cprinfo;
362 uint_t bcount;
363 kmutex_t *hmp;
364 struct vfssw *vswp;
365
366 proc_fsflush = ttoproc(curthread);
367 proc_fsflush->p_cstime = 0;
368 proc_fsflush->p_stime = 0;
369 proc_fsflush->p_cutime = 0;
370 proc_fsflush->p_utime = 0;
371 bcopy("fsflush", curproc->p_user.u_psargs, 8);
372 bcopy("fsflush", curproc->p_user.u_comm, 7);
373
374 mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL);
375 sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL);
376
377 /*
378 * Setup page coalescing.
379 */
380 fsf_npgsz = page_num_pagesizes();
381 ASSERT(fsf_npgsz < MAX_PAGESIZES);
382 for (ix = 0; ix < fsf_npgsz - 1; ++ix) {
383 fsf_pgcnt[ix] =
384 page_get_pagesize(ix + 1) / page_get_pagesize(ix);
385 fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1;
386 }
387
388 autoup = v.v_autoup * hz;
389 icount = v.v_autoup / tune.t_fsflushr;
390 CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush");
391 loop:
392 sema_v(&fsflush_sema);
393 mutex_enter(&fsflush_lock);
394 CALLB_CPR_SAFE_BEGIN(&cprinfo);
395 cv_wait(&fsflush_cv, &fsflush_lock); /* wait for clock */
396 CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock);
397 mutex_exit(&fsflush_lock);
398 sema_p(&fsflush_sema);
399
400 /*
401 * Write back all old B_DELWRI buffers on the freelist.
402 */
403 bcount = 0;
404 for (ix = 0; ix < v.v_hbuf; ix++) {
405
406 hp = &hbuf[ix];
407 dwp = (struct buf *)&dwbuf[ix];
408
409 bcount += (hp->b_length);
410
411 if (dwp->av_forw == dwp) {
412 continue;
413 }
414
415 hmp = &hbuf[ix].b_lock;
416 mutex_enter(hmp);
417 bp = dwp->av_forw;
418
419 /*
420 * Go down only on the delayed write lists.
421 */
422 while (bp != dwp) {
423
424 ASSERT(bp->b_flags & B_DELWRI);
425
426 if ((bp->b_flags & B_DELWRI) &&
427 (ddi_get_lbolt() - bp->b_start >= autoup) &&
428 sema_tryp(&bp->b_sem)) {
429 bp->b_flags |= B_ASYNC;
430 hp->b_length--;
431 notavail(bp);
432 mutex_exit(hmp);
433 if (bp->b_vp == NULL) {
434 BWRITE(bp);
435 } else {
436 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs,
437 bp);
438 }
439 mutex_enter(hmp);
440 bp = dwp->av_forw;
441 } else {
442 bp = bp->av_forw;
443 }
444 }
445 mutex_exit(hmp);
446 }
447
448 /*
449 *
450 * There is no need to wakeup any thread waiting on bio_mem_cv
451 * since brelse will wake them up as soon as IO is complete.
452 */
453 bfreelist.b_bcount = bcount;
454
455 if (dopageflush)
456 fsflush_do_pages();
457
458 if (!doiflush)
459 goto loop;
460
461 /*
462 * If the system was not booted to single user mode, skip the
463 * inode flushing until after fsflush_iflush_delay secs have elapsed.
464 */
465 if ((boothowto & RB_SINGLE) == 0 &&
466 (ddi_get_lbolt64() / hz) < fsflush_iflush_delay)
467 goto loop;
468
469 /*
470 * Flush cached attribute information (e.g. inodes).
471 */
472 if (++count >= icount) {
473 count = 0;
474
475 /*
476 * Sync back cached data.
477 */
478 RLOCK_VFSSW();
479 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
480 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
481 vfs_refvfssw(vswp);
482 RUNLOCK_VFSSW();
483 (void) fsop_sync_by_kind(vswp - vfssw,
484 SYNC_ATTR, kcred);
485 vfs_unrefvfssw(vswp);
486 RLOCK_VFSSW();
487 }
488 }
489 RUNLOCK_VFSSW();
490 }
491 goto loop;
492 }