1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
27 */
28
29 #include <sys/systm.h>
30 #include <sys/types.h>
31 #include <sys/vnode.h>
32 #include <sys/errno.h>
33 #include <sys/sysmacros.h>
34 #include <sys/debug.h>
35 #include <sys/kmem.h>
36 #include <sys/conf.h>
37 #include <sys/proc.h>
38 #include <sys/cmn_err.h>
39 #include <sys/fssnap_if.h>
40 #include <sys/fs/ufs_inode.h>
41 #include <sys/fs/ufs_filio.h>
42 #include <sys/fs/ufs_log.h>
43 #include <sys/fs/ufs_bio.h>
44 #include <sys/inttypes.h>
45 #include <sys/callb.h>
46 #include <sys/tnf_probe.h>
47
48 /*
49 * Kernel threads for logging
50 * Currently only one for rolling the log (one per log).
51 */
52
53 #define LUFS_DEFAULT_NUM_ROLL_BUFS 16
54 #define LUFS_DEFAULT_MIN_ROLL_BUFS 4
55 #define LUFS_DEFAULT_MAX_ROLL_BUFS 64
56
57 /*
58 * Macros
59 */
60 #define logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme)
61 #define ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof)
62
63 /*
64 * Tunables
65 */
66 uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS;
67 uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS;
68 uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS;
69 long logmap_maxnme = 1536;
70 int trans_roll_tics = 0;
71 uint64_t trans_roll_new_delta = 0;
72 uint64_t lrr_wait = 0;
73 /*
74 * Key for thread specific data for the roll thread to
75 * bypass snapshot throttling
76 */
77 uint_t bypass_snapshot_throttle_key;
78
79 /*
80 * externs
81 */
82 extern kmutex_t ml_scan;
83 extern kcondvar_t ml_scan_cv;
84
85 static void
86 trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop)
87 {
88 mutex_enter(&logmap->mtm_mutex);
89 logmap->mtm_ref = 0;
90 if (logmap->mtm_flags & MTM_FORCE_ROLL) {
91 cv_broadcast(&logmap->mtm_from_roll_cv);
92 }
93 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING);
94 CALLB_CPR_SAFE_BEGIN(cprinfop);
95 (void) cv_reltimedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex,
96 trans_roll_tics, TR_CLOCK_TICK);
97 CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex);
98 logmap->mtm_flags |= MTM_ROLLING;
99 mutex_exit(&logmap->mtm_mutex);
100 }
101
102 /*
103 * returns the number of 8K buffers to use for rolling the log
104 */
105 static uint32_t
106 log_roll_buffers()
107 {
108 /*
109 * sanity validate the tunable lufs_num_roll_bufs
110 */
111 if (lufs_num_roll_bufs < lufs_min_roll_bufs) {
112 return (lufs_min_roll_bufs);
113 }
114 if (lufs_num_roll_bufs > lufs_max_roll_bufs) {
115 return (lufs_max_roll_bufs);
116 }
117 return (lufs_num_roll_bufs);
118 }
119
120 /*
121 * Find something to roll, then if we don't have cached roll buffers
122 * covering all the deltas in that MAPBLOCK then read the master
123 * and overlay the deltas.
124 * returns;
125 * 0 if sucessful
126 * 1 on finding nothing to roll
127 * 2 on error
128 */
129 int
130 log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs,
131 int *retnbuf)
132 {
133 offset_t mof;
134 buf_t *bp;
135 rollbuf_t *rbp;
136 mt_map_t *logmap = ul->un_logmap;
137 daddr_t mblkno;
138 int i;
139 int error;
140 int nbuf;
141
142 /*
143 * Make sure there is really something to roll
144 */
145 mof = 0;
146 if (!logmap_next_roll(logmap, &mof)) {
147 return (1);
148 }
149
150 /*
151 * build some master blocks + deltas to roll forward
152 */
153 rw_enter(&logmap->mtm_rwlock, RW_READER);
154 nbuf = 0;
155 do {
156 mof = mof & (offset_t)MAPBLOCKMASK;
157 mblkno = lbtodb(mof);
158
159 /*
160 * Check for the case of a new delta to a set up buffer
161 */
162 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
163 if (P2ALIGN(rbp->rb_bh.b_blkno,
164 MAPBLOCKSIZE / DEV_BSIZE) == mblkno) {
165 TNF_PROBE_0(trans_roll_new_delta, "lufs",
166 /* CSTYLED */);
167 trans_roll_new_delta++;
168 /* Flush out the current set of buffers */
169 goto flush_bufs;
170 }
171 }
172
173 /*
174 * Work out what to roll next. If it isn't cached then read
175 * it asynchronously from the master.
176 */
177 bp = &rbp->rb_bh;
178 bp->b_blkno = mblkno;
179 bp->b_flags = B_READ;
180 bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT);
181 bp->b_bufsize = MAPBLOCKSIZE;
182 if (top_read_roll(rbp, ul)) {
183 /* logmap deltas were in use */
184 if (nbuf == 0) {
185 /*
186 * On first buffer wait for the logmap user
187 * to finish by grabbing the logmap lock
188 * exclusively rather than spinning
189 */
190 rw_exit(&logmap->mtm_rwlock);
191 lrr_wait++;
192 rw_enter(&logmap->mtm_rwlock, RW_WRITER);
193 rw_exit(&logmap->mtm_rwlock);
194 return (1);
195 }
196 /* we have at least one buffer - flush it */
197 goto flush_bufs;
198 }
199 if ((bp->b_flags & B_INVAL) == 0) {
200 nbuf++;
201 }
202 mof += MAPBLOCKSIZE;
203 } while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
204
205 /*
206 * If there was nothing to roll cycle back
207 */
208 if (nbuf == 0) {
209 rw_exit(&logmap->mtm_rwlock);
210 return (1);
211 }
212
213 flush_bufs:
214 /*
215 * For each buffer, if it isn't cached then wait for the read to
216 * finish and overlay the deltas.
217 */
218 for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
219 if (!rbp->rb_crb) {
220 bp = &rbp->rb_bh;
221 if (trans_not_wait(bp)) {
222 ldl_seterror(ul,
223 "Error reading master during ufs log roll");
224 error = 1;
225 }
226 /*
227 * sync read the data from the log
228 */
229 if (ldl_read(ul, bp->b_un.b_addr,
230 ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK,
231 MAPBLOCKSIZE, rbp->rb_age)) {
232 error = 1;
233 }
234 }
235
236 /*
237 * reset the age bit in the age list
238 */
239 logmap_list_put_roll(logmap, rbp->rb_age);
240
241 if (ul->un_flags & LDL_ERROR) {
242 error = 1;
243 }
244 }
245 rw_exit(&logmap->mtm_rwlock);
246 if (error)
247 return (2);
248 *retnbuf = nbuf;
249 return (0);
250 }
251
252 /*
253 * Write out a cached roll buffer
254 */
255 void
256 log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
257 {
258 crb_t *crb = rbp->rb_crb;
259 buf_t *bp = &rbp->rb_bh;
260
261 bp->b_blkno = lbtodb(crb->c_mof);
262 bp->b_un.b_addr = crb->c_buf;
263 bp->b_bcount = crb->c_nb;
264 bp->b_bufsize = crb->c_nb;
265 ASSERT((crb->c_nb & DEV_BMASK) == 0);
266 bp->b_flags = B_WRITE;
267 logstats.ls_rwrites.value.ui64++;
268
269 /* if snapshots are enabled, call it */
270 if (ufsvfsp->vfs_snapshot) {
271 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
272 } else {
273 (void) bdev_strategy(bp);
274 }
275 }
276
277 /*
278 * Write out a set of non cached roll buffers
279 */
280 void
281 log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
282 {
283 buf_t *bp = &rbp->rb_bh;
284 buf_t *bp2;
285 rbsecmap_t secmap = rbp->rb_secmap;
286 int j, k;
287
288 ASSERT(secmap);
289 ASSERT((bp->b_flags & B_INVAL) == 0);
290
291 do { /* for each contiguous block of sectors */
292 /* find start of next sector to write */
293 for (j = 0; j < 16; ++j) {
294 if (secmap & UINT16_C(1))
295 break;
296 secmap >>= 1;
297 }
298 bp->b_un.b_addr += (j << DEV_BSHIFT);
299 bp->b_blkno += j;
300
301 /* calculate number of sectors */
302 secmap >>= 1;
303 j++;
304 for (k = 1; j < 16; ++j) {
305 if ((secmap & UINT16_C(1)) == 0)
306 break;
307 secmap >>= 1;
308 k++;
309 }
310 bp->b_bcount = k << DEV_BSHIFT;
311 bp->b_flags = B_WRITE;
312 logstats.ls_rwrites.value.ui64++;
313
314 /* if snapshots are enabled, call it */
315 if (ufsvfsp->vfs_snapshot)
316 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
317 else
318 (void) bdev_strategy(bp);
319 if (secmap) {
320 /*
321 * Allocate another buf_t to handle
322 * the next write in this MAPBLOCK
323 * Chain them via b_list.
324 */
325 bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP);
326 bp->b_list = bp2;
327 bioinit(bp2);
328 bp2->b_iodone = trans_not_done;
329 bp2->b_bufsize = MAPBLOCKSIZE;
330 bp2->b_edev = bp->b_edev;
331 bp2->b_un.b_addr =
332 bp->b_un.b_addr + bp->b_bcount;
333 bp2->b_blkno = bp->b_blkno + k;
334 bp = bp2;
335 }
336 } while (secmap);
337 }
338
339 /*
340 * Asynchronously roll the deltas, using the sector map
341 * in each rollbuf_t.
342 */
343 int
344 log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf)
345 {
346
347 ufsvfs_t *ufsvfsp = ul->un_ufsvfs;
348 rollbuf_t *rbp;
349 buf_t *bp, *bp2;
350 rollbuf_t *head, *prev, *rbp2;
351
352 /*
353 * Order the buffers by blkno
354 */
355 ASSERT(nbuf > 0);
356 #ifdef lint
357 prev = rbs;
358 #endif
359 for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) {
360 for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) {
361 if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) {
362 if (rbp2 == head) {
363 rbp->rb_next = head;
364 head = rbp;
365 } else {
366 prev->rb_next = rbp;
367 rbp->rb_next = rbp2;
368 }
369 break;
370 }
371 }
372 if (rbp2 == NULL) {
373 prev->rb_next = rbp;
374 rbp->rb_next = NULL;
375 }
376 }
377
378 /*
379 * issue the in-order writes
380 */
381 for (rbp = head; rbp; rbp = rbp2) {
382 if (rbp->rb_crb) {
383 log_roll_write_crb(ufsvfsp, rbp);
384 } else {
385 log_roll_write_bufs(ufsvfsp, rbp);
386 }
387 /* null out the rb_next link for next set of rolling */
388 rbp2 = rbp->rb_next;
389 rbp->rb_next = NULL;
390 }
391
392 /*
393 * wait for all the writes to finish
394 */
395 for (rbp = rbs; rbp < rbs + nbuf; rbp++) {
396 bp = &rbp->rb_bh;
397 if (trans_not_wait(bp)) {
398 ldl_seterror(ul,
399 "Error writing master during ufs log roll");
400 }
401
402 /*
403 * Now wait for all the "cloned" buffer writes (if any)
404 * and free those headers
405 */
406 bp2 = bp->b_list;
407 bp->b_list = NULL;
408 while (bp2) {
409 if (trans_not_wait(bp2)) {
410 ldl_seterror(ul,
411 "Error writing master during ufs log roll");
412 }
413 bp = bp2;
414 bp2 = bp2->b_list;
415 kmem_free(bp, sizeof (buf_t));
416 }
417 }
418
419 if (ul->un_flags & LDL_ERROR)
420 return (1);
421 return (0);
422 }
423
424 void
425 trans_roll(ml_unit_t *ul)
426 {
427 callb_cpr_t cprinfo;
428 mt_map_t *logmap = ul->un_logmap;
429 rollbuf_t *rbs;
430 rollbuf_t *rbp;
431 buf_t *bp;
432 caddr_t roll_bufs;
433 uint32_t nmblk;
434 int i;
435 int doingforceroll;
436 int nbuf;
437
438 CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr,
439 "trans_roll");
440
441 /*
442 * We do not want the roll thread's writes to be
443 * throttled by the snapshot.
444 * If they are throttled then we can have a deadlock
445 * between the roll thread and the snapshot taskq thread:
446 * roll thread wants the throttling semaphore and
447 * the snapshot taskq thread cannot release the semaphore
448 * because it is writing to the log and the log is full.
449 */
450
451 (void) tsd_set(bypass_snapshot_throttle_key, (void*)1);
452
453 /*
454 * setup some roll parameters
455 */
456 if (trans_roll_tics == 0)
457 trans_roll_tics = 5 * hz;
458 nmblk = log_roll_buffers();
459
460 /*
461 * allocate the buffers and buffer headers
462 */
463 roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP);
464 rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP);
465
466 /*
467 * initialize the buffer headers
468 */
469 for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) {
470 rbp->rb_next = NULL;
471 bp = &rbp->rb_bh;
472 bioinit(bp);
473 bp->b_edev = ul->un_dev;
474 bp->b_iodone = trans_not_done;
475 bp->b_bufsize = MAPBLOCKSIZE;
476 }
477
478 doingforceroll = 0;
479
480 again:
481 /*
482 * LOOP FOREVER
483 */
484
485 /*
486 * exit on demand
487 */
488 mutex_enter(&logmap->mtm_mutex);
489 if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) {
490 kmem_free(rbs, nmblk * sizeof (rollbuf_t));
491 kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE);
492 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING |
493 MTM_ROLL_EXIT | MTM_ROLLING);
494 cv_broadcast(&logmap->mtm_from_roll_cv);
495 CALLB_CPR_EXIT(&cprinfo);
496 thread_exit();
497 /* NOTREACHED */
498 }
499
500 /*
501 * MT_SCAN debug mode
502 * don't roll except in FORCEROLL situations
503 */
504 if (logmap->mtm_debug & MT_SCAN)
505 if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) {
506 mutex_exit(&logmap->mtm_mutex);
507 trans_roll_wait(logmap, &cprinfo);
508 goto again;
509 }
510 ASSERT(logmap->mtm_trimlof == 0);
511
512 /*
513 * If we've finished a force roll cycle then wakeup any
514 * waiters.
515 */
516 if (doingforceroll) {
517 doingforceroll = 0;
518 logmap->mtm_flags &= ~MTM_FORCE_ROLL;
519 mutex_exit(&logmap->mtm_mutex);
520 cv_broadcast(&logmap->mtm_from_roll_cv);
521 } else {
522 mutex_exit(&logmap->mtm_mutex);
523 }
524
525 /*
526 * If someone wants us to roll something; then do it
527 */
528 if (logmap->mtm_flags & MTM_FORCE_ROLL) {
529 doingforceroll = 1;
530 goto rollsomething;
531 }
532
533 /*
534 * Log is busy, check if logmap is getting full.
535 */
536 if (logmap_need_roll(logmap)) {
537 goto rollsomething;
538 }
539
540 /*
541 * Check if the log is idle and is not empty
542 */
543 if (!logmap->mtm_ref && !ldl_empty(ul)) {
544 goto rollsomething;
545 }
546
547 /*
548 * Log is busy, check if its getting full
549 */
550 if (ldl_need_roll(ul)) {
551 goto rollsomething;
552 }
553
554 /*
555 * nothing to do; wait a bit and then start over
556 */
557 trans_roll_wait(logmap, &cprinfo);
558 goto again;
559
560 /*
561 * ROLL SOMETHING
562 */
563
564 rollsomething:
565 /*
566 * Use the cached roll buffers, or read the master
567 * and overlay the deltas
568 */
569 switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) {
570 case 1: trans_roll_wait(logmap, &cprinfo);
571 /* FALLTHROUGH */
572 case 2: goto again;
573 /* default case is success */
574 }
575
576 /*
577 * Asynchronously write out the deltas
578 */
579 if (log_roll_write(ul, rbs, nbuf))
580 goto again;
581
582 /*
583 * free up the deltas in the logmap
584 */
585 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
586 bp = &rbp->rb_bh;
587 logmap_remove_roll(logmap,
588 ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE);
589 }
590
591 /*
592 * free up log space; if possible
593 */
594 logmap_sethead(logmap, ul);
595
596 /*
597 * LOOP
598 */
599 goto again;
600 }