3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2014 Integros [integros.com]
25 */
26
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/cmn_err.h>
32 #include <sys/kmem.h>
33 #include <sys/thread.h>
34 #include <sys/file.h>
35 #include <sys/vfs.h>
36 #include <sys/zfs_znode.h>
37 #include <sys/zfs_dir.h>
38 #include <sys/zil.h>
39 #include <sys/zil_impl.h>
40 #include <sys/byteorder.h>
41 #include <sys/policy.h>
42 #include <sys/stat.h>
43 #include <sys/mode.h>
44 #include <sys/acl.h>
45 #include <sys/dmu.h>
46 #include <sys/spa.h>
47 #include <sys/zfs_fuid.h>
48 #include <sys/ddi.h>
49 #include <sys/dsl_dataset.h>
50
51 /*
52 * These zfs_log_* functions must be called within a dmu tx, in one
53 * of 2 contexts depending on zilog->z_replay:
54 *
55 * Non replay mode
56 * ---------------
57 * We need to record the transaction so that if it is committed to
58 * the Intent Log then it can be replayed. An intent log transaction
59 * structure (itx_t) is allocated and all the information necessary to
60 * possibly replay the transaction is saved in it. The itx is then assigned
61 * a sequence number and inserted in the in-memory list anchored in the zilog.
62 *
63 * Replay mode
64 * -----------
65 * We need to mark the intent log record as replayed in the log header.
66 * This is done in the same transaction as the replay so that they
67 * commit atomically.
68 */
69
437 itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
438 lr = (lr_rename_t *)&itx->itx_lr;
439 lr->lr_sdoid = sdzp->z_id;
440 lr->lr_tdoid = tdzp->z_id;
441 bcopy(sname, (char *)(lr + 1), snamesize);
442 bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
443 itx->itx_oid = szp->z_id;
444
445 zil_itx_assign(zilog, itx, tx);
446 }
447
448 /*
449 * Handles TX_WRITE transactions.
450 */
451 ssize_t zfs_immediate_write_sz = 32768;
452
453 void
454 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
455 znode_t *zp, offset_t off, ssize_t resid, int ioflag)
456 {
457 uint32_t blocksize = zp->z_blksz;
458 itx_wr_state_t write_state;
459 uintptr_t fsync_cnt;
460
461 if (zil_replaying(zilog, tx) || zp->z_unlinked)
462 return;
463
464 if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
465 write_state = WR_INDIRECT;
466 else if (!spa_has_slogs(zilog->zl_spa) &&
467 resid >= zfs_immediate_write_sz)
468 write_state = WR_INDIRECT;
469 else if (ioflag & (FSYNC | FDSYNC))
470 write_state = WR_COPIED;
471 else
472 write_state = WR_NEED_COPY;
473
474 if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
475 (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
476 }
477
478 while (resid) {
479 itx_t *itx;
480 lr_write_t *lr;
481 itx_wr_state_t wr_state = write_state;
482 ssize_t len = resid;
483
484 if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
485 wr_state = WR_NEED_COPY;
486 else if (wr_state == WR_INDIRECT)
487 len = MIN(blocksize - P2PHASE(off, blocksize), resid);
488
489 itx = zil_itx_create(txtype, sizeof (*lr) +
490 (wr_state == WR_COPIED ? len : 0));
491 lr = (lr_write_t *)&itx->itx_lr;
492 if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
493 zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
|
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/sysmacros.h>
32 #include <sys/cmn_err.h>
33 #include <sys/kmem.h>
34 #include <sys/thread.h>
35 #include <sys/file.h>
36 #include <sys/vfs.h>
37 #include <sys/zfs_znode.h>
38 #include <sys/zfs_dir.h>
39 #include <sys/zil.h>
40 #include <sys/zil_impl.h>
41 #include <sys/byteorder.h>
42 #include <sys/policy.h>
43 #include <sys/stat.h>
44 #include <sys/mode.h>
45 #include <sys/acl.h>
46 #include <sys/dmu.h>
47 #include <sys/spa.h>
48 #include <sys/spa_impl.h>
49 #include <sys/zfs_fuid.h>
50 #include <sys/ddi.h>
51 #include <sys/dsl_dataset.h>
52 #include <sys/special.h>
53
54 /*
55 * These zfs_log_* functions must be called within a dmu tx, in one
56 * of 2 contexts depending on zilog->z_replay:
57 *
58 * Non replay mode
59 * ---------------
60 * We need to record the transaction so that if it is committed to
61 * the Intent Log then it can be replayed. An intent log transaction
62 * structure (itx_t) is allocated and all the information necessary to
63 * possibly replay the transaction is saved in it. The itx is then assigned
64 * a sequence number and inserted in the in-memory list anchored in the zilog.
65 *
66 * Replay mode
67 * -----------
68 * We need to mark the intent log record as replayed in the log header.
69 * This is done in the same transaction as the replay so that they
70 * commit atomically.
71 */
72
440 itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
441 lr = (lr_rename_t *)&itx->itx_lr;
442 lr->lr_sdoid = sdzp->z_id;
443 lr->lr_tdoid = tdzp->z_id;
444 bcopy(sname, (char *)(lr + 1), snamesize);
445 bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
446 itx->itx_oid = szp->z_id;
447
448 zil_itx_assign(zilog, itx, tx);
449 }
450
451 /*
452 * Handles TX_WRITE transactions.
453 */
454 ssize_t zfs_immediate_write_sz = 32768;
455
456 void
457 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
458 znode_t *zp, offset_t off, ssize_t resid, int ioflag)
459 {
460 spa_t *spa = zilog->zl_spa;
461 spa_meta_placement_t *mp = &spa->spa_meta_policy;
462 itx_wr_state_t write_state;
463 boolean_t slogging, zil_to_special, write_to_special;
464 size_t immediate_write_sz;
465 uint32_t blocksize = zp->z_blksz;
466 uintptr_t fsync_cnt;
467
468 if (zil_replaying(zilog, tx) || zp->z_unlinked)
469 return;
470
471 /*
472 * Decide how to handle the write:
473 * - WR_INDIRECT - synchronously write in zfs format, via dmu_sync()
474 * - WR_COPIED - write to slog following the tx descriptor as
475 * immediate data
476 * - WR_NEED_COPY - copy out in the future (e.g. with next sync)
477 *
478 * Special vdevs are as fast as slogs - therefore a conservative
479 * extension to the existing logic allows for the following
480 * zpool-configurable options:
481 *
482 * (1) SYNC_TO_SPECIAL_DISABLED: do not use special vdev,
483 * neither for zil, nor for WR_INDIRECT
484 * (2) SYNC_TO_SPECIAL_STANDARD (default): use special vdev
485 * exactly like slog
486 * The remaining two options add the capability to sync data to
487 * special vdev:
488 * (3) SYNC_TO_SPECIAL_BALANCED: same as "standard", plus
489 * load balance writes to the special vdev
490 * (4) SYNC_TO_SPECIAL_ALWAYS: same as "standard" plus always
491 * write to the special vdev
492 *
493 * Presence of special vdev has no affect if slog is configured:
494 * the latter indicates that user expects conventional zfs
495 * sync-write behavior.
496 */
497
498 immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
499 ? 0 : zfs_immediate_write_sz;
500
501 /* use special only if all of the following is true */
502 zil_to_special = !spa_has_slogs(spa) &&
503 spa_can_special_be_used(spa) &&
504 mp->spa_sync_to_special != SYNC_TO_SPECIAL_DISABLED;
505
506 /*
507 * synchronously write data to special in zfs format - the
508 * WR_INDIRECT case
509 *
510 * for the "balanced" option distribute the load based on the
511 * special-to-normal ratio - the value that is periodically
512 * recomputed by the load balancer implementing one of
513 * SPA_SPECIAL_SELECTION_LATENCY etc. strategies
514 */
515 write_to_special = !spa_has_slogs(spa) &&
516 spa_write_data_to_special(spa, zilog->zl_os) &&
517 (mp->spa_sync_to_special == SYNC_TO_SPECIAL_ALWAYS ||
518 (mp->spa_sync_to_special == SYNC_TO_SPECIAL_BALANCED &&
519 spa->spa_avg_stat_rotor % 100 < spa->spa_special_to_normal_ratio));
520
521 slogging = (spa_has_slogs(spa) || zil_to_special) &&
522 zilog->zl_logbias == ZFS_LOGBIAS_LATENCY;
523
524 if (resid > immediate_write_sz && !slogging && resid <= blocksize)
525 write_state = WR_INDIRECT;
526 else if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
527 write_state = WR_INDIRECT;
528 else if (!spa_has_slogs(zilog->zl_spa) &&
529 resid >= zfs_immediate_write_sz)
530 write_state = WR_INDIRECT;
531 else if (write_to_special)
532 write_state = WR_INDIRECT;
533 else if (ioflag & (FSYNC | FDSYNC))
534 write_state = WR_COPIED;
535 else
536 write_state = WR_NEED_COPY;
537
538 DTRACE_PROBE3(zfs_lwr, ssize_t, immediate_write_sz,
539 itx_wr_state_t, write_state, uint_t, zp->z_blksz);
540
541 if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
542 (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
543 }
544
545 while (resid) {
546 itx_t *itx;
547 lr_write_t *lr;
548 itx_wr_state_t wr_state = write_state;
549 ssize_t len = resid;
550
551 if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
552 wr_state = WR_NEED_COPY;
553 else if (wr_state == WR_INDIRECT)
554 len = MIN(blocksize - P2PHASE(off, blocksize), resid);
555
556 itx = zil_itx_create(txtype, sizeof (*lr) +
557 (wr_state == WR_COPIED ? len : 0));
558 lr = (lr_write_t *)&itx->itx_lr;
559 if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
560 zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
|