Print this page
NEX-5367 special vdev: sync-write options (NEW)
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4794 Write Back Cache sync and async writes: adjust routing according to watermark limits
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
6328 Fix cstyle errors in zfs codebase (fix studio)
6328 Fix cstyle errors in zfs codebase
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Alex Reece <alex@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed by: Jorgen Lundman <lundman@lundman.net>
Approved by: Robert Mustacchi <rm@joyent.com>
Issues #7: Reconsile L2ARC and "special" use by datasets
re #12616 rb4051 zfs_log_write()/dmu_sync() write once to special refactoring
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code


   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.

  23  * Copyright (c) 2015 by Delphix. All rights reserved.
  24  * Copyright (c) 2014 Integros [integros.com]
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/param.h>
  29 #include <sys/systm.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/cmn_err.h>
  32 #include <sys/kmem.h>
  33 #include <sys/thread.h>
  34 #include <sys/file.h>
  35 #include <sys/vfs.h>
  36 #include <sys/zfs_znode.h>
  37 #include <sys/zfs_dir.h>
  38 #include <sys/zil.h>
  39 #include <sys/zil_impl.h>
  40 #include <sys/byteorder.h>
  41 #include <sys/policy.h>
  42 #include <sys/stat.h>
  43 #include <sys/mode.h>
  44 #include <sys/acl.h>
  45 #include <sys/dmu.h>
  46 #include <sys/spa.h>

  47 #include <sys/zfs_fuid.h>
  48 #include <sys/ddi.h>
  49 #include <sys/dsl_dataset.h>

  50 
  51 /*
  52  * These zfs_log_* functions must be called within a dmu tx, in one
  53  * of 2 contexts depending on zilog->z_replay:
  54  *
  55  * Non replay mode
  56  * ---------------
  57  * We need to record the transaction so that if it is committed to
  58  * the Intent Log then it can be replayed.  An intent log transaction
  59  * structure (itx_t) is allocated and all the information necessary to
  60  * possibly replay the transaction is saved in it. The itx is then assigned
  61  * a sequence number and inserted in the in-memory list anchored in the zilog.
  62  *
  63  * Replay mode
  64  * -----------
  65  * We need to mark the intent log record as replayed in the log header.
  66  * This is done in the same transaction as the replay so that they
  67  * commit atomically.
  68  */
  69 


 437         itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
 438         lr = (lr_rename_t *)&itx->itx_lr;
 439         lr->lr_sdoid = sdzp->z_id;
 440         lr->lr_tdoid = tdzp->z_id;
 441         bcopy(sname, (char *)(lr + 1), snamesize);
 442         bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
 443         itx->itx_oid = szp->z_id;
 444 
 445         zil_itx_assign(zilog, itx, tx);
 446 }
 447 
 448 /*
 449  * Handles TX_WRITE transactions.
 450  */
 451 ssize_t zfs_immediate_write_sz = 32768;
 452 
 453 void
 454 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 455     znode_t *zp, offset_t off, ssize_t resid, int ioflag)
 456 {
 457         uint32_t blocksize = zp->z_blksz;

 458         itx_wr_state_t write_state;



 459         uintptr_t fsync_cnt;
 460 
 461         if (zil_replaying(zilog, tx) || zp->z_unlinked)
 462                 return;
 463 
 464         if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)





















































 465                 write_state = WR_INDIRECT;


 466         else if (!spa_has_slogs(zilog->zl_spa) &&
 467             resid >= zfs_immediate_write_sz)
 468                 write_state = WR_INDIRECT;


 469         else if (ioflag & (FSYNC | FDSYNC))
 470                 write_state = WR_COPIED;
 471         else
 472                 write_state = WR_NEED_COPY;
 473 



 474         if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
 475                 (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
 476         }
 477 
 478         while (resid) {
 479                 itx_t *itx;
 480                 lr_write_t *lr;
 481                 itx_wr_state_t wr_state = write_state;
 482                 ssize_t len = resid;
 483 
 484                 if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
 485                         wr_state = WR_NEED_COPY;
 486                 else if (wr_state == WR_INDIRECT)
 487                         len = MIN(blocksize - P2PHASE(off, blocksize), resid);
 488 
 489                 itx = zil_itx_create(txtype, sizeof (*lr) +
 490                     (wr_state == WR_COPIED ? len : 0));
 491                 lr = (lr_write_t *)&itx->itx_lr;
 492                 if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
 493                     zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {




   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2015 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 Integros [integros.com]
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/param.h>
  30 #include <sys/systm.h>
  31 #include <sys/sysmacros.h>
  32 #include <sys/cmn_err.h>
  33 #include <sys/kmem.h>
  34 #include <sys/thread.h>
  35 #include <sys/file.h>
  36 #include <sys/vfs.h>
  37 #include <sys/zfs_znode.h>
  38 #include <sys/zfs_dir.h>
  39 #include <sys/zil.h>
  40 #include <sys/zil_impl.h>
  41 #include <sys/byteorder.h>
  42 #include <sys/policy.h>
  43 #include <sys/stat.h>
  44 #include <sys/mode.h>
  45 #include <sys/acl.h>
  46 #include <sys/dmu.h>
  47 #include <sys/spa.h>
  48 #include <sys/spa_impl.h>
  49 #include <sys/zfs_fuid.h>
  50 #include <sys/ddi.h>
  51 #include <sys/dsl_dataset.h>
  52 #include <sys/special.h>
  53 
  54 /*
  55  * These zfs_log_* functions must be called within a dmu tx, in one
  56  * of 2 contexts depending on zilog->z_replay:
  57  *
  58  * Non replay mode
  59  * ---------------
  60  * We need to record the transaction so that if it is committed to
  61  * the Intent Log then it can be replayed.  An intent log transaction
  62  * structure (itx_t) is allocated and all the information necessary to
  63  * possibly replay the transaction is saved in it. The itx is then assigned
  64  * a sequence number and inserted in the in-memory list anchored in the zilog.
  65  *
  66  * Replay mode
  67  * -----------
  68  * We need to mark the intent log record as replayed in the log header.
  69  * This is done in the same transaction as the replay so that they
  70  * commit atomically.
  71  */
  72 


 440         itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
 441         lr = (lr_rename_t *)&itx->itx_lr;
 442         lr->lr_sdoid = sdzp->z_id;
 443         lr->lr_tdoid = tdzp->z_id;
 444         bcopy(sname, (char *)(lr + 1), snamesize);
 445         bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
 446         itx->itx_oid = szp->z_id;
 447 
 448         zil_itx_assign(zilog, itx, tx);
 449 }
 450 
 451 /*
 452  * Handles TX_WRITE transactions.
 453  */
 454 ssize_t zfs_immediate_write_sz = 32768;
 455 
 456 void
 457 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 458     znode_t *zp, offset_t off, ssize_t resid, int ioflag)
 459 {
 460         spa_t *spa = zilog->zl_spa;
 461         spa_meta_placement_t *mp = &spa->spa_meta_policy;
 462         itx_wr_state_t write_state;
 463         boolean_t slogging, zil_to_special, write_to_special;
 464         size_t immediate_write_sz;
 465         uint32_t blocksize = zp->z_blksz;
 466         uintptr_t fsync_cnt;
 467 
 468         if (zil_replaying(zilog, tx) || zp->z_unlinked)
 469                 return;
 470 
 471         /*
 472          * Decide how to handle the write:
 473          * - WR_INDIRECT  - synchronously write in zfs format, via dmu_sync()
 474          * - WR_COPIED    - write to slog following the tx descriptor as
 475          *                  immediate data
 476          * - WR_NEED_COPY - copy out in the future (e.g. with next sync)
 477          *
 478          * Special vdevs are as fast as slogs - therefore a conservative
 479          * extension to the existing logic allows for the following
 480          * zpool-configurable options:
 481          *
 482          * (1) SYNC_TO_SPECIAL_DISABLED: do not use special vdev,
 483          *     neither for zil, nor for WR_INDIRECT
 484          * (2) SYNC_TO_SPECIAL_STANDARD (default): use special vdev
 485          *     exactly like slog
 486          * The remaining two options add the capability to sync data to
 487          * special vdev:
 488          * (3) SYNC_TO_SPECIAL_BALANCED: same as "standard", plus
 489          *     load balance writes to the special vdev
 490          * (4) SYNC_TO_SPECIAL_ALWAYS: same as "standard" plus always
 491          *     write to the special vdev
 492          *
 493          * Presence of special vdev has no affect if slog is configured:
 494          * the latter indicates that user expects conventional zfs
 495          * sync-write behavior.
 496          */
 497 
 498         immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
 499             ? 0 : zfs_immediate_write_sz;
 500 
 501         /* use special only if all of the following is true */
 502         zil_to_special = !spa_has_slogs(spa) &&
 503             spa_can_special_be_used(spa) &&
 504             mp->spa_sync_to_special != SYNC_TO_SPECIAL_DISABLED;
 505 
 506         /*
 507          * synchronously write data to special in zfs format - the
 508          * WR_INDIRECT case
 509          *
 510          * for the "balanced" option distribute the load based on the
 511          * special-to-normal ratio - the value that is periodically
 512          * recomputed by the load balancer implementing one of
 513          * SPA_SPECIAL_SELECTION_LATENCY etc. strategies
 514          */
 515         write_to_special = !spa_has_slogs(spa) &&
 516             spa_write_data_to_special(spa, zilog->zl_os) &&
 517             (mp->spa_sync_to_special == SYNC_TO_SPECIAL_ALWAYS ||
 518             (mp->spa_sync_to_special == SYNC_TO_SPECIAL_BALANCED &&
 519             spa->spa_avg_stat_rotor % 100 < spa->spa_special_to_normal_ratio));
 520 
 521         slogging = (spa_has_slogs(spa) || zil_to_special) &&
 522             zilog->zl_logbias == ZFS_LOGBIAS_LATENCY;
 523 
 524         if (resid > immediate_write_sz && !slogging && resid <= blocksize)
 525                 write_state = WR_INDIRECT;
 526         else if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
 527                 write_state = WR_INDIRECT;
 528         else if (!spa_has_slogs(zilog->zl_spa) &&
 529             resid >= zfs_immediate_write_sz)
 530                 write_state = WR_INDIRECT;
 531         else if (write_to_special)
 532                 write_state = WR_INDIRECT;
 533         else if (ioflag & (FSYNC | FDSYNC))
 534                 write_state = WR_COPIED;
 535         else
 536                 write_state = WR_NEED_COPY;
 537 
 538         DTRACE_PROBE3(zfs_lwr, ssize_t, immediate_write_sz,
 539             itx_wr_state_t, write_state, uint_t, zp->z_blksz);
 540 
 541         if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
 542                 (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
 543         }
 544 
 545         while (resid) {
 546                 itx_t *itx;
 547                 lr_write_t *lr;
 548                 itx_wr_state_t wr_state = write_state;
 549                 ssize_t len = resid;
 550 
 551                 if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
 552                         wr_state = WR_NEED_COPY;
 553                 else if (wr_state == WR_INDIRECT)
 554                         len = MIN(blocksize - P2PHASE(off, blocksize), resid);
 555 
 556                 itx = zil_itx_create(txtype, sizeof (*lr) +
 557                     (wr_state == WR_COPIED ? len : 0));
 558                 lr = (lr_write_t *)&itx->itx_lr;
 559                 if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
 560                     zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {