Print this page
NEX-5367 special vdev: sync-write options (NEW)
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4794 Write Back Cache sync and async writes: adjust routing according to watermark limits
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
6328 Fix cstyle errors in zfs codebase (fix studio)
6328 Fix cstyle errors in zfs codebase
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Alex Reece <alex@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed by: Jorgen Lundman <lundman@lundman.net>
Approved by: Robert Mustacchi <rm@joyent.com>
Issues #7: Reconsile L2ARC and "special" use by datasets
re #12616 rb4051 zfs_log_write()/dmu_sync() write once to special refactoring
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
        
*** 18,27 ****
--- 18,28 ----
   *
   * CDDL HEADER END
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
   * Copyright (c) 2015 by Delphix. All rights reserved.
   * Copyright (c) 2014 Integros [integros.com]
   */
  
  #include <sys/types.h>
*** 42,54 ****
--- 43,57 ----
  #include <sys/stat.h>
  #include <sys/mode.h>
  #include <sys/acl.h>
  #include <sys/dmu.h>
  #include <sys/spa.h>
+ #include <sys/spa_impl.h>
  #include <sys/zfs_fuid.h>
  #include <sys/ddi.h>
  #include <sys/dsl_dataset.h>
+ #include <sys/special.h>
  
  /*
   * These zfs_log_* functions must be called within a dmu tx, in one
   * of 2 contexts depending on zilog->z_replay:
   *
*** 452,478 ****
  
  void
  zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
      znode_t *zp, offset_t off, ssize_t resid, int ioflag)
  {
!         uint32_t blocksize = zp->z_blksz;
          itx_wr_state_t write_state;
          uintptr_t fsync_cnt;
  
          if (zil_replaying(zilog, tx) || zp->z_unlinked)
                  return;
  
!         if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
                  write_state = WR_INDIRECT;
          else if (!spa_has_slogs(zilog->zl_spa) &&
              resid >= zfs_immediate_write_sz)
                  write_state = WR_INDIRECT;
          else if (ioflag & (FSYNC | FDSYNC))
                  write_state = WR_COPIED;
          else
                  write_state = WR_NEED_COPY;
  
          if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
                  (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
          }
  
          while (resid) {
--- 455,545 ----
  
  void
  zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
      znode_t *zp, offset_t off, ssize_t resid, int ioflag)
  {
!         spa_t *spa = zilog->zl_spa;
!         spa_meta_placement_t *mp = &spa->spa_meta_policy;
          itx_wr_state_t write_state;
+         boolean_t slogging, zil_to_special, write_to_special;
+         size_t immediate_write_sz;
+         uint32_t blocksize = zp->z_blksz;
          uintptr_t fsync_cnt;
  
          if (zil_replaying(zilog, tx) || zp->z_unlinked)
                  return;
  
!         /*
!          * Decide how to handle the write:
!          * - WR_INDIRECT  - synchronously write in zfs format, via dmu_sync()
!          * - WR_COPIED    - write to slog following the tx descriptor as
!          *                  immediate data
!          * - WR_NEED_COPY - copy out in the future (e.g. with next sync)
!          *
!          * Special vdevs are as fast as slogs - therefore a conservative
!          * extension to the existing logic allows for the following
!          * zpool-configurable options:
!          *
!          * (1) SYNC_TO_SPECIAL_DISABLED: do not use special vdev,
!          *     neither for zil, nor for WR_INDIRECT
!          * (2) SYNC_TO_SPECIAL_STANDARD (default): use special vdev
!          *     exactly like slog
!          * The remaining two options add the capability to sync data to
!          * special vdev:
!          * (3) SYNC_TO_SPECIAL_BALANCED: same as "standard", plus
!          *     load balance writes to the special vdev
!          * (4) SYNC_TO_SPECIAL_ALWAYS: same as "standard" plus always
!          *     write to the special vdev
!          *
!          * Presence of special vdev has no affect if slog is configured:
!          * the latter indicates that user expects conventional zfs
!          * sync-write behavior.
!          */
! 
!         immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
!             ? 0 : zfs_immediate_write_sz;
! 
!         /* use special only if all of the following is true */
!         zil_to_special = !spa_has_slogs(spa) &&
!             spa_can_special_be_used(spa) &&
!             mp->spa_sync_to_special != SYNC_TO_SPECIAL_DISABLED;
! 
!         /*
!          * synchronously write data to special in zfs format - the
!          * WR_INDIRECT case
!          *
!          * for the "balanced" option distribute the load based on the
!          * special-to-normal ratio - the value that is periodically
!          * recomputed by the load balancer implementing one of
!          * SPA_SPECIAL_SELECTION_LATENCY etc. strategies
!          */
!         write_to_special = !spa_has_slogs(spa) &&
!             spa_write_data_to_special(spa, zilog->zl_os) &&
!             (mp->spa_sync_to_special == SYNC_TO_SPECIAL_ALWAYS ||
!             (mp->spa_sync_to_special == SYNC_TO_SPECIAL_BALANCED &&
!             spa->spa_avg_stat_rotor % 100 < spa->spa_special_to_normal_ratio));
! 
!         slogging = (spa_has_slogs(spa) || zil_to_special) &&
!             zilog->zl_logbias == ZFS_LOGBIAS_LATENCY;
! 
!         if (resid > immediate_write_sz && !slogging && resid <= blocksize)
                  write_state = WR_INDIRECT;
+         else if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+                 write_state = WR_INDIRECT;
          else if (!spa_has_slogs(zilog->zl_spa) &&
              resid >= zfs_immediate_write_sz)
                  write_state = WR_INDIRECT;
+         else if (write_to_special)
+                 write_state = WR_INDIRECT;
          else if (ioflag & (FSYNC | FDSYNC))
                  write_state = WR_COPIED;
          else
                  write_state = WR_NEED_COPY;
  
+         DTRACE_PROBE3(zfs_lwr, ssize_t, immediate_write_sz,
+             itx_wr_state_t, write_state, uint_t, zp->z_blksz);
+ 
          if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
                  (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
          }
  
          while (resid) {