Print this page
NEX-5367 special vdev: sync-write options (NEW)
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4794 Write Back Cache sync and async writes: adjust routing according to watermark limits
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
6328 Fix cstyle errors in zfs codebase (fix studio)
6328 Fix cstyle errors in zfs codebase
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Alex Reece <alex@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed by: Jorgen Lundman <lundman@lundman.net>
Approved by: Robert Mustacchi <rm@joyent.com>
Issues #7: Reconsile L2ARC and "special" use by datasets
re #12616 rb4051 zfs_log_write()/dmu_sync() write once to special refactoring
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/zfs_log.c
          +++ new/usr/src/uts/common/fs/zfs/zfs_log.c
↓ open down ↓ 12 lines elided ↑ open up ↑
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  23   24   * Copyright (c) 2015 by Delphix. All rights reserved.
  24   25   * Copyright (c) 2014 Integros [integros.com]
  25   26   */
  26   27  
  27   28  #include <sys/types.h>
  28   29  #include <sys/param.h>
  29   30  #include <sys/systm.h>
  30   31  #include <sys/sysmacros.h>
  31   32  #include <sys/cmn_err.h>
  32   33  #include <sys/kmem.h>
↓ open down ↓ 4 lines elided ↑ open up ↑
  37   38  #include <sys/zfs_dir.h>
  38   39  #include <sys/zil.h>
  39   40  #include <sys/zil_impl.h>
  40   41  #include <sys/byteorder.h>
  41   42  #include <sys/policy.h>
  42   43  #include <sys/stat.h>
  43   44  #include <sys/mode.h>
  44   45  #include <sys/acl.h>
  45   46  #include <sys/dmu.h>
  46   47  #include <sys/spa.h>
       48 +#include <sys/spa_impl.h>
  47   49  #include <sys/zfs_fuid.h>
  48   50  #include <sys/ddi.h>
  49   51  #include <sys/dsl_dataset.h>
       52 +#include <sys/special.h>
  50   53  
  51   54  /*
  52   55   * These zfs_log_* functions must be called within a dmu tx, in one
  53   56   * of 2 contexts depending on zilog->z_replay:
  54   57   *
  55   58   * Non replay mode
  56   59   * ---------------
  57   60   * We need to record the transaction so that if it is committed to
  58   61   * the Intent Log then it can be replayed.  An intent log transaction
  59   62   * structure (itx_t) is allocated and all the information necessary to
↓ open down ↓ 387 lines elided ↑ open up ↑
 447  450  
 448  451  /*
 449  452   * Handles TX_WRITE transactions.
 450  453   */
 451  454  ssize_t zfs_immediate_write_sz = 32768;
 452  455  
 453  456  void
 454  457  zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 455  458      znode_t *zp, offset_t off, ssize_t resid, int ioflag)
 456  459  {
 457      -        uint32_t blocksize = zp->z_blksz;
      460 +        spa_t *spa = zilog->zl_spa;
      461 +        spa_meta_placement_t *mp = &spa->spa_meta_policy;
 458  462          itx_wr_state_t write_state;
      463 +        boolean_t slogging, zil_to_special, write_to_special;
      464 +        size_t immediate_write_sz;
      465 +        uint32_t blocksize = zp->z_blksz;
 459  466          uintptr_t fsync_cnt;
 460  467  
 461  468          if (zil_replaying(zilog, tx) || zp->z_unlinked)
 462  469                  return;
 463  470  
 464      -        if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
      471 +        /*
      472 +         * Decide how to handle the write:
      473 +         * - WR_INDIRECT  - synchronously write in zfs format, via dmu_sync()
      474 +         * - WR_COPIED    - write to slog following the tx descriptor as
      475 +         *                  immediate data
      476 +         * - WR_NEED_COPY - copy out in the future (e.g. with next sync)
      477 +         *
      478 +         * Special vdevs are as fast as slogs - therefore a conservative
      479 +         * extension to the existing logic allows for the following
      480 +         * zpool-configurable options:
      481 +         *
      482 +         * (1) SYNC_TO_SPECIAL_DISABLED: do not use special vdev,
      483 +         *     neither for zil, nor for WR_INDIRECT
      484 +         * (2) SYNC_TO_SPECIAL_STANDARD (default): use special vdev
      485 +         *     exactly like slog
      486 +         * The remaining two options add the capability to sync data to
      487 +         * special vdev:
      488 +         * (3) SYNC_TO_SPECIAL_BALANCED: same as "standard", plus
      489 +         *     load balance writes to the special vdev
      490 +         * (4) SYNC_TO_SPECIAL_ALWAYS: same as "standard" plus always
      491 +         *     write to the special vdev
      492 +         *
      493 +         * Presence of special vdev has no affect if slog is configured:
      494 +         * the latter indicates that user expects conventional zfs
      495 +         * sync-write behavior.
      496 +         */
      497 +
      498 +        immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
      499 +            ? 0 : zfs_immediate_write_sz;
      500 +
      501 +        /* use special only if all of the following is true */
      502 +        zil_to_special = !spa_has_slogs(spa) &&
      503 +            spa_can_special_be_used(spa) &&
      504 +            mp->spa_sync_to_special != SYNC_TO_SPECIAL_DISABLED;
      505 +
      506 +        /*
      507 +         * synchronously write data to special in zfs format - the
      508 +         * WR_INDIRECT case
      509 +         *
      510 +         * for the "balanced" option distribute the load based on the
      511 +         * special-to-normal ratio - the value that is periodically
      512 +         * recomputed by the load balancer implementing one of
      513 +         * SPA_SPECIAL_SELECTION_LATENCY etc. strategies
      514 +         */
      515 +        write_to_special = !spa_has_slogs(spa) &&
      516 +            spa_write_data_to_special(spa, zilog->zl_os) &&
      517 +            (mp->spa_sync_to_special == SYNC_TO_SPECIAL_ALWAYS ||
      518 +            (mp->spa_sync_to_special == SYNC_TO_SPECIAL_BALANCED &&
      519 +            spa->spa_avg_stat_rotor % 100 < spa->spa_special_to_normal_ratio));
      520 +
      521 +        slogging = (spa_has_slogs(spa) || zil_to_special) &&
      522 +            zilog->zl_logbias == ZFS_LOGBIAS_LATENCY;
      523 +
      524 +        if (resid > immediate_write_sz && !slogging && resid <= blocksize)
 465  525                  write_state = WR_INDIRECT;
      526 +        else if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
      527 +                write_state = WR_INDIRECT;
 466  528          else if (!spa_has_slogs(zilog->zl_spa) &&
 467  529              resid >= zfs_immediate_write_sz)
 468  530                  write_state = WR_INDIRECT;
      531 +        else if (write_to_special)
      532 +                write_state = WR_INDIRECT;
 469  533          else if (ioflag & (FSYNC | FDSYNC))
 470  534                  write_state = WR_COPIED;
 471  535          else
 472  536                  write_state = WR_NEED_COPY;
 473  537  
      538 +        DTRACE_PROBE3(zfs_lwr, ssize_t, immediate_write_sz,
      539 +            itx_wr_state_t, write_state, uint_t, zp->z_blksz);
      540 +
 474  541          if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
 475  542                  (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
 476  543          }
 477  544  
 478  545          while (resid) {
 479  546                  itx_t *itx;
 480  547                  lr_write_t *lr;
 481  548                  itx_wr_state_t wr_state = write_state;
 482  549                  ssize_t len = resid;
 483  550  
↓ open down ↓ 186 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX