Print this page
NEX-19083 backport OS-7314 zil_commit should omit cache thrash
9962 zil_commit should omit cache thrash
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Joshua M. Clulow <josh@sysmgr.org>
NEX-9752 backport illumos 6950 ARC should cache compressed data
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
6950 ARC should cache compressed data
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Don Brady <don.brady@intel.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-5367 special vdev: sync-write options (NEW)
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
5269 zpool import slow
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/zil.c
          +++ new/usr/src/uts/common/fs/zfs/zil.c
↓ open down ↓ 12 lines elided ↑ open up ↑
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  23   24   * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  24   25   * Copyright (c) 2014 Integros [integros.com]
  25   26   */
  26   27  
  27   28  /* Portions Copyright 2010 Robert Milkowski */
  28   29  
  29   30  #include <sys/zfs_context.h>
  30   31  #include <sys/spa.h>
  31   32  #include <sys/dmu.h>
  32   33  #include <sys/zap.h>
↓ open down ↓ 51 lines elided ↑ open up ↑
  84   85   * This controls the amount of time that a ZIL block (lwb) will remain
  85   86   * "open" when it isn't "full", and it has a thread waiting for it to be
  86   87   * committed to stable storage. Please refer to the zil_commit_waiter()
  87   88   * function (and the comments within it) for more details.
  88   89   */
  89   90  int zfs_commit_timeout_pct = 5;
  90   91  
  91   92  /*
  92   93   * Disable intent logging replay.  This global ZIL switch affects all pools.
  93   94   */
  94      -int zil_replay_disable = 0;
       95 +int zil_replay_disable = 0;    /* disable intent logging replay */
  95   96  
  96   97  /*
  97   98   * Tunable parameter for debugging or performance analysis.  Setting
  98   99   * zfs_nocacheflush will cause corruption on power loss if a volatile
  99  100   * out-of-order write cache is enabled.
 100  101   */
 101  102  boolean_t zfs_nocacheflush = B_FALSE;
 102  103  
 103  104  /*
 104  105   * Limit SLOG write size per commit executed with synchronous priority.
↓ open down ↓ 406 lines elided ↑ open up ↑
 511  512  zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 512  513  {
 513  514          ASSERT(MUTEX_HELD(&zilog->zl_lock));
 514  515          ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
 515  516          VERIFY(list_is_empty(&lwb->lwb_waiters));
 516  517          ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 517  518          ASSERT3P(lwb->lwb_write_zio, ==, NULL);
 518  519          ASSERT3P(lwb->lwb_root_zio, ==, NULL);
 519  520          ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
 520  521          ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
 521      -            lwb->lwb_state == LWB_STATE_DONE);
      522 +            lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 522  523  
 523  524          /*
 524  525           * Clear the zilog's field to indicate this lwb is no longer
 525  526           * valid, and prevent use-after-free errors.
 526  527           */
 527  528          if (zilog->zl_last_lwb_opened == lwb)
 528  529                  zilog->zl_last_lwb_opened = NULL;
 529  530  
 530  531          kmem_cache_free(zil_lwb_cache, lwb);
 531  532  }
↓ open down ↓ 337 lines elided ↑ open up ↑
 869  870           * The lwb_waiters field of the lwb is protected by the zilog's
 870  871           * zl_lock, thus it must be held when calling this function.
 871  872           */
 872  873          ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
 873  874  
 874  875          mutex_enter(&zcw->zcw_lock);
 875  876          ASSERT(!list_link_active(&zcw->zcw_node));
 876  877          ASSERT3P(zcw->zcw_lwb, ==, NULL);
 877  878          ASSERT3P(lwb, !=, NULL);
 878  879          ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
 879      -            lwb->lwb_state == LWB_STATE_ISSUED);
      880 +            lwb->lwb_state == LWB_STATE_ISSUED ||
      881 +            lwb->lwb_state == LWB_STATE_WRITE_DONE);
 880  882  
 881  883          list_insert_tail(&lwb->lwb_waiters, zcw);
 882  884          zcw->zcw_lwb = lwb;
 883  885          mutex_exit(&zcw->zcw_lock);
 884  886  }
 885  887  
 886  888  /*
 887  889   * This function is used when zio_alloc_zil() fails to allocate a ZIL
 888  890   * block, and the given waiter must be linked to the "nolwb waiters"
 889  891   * list inside of zil_process_commit_list().
↓ open down ↓ 25 lines elided ↑ open up ↑
 915  917                  zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 916  918                  if (avl_find(t, &zvsearch, &where) == NULL) {
 917  919                          zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
 918  920                          zv->zv_vdev = zvsearch.zv_vdev;
 919  921                          avl_insert(t, zv, where);
 920  922                  }
 921  923          }
 922  924          mutex_exit(&lwb->lwb_vdev_lock);
 923  925  }
 924  926  
      927 +static void
      928 +zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
      929 +{
      930 +        avl_tree_t *src = &lwb->lwb_vdev_tree;
      931 +        avl_tree_t *dst = &nlwb->lwb_vdev_tree;
      932 +        void *cookie = NULL;
      933 +        zil_vdev_node_t *zv;
      934 +
      935 +        ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
      936 +        ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
      937 +        ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
      938 +
      939 +        /*
      940 +         * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
      941 +         * not need the protection of lwb_vdev_lock (it will only be modified
      942 +         * while holding zilog->zl_lock) as its writes and those of its
      943 +         * children have all completed.  The younger 'nlwb' may be waiting on
      944 +         * future writes to additional vdevs.
      945 +         */
      946 +        mutex_enter(&nlwb->lwb_vdev_lock);
      947 +        /*
      948 +         * Tear down the 'lwb' vdev tree, ensuring that entries which do not
      949 +         * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
      950 +         */
      951 +        while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
      952 +                avl_index_t where;
      953 +
      954 +                if (avl_find(dst, zv, &where) == NULL) {
      955 +                        avl_insert(dst, zv, where);
      956 +                } else {
      957 +                        kmem_free(zv, sizeof (*zv));
      958 +                }
      959 +        }
      960 +        mutex_exit(&nlwb->lwb_vdev_lock);
      961 +}
      962 +
 925  963  void
 926  964  zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
 927  965  {
 928  966          lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 929  967  }
 930  968  
 931  969  /*
 932      - * This function is a called after all VDEVs associated with a given lwb
      970 + * This function is a called after all vdevs associated with a given lwb
 933  971   * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
 934      - * as the lwb write completes, if "zfs_nocacheflush" is set.
      972 + * as the lwb write completes, if "zil_nocacheflush" is set. Further,
      973 + * all "previous" lwb's will have completed before this function is
      974 + * called; i.e. this function is called for all previous lwbs before
      975 + * it's called for "this" lwb (enforced via zio the dependencies
      976 + * configured in zil_lwb_set_zio_dependency()).
 935  977   *
 936  978   * The intention is for this function to be called as soon as the
 937  979   * contents of an lwb are considered "stable" on disk, and will survive
 938  980   * any sudden loss of power. At this point, any threads waiting for the
 939  981   * lwb to reach this state are signalled, and the "waiter" structures
 940  982   * are marked "done".
 941  983   */
 942  984  static void
 943  985  zil_lwb_flush_vdevs_done(zio_t *zio)
 944  986  {
↓ open down ↓ 16 lines elided ↑ open up ↑
 961 1003           * zil_process_commit_list(). zil_sync() will only remove the
 962 1004           * lwb if lwb_buf is null.
 963 1005           */
 964 1006          lwb->lwb_buf = NULL;
 965 1007          lwb->lwb_tx = NULL;
 966 1008  
 967 1009          ASSERT3U(lwb->lwb_issued_timestamp, >, 0);
 968 1010          zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
 969 1011  
 970 1012          lwb->lwb_root_zio = NULL;
 971      -        lwb->lwb_state = LWB_STATE_DONE;
 972 1013  
     1014 +        ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
     1015 +        lwb->lwb_state = LWB_STATE_FLUSH_DONE;
     1016 +
 973 1017          if (zilog->zl_last_lwb_opened == lwb) {
 974 1018                  /*
 975 1019                   * Remember the highest committed log sequence number
 976 1020                   * for ztest. We only update this value when all the log
 977 1021                   * writes succeeded, because ztest wants to ASSERT that
 978 1022                   * it got the whole log chain.
 979 1023                   */
 980 1024                  zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 981 1025          }
 982 1026  
↓ open down ↓ 19 lines elided ↑ open up ↑
1002 1046  
1003 1047          /*
1004 1048           * Now that we've written this log block, we have a stable pointer
1005 1049           * to the next block in the chain, so it's OK to let the txg in
1006 1050           * which we allocated the next block sync.
1007 1051           */
1008 1052          dmu_tx_commit(tx);
1009 1053  }
1010 1054  
1011 1055  /*
1012      - * This is called when an lwb write completes. This means, this specific
1013      - * lwb was written to disk, and all dependent lwb have also been
1014      - * written to disk.
1015      - *
1016      - * At this point, a DKIOCFLUSHWRITECACHE command hasn't been issued to
1017      - * the VDEVs involved in writing out this specific lwb. The lwb will be
1018      - * "done" once zil_lwb_flush_vdevs_done() is called, which occurs in the
1019      - * zio completion callback for the lwb's root zio.
     1056 + * This is called when an lwb's write zio completes. The callback's
     1057 + * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
     1058 + * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
     1059 + * in writing out this specific lwb's data, and in the case that cache
     1060 + * flushes have been deferred, vdevs involved in writing the data for
     1061 + * previous lwbs. The writes corresponding to all the vdevs in the
     1062 + * lwb_vdev_tree will have completed by the time this is called, due to
     1063 + * the zio dependencies configured in zil_lwb_set_zio_dependency(),
     1064 + * which takes deferred flushes into account. The lwb will be "done"
     1065 + * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
     1066 + * completion callback for the lwb's root zio.
1020 1067   */
1021 1068  static void
1022 1069  zil_lwb_write_done(zio_t *zio)
1023 1070  {
1024 1071          lwb_t *lwb = zio->io_private;
1025 1072          spa_t *spa = zio->io_spa;
1026 1073          zilog_t *zilog = lwb->lwb_zilog;
1027 1074          avl_tree_t *t = &lwb->lwb_vdev_tree;
1028 1075          void *cookie = NULL;
1029 1076          zil_vdev_node_t *zv;
     1077 +        lwb_t *nlwb;
1030 1078  
1031 1079          ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
1032 1080  
1033 1081          ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1034 1082          ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
1035 1083          ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
1036 1084          ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
1037 1085          ASSERT(!BP_IS_GANG(zio->io_bp));
1038 1086          ASSERT(!BP_IS_HOLE(zio->io_bp));
1039 1087          ASSERT(BP_GET_FILL(zio->io_bp) == 0);
1040 1088  
1041 1089          abd_put(zio->io_abd);
1042 1090  
1043      -        ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
1044      -
1045 1091          mutex_enter(&zilog->zl_lock);
     1092 +        ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
     1093 +        lwb->lwb_state = LWB_STATE_WRITE_DONE;
1046 1094          lwb->lwb_write_zio = NULL;
     1095 +        nlwb = list_next(&zilog->zl_lwb_list, lwb);
1047 1096          mutex_exit(&zilog->zl_lock);
1048 1097  
1049 1098          if (avl_numnodes(t) == 0)
1050 1099                  return;
1051 1100  
1052 1101          /*
1053 1102           * If there was an IO error, we're not going to call zio_flush()
1054 1103           * on these vdevs, so we simply empty the tree and free the
1055 1104           * nodes. We avoid calling zio_flush() since there isn't any
1056 1105           * good reason for doing so, after the lwb block failed to be
1057 1106           * written out.
1058 1107           */
1059 1108          if (zio->io_error != 0) {
1060 1109                  while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
1061 1110                          kmem_free(zv, sizeof (*zv));
1062 1111                  return;
1063 1112          }
1064 1113  
     1114 +        /*
     1115 +         * If this lwb does not have any threads waiting for it to
     1116 +         * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
     1117 +         * command to the vdevs written to by "this" lwb, and instead
     1118 +         * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
     1119 +         * command for those vdevs. Thus, we merge the vdev tree of
     1120 +         * "this" lwb with the vdev tree of the "next" lwb in the list,
     1121 +         * and assume the "next" lwb will handle flushing the vdevs (or
     1122 +         * deferring the flush(s) again).
     1123 +         *
     1124 +         * This is a useful performance optimization, especially for
     1125 +         * workloads with lots of async write activity and few sync
     1126 +         * write and/or fsync activity, as it has the potential to
     1127 +         * coalesce multiple flush commands to a vdev into one.
     1128 +         */
     1129 +        if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) {
     1130 +                zil_lwb_flush_defer(lwb, nlwb);
     1131 +                ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
     1132 +                return;
     1133 +        }
     1134 +
1065 1135          while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
1066 1136                  vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
1067 1137                  if (vd != NULL)
1068 1138                          zio_flush(lwb->lwb_root_zio, vd);
1069 1139                  kmem_free(zv, sizeof (*zv));
1070 1140          }
1071 1141  }
1072 1142  
     1143 +static void
     1144 +zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
     1145 +{
     1146 +        lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
     1147 +
     1148 +        ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
     1149 +        ASSERT(MUTEX_HELD(&zilog->zl_lock));
     1150 +
     1151 +        /*
     1152 +         * The zilog's "zl_last_lwb_opened" field is used to build the
     1153 +         * lwb/zio dependency chain, which is used to preserve the
     1154 +         * ordering of lwb completions that is required by the semantics
     1155 +         * of the ZIL. Each new lwb zio becomes a parent of the
     1156 +         * "previous" lwb zio, such that the new lwb's zio cannot
     1157 +         * complete until the "previous" lwb's zio completes.
     1158 +         *
     1159 +         * This is required by the semantics of zil_commit(); the commit
     1160 +         * waiters attached to the lwbs will be woken in the lwb zio's
     1161 +         * completion callback, so this zio dependency graph ensures the
     1162 +         * waiters are woken in the correct order (the same order the
     1163 +         * lwbs were created).
     1164 +         */
     1165 +        if (last_lwb_opened != NULL &&
     1166 +            last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) {
     1167 +                ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
     1168 +                    last_lwb_opened->lwb_state == LWB_STATE_ISSUED ||
     1169 +                    last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE);
     1170 +
     1171 +                ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
     1172 +                zio_add_child(lwb->lwb_root_zio,
     1173 +                    last_lwb_opened->lwb_root_zio);
     1174 +
     1175 +                /*
     1176 +                 * If the previous lwb's write hasn't already completed,
     1177 +                 * we also want to order the completion of the lwb write
     1178 +                 * zios (above, we only order the completion of the lwb
     1179 +                 * root zios). This is required because of how we can
     1180 +                 * defer the DKIOCFLUSHWRITECACHE commands for each lwb.
     1181 +                 *
     1182 +                 * When the DKIOCFLUSHWRITECACHE commands are defered,
     1183 +                 * the previous lwb will rely on this lwb to flush the
     1184 +                 * vdevs written to by that previous lwb. Thus, we need
     1185 +                 * to ensure this lwb doesn't issue the flush until
     1186 +                 * after the previous lwb's write completes. We ensure
     1187 +                 * this ordering by setting the zio parent/child
     1188 +                 * relationship here.
     1189 +                 *
     1190 +                 * Without this relationship on the lwb's write zio,
     1191 +                 * it's possible for this lwb's write to complete prior
     1192 +                 * to the previous lwb's write completing; and thus, the
     1193 +                 * vdevs for the previous lwb would be flushed prior to
     1194 +                 * that lwb's data being written to those vdevs (the
     1195 +                 * vdevs are flushed in the lwb write zio's completion
     1196 +                 * handler, zil_lwb_write_done()).
     1197 +                 */
     1198 +                if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) {
     1199 +                        ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
     1200 +                            last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
     1201 +
     1202 +                        ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL);
     1203 +                        zio_add_child(lwb->lwb_write_zio,
     1204 +                            last_lwb_opened->lwb_write_zio);
     1205 +                }
     1206 +        }
     1207 +}
     1208 +
     1209 +
1073 1210  /*
1074 1211   * This function's purpose is to "open" an lwb such that it is ready to
1075 1212   * accept new itxs being committed to it. To do this, the lwb's zio
1076 1213   * structures are created, and linked to the lwb. This function is
1077 1214   * idempotent; if the passed in lwb has already been opened, this
1078 1215   * function is essentially a no-op.
1079 1216   */
1080 1217  static void
1081 1218  zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
1082 1219  {
↓ open down ↓ 24 lines elided ↑ open up ↑
1107 1244  
1108 1245                  lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
1109 1246                      zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
1110 1247                      BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
1111 1248                      prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
1112 1249                  ASSERT3P(lwb->lwb_write_zio, !=, NULL);
1113 1250  
1114 1251                  lwb->lwb_state = LWB_STATE_OPENED;
1115 1252  
1116 1253                  mutex_enter(&zilog->zl_lock);
1117      -
1118      -                /*
1119      -                 * The zilog's "zl_last_lwb_opened" field is used to
1120      -                 * build the lwb/zio dependency chain, which is used to
1121      -                 * preserve the ordering of lwb completions that is
1122      -                 * required by the semantics of the ZIL. Each new lwb
1123      -                 * zio becomes a parent of the "previous" lwb zio, such
1124      -                 * that the new lwb's zio cannot complete until the
1125      -                 * "previous" lwb's zio completes.
1126      -                 *
1127      -                 * This is required by the semantics of zil_commit();
1128      -                 * the commit waiters attached to the lwbs will be woken
1129      -                 * in the lwb zio's completion callback, so this zio
1130      -                 * dependency graph ensures the waiters are woken in the
1131      -                 * correct order (the same order the lwbs were created).
1132      -                 */
1133      -                lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
1134      -                if (last_lwb_opened != NULL &&
1135      -                    last_lwb_opened->lwb_state != LWB_STATE_DONE) {
1136      -                        ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
1137      -                            last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
1138      -                        ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
1139      -                        zio_add_child(lwb->lwb_root_zio,
1140      -                            last_lwb_opened->lwb_root_zio);
1141      -                }
     1254 +                zil_lwb_set_zio_dependency(zilog, lwb);
1142 1255                  zilog->zl_last_lwb_opened = lwb;
1143      -
1144 1256                  mutex_exit(&zilog->zl_lock);
1145 1257          }
1146 1258  
1147 1259          ASSERT3P(lwb->lwb_root_zio, !=, NULL);
1148 1260          ASSERT3P(lwb->lwb_write_zio, !=, NULL);
1149 1261          ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
1150 1262  }
1151 1263  
1152 1264  /*
1153 1265   * Define a limited set of intent log block sizes.
↓ open down ↓ 47 lines elided ↑ open up ↑
1201 1313           * Note that if the allocation of nlwb synced before we wrote
1202 1314           * the block that points at it (lwb), we'd leak it if we crashed.
1203 1315           * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
1204 1316           * We dirty the dataset to ensure that zil_sync() will be called
1205 1317           * to clean up in the event of allocation failure or I/O failure.
1206 1318           */
1207 1319  
1208 1320          tx = dmu_tx_create(zilog->zl_os);
1209 1321  
1210 1322          /*
1211      -         * Since we are not going to create any new dirty data, and we
1212      -         * can even help with clearing the existing dirty data, we
1213      -         * should not be subject to the dirty data based delays. We
1214      -         * use TXG_NOTHROTTLE to bypass the delay mechanism.
     1323 +         * Since we are not going to create any new dirty data and we can even
     1324 +         * help with clearing the existing dirty data, we should not be subject
     1325 +         * to the dirty data based delays.
     1326 +         * We (ab)use TXG_WAITED to bypass the delay mechanism.
     1327 +         * One side effect from using TXG_WAITED is that dmu_tx_assign() can
     1328 +         * fail if the pool is suspended.  Those are dramatic circumstances,
     1329 +         * so we return NULL to signal that the normal ZIL processing is not
     1330 +         * possible and txg_wait_synced() should be used to ensure that the data
     1331 +         * is on disk.
1215 1332           */
1216      -        VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
1217      -
     1333 +        error = dmu_tx_assign(tx, TXG_WAITED);
     1334 +        if (error != 0) {
     1335 +                ASSERT3S(error, ==, EIO);
     1336 +                dmu_tx_abort(tx);
     1337 +                return (NULL);
     1338 +        }
1218 1339          dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
1219 1340          txg = dmu_tx_get_txg(tx);
1220 1341  
1221 1342          lwb->lwb_tx = tx;
1222 1343  
1223 1344          /*
1224 1345           * Log blocks are pre-allocated. Here we select the size of the next
1225 1346           * block, based on size used in the last block.
1226 1347           * - first find the smallest bucket that will fit the block from a
1227 1348           *   limited set of block sizes. This is because it's faster to write
↓ open down ↓ 605 lines elided ↑ open up ↑
1833 1954          ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
1834 1955  
1835 1956          while (itx = list_head(&zilog->zl_itx_commit_list)) {
1836 1957                  lr_t *lrc = &itx->itx_lr;
1837 1958                  if (lrc->lrc_txtype != TX_COMMIT)
1838 1959                          break;
1839 1960  
1840 1961                  mutex_enter(&zilog->zl_lock);
1841 1962  
1842 1963                  lwb_t *last_lwb = zilog->zl_last_lwb_opened;
1843      -                if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_DONE) {
     1964 +                if (last_lwb == NULL ||
     1965 +                    last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
1844 1966                          /*
1845 1967                           * All of the itxs this waiter was waiting on
1846 1968                           * must have already completed (or there were
1847 1969                           * never any itx's for it to wait on), so it's
1848 1970                           * safe to skip this waiter and mark it done.
1849 1971                           */
1850 1972                          zil_commit_waiter_skip(itx->itx_private);
1851 1973                  } else {
1852 1974                          zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
1853 1975                          itx->itx_private = NULL;
↓ open down ↓ 60 lines elided ↑ open up ↑
1914 2036                  return;
1915 2037  
1916 2038          list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
1917 2039              offsetof(zil_commit_waiter_t, zcw_node));
1918 2040  
1919 2041          lwb = list_tail(&zilog->zl_lwb_list);
1920 2042          if (lwb == NULL) {
1921 2043                  lwb = zil_create(zilog);
1922 2044          } else {
1923 2045                  ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
1924      -                ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE);
     2046 +                ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
     2047 +                ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
1925 2048          }
1926 2049  
1927 2050          while (itx = list_head(&zilog->zl_itx_commit_list)) {
1928 2051                  lr_t *lrc = &itx->itx_lr;
1929 2052                  uint64_t txg = lrc->lrc_txg;
1930 2053  
1931 2054                  ASSERT3U(txg, !=, 0);
1932 2055  
1933 2056                  if (lrc->lrc_txtype == TX_COMMIT) {
1934 2057                          DTRACE_PROBE2(zil__process__commit__itx,
↓ open down ↓ 81 lines elided ↑ open up ↑
2016 2139                   */
2017 2140                  zil_commit_waiter_t *zcw;
2018 2141                  while (zcw = list_head(&nolwb_waiters)) {
2019 2142                          zil_commit_waiter_skip(zcw);
2020 2143                          list_remove(&nolwb_waiters, zcw);
2021 2144                  }
2022 2145          } else {
2023 2146                  ASSERT(list_is_empty(&nolwb_waiters));
2024 2147                  ASSERT3P(lwb, !=, NULL);
2025 2148                  ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
2026      -                ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE);
     2149 +                ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
     2150 +                ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
2027 2151  
2028 2152                  /*
2029 2153                   * At this point, the ZIL block pointed at by the "lwb"
2030 2154                   * variable is in one of the following states: "closed"
2031 2155                   * or "open".
2032 2156                   *
2033 2157                   * If its "closed", then no itxs have been committed to
2034 2158                   * it, so there's no point in issuing its zio (i.e.
2035 2159                   * it's "empty").
2036 2160                   *
↓ open down ↓ 100 lines elided ↑ open up ↑
2137 2261          ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED);
2138 2262  
2139 2263          /*
2140 2264           * If the lwb has already been issued by another thread, we can
2141 2265           * immediately return since there's no work to be done (the
2142 2266           * point of this function is to issue the lwb). Additionally, we
2143 2267           * do this prior to acquiring the zl_issuer_lock, to avoid
2144 2268           * acquiring it when it's not necessary to do so.
2145 2269           */
2146 2270          if (lwb->lwb_state == LWB_STATE_ISSUED ||
2147      -            lwb->lwb_state == LWB_STATE_DONE)
     2271 +            lwb->lwb_state == LWB_STATE_WRITE_DONE ||
     2272 +            lwb->lwb_state == LWB_STATE_FLUSH_DONE)
2148 2273                  return;
2149 2274  
2150 2275          /*
2151 2276           * In order to call zil_lwb_write_issue() we must hold the
2152 2277           * zilog's "zl_issuer_lock". We can't simply acquire that lock,
2153 2278           * since we're already holding the commit waiter's "zcw_lock",
2154 2279           * and those two locks are aquired in the opposite order
2155 2280           * elsewhere.
2156 2281           */
2157 2282          mutex_exit(&zcw->zcw_lock);
↓ open down ↓ 27 lines elided ↑ open up ↑
2185 2310           * The important thing, is we treat the lwb differently depending on
2186 2311           * if it's ISSUED or OPENED, and block any other threads that might
2187 2312           * attempt to issue this lwb. For that reason we hold the
2188 2313           * zl_issuer_lock when checking the lwb_state; we must not call
2189 2314           * zil_lwb_write_issue() if the lwb had already been issued.
2190 2315           *
2191 2316           * See the comment above the lwb_state_t structure definition for
2192 2317           * more details on the lwb states, and locking requirements.
2193 2318           */
2194 2319          if (lwb->lwb_state == LWB_STATE_ISSUED ||
2195      -            lwb->lwb_state == LWB_STATE_DONE)
     2320 +            lwb->lwb_state == LWB_STATE_WRITE_DONE ||
     2321 +            lwb->lwb_state == LWB_STATE_FLUSH_DONE)
2196 2322                  goto out;
2197 2323  
2198 2324          ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
2199 2325  
2200 2326          /*
2201 2327           * As described in the comments above zil_commit_waiter() and
2202 2328           * zil_process_commit_list(), we need to issue this lwb's zio
2203 2329           * since we've reached the commit waiter's timeout and it still
2204 2330           * hasn't been issued.
2205 2331           */
↓ open down ↓ 152 lines elided ↑ open up ↑
2358 2484                           * complete.
2359 2485                           *
2360 2486                           * Additionally, if the lwb is NULL, the waiter
2361 2487                           * will soon be signalled and marked done via
2362 2488                           * zil_clean() and zil_itxg_clean(), so no timeout
2363 2489                           * is required.
2364 2490                           */
2365 2491  
2366 2492                          IMPLY(lwb != NULL,
2367 2493                              lwb->lwb_state == LWB_STATE_ISSUED ||
2368      -                            lwb->lwb_state == LWB_STATE_DONE);
     2494 +                            lwb->lwb_state == LWB_STATE_WRITE_DONE ||
     2495 +                            lwb->lwb_state == LWB_STATE_FLUSH_DONE);
2369 2496                          cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
2370 2497                  }
2371 2498          }
2372 2499  
2373 2500          mutex_exit(&zcw->zcw_lock);
2374 2501  }
2375 2502  
2376 2503  static zil_commit_waiter_t *
2377 2504  zil_alloc_commit_waiter()
2378 2505  {
↓ open down ↓ 621 lines elided ↑ open up ↑
3000 3127  
3001 3128          zilog->zl_suspending = B_TRUE;
3002 3129          mutex_exit(&zilog->zl_lock);
3003 3130  
3004 3131          /*
3005 3132           * We need to use zil_commit_impl to ensure we wait for all
3006 3133           * LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed
3007 3134           * to disk before proceeding. If we used zil_commit instead, it
3008 3135           * would just call txg_wait_synced(), because zl_suspend is set.
3009 3136           * txg_wait_synced() doesn't wait for these lwb's to be
3010      -         * LWB_STATE_DONE before returning.
     3137 +         * LWB_STATE_FLUSH_DONE before returning.
3011 3138           */
3012 3139          zil_commit_impl(zilog, 0);
3013 3140  
3014 3141          /*
3015      -         * Now that we've ensured all lwb's are LWB_STATE_DONE, we use
3016      -         * txg_wait_synced() to ensure the data from the zilog has
     3142 +         * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
     3143 +         * use txg_wait_synced() to ensure the data from the zilog has
3017 3144           * migrated to the main pool before calling zil_destroy().
3018 3145           */
3019 3146          txg_wait_synced(zilog->zl_dmu_pool, 0);
3020 3147  
3021 3148          zil_destroy(zilog, B_FALSE);
3022 3149  
3023 3150          mutex_enter(&zilog->zl_lock);
3024 3151          zilog->zl_suspending = B_FALSE;
3025 3152          cv_broadcast(&zilog->zl_cv_suspend);
3026 3153          mutex_exit(&zilog->zl_lock);
↓ open down ↓ 183 lines elided ↑ open up ↑
3210 3337                  zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
3211 3338                      zilog->zl_replaying_seq;
3212 3339                  return (B_TRUE);
3213 3340          }
3214 3341  
3215 3342          return (B_FALSE);
3216 3343  }
3217 3344  
3218 3345  /* ARGSUSED */
3219 3346  int
3220      -zil_reset(const char *osname, void *arg)
     3347 +zil_vdev_offline(const char *osname, void *arg)
3221 3348  {
3222 3349          int error;
3223 3350  
3224 3351          error = zil_suspend(osname, NULL);
3225 3352          if (error != 0)
3226 3353                  return (SET_ERROR(EEXIST));
3227 3354          return (0);
3228 3355  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX