big-one Wdiff usr/src/uts/common/fs/zfs/dmu_zfetch.c

Print this page

6281 prefetching should apply to 1MB reads
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Alexander Motin <mav@freebsd.org>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: Justin Gibbs <gibbs@scsiguy.com>
Reviewed by: Xin Li <delphij@freebsd.org>
Approved by: Gordon Ross <gordon.ross@nexenta.com>
5987 zfs prefetch code needs work
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Approved by: Gordon Ross <gordon.ross@nexenta.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dmu_zfetch.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_zfetch.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  28   28   */
  29   29  
  30   30  #include <sys/zfs_context.h>
  31   31  #include <sys/dnode.h>
  32   32  #include <sys/dmu_objset.h>
  33   33  #include <sys/dmu_zfetch.h>
  34   34  #include <sys/dmu.h>
  35   35  #include <sys/dbuf.h>
  36   36  #include <sys/kstat.h>
  37   37  
  38   38  /*
  39   39   * This tunable disables predictive prefetch.  Note that it leaves "prescient"
  40   40   * prefetch (e.g. prefetch for zfs send) intact.  Unlike predictive prefetch,
  41   41   * prescient prefetch never issues i/os that end up not being needed,
  42   42   * so it can't hurt performance.
  43   43   */
  44   44  boolean_t zfs_prefetch_disable = B_FALSE;
  45   45  
  46   46  /* max # of streams per zfetch */
  47   47  uint32_t        zfetch_max_streams = 8;
  48   48  /* min time before stream reclaim */
  49   49  uint32_t        zfetch_min_sec_reap = 2;
  50   50  /* max bytes to prefetch per stream (default 8MB) */
  51   51  uint32_t        zfetch_max_distance = 8 * 1024 * 1024;
  52   52  /* max bytes to prefetch indirects for per stream (default 64MB) */
  53   53  uint32_t        zfetch_max_idistance = 64 * 1024 * 1024;
  54   54  /* max number of bytes in an array_read in which we allow prefetching (1MB) */
  55   55  uint64_t        zfetch_array_rd_sz = 1024 * 1024;
  56   56  
  57   57  typedef struct zfetch_stats {
  58   58          kstat_named_t zfetchstat_hits;
  59   59          kstat_named_t zfetchstat_misses;
  60   60          kstat_named_t zfetchstat_max_streams;
  61   61  } zfetch_stats_t;
  62   62  
  63   63  static zfetch_stats_t zfetch_stats = {
  64   64          { "hits",                       KSTAT_DATA_UINT64 },
  65   65          { "misses",                     KSTAT_DATA_UINT64 },
  66   66          { "max_streams",                KSTAT_DATA_UINT64 },
  67   67  };
  68   68  
  69   69  #define ZFETCHSTAT_BUMP(stat) \
  70   70          atomic_inc_64(&zfetch_stats.stat.value.ui64);
  71   71  
  72   72  kstat_t         *zfetch_ksp;
  73   73  
  74   74  void
  75   75  zfetch_init(void)
  76   76  {
  77   77          zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
  78   78              KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
  79   79              KSTAT_FLAG_VIRTUAL);
  80   80  
  81   81          if (zfetch_ksp != NULL) {
  82   82                  zfetch_ksp->ks_data = &zfetch_stats;
  83   83                  kstat_install(zfetch_ksp);
  84   84          }
  85   85  }
  86   86  
  87   87  void
  88   88  zfetch_fini(void)
  89   89  {
  90   90          if (zfetch_ksp != NULL) {
  91   91                  kstat_delete(zfetch_ksp);
  92   92                  zfetch_ksp = NULL;
  93   93          }
  94   94  }
  95   95  
  96   96  /*
  97   97   * This takes a pointer to a zfetch structure and a dnode.  It performs the
  98   98   * necessary setup for the zfetch structure, grokking data from the
  99   99   * associated dnode.
 100  100   */
 101  101  void
 102  102  dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
 103  103  {
 104  104          if (zf == NULL)
 105  105                  return;
 106  106  
 107  107          zf->zf_dnode = dno;
 108  108  
 109  109          list_create(&zf->zf_stream, sizeof (zstream_t),
 110  110              offsetof(zstream_t, zs_node));
 111  111  
 112  112          rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
 113  113  }
 114  114  
 115  115  static void
 116  116  dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
 117  117  {
 118  118          ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
 119  119          list_remove(&zf->zf_stream, zs);
 120  120          mutex_destroy(&zs->zs_lock);
 121  121          kmem_free(zs, sizeof (*zs));
 122  122  }
 123  123  
 124  124  /*
 125  125   * Clean-up state associated with a zfetch structure (e.g. destroy the
 126  126   * streams).  This doesn't free the zfetch_t itself, that's left to the caller.
 127  127   */
 128  128  void
 129  129  dmu_zfetch_fini(zfetch_t *zf)
 130  130  {
 131  131          zstream_t *zs;
 132  132  
 133  133          ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
 134  134  
 135  135          rw_enter(&zf->zf_rwlock, RW_WRITER);
 136  136          while ((zs = list_head(&zf->zf_stream)) != NULL)
 137  137                  dmu_zfetch_stream_remove(zf, zs);
 138  138          rw_exit(&zf->zf_rwlock);
 139  139          list_destroy(&zf->zf_stream);
 140  140          rw_destroy(&zf->zf_rwlock);
 141  141  
 142  142          zf->zf_dnode = NULL;
 143  143  }
 144  144  
 145  145  /*
 146  146   * If there aren't too many streams already, create a new stream.
 147  147   * The "blkid" argument is the next block that we expect this stream to access.
 148  148   * While we're here, clean up old streams (which haven't been
 149  149   * accessed for at least zfetch_min_sec_reap seconds).
 150  150   */
 151  151  static void
 152  152  dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 153  153  {
 154  154          zstream_t *zs_next;
 155  155          int numstreams = 0;
 156  156  
 157  157          ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
 158  158  
 159  159          /*
 160  160           * Clean up old streams.
 161  161           */
 162  162          for (zstream_t *zs = list_head(&zf->zf_stream);
 163  163              zs != NULL; zs = zs_next) {
 164  164                  zs_next = list_next(&zf->zf_stream, zs);
 165  165                  if (((gethrtime() - zs->zs_atime) / NANOSEC) >
 166  166                      zfetch_min_sec_reap)
 167  167                          dmu_zfetch_stream_remove(zf, zs);
 168  168                  else
 169  169                          numstreams++;
 170  170          }
 171  171  
 172  172          /*
 173  173           * The maximum number of streams is normally zfetch_max_streams,
 174  174           * but for small files we lower it such that it's at least possible
 175  175           * for all the streams to be non-overlapping.
 176  176           *
 177  177           * If we are already at the maximum number of streams for this file,
 178  178           * even after removing old streams, then don't create this stream.
 179  179           */
 180  180          uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
 181  181              zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
 182  182              zfetch_max_distance));
 183  183          if (numstreams >= max_streams) {
 184  184                  ZFETCHSTAT_BUMP(zfetchstat_max_streams);
 185  185                  return;
 186  186          }
 187  187  
 188  188          zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
 189  189          zs->zs_blkid = blkid;
 190  190          zs->zs_pf_blkid = blkid;
 191  191          zs->zs_ipf_blkid = blkid;
 192  192          zs->zs_atime = gethrtime();
 193  193          mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
 194  194  
 195  195          list_insert_head(&zf->zf_stream, zs);
 196  196  }
 197  197  
 198  198  /*
 199  199   * This is the predictive prefetch entry point.  It associates dnode access
 200  200   * specified with blkid and nblks arguments with prefetch stream, predicts
 201  201   * further accesses based on that stats and initiates speculative prefetch.
 202  202   * fetch_data argument specifies whether actual data blocks should be fetched:
 203  203   *   FALSE -- prefetch only indirect blocks for predicted data blocks;

↓ open down ↓

203 lines elided

↑ open up ↑

 204  204   *   TRUE -- prefetch predicted data blocks plus following indirect blocks.
 205  205   */
 206  206  void
 207  207  dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
 208  208  {
 209  209          zstream_t *zs;
 210  210          int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
 211  211          int64_t pf_ahead_blks, max_blks;
 212  212          int epbs, max_dist_blks, pf_nblks, ipf_nblks;
 213  213          uint64_t end_of_access_blkid = blkid + nblks;
 214      -        spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
 215  214  
 216  215          if (zfs_prefetch_disable)
 217  216                  return;
 218  217  
 219  218          /*
 220      -         * If we haven't yet loaded the indirect vdevs' mappings, we
 221      -         * can only read from blocks that we carefully ensure are on
 222      -         * concrete vdevs (or previously-loaded indirect vdevs).  So we
 223      -         * can't allow the predictive prefetcher to attempt reads of other
 224      -         * blocks (e.g. of the MOS's dnode obejct).
 225      -         */
 226      -        if (!spa_indirect_vdevs_loaded(spa))
 227      -                return;
 228      -
 229      -        /*
 230  219           * As a fast path for small (single-block) files, ignore access
 231  220           * to the first block.
 232  221           */
 233  222          if (blkid == 0)
 234  223                  return;
 235  224  
 236  225          rw_enter(&zf->zf_rwlock, RW_READER);
 237  226  
 238      -        /*
 239      -         * Find matching prefetch stream.  Depending on whether the accesses
 240      -         * are block-aligned, first block of the new access may either follow
 241      -         * the last block of the previous access, or be equal to it.
 242      -         */
 243  227          for (zs = list_head(&zf->zf_stream); zs != NULL;
 244  228              zs = list_next(&zf->zf_stream, zs)) {
 245      -                if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) {
      229 +                if (blkid == zs->zs_blkid) {
 246  230                          mutex_enter(&zs->zs_lock);
 247  231                          /*
 248  232                           * zs_blkid could have changed before we
 249  233                           * acquired zs_lock; re-check them here.
 250  234                           */
 251      -                        if (blkid == zs->zs_blkid) {
 252      -                                break;
 253      -                        } else if (blkid + 1 == zs->zs_blkid) {
 254      -                                blkid++;
 255      -                                nblks--;
 256      -                                if (nblks == 0) {
 257      -                                        /* Already prefetched this before. */
 258      -                                        mutex_exit(&zs->zs_lock);
 259      -                                        rw_exit(&zf->zf_rwlock);
 260      -                                        return;
 261      -                                }
 262      -                                break;
      235 +                        if (blkid != zs->zs_blkid) {
      236 +                                mutex_exit(&zs->zs_lock);
      237 +                                continue;
 263  238                          }
 264      -                        mutex_exit(&zs->zs_lock);
      239 +                        break;
 265  240                  }
 266  241          }
 267  242  
 268  243          if (zs == NULL) {
 269  244                  /*
 270  245                   * This access is not part of any existing stream.  Create
 271  246                   * a new stream for it.
 272  247                   */
 273  248                  ZFETCHSTAT_BUMP(zfetchstat_misses);
 274  249                  if (rw_tryupgrade(&zf->zf_rwlock))

 275  250                          dmu_zfetch_stream_create(zf, end_of_access_blkid);
 276  251                  rw_exit(&zf->zf_rwlock);
 277  252                  return;
 278  253          }
 279  254  
 280  255          /*
 281  256           * This access was to a block that we issued a prefetch for on
 282  257           * behalf of this stream. Issue further prefetches for this stream.
 283  258           *
 284  259           * Normally, we start prefetching where we stopped
 285  260           * prefetching last (zs_pf_blkid).  But when we get our first
 286  261           * hit on this stream, zs_pf_blkid == zs_blkid, we don't
 287  262           * want to prefetch the block we just accessed.  In this case,
 288  263           * start just after the block we just accessed.
 289  264           */
 290  265          pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
 291  266  
 292  267          /*
 293  268           * Double our amount of prefetched data, but don't let the
 294  269           * prefetch get further ahead than zfetch_max_distance.
 295  270           */
 296  271          if (fetch_data) {
 297  272                  max_dist_blks =
 298  273                      zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
 299  274                  /*
 300  275                   * Previously, we were (zs_pf_blkid - blkid) ahead.  We
 301  276                   * want to now be double that, so read that amount again,
 302  277                   * plus the amount we are catching up by (i.e. the amount
 303  278                   * read just now).
 304  279                   */
 305  280                  pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
 306  281                  max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
 307  282                  pf_nblks = MIN(pf_ahead_blks, max_blks);
 308  283          } else {
 309  284                  pf_nblks = 0;
 310  285          }
 311  286  
 312  287          zs->zs_pf_blkid = pf_start + pf_nblks;
 313  288  
 314  289          /*
 315  290           * Do the same for indirects, starting from where we stopped last,
 316  291           * or where we will stop reading data blocks (and the indirects
 317  292           * that point to them).
 318  293           */
 319  294          ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
 320  295          max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
 321  296          /*
 322  297           * We want to double our distance ahead of the data prefetch
 323  298           * (or reader, if we are not prefetching data).  Previously, we
 324  299           * were (zs_ipf_blkid - blkid) ahead.  To double that, we read
 325  300           * that amount again, plus the amount we are catching up by
 326  301           * (i.e. the amount read now + the amount of data prefetched now).
 327  302           */
 328  303          pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
 329  304          max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
 330  305          ipf_nblks = MIN(pf_ahead_blks, max_blks);
 331  306          zs->zs_ipf_blkid = ipf_start + ipf_nblks;
 332  307  
 333  308          epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
 334  309          ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
 335  310          ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
 336  311  
 337  312          zs->zs_atime = gethrtime();
 338  313          zs->zs_blkid = end_of_access_blkid;
 339  314          mutex_exit(&zs->zs_lock);
 340  315          rw_exit(&zf->zf_rwlock);
 341  316  
 342  317          /*
 343  318           * dbuf_prefetch() is asynchronous (even when it needs to read
 344  319           * indirect blocks), but we still prefer to drop our locks before
 345  320           * calling it to reduce the time we hold them.
 346  321           */
 347  322  
 348  323          for (int i = 0; i < pf_nblks; i++) {
 349  324                  dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
 350  325                      ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
 351  326          }
 352  327          for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
 353  328                  dbuf_prefetch(zf->zf_dnode, 1, iblk,
 354  329                      ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
 355  330          }
 356  331          ZFETCHSTAT_BUMP(zfetchstat_hits);
 357  332  }

↓ open down ↓

83 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX