Print this page
9018 Replace kmem_cache_reap_now() with kmem_cache_reap_soon()
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Yuri Pankov <yuripv@yuripv.net>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/arc.c
          +++ new/usr/src/uts/common/fs/zfs/arc.c
↓ open down ↓ 12 lines elided ↑ open up ↑
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - * Copyright (c) 2012, Joyent, Inc. All rights reserved.
       23 + * Copyright (c) 2018, Joyent, Inc.
  24   24   * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  25   25   * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
  26   26   * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  27   27   */
  28   28  
  29   29  /*
  30   30   * DVA-based Adjustable Replacement Cache
  31   31   *
  32   32   * While much of the theory of operation used here is
  33   33   * based on the self-tuning, low overhead replacement cache
↓ open down ↓ 260 lines elided ↑ open up ↑
 294  294   * dropping the sublist lock and evicting from another sublist. A lower
 295  295   * value means we're more likely to evict the "correct" header (i.e. the
 296  296   * oldest header in the arc state), but comes with higher overhead
 297  297   * (i.e. more invocations of arc_evict_state_impl()).
 298  298   */
 299  299  int zfs_arc_evict_batch_limit = 10;
 300  300  
 301  301  /* number of seconds before growing cache again */
 302  302  static int              arc_grow_retry = 60;
 303  303  
      304 +/* number of milliseconds before attempting a kmem-cache-reap */
      305 +static int              arc_kmem_cache_reap_retry_ms = 1000;
      306 +
 304  307  /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
 305  308  int             zfs_arc_overflow_shift = 8;
 306  309  
 307  310  /* shift of arc_c for calculating both min and max arc_p */
 308  311  static int              arc_p_min_shift = 4;
 309  312  
 310  313  /* log2(fraction of arc to reclaim) */
 311  314  static int              arc_shrink_shift = 7;
 312  315  
 313  316  /*
↓ open down ↓ 3726 lines elided ↑ open up ↑
4040 4043                  dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
4041 4044          }
4042 4045  #if defined(__i386)
4043 4046          /*
4044 4047           * Reclaim unused memory from all kmem caches.
4045 4048           */
4046 4049          kmem_reap();
4047 4050  #endif
4048 4051  #endif
4049 4052  
     4053 +        /*
     4054 +         * If a kmem reap is already active, don't schedule more.  We must
     4055 +         * check for this because kmem_cache_reap_soon() won't actually
     4056 +         * block on the cache being reaped (this is to prevent callers from
     4057 +         * becoming implicitly blocked by a system-wide kmem reap -- which,
     4058 +         * on a system with many, many full magazines, can take minutes).
     4059 +         */
     4060 +        if (kmem_cache_reap_active())
     4061 +                return;
     4062 +
4050 4063          for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4051 4064                  if (zio_buf_cache[i] != prev_cache) {
4052 4065                          prev_cache = zio_buf_cache[i];
4053      -                        kmem_cache_reap_now(zio_buf_cache[i]);
     4066 +                        kmem_cache_reap_soon(zio_buf_cache[i]);
4054 4067                  }
4055 4068                  if (zio_data_buf_cache[i] != prev_data_cache) {
4056 4069                          prev_data_cache = zio_data_buf_cache[i];
4057      -                        kmem_cache_reap_now(zio_data_buf_cache[i]);
     4070 +                        kmem_cache_reap_soon(zio_data_buf_cache[i]);
4058 4071                  }
4059 4072          }
4060      -        kmem_cache_reap_now(abd_chunk_cache);
4061      -        kmem_cache_reap_now(buf_cache);
4062      -        kmem_cache_reap_now(hdr_full_cache);
4063      -        kmem_cache_reap_now(hdr_l2only_cache);
4064      -        kmem_cache_reap_now(range_seg_cache);
     4073 +        kmem_cache_reap_soon(abd_chunk_cache);
     4074 +        kmem_cache_reap_soon(buf_cache);
     4075 +        kmem_cache_reap_soon(hdr_full_cache);
     4076 +        kmem_cache_reap_soon(hdr_l2only_cache);
     4077 +        kmem_cache_reap_soon(range_seg_cache);
4065 4078  
4066 4079          if (zio_arena != NULL) {
4067 4080                  /*
4068 4081                   * Ask the vmem arena to reclaim unused memory from its
4069 4082                   * quantum caches.
4070 4083                   */
4071 4084                  vmem_qcache_reap(zio_arena);
4072 4085          }
4073 4086  }
4074 4087  
↓ open down ↓ 11 lines elided ↑ open up ↑
4086 4099   *    fails, and goes to sleep forever.
4087 4100   *
4088 4101   * This possible deadlock is avoided by always acquiring a hash lock
4089 4102   * using mutex_tryenter() from arc_reclaim_thread().
4090 4103   */
4091 4104  /* ARGSUSED */
4092 4105  static void
4093 4106  arc_reclaim_thread(void *unused)
4094 4107  {
4095 4108          hrtime_t                growtime = 0;
     4109 +        hrtime_t                kmem_reap_time = 0;
4096 4110          callb_cpr_t             cpr;
4097 4111  
4098 4112          CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
4099 4113  
4100 4114          mutex_enter(&arc_reclaim_lock);
4101 4115          while (!arc_reclaim_thread_exit) {
4102 4116                  uint64_t evicted = 0;
4103 4117  
4104 4118                  /*
4105 4119                   * This is necessary in order for the mdb ::arc dcmd to
↓ open down ↓ 13 lines elided ↑ open up ↑
4119 4133  
4120 4134                  /*
4121 4135                   * We call arc_adjust() before (possibly) calling
4122 4136                   * arc_kmem_reap_now(), so that we can wake up
4123 4137                   * arc_get_data_impl() sooner.
4124 4138                   */
4125 4139                  evicted = arc_adjust();
4126 4140  
4127 4141                  int64_t free_memory = arc_available_memory();
4128 4142                  if (free_memory < 0) {
4129      -
     4143 +                        hrtime_t curtime = gethrtime();
4130 4144                          arc_no_grow = B_TRUE;
4131 4145                          arc_warm = B_TRUE;
4132 4146  
4133 4147                          /*
4134 4148                           * Wait at least zfs_grow_retry (default 60) seconds
4135 4149                           * before considering growing.
4136 4150                           */
4137      -                        growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
     4151 +                        growtime = curtime + SEC2NSEC(arc_grow_retry);
4138 4152  
4139      -                        arc_kmem_reap_now();
     4153 +                        /*
     4154 +                         * Wait at least arc_kmem_cache_reap_retry_ms
     4155 +                         * between arc_kmem_reap_now() calls. Without
     4156 +                         * this check it is possible to end up in a
     4157 +                         * situation where we spend lots of time
     4158 +                         * reaping caches, while we're near arc_c_min.
     4159 +                         */
     4160 +                        if (curtime >= kmem_reap_time) {
     4161 +                                arc_kmem_reap_now();
     4162 +                                kmem_reap_time = gethrtime() +
     4163 +                                    MSEC2NSEC(arc_kmem_cache_reap_retry_ms);
     4164 +                        }
4140 4165  
4141 4166                          /*
4142 4167                           * If we are still low on memory, shrink the ARC
4143 4168                           * so that we have arc_shrink_min free space.
4144 4169                           */
4145 4170                          free_memory = arc_available_memory();
4146 4171  
4147 4172                          int64_t to_free =
4148 4173                              (arc_c >> arc_shrink_shift) - free_memory;
4149 4174                          if (to_free > 0) {
↓ open down ↓ 3133 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX