9018 Sdiff usr/src/uts/common/fs/zfs/arc.c

Print this page

9018 Replace kmem_cache_reap_now() with kmem_cache_reap_soon()
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Yuri Pankov <yuripv@yuripv.net>

   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  24  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28 
  29 /*
  30  * DVA-based Adjustable Replacement Cache
  31  *
  32  * While much of the theory of operation used here is
  33  * based on the self-tuning, low overhead replacement cache
  34  * presented by Megiddo and Modha at FAST 2003, there are some
  35  * significant differences:
  36  *
  37  * 1. The Megiddo and Modha model assumes any page is evictable.
  38  * Pages in its cache cannot be "locked" into memory.  This makes
  39  * the eviction algorithm simple: evict the last page in the list.
  40  * This also make the performance characteristics easy to reason
  41  * about.  Our cache is not so simple.  At any given moment, some
  42  * subset of the blocks in the cache are un-evictable because we
  43  * have handed out a reference to them.  Blocks are only evictable

 284 
 285 static kmutex_t         arc_reclaim_lock;
 286 static kcondvar_t       arc_reclaim_thread_cv;
 287 static boolean_t        arc_reclaim_thread_exit;
 288 static kcondvar_t       arc_reclaim_waiters_cv;
 289 
 290 uint_t arc_reduce_dnlc_percent = 3;
 291 
 292 /*
 293  * The number of headers to evict in arc_evict_state_impl() before
 294  * dropping the sublist lock and evicting from another sublist. A lower
 295  * value means we're more likely to evict the "correct" header (i.e. the
 296  * oldest header in the arc state), but comes with higher overhead
 297  * (i.e. more invocations of arc_evict_state_impl()).
 298  */
 299 int zfs_arc_evict_batch_limit = 10;
 300 
 301 /* number of seconds before growing cache again */
 302 static int              arc_grow_retry = 60;
 303 



 304 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
 305 int             zfs_arc_overflow_shift = 8;
 306 
 307 /* shift of arc_c for calculating both min and max arc_p */
 308 static int              arc_p_min_shift = 4;
 309 
 310 /* log2(fraction of arc to reclaim) */
 311 static int              arc_shrink_shift = 7;
 312 
 313 /*
 314  * log2(fraction of ARC which must be free to allow growing).
 315  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
 316  * when reading a new block into the ARC, we will evict an equal-sized block
 317  * from the ARC.
 318  *
 319  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
 320  * we will still not allow it to grow.
 321  */
 322 int                     arc_no_grow_shift = 5;
 323

4030         extern kmem_cache_t     *zio_data_buf_cache[];
4031         extern kmem_cache_t     *range_seg_cache;
4032         extern kmem_cache_t     *abd_chunk_cache;
4033 
4034 #ifdef _KERNEL
4035         if (arc_meta_used >= arc_meta_limit) {
4036                 /*
4037                  * We are exceeding our meta-data cache limit.
4038                  * Purge some DNLC entries to release holds on meta-data.
4039                  */
4040                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
4041         }
4042 #if defined(__i386)
4043         /*
4044          * Reclaim unused memory from all kmem caches.
4045          */
4046         kmem_reap();
4047 #endif
4048 #endif
4049 










4050         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4051                 if (zio_buf_cache[i] != prev_cache) {
4052                         prev_cache = zio_buf_cache[i];
4053                         kmem_cache_reap_now(zio_buf_cache[i]);
4054                 }
4055                 if (zio_data_buf_cache[i] != prev_data_cache) {
4056                         prev_data_cache = zio_data_buf_cache[i];
4057                         kmem_cache_reap_now(zio_data_buf_cache[i]);
4058                 }
4059         }
4060         kmem_cache_reap_now(abd_chunk_cache);
4061         kmem_cache_reap_now(buf_cache);
4062         kmem_cache_reap_now(hdr_full_cache);
4063         kmem_cache_reap_now(hdr_l2only_cache);
4064         kmem_cache_reap_now(range_seg_cache);
4065 
4066         if (zio_arena != NULL) {
4067                 /*
4068                  * Ask the vmem arena to reclaim unused memory from its
4069                  * quantum caches.
4070                  */
4071                 vmem_qcache_reap(zio_arena);
4072         }
4073 }
4074 
4075 /*
4076  * Threads can block in arc_get_data_impl() waiting for this thread to evict
4077  * enough data and signal them to proceed. When this happens, the threads in
4078  * arc_get_data_impl() are sleeping while holding the hash lock for their
4079  * particular arc header. Thus, we must be careful to never sleep on a
4080  * hash lock in this thread. This is to prevent the following deadlock:
4081  *
4082  *  - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
4083  *    waiting for the reclaim thread to signal it.
4084  *
4085  *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
4086  *    fails, and goes to sleep forever.
4087  *
4088  * This possible deadlock is avoided by always acquiring a hash lock
4089  * using mutex_tryenter() from arc_reclaim_thread().
4090  */
4091 /* ARGSUSED */
4092 static void
4093 arc_reclaim_thread(void *unused)
4094 {
4095         hrtime_t                growtime = 0;

4096         callb_cpr_t             cpr;
4097 
4098         CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
4099 
4100         mutex_enter(&arc_reclaim_lock);
4101         while (!arc_reclaim_thread_exit) {
4102                 uint64_t evicted = 0;
4103 
4104                 /*
4105                  * This is necessary in order for the mdb ::arc dcmd to
4106                  * show up to date information. Since the ::arc command
4107                  * does not call the kstat's update function, without
4108                  * this call, the command may show stale stats for the
4109                  * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
4110                  * with this change, the data might be up to 1 second
4111                  * out of date; but that should suffice. The arc_state_t
4112                  * structures can be queried directly if more accurate
4113                  * information is needed.
4114                  */
4115                 if (arc_ksp != NULL)
4116                         arc_ksp->ks_update(arc_ksp, KSTAT_READ);
4117 
4118                 mutex_exit(&arc_reclaim_lock);
4119 
4120                 /*
4121                  * We call arc_adjust() before (possibly) calling
4122                  * arc_kmem_reap_now(), so that we can wake up
4123                  * arc_get_data_impl() sooner.
4124                  */
4125                 evicted = arc_adjust();
4126 
4127                 int64_t free_memory = arc_available_memory();
4128                 if (free_memory < 0) {
4129 
4130                         arc_no_grow = B_TRUE;
4131                         arc_warm = B_TRUE;
4132 
4133                         /*
4134                          * Wait at least zfs_grow_retry (default 60) seconds
4135                          * before considering growing.
4136                          */
4137                         growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
4138 








4139                         arc_kmem_reap_now();



4140 
4141                         /*
4142                          * If we are still low on memory, shrink the ARC
4143                          * so that we have arc_shrink_min free space.
4144                          */
4145                         free_memory = arc_available_memory();
4146 
4147                         int64_t to_free =
4148                             (arc_c >> arc_shrink_shift) - free_memory;
4149                         if (to_free > 0) {
4150 #ifdef _KERNEL
4151                                 to_free = MAX(to_free, ptob(needfree));
4152 #endif
4153                                 arc_shrink(to_free);
4154                         }
4155                 } else if (free_memory < arc_c >> arc_no_grow_shift) {
4156                         arc_no_grow = B_TRUE;
4157                 } else if (gethrtime() >= growtime) {
4158                         arc_no_grow = B_FALSE;
4159                 }

   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2018, Joyent, Inc.
  24  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28 
  29 /*
  30  * DVA-based Adjustable Replacement Cache
  31  *
  32  * While much of the theory of operation used here is
  33  * based on the self-tuning, low overhead replacement cache
  34  * presented by Megiddo and Modha at FAST 2003, there are some
  35  * significant differences:
  36  *
  37  * 1. The Megiddo and Modha model assumes any page is evictable.
  38  * Pages in its cache cannot be "locked" into memory.  This makes
  39  * the eviction algorithm simple: evict the last page in the list.
  40  * This also make the performance characteristics easy to reason
  41  * about.  Our cache is not so simple.  At any given moment, some
  42  * subset of the blocks in the cache are un-evictable because we
  43  * have handed out a reference to them.  Blocks are only evictable

 284 
 285 static kmutex_t         arc_reclaim_lock;
 286 static kcondvar_t       arc_reclaim_thread_cv;
 287 static boolean_t        arc_reclaim_thread_exit;
 288 static kcondvar_t       arc_reclaim_waiters_cv;
 289 
 290 uint_t arc_reduce_dnlc_percent = 3;
 291 
 292 /*
 293  * The number of headers to evict in arc_evict_state_impl() before
 294  * dropping the sublist lock and evicting from another sublist. A lower
 295  * value means we're more likely to evict the "correct" header (i.e. the
 296  * oldest header in the arc state), but comes with higher overhead
 297  * (i.e. more invocations of arc_evict_state_impl()).
 298  */
 299 int zfs_arc_evict_batch_limit = 10;
 300 
 301 /* number of seconds before growing cache again */
 302 static int              arc_grow_retry = 60;
 303 
 304 /* number of milliseconds before attempting a kmem-cache-reap */
 305 static int              arc_kmem_cache_reap_retry_ms = 1000;
 306 
 307 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
 308 int             zfs_arc_overflow_shift = 8;
 309 
 310 /* shift of arc_c for calculating both min and max arc_p */
 311 static int              arc_p_min_shift = 4;
 312 
 313 /* log2(fraction of arc to reclaim) */
 314 static int              arc_shrink_shift = 7;
 315 
 316 /*
 317  * log2(fraction of ARC which must be free to allow growing).
 318  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
 319  * when reading a new block into the ARC, we will evict an equal-sized block
 320  * from the ARC.
 321  *
 322  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
 323  * we will still not allow it to grow.
 324  */
 325 int                     arc_no_grow_shift = 5;
 326

4033         extern kmem_cache_t     *zio_data_buf_cache[];
4034         extern kmem_cache_t     *range_seg_cache;
4035         extern kmem_cache_t     *abd_chunk_cache;
4036 
4037 #ifdef _KERNEL
4038         if (arc_meta_used >= arc_meta_limit) {
4039                 /*
4040                  * We are exceeding our meta-data cache limit.
4041                  * Purge some DNLC entries to release holds on meta-data.
4042                  */
4043                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
4044         }
4045 #if defined(__i386)
4046         /*
4047          * Reclaim unused memory from all kmem caches.
4048          */
4049         kmem_reap();
4050 #endif
4051 #endif
4052 
4053         /*
4054          * If a kmem reap is already active, don't schedule more.  We must
4055          * check for this because kmem_cache_reap_soon() won't actually
4056          * block on the cache being reaped (this is to prevent callers from
4057          * becoming implicitly blocked by a system-wide kmem reap -- which,
4058          * on a system with many, many full magazines, can take minutes).
4059          */
4060         if (kmem_cache_reap_active())
4061                 return;
4062 
4063         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4064                 if (zio_buf_cache[i] != prev_cache) {
4065                         prev_cache = zio_buf_cache[i];
4066                         kmem_cache_reap_soon(zio_buf_cache[i]);
4067                 }
4068                 if (zio_data_buf_cache[i] != prev_data_cache) {
4069                         prev_data_cache = zio_data_buf_cache[i];
4070                         kmem_cache_reap_soon(zio_data_buf_cache[i]);
4071                 }
4072         }
4073         kmem_cache_reap_soon(abd_chunk_cache);
4074         kmem_cache_reap_soon(buf_cache);
4075         kmem_cache_reap_soon(hdr_full_cache);
4076         kmem_cache_reap_soon(hdr_l2only_cache);
4077         kmem_cache_reap_soon(range_seg_cache);
4078 
4079         if (zio_arena != NULL) {
4080                 /*
4081                  * Ask the vmem arena to reclaim unused memory from its
4082                  * quantum caches.
4083                  */
4084                 vmem_qcache_reap(zio_arena);
4085         }
4086 }
4087 
4088 /*
4089  * Threads can block in arc_get_data_impl() waiting for this thread to evict
4090  * enough data and signal them to proceed. When this happens, the threads in
4091  * arc_get_data_impl() are sleeping while holding the hash lock for their
4092  * particular arc header. Thus, we must be careful to never sleep on a
4093  * hash lock in this thread. This is to prevent the following deadlock:
4094  *
4095  *  - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
4096  *    waiting for the reclaim thread to signal it.
4097  *
4098  *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
4099  *    fails, and goes to sleep forever.
4100  *
4101  * This possible deadlock is avoided by always acquiring a hash lock
4102  * using mutex_tryenter() from arc_reclaim_thread().
4103  */
4104 /* ARGSUSED */
4105 static void
4106 arc_reclaim_thread(void *unused)
4107 {
4108         hrtime_t                growtime = 0;
4109         hrtime_t                kmem_reap_time = 0;
4110         callb_cpr_t             cpr;
4111 
4112         CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
4113 
4114         mutex_enter(&arc_reclaim_lock);
4115         while (!arc_reclaim_thread_exit) {
4116                 uint64_t evicted = 0;
4117 
4118                 /*
4119                  * This is necessary in order for the mdb ::arc dcmd to
4120                  * show up to date information. Since the ::arc command
4121                  * does not call the kstat's update function, without
4122                  * this call, the command may show stale stats for the
4123                  * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
4124                  * with this change, the data might be up to 1 second
4125                  * out of date; but that should suffice. The arc_state_t
4126                  * structures can be queried directly if more accurate
4127                  * information is needed.
4128                  */
4129                 if (arc_ksp != NULL)
4130                         arc_ksp->ks_update(arc_ksp, KSTAT_READ);
4131 
4132                 mutex_exit(&arc_reclaim_lock);
4133 
4134                 /*
4135                  * We call arc_adjust() before (possibly) calling
4136                  * arc_kmem_reap_now(), so that we can wake up
4137                  * arc_get_data_impl() sooner.
4138                  */
4139                 evicted = arc_adjust();
4140 
4141                 int64_t free_memory = arc_available_memory();
4142                 if (free_memory < 0) {
4143                         hrtime_t curtime = gethrtime();
4144                         arc_no_grow = B_TRUE;
4145                         arc_warm = B_TRUE;
4146 
4147                         /*
4148                          * Wait at least zfs_grow_retry (default 60) seconds
4149                          * before considering growing.
4150                          */
4151                         growtime = curtime + SEC2NSEC(arc_grow_retry);
4152 
4153                         /*
4154                          * Wait at least arc_kmem_cache_reap_retry_ms
4155                          * between arc_kmem_reap_now() calls. Without
4156                          * this check it is possible to end up in a
4157                          * situation where we spend lots of time
4158                          * reaping caches, while we're near arc_c_min.
4159                          */
4160                         if (curtime >= kmem_reap_time) {
4161                                 arc_kmem_reap_now();
4162                                 kmem_reap_time = gethrtime() +
4163                                     MSEC2NSEC(arc_kmem_cache_reap_retry_ms);
4164                         }
4165 
4166                         /*
4167                          * If we are still low on memory, shrink the ARC
4168                          * so that we have arc_shrink_min free space.
4169                          */
4170                         free_memory = arc_available_memory();
4171 
4172                         int64_t to_free =
4173                             (arc_c >> arc_shrink_shift) - free_memory;
4174                         if (to_free > 0) {
4175 #ifdef _KERNEL
4176                                 to_free = MAX(to_free, ptob(needfree));
4177 #endif
4178                                 arc_shrink(to_free);
4179                         }
4180                 } else if (free_memory < arc_c >> arc_no_grow_shift) {
4181                         arc_no_grow = B_TRUE;
4182                 } else if (gethrtime() >= growtime) {
4183                         arc_no_grow = B_FALSE;
4184                 }