3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
24 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
27 */
28
29 /*
30 * DVA-based Adjustable Replacement Cache
31 *
32 * While much of the theory of operation used here is
33 * based on the self-tuning, low overhead replacement cache
34 * presented by Megiddo and Modha at FAST 2003, there are some
35 * significant differences:
36 *
37 * 1. The Megiddo and Modha model assumes any page is evictable.
38 * Pages in its cache cannot be "locked" into memory. This makes
39 * the eviction algorithm simple: evict the last page in the list.
40 * This also make the performance characteristics easy to reason
41 * about. Our cache is not so simple. At any given moment, some
42 * subset of the blocks in the cache are un-evictable because we
43 * have handed out a reference to them. Blocks are only evictable
284
285 static kmutex_t arc_reclaim_lock;
286 static kcondvar_t arc_reclaim_thread_cv;
287 static boolean_t arc_reclaim_thread_exit;
288 static kcondvar_t arc_reclaim_waiters_cv;
289
290 uint_t arc_reduce_dnlc_percent = 3;
291
292 /*
293 * The number of headers to evict in arc_evict_state_impl() before
294 * dropping the sublist lock and evicting from another sublist. A lower
295 * value means we're more likely to evict the "correct" header (i.e. the
296 * oldest header in the arc state), but comes with higher overhead
297 * (i.e. more invocations of arc_evict_state_impl()).
298 */
299 int zfs_arc_evict_batch_limit = 10;
300
301 /* number of seconds before growing cache again */
302 static int arc_grow_retry = 60;
303
304 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
305 int zfs_arc_overflow_shift = 8;
306
307 /* shift of arc_c for calculating both min and max arc_p */
308 static int arc_p_min_shift = 4;
309
310 /* log2(fraction of arc to reclaim) */
311 static int arc_shrink_shift = 7;
312
313 /*
314 * log2(fraction of ARC which must be free to allow growing).
315 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
316 * when reading a new block into the ARC, we will evict an equal-sized block
317 * from the ARC.
318 *
319 * This must be less than arc_shrink_shift, so that when we shrink the ARC,
320 * we will still not allow it to grow.
321 */
322 int arc_no_grow_shift = 5;
323
4030 extern kmem_cache_t *zio_data_buf_cache[];
4031 extern kmem_cache_t *range_seg_cache;
4032 extern kmem_cache_t *abd_chunk_cache;
4033
4034 #ifdef _KERNEL
4035 if (arc_meta_used >= arc_meta_limit) {
4036 /*
4037 * We are exceeding our meta-data cache limit.
4038 * Purge some DNLC entries to release holds on meta-data.
4039 */
4040 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
4041 }
4042 #if defined(__i386)
4043 /*
4044 * Reclaim unused memory from all kmem caches.
4045 */
4046 kmem_reap();
4047 #endif
4048 #endif
4049
4050 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4051 if (zio_buf_cache[i] != prev_cache) {
4052 prev_cache = zio_buf_cache[i];
4053 kmem_cache_reap_now(zio_buf_cache[i]);
4054 }
4055 if (zio_data_buf_cache[i] != prev_data_cache) {
4056 prev_data_cache = zio_data_buf_cache[i];
4057 kmem_cache_reap_now(zio_data_buf_cache[i]);
4058 }
4059 }
4060 kmem_cache_reap_now(abd_chunk_cache);
4061 kmem_cache_reap_now(buf_cache);
4062 kmem_cache_reap_now(hdr_full_cache);
4063 kmem_cache_reap_now(hdr_l2only_cache);
4064 kmem_cache_reap_now(range_seg_cache);
4065
4066 if (zio_arena != NULL) {
4067 /*
4068 * Ask the vmem arena to reclaim unused memory from its
4069 * quantum caches.
4070 */
4071 vmem_qcache_reap(zio_arena);
4072 }
4073 }
4074
4075 /*
4076 * Threads can block in arc_get_data_impl() waiting for this thread to evict
4077 * enough data and signal them to proceed. When this happens, the threads in
4078 * arc_get_data_impl() are sleeping while holding the hash lock for their
4079 * particular arc header. Thus, we must be careful to never sleep on a
4080 * hash lock in this thread. This is to prevent the following deadlock:
4081 *
4082 * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
4083 * waiting for the reclaim thread to signal it.
4084 *
4085 * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
4086 * fails, and goes to sleep forever.
4087 *
4088 * This possible deadlock is avoided by always acquiring a hash lock
4089 * using mutex_tryenter() from arc_reclaim_thread().
4090 */
4091 /* ARGSUSED */
4092 static void
4093 arc_reclaim_thread(void *unused)
4094 {
4095 hrtime_t growtime = 0;
4096 callb_cpr_t cpr;
4097
4098 CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
4099
4100 mutex_enter(&arc_reclaim_lock);
4101 while (!arc_reclaim_thread_exit) {
4102 uint64_t evicted = 0;
4103
4104 /*
4105 * This is necessary in order for the mdb ::arc dcmd to
4106 * show up to date information. Since the ::arc command
4107 * does not call the kstat's update function, without
4108 * this call, the command may show stale stats for the
4109 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
4110 * with this change, the data might be up to 1 second
4111 * out of date; but that should suffice. The arc_state_t
4112 * structures can be queried directly if more accurate
4113 * information is needed.
4114 */
4115 if (arc_ksp != NULL)
4116 arc_ksp->ks_update(arc_ksp, KSTAT_READ);
4117
4118 mutex_exit(&arc_reclaim_lock);
4119
4120 /*
4121 * We call arc_adjust() before (possibly) calling
4122 * arc_kmem_reap_now(), so that we can wake up
4123 * arc_get_data_impl() sooner.
4124 */
4125 evicted = arc_adjust();
4126
4127 int64_t free_memory = arc_available_memory();
4128 if (free_memory < 0) {
4129
4130 arc_no_grow = B_TRUE;
4131 arc_warm = B_TRUE;
4132
4133 /*
4134 * Wait at least zfs_grow_retry (default 60) seconds
4135 * before considering growing.
4136 */
4137 growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
4138
4139 arc_kmem_reap_now();
4140
4141 /*
4142 * If we are still low on memory, shrink the ARC
4143 * so that we have arc_shrink_min free space.
4144 */
4145 free_memory = arc_available_memory();
4146
4147 int64_t to_free =
4148 (arc_c >> arc_shrink_shift) - free_memory;
4149 if (to_free > 0) {
4150 #ifdef _KERNEL
4151 to_free = MAX(to_free, ptob(needfree));
4152 #endif
4153 arc_shrink(to_free);
4154 }
4155 } else if (free_memory < arc_c >> arc_no_grow_shift) {
4156 arc_no_grow = B_TRUE;
4157 } else if (gethrtime() >= growtime) {
4158 arc_no_grow = B_FALSE;
4159 }
|
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2018, Joyent, Inc.
24 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
27 */
28
29 /*
30 * DVA-based Adjustable Replacement Cache
31 *
32 * While much of the theory of operation used here is
33 * based on the self-tuning, low overhead replacement cache
34 * presented by Megiddo and Modha at FAST 2003, there are some
35 * significant differences:
36 *
37 * 1. The Megiddo and Modha model assumes any page is evictable.
38 * Pages in its cache cannot be "locked" into memory. This makes
39 * the eviction algorithm simple: evict the last page in the list.
40 * This also make the performance characteristics easy to reason
41 * about. Our cache is not so simple. At any given moment, some
42 * subset of the blocks in the cache are un-evictable because we
43 * have handed out a reference to them. Blocks are only evictable
284
285 static kmutex_t arc_reclaim_lock;
286 static kcondvar_t arc_reclaim_thread_cv;
287 static boolean_t arc_reclaim_thread_exit;
288 static kcondvar_t arc_reclaim_waiters_cv;
289
290 uint_t arc_reduce_dnlc_percent = 3;
291
292 /*
293 * The number of headers to evict in arc_evict_state_impl() before
294 * dropping the sublist lock and evicting from another sublist. A lower
295 * value means we're more likely to evict the "correct" header (i.e. the
296 * oldest header in the arc state), but comes with higher overhead
297 * (i.e. more invocations of arc_evict_state_impl()).
298 */
299 int zfs_arc_evict_batch_limit = 10;
300
301 /* number of seconds before growing cache again */
302 static int arc_grow_retry = 60;
303
304 /* number of milliseconds before attempting a kmem-cache-reap */
305 static int arc_kmem_cache_reap_retry_ms = 1000;
306
307 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
308 int zfs_arc_overflow_shift = 8;
309
310 /* shift of arc_c for calculating both min and max arc_p */
311 static int arc_p_min_shift = 4;
312
313 /* log2(fraction of arc to reclaim) */
314 static int arc_shrink_shift = 7;
315
316 /*
317 * log2(fraction of ARC which must be free to allow growing).
318 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
319 * when reading a new block into the ARC, we will evict an equal-sized block
320 * from the ARC.
321 *
322 * This must be less than arc_shrink_shift, so that when we shrink the ARC,
323 * we will still not allow it to grow.
324 */
325 int arc_no_grow_shift = 5;
326
4033 extern kmem_cache_t *zio_data_buf_cache[];
4034 extern kmem_cache_t *range_seg_cache;
4035 extern kmem_cache_t *abd_chunk_cache;
4036
4037 #ifdef _KERNEL
4038 if (arc_meta_used >= arc_meta_limit) {
4039 /*
4040 * We are exceeding our meta-data cache limit.
4041 * Purge some DNLC entries to release holds on meta-data.
4042 */
4043 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
4044 }
4045 #if defined(__i386)
4046 /*
4047 * Reclaim unused memory from all kmem caches.
4048 */
4049 kmem_reap();
4050 #endif
4051 #endif
4052
4053 /*
4054 * If a kmem reap is already active, don't schedule more. We must
4055 * check for this because kmem_cache_reap_soon() won't actually
4056 * block on the cache being reaped (this is to prevent callers from
4057 * becoming implicitly blocked by a system-wide kmem reap -- which,
4058 * on a system with many, many full magazines, can take minutes).
4059 */
4060 if (kmem_cache_reap_active())
4061 return;
4062
4063 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4064 if (zio_buf_cache[i] != prev_cache) {
4065 prev_cache = zio_buf_cache[i];
4066 kmem_cache_reap_soon(zio_buf_cache[i]);
4067 }
4068 if (zio_data_buf_cache[i] != prev_data_cache) {
4069 prev_data_cache = zio_data_buf_cache[i];
4070 kmem_cache_reap_soon(zio_data_buf_cache[i]);
4071 }
4072 }
4073 kmem_cache_reap_soon(abd_chunk_cache);
4074 kmem_cache_reap_soon(buf_cache);
4075 kmem_cache_reap_soon(hdr_full_cache);
4076 kmem_cache_reap_soon(hdr_l2only_cache);
4077 kmem_cache_reap_soon(range_seg_cache);
4078
4079 if (zio_arena != NULL) {
4080 /*
4081 * Ask the vmem arena to reclaim unused memory from its
4082 * quantum caches.
4083 */
4084 vmem_qcache_reap(zio_arena);
4085 }
4086 }
4087
4088 /*
4089 * Threads can block in arc_get_data_impl() waiting for this thread to evict
4090 * enough data and signal them to proceed. When this happens, the threads in
4091 * arc_get_data_impl() are sleeping while holding the hash lock for their
4092 * particular arc header. Thus, we must be careful to never sleep on a
4093 * hash lock in this thread. This is to prevent the following deadlock:
4094 *
4095 * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
4096 * waiting for the reclaim thread to signal it.
4097 *
4098 * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
4099 * fails, and goes to sleep forever.
4100 *
4101 * This possible deadlock is avoided by always acquiring a hash lock
4102 * using mutex_tryenter() from arc_reclaim_thread().
4103 */
4104 /* ARGSUSED */
4105 static void
4106 arc_reclaim_thread(void *unused)
4107 {
4108 hrtime_t growtime = 0;
4109 hrtime_t kmem_reap_time = 0;
4110 callb_cpr_t cpr;
4111
4112 CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
4113
4114 mutex_enter(&arc_reclaim_lock);
4115 while (!arc_reclaim_thread_exit) {
4116 uint64_t evicted = 0;
4117
4118 /*
4119 * This is necessary in order for the mdb ::arc dcmd to
4120 * show up to date information. Since the ::arc command
4121 * does not call the kstat's update function, without
4122 * this call, the command may show stale stats for the
4123 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
4124 * with this change, the data might be up to 1 second
4125 * out of date; but that should suffice. The arc_state_t
4126 * structures can be queried directly if more accurate
4127 * information is needed.
4128 */
4129 if (arc_ksp != NULL)
4130 arc_ksp->ks_update(arc_ksp, KSTAT_READ);
4131
4132 mutex_exit(&arc_reclaim_lock);
4133
4134 /*
4135 * We call arc_adjust() before (possibly) calling
4136 * arc_kmem_reap_now(), so that we can wake up
4137 * arc_get_data_impl() sooner.
4138 */
4139 evicted = arc_adjust();
4140
4141 int64_t free_memory = arc_available_memory();
4142 if (free_memory < 0) {
4143 hrtime_t curtime = gethrtime();
4144 arc_no_grow = B_TRUE;
4145 arc_warm = B_TRUE;
4146
4147 /*
4148 * Wait at least zfs_grow_retry (default 60) seconds
4149 * before considering growing.
4150 */
4151 growtime = curtime + SEC2NSEC(arc_grow_retry);
4152
4153 /*
4154 * Wait at least arc_kmem_cache_reap_retry_ms
4155 * between arc_kmem_reap_now() calls. Without
4156 * this check it is possible to end up in a
4157 * situation where we spend lots of time
4158 * reaping caches, while we're near arc_c_min.
4159 */
4160 if (curtime >= kmem_reap_time) {
4161 arc_kmem_reap_now();
4162 kmem_reap_time = gethrtime() +
4163 MSEC2NSEC(arc_kmem_cache_reap_retry_ms);
4164 }
4165
4166 /*
4167 * If we are still low on memory, shrink the ARC
4168 * so that we have arc_shrink_min free space.
4169 */
4170 free_memory = arc_available_memory();
4171
4172 int64_t to_free =
4173 (arc_c >> arc_shrink_shift) - free_memory;
4174 if (to_free > 0) {
4175 #ifdef _KERNEL
4176 to_free = MAX(to_free, ptob(needfree));
4177 #endif
4178 arc_shrink(to_free);
4179 }
4180 } else if (free_memory < arc_c >> arc_no_grow_shift) {
4181 arc_no_grow = B_TRUE;
4182 } else if (gethrtime() >= growtime) {
4183 arc_no_grow = B_FALSE;
4184 }
|