illumos-gate Udiff usr/src/uts/common/fs/zfs/arc.c

Print this page

re #13729 assign each ARC hash bucket its own mutex
In ARC the number of buckets in buffer header hash table is
proportional to the size of physical RAM.
The number of locks protecting headers in the buckets is fixed to 256 though.
Hence, on systems with large memory (>= 128GB) too many unrelated buffer
headers are protected by the same mutex.
When the memory in the system is fragmented this may cause a deadlock:
- An arc_read thread may be trying to allocate a 128k buffer while holding
a header lock.
- The allocation uses KM_PUSHPAGE option that blocks the thread if no contigous
chunk of requested size is available.
- ARC eviction thread that is supposed to evict some buffers would call
an evict callback on one of the buffers.
- Before freing the memory, the callback will attempt to take a lock on buffer
header.
- Incidentally, this buffer header will be protected by the same lock as
he one in arc_read() thread.
The solution in this patch is not perfect - that is, it protects all headers
in the hash bucket by the same lock.
However, a probability of collision is very low and does not depend on memory
size.
By the same argument, padding locks to cacheline looks like a waste of memory
here since the probability of contention on a cacheline is quite low, given
he number of buckets, number of locks per cacheline (4) and the fact that
he hash function (crc64 % hash table size) is supposed to be a very good
randomizer.
This effect on memory usage is as follows:
Per hash table size n,
- Original code uses 16K + 16 + n * 8 bytes of memory
- This fix uses 2 * n * 8 + 8 bytes of memory
- The net memory overhead is therefore n * 8 - 16K - 8 bytes
The value of n grows proportionally to physical memory size.
For 128GB of physical memory it is 2M, so the memory overhead is
16M - 16K - 8 bytes.
For smaller memory configurations the overhead is proportionally smaller, and
for larger memory configurations it is propottionally bigger.
The patch has been tested for 30+ hours using vdbench script that reproduces
hang with original code 100% of times in 20-30 minutes.

@@ -543,32 +543,25 @@
 
 /*
  * Hash table routines
  */
 
-#define HT_LOCK_PAD     64
-
-struct ht_lock {
-        kmutex_t        ht_lock;
-#ifdef _KERNEL
-        unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
-#endif
+struct ht_table {
+        arc_buf_hdr_t   *hdr;
+        kmutex_t        lock;
 };
 
-#define BUF_LOCKS 256
 typedef struct buf_hash_table {
         uint64_t ht_mask;
-        arc_buf_hdr_t **ht_table;
-        struct ht_lock ht_locks[BUF_LOCKS];
+        struct ht_table *ht_table;
 } buf_hash_table_t;
 
 static buf_hash_table_t buf_hash_table;
 
 #define BUF_HASH_INDEX(spa, dva, birth) \
         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
-#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
-#define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
+#define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_table[idx].lock)
 #define HDR_LOCK(hdr) \
         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];

@@ -701,11 +694,11 @@
         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
         arc_buf_hdr_t *buf;
 
         mutex_enter(hash_lock);
-        for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
+        for (buf = buf_hash_table.ht_table[idx].hdr; buf != NULL;
             buf = buf->b_hash_next) {
                 if (BUF_EQUAL(spa, dva, birth, buf)) {
                         *lockp = hash_lock;
                         return (buf);
                 }

@@ -730,18 +723,18 @@
         uint32_t i;
 
         ASSERT(!HDR_IN_HASH_TABLE(buf));
         *lockp = hash_lock;
         mutex_enter(hash_lock);
-        for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
+        for (fbuf = buf_hash_table.ht_table[idx].hdr, i = 0; fbuf != NULL;
             fbuf = fbuf->b_hash_next, i++) {
                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
                         return (fbuf);
         }
 
-        buf->b_hash_next = buf_hash_table.ht_table[idx];
-        buf_hash_table.ht_table[idx] = buf;
+        buf->b_hash_next = buf_hash_table.ht_table[idx].hdr;
+        buf_hash_table.ht_table[idx].hdr = buf;
         buf->b_flags |= ARC_IN_HASH_TABLE;
 
         /* collect some hash table performance data */
         if (i > 0) {
                 ARCSTAT_BUMP(arcstat_hash_collisions);

@@ -764,11 +757,11 @@
         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 
         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
         ASSERT(HDR_IN_HASH_TABLE(buf));
 
-        bufp = &buf_hash_table.ht_table[idx];
+        bufp = &buf_hash_table.ht_table[idx].hdr;
         while ((fbuf = *bufp) != buf) {
                 ASSERT(fbuf != NULL);
                 bufp = &fbuf->b_hash_next;
         }
         *bufp = buf->b_hash_next;

@@ -776,12 +769,12 @@
         buf->b_flags &= ~ARC_IN_HASH_TABLE;
 
         /* collect some hash table performance data */
         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 
-        if (buf_hash_table.ht_table[idx] &&
-            buf_hash_table.ht_table[idx]->b_hash_next == NULL)
+        if (buf_hash_table.ht_table[idx].hdr &&
+            buf_hash_table.ht_table[idx].hdr->b_hash_next == NULL)
                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 }
 
 /*
  * Global data structures and functions for the buf kmem cache.

@@ -792,14 +785,14 @@
 static void
 buf_fini(void)
 {
         int i;
 
+        for (i = 0; i < buf_hash_table.ht_mask + 1; i++)
+                mutex_destroy(&buf_hash_table.ht_table[i].lock);
         kmem_free(buf_hash_table.ht_table,
-            (buf_hash_table.ht_mask + 1) * sizeof (void *));
-        for (i = 0; i < BUF_LOCKS; i++)
-                mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+            (buf_hash_table.ht_mask + 1) * sizeof (struct ht_table));
         kmem_cache_destroy(hdr_cache);
         kmem_cache_destroy(buf_cache);
 }
 
 /*

@@ -892,11 +885,11 @@
         while (hsize * 65536 < physmem * PAGESIZE)
                 hsize <<= 1;
 retry:
         buf_hash_table.ht_mask = hsize - 1;
         buf_hash_table.ht_table =
-            kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+            kmem_zalloc(hsize * sizeof (struct ht_table), KM_NOSLEEP);
         if (buf_hash_table.ht_table == NULL) {
                 ASSERT(hsize > (1ULL << 8));
                 hsize >>= 1;
                 goto retry;
         }

@@ -908,12 +901,12 @@
 
         for (i = 0; i < 256; i++)
                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 
-        for (i = 0; i < BUF_LOCKS; i++) {
-                mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
+        for (i = 0; i < hsize; i++) {
+                mutex_init(&buf_hash_table.ht_table[i].lock,
                     NULL, MUTEX_DEFAULT, NULL);
         }
 }
 
 #define ARC_MINTIME     (hz>>4) /* 62 ms */