Print this page
re #13729 assign each ARC hash bucket its own mutex
In ARC the number of buckets in buffer header hash table is
proportional to the size of physical RAM.
The number of locks protecting headers in the buckets is fixed to 256 though.
Hence, on systems with large memory (>= 128GB) too many unrelated buffer
headers are protected by the same mutex.
When the memory in the system is fragmented this may cause a deadlock:
- An arc_read thread may be trying to allocate a 128k buffer while holding
a header lock.
- The allocation uses KM_PUSHPAGE option that blocks the thread if no contigous
chunk of requested size is available.
- ARC eviction thread that is supposed to evict some buffers would call
an evict callback on one of the buffers.
- Before freing the memory, the callback will attempt to take a lock on buffer
header.
- Incidentally, this buffer header will be protected by the same lock as
he one in arc_read() thread.
The solution in this patch is not perfect - that is, it protects all headers
in the hash bucket by the same lock.
However, a probability of collision is very low and does not depend on memory
size.
By the same argument, padding locks to cacheline looks like a waste of memory
here since the probability of contention on a cacheline is quite low, given
he number of buckets, number of locks per cacheline (4) and the fact that
he hash function (crc64 % hash table size) is supposed to be a very good
randomizer.
This effect on memory usage is as follows:
Per hash table size n,
- Original code uses 16K + 16 + n * 8 bytes of memory
- This fix uses 2 * n * 8 + 8 bytes of memory
- The net memory overhead is therefore n * 8 - 16K - 8 bytes
The value of n grows proportionally to physical memory size.
For 128GB of physical memory it is 2M, so the memory overhead is
16M - 16K - 8 bytes.
For smaller memory configurations the overhead is proportionally smaller, and
for larger memory configurations it is propottionally bigger.
The patch has been tested for 30+ hours using vdbench script that reproduces
hang with original code 100% of times in 20-30 minutes.

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/arc.c
          +++ new/usr/src/uts/common/fs/zfs/arc.c
↓ open down ↓ 537 lines elided ↑ open up ↑
 538  538   * Other sizes
 539  539   */
 540  540  
 541  541  #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 542  542  #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 543  543  
 544  544  /*
 545  545   * Hash table routines
 546  546   */
 547  547  
 548      -#define HT_LOCK_PAD     64
 549      -
 550      -struct ht_lock {
 551      -        kmutex_t        ht_lock;
 552      -#ifdef _KERNEL
 553      -        unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 554      -#endif
      548 +struct ht_table {
      549 +        arc_buf_hdr_t   *hdr;
      550 +        kmutex_t        lock;
 555  551  };
 556  552  
 557      -#define BUF_LOCKS 256
 558  553  typedef struct buf_hash_table {
 559  554          uint64_t ht_mask;
 560      -        arc_buf_hdr_t **ht_table;
 561      -        struct ht_lock ht_locks[BUF_LOCKS];
      555 +        struct ht_table *ht_table;
 562  556  } buf_hash_table_t;
 563  557  
 564  558  static buf_hash_table_t buf_hash_table;
 565  559  
 566  560  #define BUF_HASH_INDEX(spa, dva, birth) \
 567  561          (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 568      -#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 569      -#define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
      562 +#define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_table[idx].lock)
 570  563  #define HDR_LOCK(hdr) \
 571  564          (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 572  565  
 573  566  uint64_t zfs_crc64_table[256];
 574  567  
 575  568  /*
 576  569   * Level 2 ARC
 577  570   */
 578  571  
 579  572  #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
↓ open down ↓ 116 lines elided ↑ open up ↑
 696  689  }
 697  690  
 698  691  static arc_buf_hdr_t *
 699  692  buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 700  693  {
 701  694          uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 702  695          kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 703  696          arc_buf_hdr_t *buf;
 704  697  
 705  698          mutex_enter(hash_lock);
 706      -        for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
      699 +        for (buf = buf_hash_table.ht_table[idx].hdr; buf != NULL;
 707  700              buf = buf->b_hash_next) {
 708  701                  if (BUF_EQUAL(spa, dva, birth, buf)) {
 709  702                          *lockp = hash_lock;
 710  703                          return (buf);
 711  704                  }
 712  705          }
 713  706          mutex_exit(hash_lock);
 714  707          *lockp = NULL;
 715  708          return (NULL);
 716  709  }
↓ open down ↓ 8 lines elided ↑ open up ↑
 725  718  buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 726  719  {
 727  720          uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 728  721          kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 729  722          arc_buf_hdr_t *fbuf;
 730  723          uint32_t i;
 731  724  
 732  725          ASSERT(!HDR_IN_HASH_TABLE(buf));
 733  726          *lockp = hash_lock;
 734  727          mutex_enter(hash_lock);
 735      -        for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
      728 +        for (fbuf = buf_hash_table.ht_table[idx].hdr, i = 0; fbuf != NULL;
 736  729              fbuf = fbuf->b_hash_next, i++) {
 737  730                  if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 738  731                          return (fbuf);
 739  732          }
 740  733  
 741      -        buf->b_hash_next = buf_hash_table.ht_table[idx];
 742      -        buf_hash_table.ht_table[idx] = buf;
      734 +        buf->b_hash_next = buf_hash_table.ht_table[idx].hdr;
      735 +        buf_hash_table.ht_table[idx].hdr = buf;
 743  736          buf->b_flags |= ARC_IN_HASH_TABLE;
 744  737  
 745  738          /* collect some hash table performance data */
 746  739          if (i > 0) {
 747  740                  ARCSTAT_BUMP(arcstat_hash_collisions);
 748  741                  if (i == 1)
 749  742                          ARCSTAT_BUMP(arcstat_hash_chains);
 750  743  
 751  744                  ARCSTAT_MAX(arcstat_hash_chain_max, i);
 752  745          }
↓ open down ↓ 6 lines elided ↑ open up ↑
 759  752  
 760  753  static void
 761  754  buf_hash_remove(arc_buf_hdr_t *buf)
 762  755  {
 763  756          arc_buf_hdr_t *fbuf, **bufp;
 764  757          uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 765  758  
 766  759          ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 767  760          ASSERT(HDR_IN_HASH_TABLE(buf));
 768  761  
 769      -        bufp = &buf_hash_table.ht_table[idx];
      762 +        bufp = &buf_hash_table.ht_table[idx].hdr;
 770  763          while ((fbuf = *bufp) != buf) {
 771  764                  ASSERT(fbuf != NULL);
 772  765                  bufp = &fbuf->b_hash_next;
 773  766          }
 774  767          *bufp = buf->b_hash_next;
 775  768          buf->b_hash_next = NULL;
 776  769          buf->b_flags &= ~ARC_IN_HASH_TABLE;
 777  770  
 778  771          /* collect some hash table performance data */
 779  772          ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 780  773  
 781      -        if (buf_hash_table.ht_table[idx] &&
 782      -            buf_hash_table.ht_table[idx]->b_hash_next == NULL)
      774 +        if (buf_hash_table.ht_table[idx].hdr &&
      775 +            buf_hash_table.ht_table[idx].hdr->b_hash_next == NULL)
 783  776                  ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 784  777  }
 785  778  
 786  779  /*
 787  780   * Global data structures and functions for the buf kmem cache.
 788  781   */
 789  782  static kmem_cache_t *hdr_cache;
 790  783  static kmem_cache_t *buf_cache;
 791  784  
 792  785  static void
 793  786  buf_fini(void)
 794  787  {
 795  788          int i;
 796  789  
      790 +        for (i = 0; i < buf_hash_table.ht_mask + 1; i++)
      791 +                mutex_destroy(&buf_hash_table.ht_table[i].lock);
 797  792          kmem_free(buf_hash_table.ht_table,
 798      -            (buf_hash_table.ht_mask + 1) * sizeof (void *));
 799      -        for (i = 0; i < BUF_LOCKS; i++)
 800      -                mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
      793 +            (buf_hash_table.ht_mask + 1) * sizeof (struct ht_table));
 801  794          kmem_cache_destroy(hdr_cache);
 802  795          kmem_cache_destroy(buf_cache);
 803  796  }
 804  797  
 805  798  /*
 806  799   * Constructor callback - called when the cache is empty
 807  800   * and a new buf is requested.
 808  801   */
 809  802  /* ARGSUSED */
 810  803  static int
↓ open down ↓ 76 lines elided ↑ open up ↑
 887  880          /*
 888  881           * The hash table is big enough to fill all of physical memory
 889  882           * with an average 64K block size.  The table will take up
 890  883           * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 891  884           */
 892  885          while (hsize * 65536 < physmem * PAGESIZE)
 893  886                  hsize <<= 1;
 894  887  retry:
 895  888          buf_hash_table.ht_mask = hsize - 1;
 896  889          buf_hash_table.ht_table =
 897      -            kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
      890 +            kmem_zalloc(hsize * sizeof (struct ht_table), KM_NOSLEEP);
 898  891          if (buf_hash_table.ht_table == NULL) {
 899  892                  ASSERT(hsize > (1ULL << 8));
 900  893                  hsize >>= 1;
 901  894                  goto retry;
 902  895          }
 903  896  
 904  897          hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 905  898              0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 906  899          buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 907  900              0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 908  901  
 909  902          for (i = 0; i < 256; i++)
 910  903                  for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 911  904                          *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 912  905  
 913      -        for (i = 0; i < BUF_LOCKS; i++) {
 914      -                mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
      906 +        for (i = 0; i < hsize; i++) {
      907 +                mutex_init(&buf_hash_table.ht_table[i].lock,
 915  908                      NULL, MUTEX_DEFAULT, NULL);
 916  909          }
 917  910  }
 918  911  
 919  912  #define ARC_MINTIME     (hz>>4) /* 62 ms */
 920  913  
 921  914  static void
 922  915  arc_cksum_verify(arc_buf_t *buf)
 923  916  {
 924  917          zio_cksum_t zc;
↓ open down ↓ 3859 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX