Print this page
5219 l2arc_write_buffers() may write beyond target_sz
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Saso Kiselkov <skiselkov@gmail.com>
Reviewed by: George Wilson <george@delphix.com>
Reviewed by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Justin Gibbs <gibbs@FreeBSD.org>
Approved by: Matthew Ahrens <mahrens@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/arc.c
          +++ new/usr/src/uts/common/fs/zfs/arc.c
↓ open down ↓ 5836 lines elided ↑ open up ↑
5837 5837   * state between calls to this function.
5838 5838   *
5839 5839   * Returns the number of bytes actually written (which may be smaller than
5840 5840   * the delta by which the device hand has changed due to alignment).
5841 5841   */
5842 5842  static uint64_t
5843 5843  l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5844 5844      boolean_t *headroom_boost)
5845 5845  {
5846 5846          arc_buf_hdr_t *hdr, *hdr_prev, *head;
5847      -        uint64_t write_asize, write_psize, write_sz, headroom,
     5847 +        uint64_t write_asize, write_sz, headroom,
5848 5848              buf_compress_minsz;
5849 5849          void *buf_data;
5850 5850          boolean_t full;
5851 5851          l2arc_write_callback_t *cb;
5852 5852          zio_t *pio, *wzio;
5853 5853          uint64_t guid = spa_load_guid(spa);
5854 5854          const boolean_t do_headroom_boost = *headroom_boost;
5855 5855  
5856 5856          ASSERT(dev->l2ad_vdev != NULL);
5857 5857  
5858 5858          /* Lower the flag now, we might want to raise it again later. */
5859 5859          *headroom_boost = B_FALSE;
5860 5860  
5861 5861          pio = NULL;
5862      -        write_sz = write_asize = write_psize = 0;
     5862 +        write_sz = write_asize = 0;
5863 5863          full = B_FALSE;
5864 5864          head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
5865 5865          head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
5866 5866          head->b_flags |= ARC_FLAG_HAS_L2HDR;
5867 5867  
5868 5868          /*
5869 5869           * We will want to try to compress buffers that are at least 2x the
5870 5870           * device sector size.
5871 5871           */
5872 5872          buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
↓ open down ↓ 16 lines elided ↑ open up ↑
5889 5889                  else
5890 5890                          hdr = multilist_sublist_tail(mls);
5891 5891  
5892 5892                  headroom = target_sz * l2arc_headroom;
5893 5893                  if (do_headroom_boost)
5894 5894                          headroom = (headroom * l2arc_headroom_boost) / 100;
5895 5895  
5896 5896                  for (; hdr; hdr = hdr_prev) {
5897 5897                          kmutex_t *hash_lock;
5898 5898                          uint64_t buf_sz;
     5899 +                        uint64_t buf_a_sz;
5899 5900  
5900 5901                          if (arc_warm == B_FALSE)
5901 5902                                  hdr_prev = multilist_sublist_next(mls, hdr);
5902 5903                          else
5903 5904                                  hdr_prev = multilist_sublist_prev(mls, hdr);
5904 5905  
5905 5906                          hash_lock = HDR_LOCK(hdr);
5906 5907                          if (!mutex_tryenter(hash_lock)) {
5907 5908                                  /*
5908 5909                                   * Skip this buffer rather than waiting.
↓ open down ↓ 8 lines elided ↑ open up ↑
5917 5918                                   */
5918 5919                                  mutex_exit(hash_lock);
5919 5920                                  break;
5920 5921                          }
5921 5922  
5922 5923                          if (!l2arc_write_eligible(guid, hdr)) {
5923 5924                                  mutex_exit(hash_lock);
5924 5925                                  continue;
5925 5926                          }
5926 5927  
5927      -                        if ((write_sz + hdr->b_size) > target_sz) {
     5928 +                        /*
     5929 +                         * Assume that the buffer is not going to be compressed
     5930 +                         * and could take more space on disk because of a larger
     5931 +                         * disk block size.
     5932 +                         */
     5933 +                        buf_sz = hdr->b_size;
     5934 +                        buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
     5935 +
     5936 +                        if ((write_asize + buf_a_sz) > target_sz) {
5928 5937                                  full = B_TRUE;
5929 5938                                  mutex_exit(hash_lock);
5930 5939                                  break;
5931 5940                          }
5932 5941  
5933 5942                          if (pio == NULL) {
5934 5943                                  /*
5935 5944                                   * Insert a dummy header on the buflist so
5936 5945                                   * l2arc_write_done() can find where the
5937 5946                                   * write buffers begin without searching.
↓ open down ↓ 43 lines elided ↑ open up ↑
5981 5990                           * header's hash_lock below, in the second stage
5982 5991                           * of this function. Thus, we can't simply
5983 5992                           * change the b_flags field to denote that the
5984 5993                           * IO has been sent. We can change the b_daddr
5985 5994                           * field of the L2 portion, though, since we'll
5986 5995                           * be holding the l2ad_mtx; which is why we're
5987 5996                           * using it to denote the header's state change.
5988 5997                           */
5989 5998                          hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
5990 5999  
5991      -                        buf_sz = hdr->b_size;
5992 6000                          hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
5993 6001  
5994 6002                          mutex_enter(&dev->l2ad_mtx);
5995 6003                          list_insert_head(&dev->l2ad_buflist, hdr);
5996 6004                          mutex_exit(&dev->l2ad_mtx);
5997 6005  
5998 6006                          /*
5999 6007                           * Compute and store the buffer cksum before
6000 6008                           * writing.  On debug the cksum is verified first.
6001 6009                           */
6002 6010                          arc_cksum_verify(hdr->b_l1hdr.b_buf);
6003 6011                          arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
6004 6012  
6005 6013                          mutex_exit(hash_lock);
6006 6014  
6007 6015                          write_sz += buf_sz;
     6016 +                        write_asize += buf_a_sz;
6008 6017                  }
6009 6018  
6010 6019                  multilist_sublist_unlock(mls);
6011 6020  
6012 6021                  if (full == B_TRUE)
6013 6022                          break;
6014 6023          }
6015 6024  
6016 6025          /* No buffers selected for writing? */
6017 6026          if (pio == NULL) {
6018 6027                  ASSERT0(write_sz);
6019 6028                  ASSERT(!HDR_HAS_L1HDR(head));
6020 6029                  kmem_cache_free(hdr_l2only_cache, head);
6021 6030                  return (0);
6022 6031          }
6023 6032  
6024 6033          mutex_enter(&dev->l2ad_mtx);
6025 6034  
6026 6035          /*
     6036 +         * Note that elsewhere in this file arcstat_l2_asize
     6037 +         * and the used space on l2ad_vdev are updated using b_asize,
     6038 +         * which is not necessarily rounded up to the device block size.
     6039 +         * Too keep accounting consistent we do the same here as well:
     6040 +         * stats_size accumulates the sum of b_asize of the written buffers,
     6041 +         * while write_asize accumulates the sum of b_asize rounded up
     6042 +         * to the device block size.
     6043 +         * The latter sum is used only to validate the corectness of the code.
     6044 +         */
     6045 +        uint64_t stats_size = 0;
     6046 +        write_asize = 0;
     6047 +
     6048 +        /*
6027 6049           * Now start writing the buffers. We're starting at the write head
6028 6050           * and work backwards, retracing the course of the buffer selector
6029 6051           * loop above.
6030 6052           */
6031 6053          for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
6032 6054              hdr = list_prev(&dev->l2ad_buflist, hdr)) {
6033 6055                  uint64_t buf_sz;
6034 6056  
6035 6057                  /*
6036 6058                   * We rely on the L1 portion of the header below, so
↓ open down ↓ 32 lines elided ↑ open up ↑
6069 6091  
6070 6092                  /*
6071 6093                   * We need to do this regardless if buf_sz is zero or
6072 6094                   * not, otherwise, when this l2hdr is evicted we'll
6073 6095                   * remove a reference that was never added.
6074 6096                   */
6075 6097                  (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
6076 6098  
6077 6099                  /* Compression may have squashed the buffer to zero length. */
6078 6100                  if (buf_sz != 0) {
6079      -                        uint64_t buf_p_sz;
     6101 +                        uint64_t buf_a_sz;
6080 6102  
6081 6103                          wzio = zio_write_phys(pio, dev->l2ad_vdev,
6082 6104                              dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
6083 6105                              NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
6084 6106                              ZIO_FLAG_CANFAIL, B_FALSE);
6085 6107  
6086 6108                          DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6087 6109                              zio_t *, wzio);
6088 6110                          (void) zio_nowait(wzio);
6089 6111  
6090      -                        write_asize += buf_sz;
     6112 +                        stats_size += buf_sz;
6091 6113  
6092 6114                          /*
6093 6115                           * Keep the clock hand suitably device-aligned.
6094 6116                           */
6095      -                        buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6096      -                        write_psize += buf_p_sz;
6097      -                        dev->l2ad_hand += buf_p_sz;
     6117 +                        buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
     6118 +                        write_asize += buf_a_sz;
     6119 +                        dev->l2ad_hand += buf_a_sz;
6098 6120                  }
6099 6121          }
6100 6122  
6101 6123          mutex_exit(&dev->l2ad_mtx);
6102 6124  
6103 6125          ASSERT3U(write_asize, <=, target_sz);
6104 6126          ARCSTAT_BUMP(arcstat_l2_writes_sent);
6105 6127          ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
6106 6128          ARCSTAT_INCR(arcstat_l2_size, write_sz);
6107      -        ARCSTAT_INCR(arcstat_l2_asize, write_asize);
6108      -        vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
     6129 +        ARCSTAT_INCR(arcstat_l2_asize, stats_size);
     6130 +        vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0);
6109 6131  
6110 6132          /*
6111 6133           * Bump device hand to the device start if it is approaching the end.
6112 6134           * l2arc_evict() will already have evicted ahead for this case.
6113 6135           */
6114 6136          if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
6115 6137                  dev->l2ad_hand = dev->l2ad_start;
6116 6138                  dev->l2ad_first = B_FALSE;
6117 6139          }
6118 6140  
↓ open down ↓ 443 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX