Print this page
5219 l2arc_write_buffers() may write beyond target_sz
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Saso Kiselkov <skiselkov@gmail.com>
Reviewed by: George Wilson <george@delphix.com>
Reviewed by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Justin Gibbs <gibbs@FreeBSD.org>
Approved by: Matthew Ahrens <mahrens@delphix.com>


5827         }
5828         mutex_exit(&dev->l2ad_mtx);
5829 }
5830 
5831 /*
5832  * Find and write ARC buffers to the L2ARC device.
5833  *
5834  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
5835  * for reading until they have completed writing.
5836  * The headroom_boost is an in-out parameter used to maintain headroom boost
5837  * state between calls to this function.
5838  *
5839  * Returns the number of bytes actually written (which may be smaller than
5840  * the delta by which the device hand has changed due to alignment).
5841  */
5842 static uint64_t
5843 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5844     boolean_t *headroom_boost)
5845 {
5846         arc_buf_hdr_t *hdr, *hdr_prev, *head;
5847         uint64_t write_asize, write_psize, write_sz, headroom,
5848             buf_compress_minsz;
5849         void *buf_data;
5850         boolean_t full;
5851         l2arc_write_callback_t *cb;
5852         zio_t *pio, *wzio;
5853         uint64_t guid = spa_load_guid(spa);
5854         const boolean_t do_headroom_boost = *headroom_boost;
5855 
5856         ASSERT(dev->l2ad_vdev != NULL);
5857 
5858         /* Lower the flag now, we might want to raise it again later. */
5859         *headroom_boost = B_FALSE;
5860 
5861         pio = NULL;
5862         write_sz = write_asize = write_psize = 0;
5863         full = B_FALSE;
5864         head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
5865         head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
5866         head->b_flags |= ARC_FLAG_HAS_L2HDR;
5867 
5868         /*
5869          * We will want to try to compress buffers that are at least 2x the
5870          * device sector size.
5871          */
5872         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5873 
5874         /*
5875          * Copy buffers for L2ARC writing.
5876          */
5877         for (int try = 0; try <= 3; try++) {
5878                 multilist_sublist_t *mls = l2arc_sublist_lock(try);
5879                 uint64_t passed_sz = 0;
5880 
5881                 /*
5882                  * L2ARC fast warmup.
5883                  *
5884                  * Until the ARC is warm and starts to evict, read from the
5885                  * head of the ARC lists rather than the tail.
5886                  */
5887                 if (arc_warm == B_FALSE)
5888                         hdr = multilist_sublist_head(mls);
5889                 else
5890                         hdr = multilist_sublist_tail(mls);
5891 
5892                 headroom = target_sz * l2arc_headroom;
5893                 if (do_headroom_boost)
5894                         headroom = (headroom * l2arc_headroom_boost) / 100;
5895 
5896                 for (; hdr; hdr = hdr_prev) {
5897                         kmutex_t *hash_lock;
5898                         uint64_t buf_sz;

5899 
5900                         if (arc_warm == B_FALSE)
5901                                 hdr_prev = multilist_sublist_next(mls, hdr);
5902                         else
5903                                 hdr_prev = multilist_sublist_prev(mls, hdr);
5904 
5905                         hash_lock = HDR_LOCK(hdr);
5906                         if (!mutex_tryenter(hash_lock)) {
5907                                 /*
5908                                  * Skip this buffer rather than waiting.
5909                                  */
5910                                 continue;
5911                         }
5912 
5913                         passed_sz += hdr->b_size;
5914                         if (passed_sz > headroom) {
5915                                 /*
5916                                  * Searched too far.
5917                                  */
5918                                 mutex_exit(hash_lock);
5919                                 break;
5920                         }
5921 
5922                         if (!l2arc_write_eligible(guid, hdr)) {
5923                                 mutex_exit(hash_lock);
5924                                 continue;
5925                         }
5926 
5927                         if ((write_sz + hdr->b_size) > target_sz) {








5928                                 full = B_TRUE;
5929                                 mutex_exit(hash_lock);
5930                                 break;
5931                         }
5932 
5933                         if (pio == NULL) {
5934                                 /*
5935                                  * Insert a dummy header on the buflist so
5936                                  * l2arc_write_done() can find where the
5937                                  * write buffers begin without searching.
5938                                  */
5939                                 mutex_enter(&dev->l2ad_mtx);
5940                                 list_insert_head(&dev->l2ad_buflist, head);
5941                                 mutex_exit(&dev->l2ad_mtx);
5942 
5943                                 cb = kmem_alloc(
5944                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
5945                                 cb->l2wcb_dev = dev;
5946                                 cb->l2wcb_head = head;
5947                                 pio = zio_root(spa, l2arc_write_done, cb,


5971                          * enables us to differentiate which stage of
5972                          * l2arc_write_buffers() the particular header
5973                          * is in (e.g. this loop, or the one below).
5974                          * ARC_FLAG_L2_WRITING is not enough to make
5975                          * this distinction, and we need to know in
5976                          * order to do proper l2arc vdev accounting in
5977                          * arc_release() and arc_hdr_destroy().
5978                          *
5979                          * Note, we can't use a new flag to distinguish
5980                          * the two stages because we don't hold the
5981                          * header's hash_lock below, in the second stage
5982                          * of this function. Thus, we can't simply
5983                          * change the b_flags field to denote that the
5984                          * IO has been sent. We can change the b_daddr
5985                          * field of the L2 portion, though, since we'll
5986                          * be holding the l2ad_mtx; which is why we're
5987                          * using it to denote the header's state change.
5988                          */
5989                         hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
5990 
5991                         buf_sz = hdr->b_size;
5992                         hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
5993 
5994                         mutex_enter(&dev->l2ad_mtx);
5995                         list_insert_head(&dev->l2ad_buflist, hdr);
5996                         mutex_exit(&dev->l2ad_mtx);
5997 
5998                         /*
5999                          * Compute and store the buffer cksum before
6000                          * writing.  On debug the cksum is verified first.
6001                          */
6002                         arc_cksum_verify(hdr->b_l1hdr.b_buf);
6003                         arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
6004 
6005                         mutex_exit(hash_lock);
6006 
6007                         write_sz += buf_sz;

6008                 }
6009 
6010                 multilist_sublist_unlock(mls);
6011 
6012                 if (full == B_TRUE)
6013                         break;
6014         }
6015 
6016         /* No buffers selected for writing? */
6017         if (pio == NULL) {
6018                 ASSERT0(write_sz);
6019                 ASSERT(!HDR_HAS_L1HDR(head));
6020                 kmem_cache_free(hdr_l2only_cache, head);
6021                 return (0);
6022         }
6023 
6024         mutex_enter(&dev->l2ad_mtx);
6025 
6026         /*













6027          * Now start writing the buffers. We're starting at the write head
6028          * and work backwards, retracing the course of the buffer selector
6029          * loop above.
6030          */
6031         for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
6032             hdr = list_prev(&dev->l2ad_buflist, hdr)) {
6033                 uint64_t buf_sz;
6034 
6035                 /*
6036                  * We rely on the L1 portion of the header below, so
6037                  * it's invalid for this header to have been evicted out
6038                  * of the ghost cache, prior to being written out. The
6039                  * ARC_FLAG_L2_WRITING bit ensures this won't happen.
6040                  */
6041                 ASSERT(HDR_HAS_L1HDR(hdr));
6042 
6043                 /*
6044                  * We shouldn't need to lock the buffer here, since we flagged
6045                  * it as ARC_FLAG_L2_WRITING in the previous step, but we must
6046                  * take care to only access its L2 cache parameters. In


6059                                 *headroom_boost = B_TRUE;
6060                         }
6061                 }
6062 
6063                 /*
6064                  * Pick up the buffer data we had previously stashed away
6065                  * (and now potentially also compressed).
6066                  */
6067                 buf_data = hdr->b_l1hdr.b_tmp_cdata;
6068                 buf_sz = hdr->b_l2hdr.b_asize;
6069 
6070                 /*
6071                  * We need to do this regardless if buf_sz is zero or
6072                  * not, otherwise, when this l2hdr is evicted we'll
6073                  * remove a reference that was never added.
6074                  */
6075                 (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
6076 
6077                 /* Compression may have squashed the buffer to zero length. */
6078                 if (buf_sz != 0) {
6079                         uint64_t buf_p_sz;
6080 
6081                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
6082                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
6083                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
6084                             ZIO_FLAG_CANFAIL, B_FALSE);
6085 
6086                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6087                             zio_t *, wzio);
6088                         (void) zio_nowait(wzio);
6089 
6090                         write_asize += buf_sz;
6091 
6092                         /*
6093                          * Keep the clock hand suitably device-aligned.
6094                          */
6095                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6096                         write_psize += buf_p_sz;
6097                         dev->l2ad_hand += buf_p_sz;
6098                 }
6099         }
6100 
6101         mutex_exit(&dev->l2ad_mtx);
6102 
6103         ASSERT3U(write_asize, <=, target_sz);
6104         ARCSTAT_BUMP(arcstat_l2_writes_sent);
6105         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
6106         ARCSTAT_INCR(arcstat_l2_size, write_sz);
6107         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
6108         vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
6109 
6110         /*
6111          * Bump device hand to the device start if it is approaching the end.
6112          * l2arc_evict() will already have evicted ahead for this case.
6113          */
6114         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
6115                 dev->l2ad_hand = dev->l2ad_start;
6116                 dev->l2ad_first = B_FALSE;
6117         }
6118 
6119         dev->l2ad_writing = B_TRUE;
6120         (void) zio_wait(pio);
6121         dev->l2ad_writing = B_FALSE;
6122 
6123         return (write_asize);
6124 }
6125 
6126 /*
6127  * Compresses an L2ARC buffer.
6128  * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its




5827         }
5828         mutex_exit(&dev->l2ad_mtx);
5829 }
5830 
5831 /*
5832  * Find and write ARC buffers to the L2ARC device.
5833  *
5834  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
5835  * for reading until they have completed writing.
5836  * The headroom_boost is an in-out parameter used to maintain headroom boost
5837  * state between calls to this function.
5838  *
5839  * Returns the number of bytes actually written (which may be smaller than
5840  * the delta by which the device hand has changed due to alignment).
5841  */
5842 static uint64_t
5843 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5844     boolean_t *headroom_boost)
5845 {
5846         arc_buf_hdr_t *hdr, *hdr_prev, *head;
5847         uint64_t write_asize, write_sz, headroom,
5848             buf_compress_minsz;
5849         void *buf_data;
5850         boolean_t full;
5851         l2arc_write_callback_t *cb;
5852         zio_t *pio, *wzio;
5853         uint64_t guid = spa_load_guid(spa);
5854         const boolean_t do_headroom_boost = *headroom_boost;
5855 
5856         ASSERT(dev->l2ad_vdev != NULL);
5857 
5858         /* Lower the flag now, we might want to raise it again later. */
5859         *headroom_boost = B_FALSE;
5860 
5861         pio = NULL;
5862         write_sz = write_asize = 0;
5863         full = B_FALSE;
5864         head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
5865         head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
5866         head->b_flags |= ARC_FLAG_HAS_L2HDR;
5867 
5868         /*
5869          * We will want to try to compress buffers that are at least 2x the
5870          * device sector size.
5871          */
5872         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5873 
5874         /*
5875          * Copy buffers for L2ARC writing.
5876          */
5877         for (int try = 0; try <= 3; try++) {
5878                 multilist_sublist_t *mls = l2arc_sublist_lock(try);
5879                 uint64_t passed_sz = 0;
5880 
5881                 /*
5882                  * L2ARC fast warmup.
5883                  *
5884                  * Until the ARC is warm and starts to evict, read from the
5885                  * head of the ARC lists rather than the tail.
5886                  */
5887                 if (arc_warm == B_FALSE)
5888                         hdr = multilist_sublist_head(mls);
5889                 else
5890                         hdr = multilist_sublist_tail(mls);
5891 
5892                 headroom = target_sz * l2arc_headroom;
5893                 if (do_headroom_boost)
5894                         headroom = (headroom * l2arc_headroom_boost) / 100;
5895 
5896                 for (; hdr; hdr = hdr_prev) {
5897                         kmutex_t *hash_lock;
5898                         uint64_t buf_sz;
5899                         uint64_t buf_a_sz;
5900 
5901                         if (arc_warm == B_FALSE)
5902                                 hdr_prev = multilist_sublist_next(mls, hdr);
5903                         else
5904                                 hdr_prev = multilist_sublist_prev(mls, hdr);
5905 
5906                         hash_lock = HDR_LOCK(hdr);
5907                         if (!mutex_tryenter(hash_lock)) {
5908                                 /*
5909                                  * Skip this buffer rather than waiting.
5910                                  */
5911                                 continue;
5912                         }
5913 
5914                         passed_sz += hdr->b_size;
5915                         if (passed_sz > headroom) {
5916                                 /*
5917                                  * Searched too far.
5918                                  */
5919                                 mutex_exit(hash_lock);
5920                                 break;
5921                         }
5922 
5923                         if (!l2arc_write_eligible(guid, hdr)) {
5924                                 mutex_exit(hash_lock);
5925                                 continue;
5926                         }
5927 
5928                         /*
5929                          * Assume that the buffer is not going to be compressed
5930                          * and could take more space on disk because of a larger
5931                          * disk block size.
5932                          */
5933                         buf_sz = hdr->b_size;
5934                         buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5935 
5936                         if ((write_asize + buf_a_sz) > target_sz) {
5937                                 full = B_TRUE;
5938                                 mutex_exit(hash_lock);
5939                                 break;
5940                         }
5941 
5942                         if (pio == NULL) {
5943                                 /*
5944                                  * Insert a dummy header on the buflist so
5945                                  * l2arc_write_done() can find where the
5946                                  * write buffers begin without searching.
5947                                  */
5948                                 mutex_enter(&dev->l2ad_mtx);
5949                                 list_insert_head(&dev->l2ad_buflist, head);
5950                                 mutex_exit(&dev->l2ad_mtx);
5951 
5952                                 cb = kmem_alloc(
5953                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
5954                                 cb->l2wcb_dev = dev;
5955                                 cb->l2wcb_head = head;
5956                                 pio = zio_root(spa, l2arc_write_done, cb,


5980                          * enables us to differentiate which stage of
5981                          * l2arc_write_buffers() the particular header
5982                          * is in (e.g. this loop, or the one below).
5983                          * ARC_FLAG_L2_WRITING is not enough to make
5984                          * this distinction, and we need to know in
5985                          * order to do proper l2arc vdev accounting in
5986                          * arc_release() and arc_hdr_destroy().
5987                          *
5988                          * Note, we can't use a new flag to distinguish
5989                          * the two stages because we don't hold the
5990                          * header's hash_lock below, in the second stage
5991                          * of this function. Thus, we can't simply
5992                          * change the b_flags field to denote that the
5993                          * IO has been sent. We can change the b_daddr
5994                          * field of the L2 portion, though, since we'll
5995                          * be holding the l2ad_mtx; which is why we're
5996                          * using it to denote the header's state change.
5997                          */
5998                         hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
5999 

6000                         hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
6001 
6002                         mutex_enter(&dev->l2ad_mtx);
6003                         list_insert_head(&dev->l2ad_buflist, hdr);
6004                         mutex_exit(&dev->l2ad_mtx);
6005 
6006                         /*
6007                          * Compute and store the buffer cksum before
6008                          * writing.  On debug the cksum is verified first.
6009                          */
6010                         arc_cksum_verify(hdr->b_l1hdr.b_buf);
6011                         arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
6012 
6013                         mutex_exit(hash_lock);
6014 
6015                         write_sz += buf_sz;
6016                         write_asize += buf_a_sz;
6017                 }
6018 
6019                 multilist_sublist_unlock(mls);
6020 
6021                 if (full == B_TRUE)
6022                         break;
6023         }
6024 
6025         /* No buffers selected for writing? */
6026         if (pio == NULL) {
6027                 ASSERT0(write_sz);
6028                 ASSERT(!HDR_HAS_L1HDR(head));
6029                 kmem_cache_free(hdr_l2only_cache, head);
6030                 return (0);
6031         }
6032 
6033         mutex_enter(&dev->l2ad_mtx);
6034 
6035         /*
6036          * Note that elsewhere in this file arcstat_l2_asize
6037          * and the used space on l2ad_vdev are updated using b_asize,
6038          * which is not necessarily rounded up to the device block size.
6039          * Too keep accounting consistent we do the same here as well:
6040          * stats_size accumulates the sum of b_asize of the written buffers,
6041          * while write_asize accumulates the sum of b_asize rounded up
6042          * to the device block size.
6043          * The latter sum is used only to validate the corectness of the code.
6044          */
6045         uint64_t stats_size = 0;
6046         write_asize = 0;
6047 
6048         /*
6049          * Now start writing the buffers. We're starting at the write head
6050          * and work backwards, retracing the course of the buffer selector
6051          * loop above.
6052          */
6053         for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
6054             hdr = list_prev(&dev->l2ad_buflist, hdr)) {
6055                 uint64_t buf_sz;
6056 
6057                 /*
6058                  * We rely on the L1 portion of the header below, so
6059                  * it's invalid for this header to have been evicted out
6060                  * of the ghost cache, prior to being written out. The
6061                  * ARC_FLAG_L2_WRITING bit ensures this won't happen.
6062                  */
6063                 ASSERT(HDR_HAS_L1HDR(hdr));
6064 
6065                 /*
6066                  * We shouldn't need to lock the buffer here, since we flagged
6067                  * it as ARC_FLAG_L2_WRITING in the previous step, but we must
6068                  * take care to only access its L2 cache parameters. In


6081                                 *headroom_boost = B_TRUE;
6082                         }
6083                 }
6084 
6085                 /*
6086                  * Pick up the buffer data we had previously stashed away
6087                  * (and now potentially also compressed).
6088                  */
6089                 buf_data = hdr->b_l1hdr.b_tmp_cdata;
6090                 buf_sz = hdr->b_l2hdr.b_asize;
6091 
6092                 /*
6093                  * We need to do this regardless if buf_sz is zero or
6094                  * not, otherwise, when this l2hdr is evicted we'll
6095                  * remove a reference that was never added.
6096                  */
6097                 (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
6098 
6099                 /* Compression may have squashed the buffer to zero length. */
6100                 if (buf_sz != 0) {
6101                         uint64_t buf_a_sz;
6102 
6103                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
6104                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
6105                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
6106                             ZIO_FLAG_CANFAIL, B_FALSE);
6107 
6108                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6109                             zio_t *, wzio);
6110                         (void) zio_nowait(wzio);
6111 
6112                         stats_size += buf_sz;
6113 
6114                         /*
6115                          * Keep the clock hand suitably device-aligned.
6116                          */
6117                         buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6118                         write_asize += buf_a_sz;
6119                         dev->l2ad_hand += buf_a_sz;
6120                 }
6121         }
6122 
6123         mutex_exit(&dev->l2ad_mtx);
6124 
6125         ASSERT3U(write_asize, <=, target_sz);
6126         ARCSTAT_BUMP(arcstat_l2_writes_sent);
6127         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
6128         ARCSTAT_INCR(arcstat_l2_size, write_sz);
6129         ARCSTAT_INCR(arcstat_l2_asize, stats_size);
6130         vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0);
6131 
6132         /*
6133          * Bump device hand to the device start if it is approaching the end.
6134          * l2arc_evict() will already have evicted ahead for this case.
6135          */
6136         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
6137                 dev->l2ad_hand = dev->l2ad_start;
6138                 dev->l2ad_first = B_FALSE;
6139         }
6140 
6141         dev->l2ad_writing = B_TRUE;
6142         (void) zio_wait(pio);
6143         dev->l2ad_writing = B_FALSE;
6144 
6145         return (write_asize);
6146 }
6147 
6148 /*
6149  * Compresses an L2ARC buffer.
6150  * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its