Print this page
NEX-13140 DVA-throttle support for special-class
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-4620 ZFS autotrim triggering is unreliable
NEX-4622 On-demand TRIM code illogically enumerates metaslabs via mg_ms_tree
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Hans Rosenfeld <hans.rosenfeld@nexenta.com>
NEX-3984 On-demand TRIM
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Conflicts:
        usr/src/common/zfs/zpool_prop.c
        usr/src/uts/common/sys/fs/zfs.h
NEX-3508 CLONE - Port NEX-2946 Add UNMAP/TRIM functionality to ZFS and illumos
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Conflicts:
    usr/src/uts/common/io/scsi/targets/sd.c
    usr/src/uts/common/sys/scsi/targets/sddef.h

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
          +++ new/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
↓ open down ↓ 17 lines elided ↑ open up ↑
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
       28 + * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
  28   29   */
  29   30  
  30   31  #ifndef _SYS_METASLAB_IMPL_H
  31   32  #define _SYS_METASLAB_IMPL_H
  32   33  
  33   34  #include <sys/metaslab.h>
  34   35  #include <sys/space_map.h>
  35   36  #include <sys/range_tree.h>
  36   37  #include <sys/vdev.h>
  37   38  #include <sys/txg.h>
↓ open down ↓ 143 lines elided ↑ open up ↑
 181  182          uint64_t                mc_alloc_max_slots;
 182  183          refcount_t              mc_alloc_slots;
 183  184  
 184  185          uint64_t                mc_alloc_groups; /* # of allocatable groups */
 185  186  
 186  187          uint64_t                mc_alloc;       /* total allocated space */
 187  188          uint64_t                mc_deferred;    /* total deferred frees */
 188  189          uint64_t                mc_space;       /* total space (alloc + free) */
 189  190          uint64_t                mc_dspace;      /* total deflated space */
 190  191          uint64_t                mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
      192 +
      193 +        kmutex_t                mc_alloc_lock;
      194 +        avl_tree_t              mc_alloc_tree;
 191  195  };
 192  196  
 193  197  /*
 194  198   * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
 195  199   * of a top-level vdev. They are linked togther to form a circular linked
 196  200   * list and can belong to only one metaslab class. Metaslab groups may become
 197  201   * ineligible for allocations for a number of reasons such as limited free
 198  202   * space, fragmentation, or going offline. When this happens the allocator will
 199  203   * simply find the next metaslab group in the linked list and attempt
 200  204   * to allocate from that group instead.
↓ open down ↓ 38 lines elided ↑ open up ↑
 239  243           * groups.
 240  244           */
 241  245          boolean_t               mg_no_free_space;
 242  246  
 243  247          uint64_t                mg_allocations;
 244  248          uint64_t                mg_failed_allocations;
 245  249          uint64_t                mg_fragmentation;
 246  250          uint64_t                mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
 247  251  };
 248  252  
      253 +typedef struct {
      254 +        uint64_t        ts_birth;       /* TXG at which this trimset starts */
      255 +        range_tree_t    *ts_tree;       /* tree of extents in the trimset */
      256 +} metaslab_trimset_t;
      257 +
 249  258  /*
 250  259   * This value defines the number of elements in the ms_lbas array. The value
 251  260   * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
 252  261   * This is the equivalent of highbit(UINT64_MAX).
 253  262   */
 254  263  #define MAX_LBAS        64
 255  264  
 256  265  /*
 257  266   * Each metaslab maintains a set of in-core trees to track metaslab
 258  267   * operations.  The in-core free tree (ms_tree) contains the list of
 259  268   * free segments which are eligible for allocation.  As blocks are
 260      - * allocated, the allocated segment are removed from the ms_tree and
 261      - * added to a per txg allocation tree (ms_alloctree).  As blocks are
 262      - * freed, they are added to the free tree (ms_freeingtree).  These trees
 263      - * allow us to process all allocations and frees in syncing context
 264      - * where it is safe to update the on-disk space maps.  An additional set
 265      - * of in-core trees is maintained to track deferred frees
 266      - * (ms_defertree).  Once a block is freed it will move from the
      269 + * allocated, the allocated segments are removed from the ms_tree and
      270 + * added to a per txg allocation tree (ms_alloctree).  This allows us to
      271 + * process all allocations in syncing context where it is safe to update
      272 + * the on-disk space maps.  Frees are also processed in syncing context.
      273 + * Most frees are generated from syncing context, and those that are not
      274 + * are held in the spa_free_bplist for processing in syncing context.
      275 + * An additional set of in-core trees is maintained to track deferred
      276 + * frees (ms_defertree).  Once a block is freed it will move from the
 267  277   * ms_freedtree to the ms_defertree.  A deferred free means that a block
 268  278   * has been freed but cannot be used by the pool until TXG_DEFER_SIZE
 269  279   * transactions groups later.  For example, a block that is freed in txg
 270  280   * 50 will not be available for reallocation until txg 52 (50 +
 271  281   * TXG_DEFER_SIZE).  This provides a safety net for uberblock rollback.
 272  282   * A pool could be safely rolled back TXG_DEFERS_SIZE transactions
 273  283   * groups and ensure that no block has been reallocated.
 274  284   *
 275  285   * The simplified transition diagram looks like this:
 276  286   *
↓ open down ↓ 25 lines elided ↑ open up ↑
 302  312   * As the space map grows (as a result of the appends) it will
 303  313   * eventually become space-inefficient.  When the metaslab's in-core
 304  314   * free tree is zfs_condense_pct/100 times the size of the minimal
 305  315   * on-disk representation, we rewrite it in its minimized form.  If a
 306  316   * metaslab needs to condense then we must set the ms_condensing flag to
 307  317   * ensure that allocations are not performed on the metaslab that is
 308  318   * being written.
 309  319   */
 310  320  struct metaslab {
 311  321          kmutex_t        ms_lock;
 312      -        kmutex_t        ms_sync_lock;
 313  322          kcondvar_t      ms_load_cv;
 314  323          space_map_t     *ms_sm;
 315  324          uint64_t        ms_id;
 316  325          uint64_t        ms_start;
 317  326          uint64_t        ms_size;
 318  327          uint64_t        ms_fragmentation;
 319  328  
 320  329          range_tree_t    *ms_alloctree[TXG_SIZE];
 321  330          range_tree_t    *ms_tree;
 322  331  
      332 +        metaslab_trimset_t      *ms_cur_ts; /* currently prepared trims */
      333 +        metaslab_trimset_t      *ms_prev_ts;  /* previous (aging) trims */
      334 +        kcondvar_t              ms_trim_cv;
      335 +        metaslab_trimset_t      *ms_trimming_ts;
      336 +
 323  337          /*
 324  338           * The following range trees are accessed only from syncing context.
 325  339           * ms_free*tree only have entries while syncing, and are empty
 326  340           * between syncs.
 327  341           */
 328  342          range_tree_t    *ms_freeingtree; /* to free this syncing txg */
 329  343          range_tree_t    *ms_freedtree; /* already freed this syncing txg */
 330  344          range_tree_t    *ms_defertree[TXG_DEFER_SIZE];
 331  345  
 332  346          boolean_t       ms_condensing;  /* condensing? */
↓ open down ↓ 43 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX