Print this page
10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/vdev.c
          +++ new/usr/src/uts/common/fs/zfs/vdev.c
↓ open down ↓ 493 lines elided ↑ open up ↑
 494  494          vic->vic_prev_indirect_vdev = UINT64_MAX;
 495  495  
 496  496          rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
 497  497          mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
 498  498          vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
 499  499  
 500  500          list_link_init(&vd->vdev_leaf_node);
 501  501          mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 502  502          mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 503  503          mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 504      -        mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 505  504          mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
 506  505          mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
 507  506          cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
 508  507          cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
 509  508  
 510  509          for (int t = 0; t < DTL_TYPES; t++) {
 511  510                  vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
 512  511          }
 513  512          txg_list_create(&vd->vdev_ms_list, spa,
 514  513              offsetof(struct metaslab, ms_txg_node));
↓ open down ↓ 367 lines elided ↑ open up ↑
 882  881          if (vd->vdev_obsolete_sm != NULL) {
 883  882                  ASSERT(vd->vdev_removing ||
 884  883                      vd->vdev_ops == &vdev_indirect_ops);
 885  884                  space_map_close(vd->vdev_obsolete_sm);
 886  885                  vd->vdev_obsolete_sm = NULL;
 887  886          }
 888  887          range_tree_destroy(vd->vdev_obsolete_segments);
 889  888          rw_destroy(&vd->vdev_indirect_rwlock);
 890  889          mutex_destroy(&vd->vdev_obsolete_lock);
 891  890  
 892      -        mutex_destroy(&vd->vdev_queue_lock);
 893  891          mutex_destroy(&vd->vdev_dtl_lock);
 894  892          mutex_destroy(&vd->vdev_stat_lock);
 895  893          mutex_destroy(&vd->vdev_probe_lock);
 896  894          mutex_destroy(&vd->vdev_initialize_lock);
 897  895          mutex_destroy(&vd->vdev_initialize_io_lock);
 898  896          cv_destroy(&vd->vdev_initialize_io_cv);
 899  897          cv_destroy(&vd->vdev_initialize_cv);
 900  898  
 901  899          if (vd == spa->spa_root_vdev)
 902  900                  spa->spa_root_vdev = NULL;
↓ open down ↓ 341 lines elided ↑ open up ↑
1244 1242                   * pointer to NULL. The reason is that vdev_metaslab_fini()
1245 1243                   * may be called multiple times for certain operations
1246 1244                   * (i.e. when destroying a pool) so we need to ensure that
1247 1245                   * this clause never executes twice. This logic is similar
1248 1246                   * to the one used for the vdev_ms clause below.
1249 1247                   */
1250 1248                  vd->vdev_checkpoint_sm = NULL;
1251 1249          }
1252 1250  
1253 1251          if (vd->vdev_ms != NULL) {
1254      -                uint64_t count = vd->vdev_ms_count;
     1252 +                metaslab_group_t *mg = vd->vdev_mg;
     1253 +                metaslab_group_passivate(mg);
1255 1254  
1256      -                metaslab_group_passivate(vd->vdev_mg);
     1255 +                uint64_t count = vd->vdev_ms_count;
1257 1256                  for (uint64_t m = 0; m < count; m++) {
1258 1257                          metaslab_t *msp = vd->vdev_ms[m];
1259      -
1260 1258                          if (msp != NULL)
1261 1259                                  metaslab_fini(msp);
1262 1260                  }
1263 1261                  kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
1264 1262                  vd->vdev_ms = NULL;
1265 1263  
1266 1264                  vd->vdev_ms_count = 0;
     1265 +
     1266 +                for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
     1267 +                        ASSERT0(mg->mg_histogram[i]);
1267 1268          }
1268 1269          ASSERT0(vd->vdev_ms_count);
1269 1270  }
1270 1271  
1271 1272  typedef struct vdev_probe_stats {
1272 1273          boolean_t       vps_readable;
1273 1274          boolean_t       vps_writeable;
1274 1275          int             vps_flags;
1275 1276  } vdev_probe_stats_t;
1276 1277  
↓ open down ↓ 1265 lines elided ↑ open up ↑
2542 2543          if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
2543 2544                  ASSERT(vdev_is_concrete(vd));
2544 2545  
2545 2546                  error = space_map_open(&vd->vdev_dtl_sm, mos,
2546 2547                      vd->vdev_dtl_object, 0, -1ULL, 0);
2547 2548                  if (error)
2548 2549                          return (error);
2549 2550                  ASSERT(vd->vdev_dtl_sm != NULL);
2550 2551  
2551 2552                  mutex_enter(&vd->vdev_dtl_lock);
2552      -
2553      -                /*
2554      -                 * Now that we've opened the space_map we need to update
2555      -                 * the in-core DTL.
2556      -                 */
2557      -                space_map_update(vd->vdev_dtl_sm);
2558      -
2559 2553                  error = space_map_load(vd->vdev_dtl_sm,
2560 2554                      vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
2561 2555                  mutex_exit(&vd->vdev_dtl_lock);
2562 2556  
2563 2557                  return (error);
2564 2558          }
2565 2559  
2566 2560          for (int c = 0; c < vd->vdev_children; c++) {
2567 2561                  error = vdev_dtl_load(vd->vdev_child[c]);
2568 2562                  if (error != 0)
↓ open down ↓ 139 lines elided ↑ open up ↑
2708 2702           */
2709 2703          if (object != space_map_object(vd->vdev_dtl_sm)) {
2710 2704                  vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
2711 2705                      "new object %llu", (u_longlong_t)txg, spa_name(spa),
2712 2706                      (u_longlong_t)object,
2713 2707                      (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
2714 2708                  vdev_config_dirty(vd->vdev_top);
2715 2709          }
2716 2710  
2717 2711          dmu_tx_commit(tx);
2718      -
2719      -        mutex_enter(&vd->vdev_dtl_lock);
2720      -        space_map_update(vd->vdev_dtl_sm);
2721      -        mutex_exit(&vd->vdev_dtl_lock);
2722 2712  }
2723 2713  
2724 2714  /*
2725 2715   * Determine whether the specified vdev can be offlined/detached/removed
2726 2716   * without losing data.
2727 2717   */
2728 2718  boolean_t
2729 2719  vdev_dtl_required(vdev_t *vd)
2730 2720  {
2731 2721          spa_t *spa = vd->vdev_spa;
↓ open down ↓ 122 lines elided ↑ open up ↑
2854 2844          if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
2855 2845                  vdev_metaslab_group_create(vd);
2856 2846  
2857 2847                  if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
2858 2848                          vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2859 2849                              VDEV_AUX_CORRUPT_DATA);
2860 2850                          vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
2861 2851                              "asize=%llu", (u_longlong_t)vd->vdev_ashift,
2862 2852                              (u_longlong_t)vd->vdev_asize);
2863 2853                          return (SET_ERROR(ENXIO));
2864      -                } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
     2854 +                }
     2855 +
     2856 +                error = vdev_metaslab_init(vd, 0);
     2857 +                if (error != 0) {
2865 2858                          vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
2866 2859                              "[error=%d]", error);
2867 2860                          vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2868 2861                              VDEV_AUX_CORRUPT_DATA);
2869 2862                          return (error);
2870 2863                  }
2871 2864  
2872 2865                  uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd);
2873 2866                  if (checkpoint_sm_obj != 0) {
2874 2867                          objset_t *mos = spa_meta_objset(vd->vdev_spa);
2875 2868                          ASSERT(vd->vdev_asize != 0);
2876 2869                          ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
2877 2870  
2878      -                        if ((error = space_map_open(&vd->vdev_checkpoint_sm,
     2871 +                        error = space_map_open(&vd->vdev_checkpoint_sm,
2879 2872                              mos, checkpoint_sm_obj, 0, vd->vdev_asize,
2880      -                            vd->vdev_ashift))) {
     2873 +                            vd->vdev_ashift);
     2874 +                        if (error != 0) {
2881 2875                                  vdev_dbgmsg(vd, "vdev_load: space_map_open "
2882 2876                                      "failed for checkpoint spacemap (obj %llu) "
2883 2877                                      "[error=%d]",
2884 2878                                      (u_longlong_t)checkpoint_sm_obj, error);
2885 2879                                  return (error);
2886 2880                          }
2887 2881                          ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2888      -                        space_map_update(vd->vdev_checkpoint_sm);
2889 2882  
2890 2883                          /*
2891 2884                           * Since the checkpoint_sm contains free entries
2892      -                         * exclusively we can use sm_alloc to indicate the
2893      -                         * culmulative checkpointed space that has been freed.
     2885 +                         * exclusively we can use space_map_allocated() to
     2886 +                         * indicate the cumulative checkpointed space that
     2887 +                         * has been freed.
2894 2888                           */
2895 2889                          vd->vdev_stat.vs_checkpoint_space =
2896      -                            -vd->vdev_checkpoint_sm->sm_alloc;
     2890 +                            -space_map_allocated(vd->vdev_checkpoint_sm);
2897 2891                          vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
2898 2892                              vd->vdev_stat.vs_checkpoint_space;
2899 2893                  }
2900 2894          }
2901 2895  
2902 2896          /*
2903 2897           * If this is a leaf vdev, load its DTL.
2904 2898           */
2905 2899          if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
2906 2900                  vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
↓ open down ↓ 11 lines elided ↑ open up ↑
2918 2912  
2919 2913                  if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
2920 2914                      obsolete_sm_object, 0, vd->vdev_asize, 0))) {
2921 2915                          vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2922 2916                              VDEV_AUX_CORRUPT_DATA);
2923 2917                          vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
2924 2918                              "obsolete spacemap (obj %llu) [error=%d]",
2925 2919                              (u_longlong_t)obsolete_sm_object, error);
2926 2920                          return (error);
2927 2921                  }
2928      -                space_map_update(vd->vdev_obsolete_sm);
2929 2922          }
2930 2923  
2931 2924          return (0);
2932 2925  }
2933 2926  
2934 2927  /*
2935 2928   * The special vdev case is used for hot spares and l2cache devices.  Its
2936 2929   * sole purpose it to set the vdev state for the associated vdev.  To do this,
2937 2930   * we make sure that we can open the underlying device, then try to read the
2938 2931   * label, and make sure that the label is sane and that it hasn't been
↓ open down ↓ 66 lines elided ↑ open up ↑
3005 2998  
3006 2999  static void
3007 3000  vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
3008 3001  {
3009 3002          spa_t *spa = vd->vdev_spa;
3010 3003  
3011 3004          ASSERT(vd->vdev_islog);
3012 3005          ASSERT(vd == vd->vdev_top);
3013 3006          ASSERT3U(txg, ==, spa_syncing_txg(spa));
3014 3007  
3015      -        if (vd->vdev_ms != NULL) {
3016      -                metaslab_group_t *mg = vd->vdev_mg;
3017      -
3018      -                metaslab_group_histogram_verify(mg);
3019      -                metaslab_class_histogram_verify(mg->mg_class);
3020      -
3021      -                for (int m = 0; m < vd->vdev_ms_count; m++) {
3022      -                        metaslab_t *msp = vd->vdev_ms[m];
3023      -
3024      -                        if (msp == NULL || msp->ms_sm == NULL)
3025      -                                continue;
3026      -
3027      -                        mutex_enter(&msp->ms_lock);
3028      -                        /*
3029      -                         * If the metaslab was not loaded when the vdev
3030      -                         * was removed then the histogram accounting may
3031      -                         * not be accurate. Update the histogram information
3032      -                         * here so that we ensure that the metaslab group
3033      -                         * and metaslab class are up-to-date.
3034      -                         */
3035      -                        metaslab_group_histogram_remove(mg, msp);
3036      -
3037      -                        VERIFY0(space_map_allocated(msp->ms_sm));
3038      -                        space_map_close(msp->ms_sm);
3039      -                        msp->ms_sm = NULL;
3040      -                        mutex_exit(&msp->ms_lock);
3041      -                }
3042      -
3043      -                if (vd->vdev_checkpoint_sm != NULL) {
3044      -                        ASSERT(spa_has_checkpoint(spa));
3045      -                        space_map_close(vd->vdev_checkpoint_sm);
3046      -                        vd->vdev_checkpoint_sm = NULL;
3047      -                }
3048      -
3049      -                metaslab_group_histogram_verify(mg);
3050      -                metaslab_class_histogram_verify(mg->mg_class);
3051      -
3052      -                for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
3053      -                        ASSERT0(mg->mg_histogram[i]);
3054      -        }
3055      -
3056 3008          dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
3057 3009  
3058 3010          vdev_destroy_spacemaps(vd, tx);
3059 3011          if (vd->vdev_top_zap != 0) {
3060 3012                  vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
3061 3013                  vd->vdev_top_zap = 0;
3062 3014          }
3063 3015  
3064 3016          dmu_tx_commit(tx);
3065 3017  }
↓ open down ↓ 13 lines elided ↑ open up ↑
3079 3031          if (reassess)
3080 3032                  metaslab_sync_reassess(vd->vdev_mg);
3081 3033  }
3082 3034  
3083 3035  void
3084 3036  vdev_sync(vdev_t *vd, uint64_t txg)
3085 3037  {
3086 3038          spa_t *spa = vd->vdev_spa;
3087 3039          vdev_t *lvd;
3088 3040          metaslab_t *msp;
3089      -        dmu_tx_t *tx;
3090 3041  
     3042 +        ASSERT3U(txg, ==, spa->spa_syncing_txg);
     3043 +        dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3091 3044          if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
3092      -                dmu_tx_t *tx;
3093      -
3094 3045                  ASSERT(vd->vdev_removing ||
3095 3046                      vd->vdev_ops == &vdev_indirect_ops);
3096 3047  
3097      -                tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3098 3048                  vdev_indirect_sync_obsolete(vd, tx);
3099      -                dmu_tx_commit(tx);
3100 3049  
3101 3050                  /*
3102 3051                   * If the vdev is indirect, it can't have dirty
3103 3052                   * metaslabs or DTLs.
3104 3053                   */
3105 3054                  if (vd->vdev_ops == &vdev_indirect_ops) {
3106 3055                          ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
3107 3056                          ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
     3057 +                        dmu_tx_commit(tx);
3108 3058                          return;
3109 3059                  }
3110 3060          }
3111 3061  
3112 3062          ASSERT(vdev_is_concrete(vd));
3113 3063  
3114 3064          if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
3115 3065              !vd->vdev_removing) {
3116 3066                  ASSERT(vd == vd->vdev_top);
3117 3067                  ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
3118      -                tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3119 3068                  vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
3120 3069                      DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
3121 3070                  ASSERT(vd->vdev_ms_array != 0);
3122 3071                  vdev_config_dirty(vd);
3123      -                dmu_tx_commit(tx);
3124 3072          }
3125 3073  
3126 3074          while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
3127 3075                  metaslab_sync(msp, txg);
3128 3076                  (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
3129 3077          }
3130 3078  
3131 3079          while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
3132 3080                  vdev_dtl_sync(lvd, txg);
3133 3081  
3134 3082          /*
3135 3083           * If this is an empty log device being removed, destroy the
3136 3084           * metadata associated with it.
3137 3085           */
3138 3086          if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
3139 3087                  vdev_remove_empty_log(vd, txg);
3140 3088  
3141 3089          (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
     3090 +        dmu_tx_commit(tx);
3142 3091  }
3143 3092  
3144 3093  uint64_t
3145 3094  vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
3146 3095  {
3147 3096          return (vd->vdev_ops->vdev_op_asize(vd, psize));
3148 3097  }
3149 3098  
3150 3099  /*
3151 3100   * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
↓ open down ↓ 209 lines elided ↑ open up ↑
3361 3310                          (void) spa_vdev_state_exit(spa, vd, 0);
3362 3311  
3363 3312                          error = spa_reset_logs(spa);
3364 3313  
3365 3314                          /*
3366 3315                           * If the log device was successfully reset but has
3367 3316                           * checkpointed data, do not offline it.
3368 3317                           */
3369 3318                          if (error == 0 &&
3370 3319                              tvd->vdev_checkpoint_sm != NULL) {
3371      -                                ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
3372      -                                    !=, 0);
3373 3320                                  error = ZFS_ERR_CHECKPOINT_EXISTS;
3374 3321                          }
3375 3322  
3376 3323                          spa_vdev_state_enter(spa, SCL_ALLOC);
3377 3324  
3378 3325                          /*
3379 3326                           * Check to see if the config has changed.
3380 3327                           */
3381 3328                          if (error || generation != spa->spa_config_generation) {
3382 3329                                  metaslab_group_activate(mg);
↓ open down ↓ 956 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX