Print this page
NEX-20260 NFS hung in transitional state when RSF marks it maintenance
NEX-20423 NFSv4 state database entry locking is not always used around reference count.
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
NEX-16452 NFS server in a zone state database needs to be per zone
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-15279 support NFS server in zone
NEX-15520 online NFS shares cause zoneadm halt to hang in nfs_export_zone_fini
Portions contributed by: Dan Kruchinin dan.kruchinin@nexenta.com
Portions contributed by: Stepan Zastupov stepan.zastupov@gmail.com
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/nfs/nfs4_db.c
          +++ new/usr/src/uts/common/fs/nfs/nfs4_db.c
↓ open down ↓ 10 lines elided ↑ open up ↑
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
       21 +
  21   22  /*
  22   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  23   24   */
  24   25  
       26 +/*
       27 + * Copyright 2019 Nexenta Systems, Inc.
       28 + */
       29 +
  25   30  #include <sys/systm.h>
  26   31  #include <sys/cmn_err.h>
  27   32  #include <sys/kmem.h>
  28   33  #include <sys/disp.h>
  29   34  #include <sys/id_space.h>
  30      -#include <sys/atomic.h>
  31   35  #include <rpc/rpc.h>
  32   36  #include <nfs/nfs4.h>
  33   37  #include <nfs/nfs4_db_impl.h>
  34   38  #include <sys/sdt.h>
  35   39  
  36   40  static int rfs4_reap_interval = RFS4_REAP_INTERVAL;
  37   41  
  38   42  static void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
  39   43  static void rfs4_dbe_destroy(rfs4_dbe_t *);
  40   44  static rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, id_t, rfs4_entry_t);
↓ open down ↓ 12 lines elided ↑ open up ↑
  53   57  
  54   58  id_t
  55   59  rfs4_dbe_getid(rfs4_dbe_t *entry)
  56   60  {
  57   61          return (entry->dbe_id);
  58   62  }
  59   63  
  60   64  void
  61   65  rfs4_dbe_hold(rfs4_dbe_t *entry)
  62   66  {
  63      -        atomic_inc_32(&entry->dbe_refcnt);
       67 +        if (!MUTEX_HELD(entry->dbe_lock)) {
       68 +                mutex_enter(entry->dbe_lock);
       69 +                entry->dbe_refcnt++;
       70 +                mutex_exit(entry->dbe_lock);
       71 +        } else {
       72 +                entry->dbe_refcnt++;
       73 +        }
  64   74  }
  65   75  
  66   76  /*
  67   77   * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
  68   78   */
  69   79  void
  70   80  rfs4_dbe_rele_nolock(rfs4_dbe_t *entry)
  71   81  {
  72      -        atomic_dec_32(&entry->dbe_refcnt);
       82 +        if (!MUTEX_HELD(entry->dbe_lock)) {
       83 +                ASSERT(entry->dbe_refcnt > 0);
       84 +                mutex_enter(entry->dbe_lock);
       85 +                entry->dbe_refcnt--;
       86 +                mutex_exit(entry->dbe_lock);
       87 +        } else {
       88 +                entry->dbe_refcnt--;
       89 +        }
  73   90  }
  74   91  
  75   92  
  76   93  uint32_t
  77   94  rfs4_dbe_refcnt(rfs4_dbe_t *entry)
  78   95  {
  79   96          return (entry->dbe_refcnt);
  80   97  }
  81   98  
  82   99  /*
  83  100   * Mark an entry such that the dbsearch will skip it.
  84  101   * Caller does not want this entry to be found any longer
  85  102   */
  86  103  void
  87  104  rfs4_dbe_invalidate(rfs4_dbe_t *entry)
  88  105  {
  89      -        entry->dbe_invalid = TRUE;
  90      -        entry->dbe_skipsearch = TRUE;
      106 +        if (!MUTEX_HELD(entry->dbe_lock)) {
      107 +                mutex_enter(entry->dbe_lock);
      108 +                entry->dbe_invalid = TRUE;
      109 +                entry->dbe_skipsearch = TRUE;
      110 +                mutex_exit(entry->dbe_lock);
      111 +        } else {
      112 +                entry->dbe_invalid = TRUE;
      113 +                entry->dbe_skipsearch = TRUE;
      114 +        }
  91  115  }
  92  116  
  93  117  /*
  94  118   * Is this entry invalid?
  95  119   */
  96  120  bool_t
  97  121  rfs4_dbe_is_invalid(rfs4_dbe_t *entry)
  98  122  {
  99  123          return (entry->dbe_invalid);
 100  124  }
↓ open down ↓ 21 lines elided ↑ open up ↑
 122  146          rfs4_dbe_lock(entry);
 123  147          entry->dbe_skipsearch = FALSE;
 124  148          rfs4_dbe_unlock(entry);
 125  149  }
 126  150  
 127  151  void
 128  152  rfs4_dbe_rele(rfs4_dbe_t *entry)
 129  153  {
 130  154          mutex_enter(entry->dbe_lock);
 131  155          ASSERT(entry->dbe_refcnt > 1);
 132      -        atomic_dec_32(&entry->dbe_refcnt);
      156 +        entry->dbe_refcnt--;
 133  157          entry->dbe_time_rele = gethrestime_sec();
 134  158          mutex_exit(entry->dbe_lock);
 135  159  }
 136  160  
 137  161  void
 138  162  rfs4_dbe_lock(rfs4_dbe_t *entry)
 139  163  {
 140  164          mutex_enter(entry->dbe_lock);
 141  165  }
 142  166  
↓ open down ↓ 99 lines elided ↑ open up ↑
 242  266          for (next = db->db_tables; next; ) {
 243  267                  tmp = next;
 244  268                  next = tmp->dbt_tnext;
 245  269                  rfs4_table_destroy(db, tmp);
 246  270          }
 247  271  
 248  272          mutex_destroy(db->db_lock);
 249  273          kmem_free(db, sizeof (rfs4_database_t));
 250  274  }
 251  275  
      276 +/*
      277 + * Used to get the correct kmem_cache database for the state table being
      278 + * created.
      279 + * Helper function for rfs4_table_create
      280 + */
      281 +static kmem_cache_t *
      282 +get_db_mem_cache(char *name)
      283 +{
      284 +        int i;
      285 +
      286 +        for (i = 0; i < RFS4_DB_MEM_CACHE_NUM; i++) {
      287 +                if (strcmp(name, rfs4_db_mem_cache_table[i].r_db_name) == 0)
      288 +                        return (rfs4_db_mem_cache_table[i].r_db_mem_cache);
      289 +        }
      290 +        /*
      291 +         * There is no associated kmem cache for this NFS4 server state
      292 +         * table name
      293 +         */
      294 +        return (NULL);
      295 +}
      296 +
      297 +/*
      298 + * Used to initialize the global NFSv4 server state database.
      299 + * Helper funtion for rfs4_state_g_init and called when module is loaded.
      300 + */
      301 +kmem_cache_t *
      302 +/* CSTYLED */
      303 +nfs4_init_mem_cache(char *cache_name, uint32_t idxcnt, uint32_t size, uint32_t idx)
      304 +{
      305 +        kmem_cache_t *mem_cache = kmem_cache_create(cache_name,
      306 +            sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
      307 +            0,
      308 +            rfs4_dbe_kmem_constructor,
      309 +            rfs4_dbe_kmem_destructor,
      310 +            NULL,
      311 +            NULL,
      312 +            NULL,
      313 +            0);
      314 +        (void) strlcpy(rfs4_db_mem_cache_table[idx].r_db_name, cache_name,
      315 +            strlen(cache_name) + 1);
      316 +        rfs4_db_mem_cache_table[idx].r_db_mem_cache = mem_cache;
      317 +        return (mem_cache);
      318 +}
      319 +
 252  320  rfs4_table_t *
 253  321  rfs4_table_create(rfs4_database_t *db, char *tabname, time_t max_cache_time,
 254  322      uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
 255  323      void (*destroy)(rfs4_entry_t),
 256  324      bool_t (*expiry)(rfs4_entry_t),
 257  325      uint32_t size, uint32_t hashsize,
 258  326      uint32_t maxentries, id_t start)
 259  327  {
 260  328          rfs4_table_t    *table;
 261  329          int              len;
↓ open down ↓ 35 lines elided ↑ open up ↑
 297  365          ASSERT(t_lowat != 0);
 298  366          table->dbt_id_lwat = (maxentries * t_lowat) / 100;
 299  367          ASSERT(t_hiwat != 0);
 300  368          table->dbt_id_hwat = (maxentries * t_hiwat) / 100;
 301  369          table->dbt_id_reap = MIN(rfs4_reap_interval, max_cache_time);
 302  370          table->dbt_maxentries = maxentries;
 303  371          table->dbt_create = create;
 304  372          table->dbt_destroy = destroy;
 305  373          table->dbt_expiry = expiry;
 306  374  
 307      -        table->dbt_mem_cache = kmem_cache_create(cache_name,
 308      -            sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
 309      -            0,
 310      -            rfs4_dbe_kmem_constructor,
 311      -            rfs4_dbe_kmem_destructor,
 312      -            NULL,
 313      -            table,
 314      -            NULL,
 315      -            0);
      375 +        /*
      376 +         * get the correct kmem_cache for this table type based on the name.
      377 +         */
      378 +        table->dbt_mem_cache = get_db_mem_cache(cache_name);
      379 +
 316  380          kmem_free(cache_name, len+13);
 317  381  
 318  382          table->dbt_debug = db->db_debug_flags;
 319  383  
 320  384          mutex_enter(db->db_lock);
 321  385          table->dbt_tnext = db->db_tables;
 322  386          db->db_tables = table;
 323  387          mutex_exit(db->db_lock);
 324  388  
 325  389          rfs4_start_reaper(table);
↓ open down ↓ 31 lines elided ↑ open up ↑
 357  421          }
 358  422  
 359  423          rw_destroy(table->dbt_t_lock);
 360  424          mutex_destroy(table->dbt_lock);
 361  425          mutex_destroy(&table->dbt_reaper_cv_lock);
 362  426          cv_destroy(&table->dbt_reaper_wait);
 363  427  
 364  428          kmem_free(table->dbt_name, strlen(table->dbt_name) + 1);
 365  429          if (table->dbt_id_space)
 366  430                  id_space_destroy(table->dbt_id_space);
 367      -        kmem_cache_destroy(table->dbt_mem_cache);
      431 +        table->dbt_mem_cache = NULL;
 368  432          kmem_free(table, sizeof (rfs4_table_t));
 369  433  }
 370  434  
 371  435  rfs4_index_t *
 372  436  rfs4_index_create(rfs4_table_t *table, char *keyname,
 373  437      uint32_t (*hash)(void *),
 374  438      bool_t (compare)(rfs4_entry_t, void *),
 375  439      void *(*mkkey)(rfs4_entry_t),
 376  440      bool_t createable)
 377  441  {
↓ open down ↓ 298 lines elided ↑ open up ↑
 676  740              (CE_NOTE, "Entry %p created for %s = %p in table %s",
 677  741              (void*)entry, idx->dbi_keyname, (void*)key, table->dbt_name));
 678  742  
 679  743          return (entry->dbe_data);
 680  744  }
 681  745  
 682  746  /*ARGSUSED*/
 683  747  boolean_t
 684  748  rfs4_cpr_callb(void *arg, int code)
 685  749  {
 686      -        rfs4_table_t *table = rfs4_client_tab;
 687  750          rfs4_bucket_t *buckets, *bp;
 688  751          rfs4_link_t *l;
 689  752          rfs4_client_t *cp;
 690  753          int i;
 691  754  
      755 +        nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
      756 +        rfs4_table_t *table = nsrv4->rfs4_client_tab;
      757 +
 692  758          /*
 693  759           * We get called for Suspend and Resume events.
 694  760           * For the suspend case we simply don't care!  Nor do we care if
 695  761           * there are no clients.
 696  762           */
 697  763          if (code == CB_CODE_CPR_CHKPT || table == NULL) {
 698  764                  return (B_TRUE);
 699  765          }
 700  766  
 701  767          buckets = table->dbt_indices->dbi_buckets;
↓ open down ↓ 66 lines elided ↑ open up ↑
 768  834          bool_t found;
 769  835          int i;
 770  836          int count = 0;
 771  837  
 772  838          NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
 773  839              (CE_NOTE, "Reaping %d entries older than %ld seconds in table %s",
 774  840              desired, cache_time, table->dbt_name));
 775  841  
 776  842          /* Walk the buckets looking for entries to release/destroy */
 777  843          for (i = 0; i < table->dbt_len; i++) {
      844 +                int retries = 0;
 778  845                  bp = &buckets[i];
 779  846                  do {
 780  847                          found = FALSE;
 781  848                          rw_enter(bp->dbk_lock, RW_READER);
 782  849                          for (l = bp->dbk_head; l; l = l->next) {
 783  850                                  entry = l->entry;
      851 +                                mutex_enter(entry->dbe_lock);
      852 +                                ASSERT(entry->dbe_refcnt != 0);
 784  853                                  /*
 785  854                                   * Examine an entry.  Ref count of 1 means
 786  855                                   * that the only reference is for the hash
 787  856                                   * table reference.
 788  857                                   */
 789      -                                if (entry->dbe_refcnt != 1)
      858 +                                if (entry->dbe_refcnt != 1) {
      859 +#ifdef DEBUG
      860 +                                        rfs4_dbe_debug(entry);
      861 +#endif
      862 +                                        mutex_exit(entry->dbe_lock);
 790  863                                          continue;
 791      -                                mutex_enter(entry->dbe_lock);
      864 +                                }
 792  865                                  if ((entry->dbe_refcnt == 1) &&
 793  866                                      (table->dbt_reaper_shutdown ||
 794  867                                      table->dbt_expiry == NULL ||
 795  868                                      (*table->dbt_expiry)(entry->dbe_data))) {
 796      -                                        entry->dbe_refcnt--;
      869 +                                        rfs4_dbe_rele_nolock(entry);
 797  870                                          count++;
 798  871                                          found = TRUE;
 799  872                                  }
 800  873                                  mutex_exit(entry->dbe_lock);
 801  874                          }
 802  875                          if (found) {
 803  876                                  if (!rw_tryupgrade(bp->dbk_lock)) {
 804  877                                          rw_exit(bp->dbk_lock);
 805  878                                          rw_enter(bp->dbk_lock, RW_WRITER);
 806  879                                  }
 807  880  
 808  881                                  l = bp->dbk_head;
 809  882                                  while (l) {
 810  883                                          t = l;
 811  884                                          entry = t->entry;
 812  885                                          l = l->next;
      886 +                                        mutex_enter(entry->dbe_lock);
 813  887                                          if (entry->dbe_refcnt == 0) {
 814  888                                                  DEQUEUE(bp->dbk_head, t);
      889 +                                                mutex_exit(entry->dbe_lock);
 815  890                                                  t->next = NULL;
 816  891                                                  t->prev = NULL;
 817  892                                                  INVALIDATE_ADDR(t->entry);
 818  893                                                  rfs4_dbe_destroy(entry);
 819      -                                        }
      894 +                                        } else
      895 +                                                mutex_exit(entry->dbe_lock);
 820  896                                  }
 821  897                          }
 822  898                          rw_exit(bp->dbk_lock);
 823  899                          /*
 824  900                           * delay slightly if there is more work to do
 825  901                           * with the expectation that other reaper
 826  902                           * threads are freeing data structures as well
 827  903                           * and in turn will reduce ref counts on
 828  904                           * entries in this table allowing them to be
 829  905                           * released.  This is only done in the
 830  906                           * instance that the tables are being shut down.
 831  907                           */
 832      -                        if (table->dbt_reaper_shutdown && bp->dbk_head != NULL)
      908 +                        if (table->dbt_reaper_shutdown && bp->dbk_head != NULL) {
 833  909                                  delay(hz/100);
      910 +                                retries++;
      911 +                        }
 834  912                  /*
 835  913                   * If this is a table shutdown, keep going until
 836  914                   * everything is gone
 837  915                   */
 838      -                } while (table->dbt_reaper_shutdown && bp->dbk_head != NULL);
      916 +                } while (table->dbt_reaper_shutdown && bp->dbk_head != NULL && retries < 5);
 839  917  
 840  918                  if (!table->dbt_reaper_shutdown && desired && count >= desired)
 841  919                          break;
 842  920          }
 843  921  
 844  922          NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
 845  923              (CE_NOTE, "Reaped %d entries older than %ld seconds in table %s",
 846  924              count, cache_time, table->dbt_name));
 847  925  }
 848  926  
↓ open down ↓ 23 lines elided ↑ open up ↑
 872  950          CALLB_CPR_EXIT(&table->dbt_reaper_cpr_info);
 873  951  
 874  952          NFS4_DEBUG(table->dbt_debug,
 875  953              (CE_NOTE, "rfs4_reaper_thread exiting for %s", table->dbt_name));
 876  954  
 877  955          /* Notify the database shutdown processing that the table is shutdown */
 878  956          mutex_enter(table->dbt_db->db_lock);
 879  957          table->dbt_db->db_shutdown_count--;
 880  958          cv_signal(&table->dbt_db->db_shutdown_wait);
 881  959          mutex_exit(table->dbt_db->db_lock);
      960 +        zthread_exit();
 882  961  }
 883  962  
 884  963  static void
 885  964  rfs4_start_reaper(rfs4_table_t *table)
 886  965  {
 887  966          if (table->dbt_max_cache_time == 0)
 888  967                  return;
 889  968  
 890      -        (void) thread_create(NULL, 0, reaper_thread, table, 0, &p0, TS_RUN,
      969 +        (void) zthread_create(NULL, 0, reaper_thread, table, 0,
 891  970              minclsyspri);
 892  971  }
 893  972  
 894  973  #ifdef DEBUG
 895  974  void
 896  975  rfs4_dbe_debug(rfs4_dbe_t *entry)
 897  976  {
 898  977          cmn_err(CE_NOTE, "Entry %p from table %s",
 899  978              (void *)entry, entry->dbe_table->dbt_name);
 900  979          cmn_err(CE_CONT, "\trefcnt = %d id = %d",
 901  980              entry->dbe_refcnt, entry->dbe_id);
 902  981  }
 903  982  #endif
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX