1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  25  * Copyright 2017, OmniTI Computer Consulting, Inc. All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/stream.h>
  30 #include <sys/strsun.h>
  31 #include <sys/zone.h>
  32 #include <sys/ddi.h>
  33 #include <sys/disp.h>
  34 #include <sys/sunddi.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/debug.h>
  37 #include <sys/atomic.h>
  38 #include <sys/callb.h>
  39 #define _SUN_TPI_VERSION 2
  40 #include <sys/tihdr.h>
  41 
  42 #include <inet/common.h>
  43 #include <inet/mi.h>
  44 #include <inet/mib2.h>
  45 #include <inet/snmpcom.h>
  46 
  47 #include <netinet/ip6.h>
  48 #include <netinet/icmp6.h>
  49 
  50 #include <inet/ip.h>
  51 #include <inet/ip_impl.h>
  52 #include <inet/ip6.h>
  53 #include <inet/ip6_asp.h>
  54 #include <inet/ip_multi.h>
  55 #include <inet/ip_if.h>
  56 #include <inet/ip_ire.h>
  57 #include <inet/ip_ftable.h>
  58 #include <inet/ip_rts.h>
  59 #include <inet/ip_ndp.h>
  60 #include <inet/ipclassifier.h>
  61 #include <inet/ip_listutils.h>
  62 
  63 #include <sys/sunddi.h>
  64 
  65 /*
  66  * Routines for handling destination cache entries.
  67  * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
  68  * That entry holds both the IP ident value and the dce generation number.
  69  *
  70  * Any time a DCE is changed significantly (different path MTU, but NOT
  71  * different ULP info!), the dce_generation number is increased.
  72  * Also, when a new DCE is created, the dce_generation number in the default
  73  * DCE is bumped. That allows the dce_t information to be cached efficiently
  74  * as long as the entity caching the dce_t also caches the dce_generation,
  75  * and compares the cached generation to detect any changes.
  76  * Furthermore, when a DCE is deleted, if there are any outstanding references
  77  * to the DCE it will be marked as condemned. The condemned mark is
  78  * a designated generation number which is never otherwise used, hence
  79  * the single comparison with the generation number captures that as well.
  80  *
  81  * An example of code which caches is as follows:
  82  *
  83  *      if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
  84  *              The DCE has changed
  85  *              mystruct->my_dce = dce_lookup_pkt(mp, ixa,
  86  *                  &mystruct->my_dce_generation);
  87  *              Not needed in practice, since we have the default DCE:
  88  *              if (DCE_IS_CONDEMNED(mystruct->my_dce))
  89  *                      return failure;
  90  *      }
  91  *
  92  * Note that for IPv6 link-local addresses we record the ifindex since the
  93  * link-locals are not globally unique.
  94  *
  95  * DCEs can remain for an arbitrarily long time, until memory pressure or
  96  * too-deep hash buckets (see dce_lookup_and_add*()) enable the reclaim thread
  97  * to actually remove DCEs from the cache.
  98  */
  99 
 100 /*
 101  * Hash bucket structure for DCEs
 102  */
 103 typedef struct dcb_s {
 104         krwlock_t       dcb_lock;
 105         uint32_t        dcb_cnt;
 106         dce_t           *dcb_dce;
 107 } dcb_t;
 108 
 109 static void     dce_delete_locked(dcb_t *, dce_t *);
 110 static void     dce_make_condemned(dce_t *);
 111 
 112 static kmem_cache_t *dce_cache;
 113 static kthread_t *dce_reclaim_thread;
 114 static kmutex_t dce_reclaim_lock;
 115 static kcondvar_t dce_reclaim_cv;
 116 static int dce_reclaim_shutdown;
 117 
 118 /* Global so it can be tuned in /etc/system. This must be a power of two. */
 119 uint_t ip_dce_hash_size = 1024;
 120 
 121 /* The time in seconds between executions of the IP DCE reclaim worker. */
 122 uint_t ip_dce_reclaim_interval = 60;
 123 
 124 /* The factor of the DCE threshold at which to start hard reclaims */
 125 uint_t ip_dce_reclaim_threshold_hard = 2;
 126 
 127 /* Operates on a uint64_t */
 128 #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
 129 
 130 /*
 131  * Reclaim a fraction of dce's in the dcb.
 132  * For now we have a higher probability to delete DCEs without DCE_PMTU.
 133  */
 134 static void
 135 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
 136 {
 137         uint_t  fraction_pmtu = fraction*4;
 138         uint_t  hash;
 139         dce_t   *dce, *nextdce;
 140         hrtime_t seed = gethrtime();
 141         uint_t  retained = 0;
 142         uint_t  max = ipst->ips_ip_dce_reclaim_threshold;
 143 
 144         max *= ip_dce_reclaim_threshold_hard;
 145 
 146         rw_enter(&dcb->dcb_lock, RW_WRITER);
 147         for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
 148                 nextdce = dce->dce_next;
 149                 /* Clear DCEF_PMTU if the pmtu is too old */
 150                 mutex_enter(&dce->dce_lock);
 151                 if ((dce->dce_flags & DCEF_PMTU) &&
 152                     TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
 153                     ipst->ips_ip_pathmtu_interval) {
 154                         dce->dce_flags &= ~DCEF_PMTU;
 155                         mutex_exit(&dce->dce_lock);
 156                         dce_increment_generation(dce);
 157                 } else {
 158                         mutex_exit(&dce->dce_lock);
 159                 }
 160 
 161                 if (max == 0 || retained < max) {
 162                         hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed));
 163 
 164                         if (dce->dce_flags & DCEF_PMTU) {
 165                                 if (hash % fraction_pmtu != 0) {
 166                                         retained++;
 167                                         continue;
 168                                 }
 169                         } else {
 170                                 if (hash % fraction != 0) {
 171                                         retained++;
 172                                         continue;
 173                                 }
 174                         }
 175                 }
 176 
 177                 IP_STAT(ipst, ip_dce_reclaim_deleted);
 178                 dce_delete_locked(dcb, dce);
 179                 dce_refrele(dce);
 180         }
 181         rw_exit(&dcb->dcb_lock);
 182 }
 183 
 184 /*
 185  * kmem_cache callback to free up memory.
 186  *
 187  */
 188 static void
 189 ip_dce_reclaim_stack(ip_stack_t *ipst)
 190 {
 191         int     i;
 192 
 193         IP_STAT(ipst, ip_dce_reclaim_calls);
 194         for (i = 0; i < ipst->ips_dce_hashsize; i++) {
 195                 dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
 196                     ipst->ips_ip_dce_reclaim_fraction);
 197 
 198                 dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
 199                     ipst->ips_ip_dce_reclaim_fraction);
 200         }
 201 
 202         /*
 203          * Walk all CONNs that can have a reference on an ire, nce or dce.
 204          * Get them to update any stale references to drop any refholds they
 205          * have.
 206          */
 207         ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
 208 }
 209 
 210 /*
 211  * Called by dce_reclaim_worker() below, and no one else.  Typically this will
 212  * mean that the number of entries in the hash buckets has exceeded a tunable
 213  * threshold.
 214  */
 215 static void
 216 ip_dce_reclaim(void)
 217 {
 218         netstack_handle_t nh;
 219         netstack_t *ns;
 220         ip_stack_t *ipst;
 221 
 222         ASSERT(curthread == dce_reclaim_thread);
 223 
 224         netstack_next_init(&nh);
 225         while ((ns = netstack_next(&nh)) != NULL) {
 226                 /*
 227                  * netstack_next() can return a netstack_t with a NULL
 228                  * netstack_ip at boot time.
 229                  */
 230                 if ((ipst = ns->netstack_ip) == NULL) {
 231                         netstack_rele(ns);
 232                         continue;
 233                 }
 234                 if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0)
 235                         ip_dce_reclaim_stack(ipst);
 236                 netstack_rele(ns);
 237         }
 238         netstack_next_fini(&nh);
 239 }
 240 
 241 /* ARGSUSED */
 242 static void
 243 dce_reclaim_worker(void *arg)
 244 {
 245         callb_cpr_t     cprinfo;
 246 
 247         CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr,
 248             "dce_reclaim_worker");
 249 
 250         mutex_enter(&dce_reclaim_lock);
 251         while (!dce_reclaim_shutdown) {
 252                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
 253                 (void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock,
 254                     ddi_get_lbolt() + ip_dce_reclaim_interval * hz);
 255                 CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock);
 256 
 257                 if (dce_reclaim_shutdown)
 258                         break;
 259 
 260                 mutex_exit(&dce_reclaim_lock);
 261                 ip_dce_reclaim();
 262                 mutex_enter(&dce_reclaim_lock);
 263         }
 264 
 265         ASSERT(MUTEX_HELD(&dce_reclaim_lock));
 266         dce_reclaim_thread = NULL;
 267         dce_reclaim_shutdown = 0;
 268         cv_broadcast(&dce_reclaim_cv);
 269         CALLB_CPR_EXIT(&cprinfo);   /* drops the lock */
 270 
 271         thread_exit();
 272 }
 273 
 274 void
 275 dce_g_init(void)
 276 {
 277         dce_cache = kmem_cache_create("dce_cache",
 278             sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 279 
 280         mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
 281         cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL);
 282 
 283         dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker,
 284             NULL, 0, &p0, TS_RUN, minclsyspri);
 285 }
 286 
 287 void
 288 dce_g_destroy(void)
 289 {
 290         mutex_enter(&dce_reclaim_lock);
 291         dce_reclaim_shutdown = 1;
 292         cv_signal(&dce_reclaim_cv);
 293         while (dce_reclaim_thread != NULL)
 294                 cv_wait(&dce_reclaim_cv, &dce_reclaim_lock);
 295         mutex_exit(&dce_reclaim_lock);
 296 
 297         cv_destroy(&dce_reclaim_cv);
 298         mutex_destroy(&dce_reclaim_lock);
 299 
 300         kmem_cache_destroy(dce_cache);
 301 }
 302 
 303 /*
 304  * Allocate a default DCE and a hash table for per-IP address DCEs
 305  */
 306 void
 307 dce_stack_init(ip_stack_t *ipst)
 308 {
 309         int     i;
 310 
 311         ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
 312         bzero(ipst->ips_dce_default, sizeof (dce_t));
 313         ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
 314         ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
 315         ipst->ips_dce_default->dce_last_change_time =
 316             TICK_TO_SEC(ddi_get_lbolt64());
 317         ipst->ips_dce_default->dce_refcnt = 1;    /* Should never go away */
 318         ipst->ips_dce_default->dce_ipst = ipst;
 319 
 320         /* This must be a power of two since we are using IRE_ADDR_HASH macro */
 321         ipst->ips_dce_hashsize = ip_dce_hash_size;
 322         ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
 323             sizeof (dcb_t), KM_SLEEP);
 324         ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
 325             sizeof (dcb_t), KM_SLEEP);
 326         for (i = 0; i < ipst->ips_dce_hashsize; i++) {
 327                 rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
 328                     NULL);
 329                 rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
 330                     NULL);
 331         }
 332 }
 333 
 334 /*
 335  * Given a DCE hash bucket, unlink DCE entries from it. Some callers need
 336  * ifindex-specific matching, others don't. Don't overload ifindex to indicate
 337  * specificity, just indicate so explicitly.
 338  */
 339 static void
 340 dce_bucket_clean(dcb_t *dcb, boolean_t specific_ifindex, uint_t ifindex)
 341 {
 342         dce_t   *dce, *nextdce;
 343 
 344         rw_enter(&dcb->dcb_lock, RW_WRITER);
 345 
 346         for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
 347                 nextdce = dce->dce_next;
 348                 if ((!specific_ifindex) || dce->dce_ifindex == ifindex) {
 349                         dce_delete_locked(dcb, dce);
 350                         dce_refrele(dce);
 351                 }
 352         }
 353 
 354         rw_exit(&dcb->dcb_lock);
 355 }
 356 
 357 void
 358 dce_stack_destroy(ip_stack_t *ipst)
 359 {
 360         int i;
 361         for (i = 0; i < ipst->ips_dce_hashsize; i++) {
 362                 dce_bucket_clean(&ipst->ips_dce_hash_v4[i], B_FALSE, 0);
 363                 rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
 364                 dce_bucket_clean(&ipst->ips_dce_hash_v6[i], B_FALSE, 0);
 365                 rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
 366         }
 367         kmem_free(ipst->ips_dce_hash_v4,
 368             ipst->ips_dce_hashsize * sizeof (dcb_t));
 369         ipst->ips_dce_hash_v4 = NULL;
 370         kmem_free(ipst->ips_dce_hash_v6,
 371             ipst->ips_dce_hashsize * sizeof (dcb_t));
 372         ipst->ips_dce_hash_v6 = NULL;
 373         ipst->ips_dce_hashsize = 0;
 374 
 375         ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
 376         kmem_cache_free(dce_cache, ipst->ips_dce_default);
 377         ipst->ips_dce_default = NULL;
 378 }
 379 
 380 /* When any DCE is good enough */
 381 dce_t *
 382 dce_get_default(ip_stack_t *ipst)
 383 {
 384         dce_t           *dce;
 385 
 386         dce = ipst->ips_dce_default;
 387         dce_refhold(dce);
 388         return (dce);
 389 }
 390 
 391 /*
 392  * Generic for IPv4 and IPv6.
 393  *
 394  * Used by callers that need to cache e.g., the datapath
 395  * Returns the generation number in the last argument.
 396  */
 397 dce_t *
 398 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
 399 {
 400         if (ixa->ixa_flags & IXAF_IS_IPV4) {
 401                 /*
 402                  * If we have a source route we need to look for the final
 403                  * destination in the source route option.
 404                  */
 405                 ipaddr_t final_dst;
 406                 ipha_t *ipha = (ipha_t *)mp->b_rptr;
 407 
 408                 final_dst = ip_get_dst(ipha);
 409                 return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
 410         } else {
 411                 uint_t ifindex;
 412                 /*
 413                  * If we have a routing header we need to look for the final
 414                  * destination in the routing extension header.
 415                  */
 416                 in6_addr_t final_dst;
 417                 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
 418 
 419                 final_dst = ip_get_dst_v6(ip6h, mp, NULL);
 420                 ifindex = 0;
 421                 if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
 422                         ifindex = ixa->ixa_nce->nce_common->ncec_ill->
 423                             ill_phyint->phyint_ifindex;
 424                 }
 425                 return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
 426                     generationp));
 427         }
 428 }
 429 
 430 /*
 431  * Used by callers that need to cache e.g., the datapath
 432  * Returns the generation number in the last argument.
 433  */
 434 dce_t *
 435 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
 436 {
 437         uint_t          hash;
 438         dcb_t           *dcb;
 439         dce_t           *dce;
 440 
 441         /* Set *generationp before dropping the lock(s) that allow additions */
 442         if (generationp != NULL)
 443                 *generationp = ipst->ips_dce_default->dce_generation;
 444 
 445         hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
 446         dcb = &ipst->ips_dce_hash_v4[hash];
 447         rw_enter(&dcb->dcb_lock, RW_READER);
 448         for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 449                 if (dce->dce_v4addr == dst) {
 450                         mutex_enter(&dce->dce_lock);
 451                         if (!DCE_IS_CONDEMNED(dce)) {
 452                                 dce_refhold(dce);
 453                                 if (generationp != NULL)
 454                                         *generationp = dce->dce_generation;
 455                                 mutex_exit(&dce->dce_lock);
 456                                 rw_exit(&dcb->dcb_lock);
 457                                 return (dce);
 458                         }
 459                         mutex_exit(&dce->dce_lock);
 460                 }
 461         }
 462         rw_exit(&dcb->dcb_lock);
 463         /* Not found */
 464         dce = ipst->ips_dce_default;
 465         dce_refhold(dce);
 466         return (dce);
 467 }
 468 
 469 /*
 470  * Used by callers that need to cache e.g., the datapath
 471  * Returns the generation number in the last argument.
 472  * ifindex should only be set for link-locals
 473  */
 474 dce_t *
 475 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
 476     uint_t *generationp)
 477 {
 478         uint_t          hash;
 479         dcb_t           *dcb;
 480         dce_t           *dce;
 481 
 482         /* Set *generationp before dropping the lock(s) that allow additions */
 483         if (generationp != NULL)
 484                 *generationp = ipst->ips_dce_default->dce_generation;
 485 
 486         hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
 487         dcb = &ipst->ips_dce_hash_v6[hash];
 488         rw_enter(&dcb->dcb_lock, RW_READER);
 489         for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 490                 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
 491                     dce->dce_ifindex == ifindex) {
 492                         mutex_enter(&dce->dce_lock);
 493                         if (!DCE_IS_CONDEMNED(dce)) {
 494                                 dce_refhold(dce);
 495                                 if (generationp != NULL)
 496                                         *generationp = dce->dce_generation;
 497                                 mutex_exit(&dce->dce_lock);
 498                                 rw_exit(&dcb->dcb_lock);
 499                                 return (dce);
 500                         }
 501                         mutex_exit(&dce->dce_lock);
 502                 }
 503         }
 504         rw_exit(&dcb->dcb_lock);
 505         /* Not found */
 506         dce = ipst->ips_dce_default;
 507         dce_refhold(dce);
 508         return (dce);
 509 }
 510 
 511 /*
 512  * Atomically looks for a non-default DCE, and if not found tries to create one.
 513  * If there is no memory it returns NULL.
 514  * When an entry is created we increase the generation number on
 515  * the default DCE so that conn_ip_output will detect there is a new DCE.
 516  */
 517 dce_t *
 518 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
 519 {
 520         uint_t          hash;
 521         dcb_t           *dcb;
 522         dce_t           *dce;
 523 
 524         hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
 525         dcb = &ipst->ips_dce_hash_v4[hash];
 526         /*
 527          * Assuming that we get fairly even distribution across all of the
 528          * buckets, once one bucket is overly full, prune the whole cache.
 529          */
 530         if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
 531                 atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
 532         rw_enter(&dcb->dcb_lock, RW_WRITER);
 533         for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 534                 if (dce->dce_v4addr == dst) {
 535                         mutex_enter(&dce->dce_lock);
 536                         if (!DCE_IS_CONDEMNED(dce)) {
 537                                 dce_refhold(dce);
 538                                 mutex_exit(&dce->dce_lock);
 539                                 rw_exit(&dcb->dcb_lock);
 540                                 return (dce);
 541                         }
 542                         mutex_exit(&dce->dce_lock);
 543                 }
 544         }
 545         dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
 546         if (dce == NULL) {
 547                 rw_exit(&dcb->dcb_lock);
 548                 return (NULL);
 549         }
 550         bzero(dce, sizeof (dce_t));
 551         dce->dce_ipst = ipst;        /* No netstack_hold */
 552         dce->dce_v4addr = dst;
 553         dce->dce_generation = DCE_GENERATION_INITIAL;
 554         dce->dce_ipversion = IPV4_VERSION;
 555         dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
 556         dce_refhold(dce);       /* For the hash list */
 557 
 558         /* Link into list */
 559         if (dcb->dcb_dce != NULL)
 560                 dcb->dcb_dce->dce_ptpn = &dce->dce_next;
 561         dce->dce_next = dcb->dcb_dce;
 562         dce->dce_ptpn = &dcb->dcb_dce;
 563         dcb->dcb_dce = dce;
 564         dce->dce_bucket = dcb;
 565         atomic_inc_32(&dcb->dcb_cnt);
 566         dce_refhold(dce);       /* For the caller */
 567         rw_exit(&dcb->dcb_lock);
 568 
 569         /* Initialize dce_ident to be different than for the last packet */
 570         dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
 571 
 572         dce_increment_generation(ipst->ips_dce_default);
 573         return (dce);
 574 }
 575 
 576 /*
 577  * Atomically looks for a non-default DCE, and if not found tries to create one.
 578  * If there is no memory it returns NULL.
 579  * When an entry is created we increase the generation number on
 580  * the default DCE so that conn_ip_output will detect there is a new DCE.
 581  * ifindex should only be used with link-local addresses.
 582  */
 583 dce_t *
 584 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
 585 {
 586         uint_t          hash;
 587         dcb_t           *dcb;
 588         dce_t           *dce;
 589 
 590         /* We should not create entries for link-locals w/o an ifindex */
 591         ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
 592 
 593         hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
 594         dcb = &ipst->ips_dce_hash_v6[hash];
 595         /*
 596          * Assuming that we get fairly even distribution across all of the
 597          * buckets, once one bucket is overly full, prune the whole cache.
 598          */
 599         if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
 600                 atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
 601         rw_enter(&dcb->dcb_lock, RW_WRITER);
 602         for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 603                 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
 604                     dce->dce_ifindex == ifindex) {
 605                         mutex_enter(&dce->dce_lock);
 606                         if (!DCE_IS_CONDEMNED(dce)) {
 607                                 dce_refhold(dce);
 608                                 mutex_exit(&dce->dce_lock);
 609                                 rw_exit(&dcb->dcb_lock);
 610                                 return (dce);
 611                         }
 612                         mutex_exit(&dce->dce_lock);
 613                 }
 614         }
 615 
 616         dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
 617         if (dce == NULL) {
 618                 rw_exit(&dcb->dcb_lock);
 619                 return (NULL);
 620         }
 621         bzero(dce, sizeof (dce_t));
 622         dce->dce_ipst = ipst;        /* No netstack_hold */
 623         dce->dce_v6addr = *dst;
 624         dce->dce_ifindex = ifindex;
 625         dce->dce_generation = DCE_GENERATION_INITIAL;
 626         dce->dce_ipversion = IPV6_VERSION;
 627         dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
 628         dce_refhold(dce);       /* For the hash list */
 629 
 630         /* Link into list */
 631         if (dcb->dcb_dce != NULL)
 632                 dcb->dcb_dce->dce_ptpn = &dce->dce_next;
 633         dce->dce_next = dcb->dcb_dce;
 634         dce->dce_ptpn = &dcb->dcb_dce;
 635         dcb->dcb_dce = dce;
 636         dce->dce_bucket = dcb;
 637         atomic_inc_32(&dcb->dcb_cnt);
 638         dce_refhold(dce);       /* For the caller */
 639         rw_exit(&dcb->dcb_lock);
 640 
 641         /* Initialize dce_ident to be different than for the last packet */
 642         dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
 643         dce_increment_generation(ipst->ips_dce_default);
 644         return (dce);
 645 }
 646 
 647 /*
 648  * Set/update uinfo. Creates a per-destination dce if none exists.
 649  *
 650  * Note that we do not bump the generation number here.
 651  * New connections will find the new uinfo.
 652  *
 653  * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
 654  */
 655 static void
 656 dce_setuinfo(dce_t *dce, iulp_t *uinfo)
 657 {
 658         /*
 659          * Update the round trip time estimate and/or the max frag size
 660          * and/or the slow start threshold.
 661          *
 662          * We serialize multiple advises using dce_lock.
 663          */
 664         mutex_enter(&dce->dce_lock);
 665         /* Gard against setting to zero */
 666         if (uinfo->iulp_rtt != 0) {
 667                 /*
 668                  * If there is no old cached values, initialize them
 669                  * conservatively.  Set them to be (1.5 * new value).
 670                  */
 671                 if (dce->dce_uinfo.iulp_rtt != 0) {
 672                         dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
 673                             uinfo->iulp_rtt) >> 1;
 674                 } else {
 675                         dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
 676                             (uinfo->iulp_rtt >> 1);
 677                 }
 678                 if (dce->dce_uinfo.iulp_rtt_sd != 0) {
 679                         dce->dce_uinfo.iulp_rtt_sd =
 680                             (dce->dce_uinfo.iulp_rtt_sd +
 681                             uinfo->iulp_rtt_sd) >> 1;
 682                 } else {
 683                         dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
 684                             (uinfo->iulp_rtt_sd >> 1);
 685                 }
 686         }
 687         if (uinfo->iulp_mtu != 0) {
 688                 if (dce->dce_flags & DCEF_PMTU) {
 689                         dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
 690                 } else {
 691                         dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
 692                         dce->dce_flags |= DCEF_PMTU;
 693                 }
 694                 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
 695         }
 696         if (uinfo->iulp_ssthresh != 0) {
 697                 if (dce->dce_uinfo.iulp_ssthresh != 0)
 698                         dce->dce_uinfo.iulp_ssthresh =
 699                             (uinfo->iulp_ssthresh +
 700                             dce->dce_uinfo.iulp_ssthresh) >> 1;
 701                 else
 702                         dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
 703         }
 704         /* We have uinfo for sure */
 705         dce->dce_flags |= DCEF_UINFO;
 706         mutex_exit(&dce->dce_lock);
 707 }
 708 
 709 
 710 int
 711 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
 712 {
 713         dce_t *dce;
 714 
 715         dce = dce_lookup_and_add_v4(dst, ipst);
 716         if (dce == NULL)
 717                 return (ENOMEM);
 718 
 719         dce_setuinfo(dce, uinfo);
 720         dce_refrele(dce);
 721         return (0);
 722 }
 723 
 724 int
 725 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
 726     ip_stack_t *ipst)
 727 {
 728         dce_t *dce;
 729 
 730         dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
 731         if (dce == NULL)
 732                 return (ENOMEM);
 733 
 734         dce_setuinfo(dce, uinfo);
 735         dce_refrele(dce);
 736         return (0);
 737 }
 738 
 739 /* Common routine for IPv4 and IPv6 */
 740 int
 741 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
 742     ip_stack_t *ipst)
 743 {
 744         ipaddr_t dst4;
 745 
 746         if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
 747                 IN6_V4MAPPED_TO_IPADDR(dst, dst4);
 748                 return (dce_update_uinfo_v4(dst4, uinfo, ipst));
 749         } else {
 750                 return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
 751         }
 752 }
 753 
 754 static void
 755 dce_make_condemned(dce_t *dce)
 756 {
 757         ip_stack_t      *ipst = dce->dce_ipst;
 758 
 759         mutex_enter(&dce->dce_lock);
 760         ASSERT(!DCE_IS_CONDEMNED(dce));
 761         dce->dce_generation = DCE_GENERATION_CONDEMNED;
 762         mutex_exit(&dce->dce_lock);
 763         /* Count how many condemned dces for kmem_cache callback */
 764         atomic_inc_32(&ipst->ips_num_dce_condemned);
 765 }
 766 
 767 /*
 768  * Increment the generation avoiding the special condemned value
 769  */
 770 void
 771 dce_increment_generation(dce_t *dce)
 772 {
 773         uint_t generation;
 774 
 775         mutex_enter(&dce->dce_lock);
 776         if (!DCE_IS_CONDEMNED(dce)) {
 777                 generation = dce->dce_generation + 1;
 778                 if (generation == DCE_GENERATION_CONDEMNED)
 779                         generation = DCE_GENERATION_INITIAL;
 780                 ASSERT(generation != DCE_GENERATION_VERIFY);
 781                 dce->dce_generation = generation;
 782         }
 783         mutex_exit(&dce->dce_lock);
 784 }
 785 
 786 /*
 787  * Increment the generation number on all dces that have a path MTU and
 788  * the default DCE. Used when ill_mtu or ill_mc_mtu changes.
 789  */
 790 void
 791 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
 792 {
 793         int             i;
 794         dcb_t           *dcb;
 795         dce_t           *dce;
 796 
 797         for (i = 0; i < ipst->ips_dce_hashsize; i++) {
 798                 if (isv6)
 799                         dcb = &ipst->ips_dce_hash_v6[i];
 800                 else
 801                         dcb = &ipst->ips_dce_hash_v4[i];
 802                 rw_enter(&dcb->dcb_lock, RW_WRITER);
 803                 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 804                         if (DCE_IS_CONDEMNED(dce))
 805                                 continue;
 806                         dce_increment_generation(dce);
 807                 }
 808                 rw_exit(&dcb->dcb_lock);
 809         }
 810         dce_increment_generation(ipst->ips_dce_default);
 811 }
 812 
 813 /*
 814  * Caller needs to do a dce_refrele since we can't do the
 815  * dce_refrele under dcb_lock.
 816  */
 817 static void
 818 dce_delete_locked(dcb_t *dcb, dce_t *dce)
 819 {
 820         dce->dce_bucket = NULL;
 821         *dce->dce_ptpn = dce->dce_next;
 822         if (dce->dce_next != NULL)
 823                 dce->dce_next->dce_ptpn = dce->dce_ptpn;
 824         dce->dce_ptpn = NULL;
 825         dce->dce_next = NULL;
 826         atomic_dec_32(&dcb->dcb_cnt);
 827         dce_make_condemned(dce);
 828 }
 829 
 830 static void
 831 dce_inactive(dce_t *dce)
 832 {
 833         ip_stack_t      *ipst = dce->dce_ipst;
 834 
 835         ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
 836         ASSERT(dce->dce_ptpn == NULL);
 837         ASSERT(dce->dce_bucket == NULL);
 838 
 839         /* Count how many condemned dces for kmem_cache callback */
 840         if (DCE_IS_CONDEMNED(dce))
 841                 atomic_dec_32(&ipst->ips_num_dce_condemned);
 842 
 843         kmem_cache_free(dce_cache, dce);
 844 }
 845 
 846 void
 847 dce_refrele(dce_t *dce)
 848 {
 849         ASSERT(dce->dce_refcnt != 0);
 850         if (atomic_dec_32_nv(&dce->dce_refcnt) == 0)
 851                 dce_inactive(dce);
 852 }
 853 
 854 void
 855 dce_refhold(dce_t *dce)
 856 {
 857         atomic_inc_32(&dce->dce_refcnt);
 858         ASSERT(dce->dce_refcnt != 0);
 859 }
 860 
 861 /* No tracing support yet hence the same as the above functions */
 862 void
 863 dce_refrele_notr(dce_t *dce)
 864 {
 865         ASSERT(dce->dce_refcnt != 0);
 866         if (atomic_dec_32_nv(&dce->dce_refcnt) == 0)
 867                 dce_inactive(dce);
 868 }
 869 
 870 void
 871 dce_refhold_notr(dce_t *dce)
 872 {
 873         atomic_inc_32(&dce->dce_refcnt);
 874         ASSERT(dce->dce_refcnt != 0);
 875 }
 876 
 877 /* Report both the IPv4 and IPv6 DCEs. */
 878 mblk_t *
 879 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 880 {
 881         struct opthdr           *optp;
 882         mblk_t                  *mp2ctl;
 883         dest_cache_entry_t      dest_cache;
 884         mblk_t                  *mp_tail = NULL;
 885         dce_t                   *dce;
 886         dcb_t                   *dcb;
 887         int                     i;
 888         uint64_t                current_time;
 889 
 890         current_time = TICK_TO_SEC(ddi_get_lbolt64());
 891 
 892         /*
 893          * make a copy of the original message
 894          */
 895         mp2ctl = copymsg(mpctl);
 896 
 897         /* First we do IPv4 entries */
 898         optp = (struct opthdr *)&mpctl->b_rptr[
 899             sizeof (struct T_optmgmt_ack)];
 900         optp->level = MIB2_IP;
 901         optp->name = EXPER_IP_DCE;
 902 
 903         for (i = 0; i < ipst->ips_dce_hashsize; i++) {
 904                 dcb = &ipst->ips_dce_hash_v4[i];
 905                 rw_enter(&dcb->dcb_lock, RW_READER);
 906                 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 907                         dest_cache.DestIpv4Address = dce->dce_v4addr;
 908                         dest_cache.DestFlags = dce->dce_flags;
 909                         if (dce->dce_flags & DCEF_PMTU)
 910                                 dest_cache.DestPmtu = dce->dce_pmtu;
 911                         else
 912                                 dest_cache.DestPmtu = 0;
 913                         dest_cache.DestIdent = dce->dce_ident;
 914                         dest_cache.DestIfindex = 0;
 915                         dest_cache.DestAge = current_time -
 916                             dce->dce_last_change_time;
 917                         if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
 918                             (char *)&dest_cache, (int)sizeof (dest_cache))) {
 919                                 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
 920                                     "failed to allocate %u bytes\n",
 921                                     (uint_t)sizeof (dest_cache)));
 922                         }
 923                 }
 924                 rw_exit(&dcb->dcb_lock);
 925         }
 926         optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
 927         ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
 928             (int)optp->level, (int)optp->name, (int)optp->len));
 929         qreply(q, mpctl);
 930 
 931         if (mp2ctl == NULL) {
 932                 /* Copymsg failed above */
 933                 return (NULL);
 934         }
 935 
 936         /* Now for IPv6 */
 937         mpctl = mp2ctl;
 938         mp_tail = NULL;
 939         mp2ctl = copymsg(mpctl);
 940         optp = (struct opthdr *)&mpctl->b_rptr[
 941             sizeof (struct T_optmgmt_ack)];
 942         optp->level = MIB2_IP6;
 943         optp->name = EXPER_IP_DCE;
 944 
 945         for (i = 0; i < ipst->ips_dce_hashsize; i++) {
 946                 dcb = &ipst->ips_dce_hash_v6[i];
 947                 rw_enter(&dcb->dcb_lock, RW_READER);
 948                 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 949                         dest_cache.DestIpv6Address = dce->dce_v6addr;
 950                         dest_cache.DestFlags = dce->dce_flags;
 951                         if (dce->dce_flags & DCEF_PMTU)
 952                                 dest_cache.DestPmtu = dce->dce_pmtu;
 953                         else
 954                                 dest_cache.DestPmtu = 0;
 955                         dest_cache.DestIdent = dce->dce_ident;
 956                         if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
 957                                 dest_cache.DestIfindex = dce->dce_ifindex;
 958                         else
 959                                 dest_cache.DestIfindex = 0;
 960                         dest_cache.DestAge = current_time -
 961                             dce->dce_last_change_time;
 962                         if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
 963                             (char *)&dest_cache, (int)sizeof (dest_cache))) {
 964                                 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
 965                                     "failed to allocate %u bytes\n",
 966                                     (uint_t)sizeof (dest_cache)));
 967                         }
 968                 }
 969                 rw_exit(&dcb->dcb_lock);
 970         }
 971         optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
 972         ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
 973             (int)optp->level, (int)optp->name, (int)optp->len));
 974         qreply(q, mpctl);
 975 
 976         return (mp2ctl);
 977 }
 978 
 979 /*
 980  * Remove IPv6 DCEs which refer to an ifindex that is going away.
 981  * This is not required for correctness, but it avoids netstat -d
 982  * showing stale stuff that will never be used.
 983  */
 984 void
 985 dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
 986 {
 987         uint_t  i;
 988 
 989         for (i = 0; i < ipst->ips_dce_hashsize; i++)
 990                 dce_bucket_clean(&ipst->ips_dce_hash_v6[i], B_TRUE, ifindex);
 991 }