Print this page
Warning about DCE lifetimes
coyright fix
gdamore's feedback
7185 IP DCEs leak from halted non-global zones
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/inet/ip/ip_dce.c
+++ new/usr/src/uts/common/inet/ip/ip_dce.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
25 + * Copyright 2017, OmniTI Computer Consulting, Inc. All rights reserved.
25 26 */
26 27
27 28 #include <sys/types.h>
28 29 #include <sys/stream.h>
29 30 #include <sys/strsun.h>
30 31 #include <sys/zone.h>
31 32 #include <sys/ddi.h>
32 33 #include <sys/disp.h>
33 34 #include <sys/sunddi.h>
34 35 #include <sys/cmn_err.h>
35 36 #include <sys/debug.h>
36 37 #include <sys/atomic.h>
37 38 #include <sys/callb.h>
38 39 #define _SUN_TPI_VERSION 2
39 40 #include <sys/tihdr.h>
40 41
41 42 #include <inet/common.h>
42 43 #include <inet/mi.h>
43 44 #include <inet/mib2.h>
44 45 #include <inet/snmpcom.h>
45 46
46 47 #include <netinet/ip6.h>
47 48 #include <netinet/icmp6.h>
48 49
49 50 #include <inet/ip.h>
50 51 #include <inet/ip_impl.h>
51 52 #include <inet/ip6.h>
52 53 #include <inet/ip6_asp.h>
53 54 #include <inet/ip_multi.h>
54 55 #include <inet/ip_if.h>
55 56 #include <inet/ip_ire.h>
56 57 #include <inet/ip_ftable.h>
57 58 #include <inet/ip_rts.h>
58 59 #include <inet/ip_ndp.h>
59 60 #include <inet/ipclassifier.h>
60 61 #include <inet/ip_listutils.h>
61 62
62 63 #include <sys/sunddi.h>
63 64
64 65 /*
65 66 * Routines for handling destination cache entries.
66 67 * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
67 68 * That entry holds both the IP ident value and the dce generation number.
68 69 *
69 70 * Any time a DCE is changed significantly (different path MTU, but NOT
70 71 * different ULP info!), the dce_generation number is increased.
71 72 * Also, when a new DCE is created, the dce_generation number in the default
72 73 * DCE is bumped. That allows the dce_t information to be cached efficiently
73 74 * as long as the entity caching the dce_t also caches the dce_generation,
74 75 * and compares the cached generation to detect any changes.
75 76 * Furthermore, when a DCE is deleted, if there are any outstanding references
76 77 * to the DCE it will be marked as condemned. The condemned mark is
77 78 * a designated generation number which is never otherwise used, hence
78 79 * the single comparison with the generation number captures that as well.
79 80 *
80 81 * An example of code which caches is as follows:
81 82 *
82 83 * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
|
↓ open down ↓ |
48 lines elided |
↑ open up ↑ |
83 84 * The DCE has changed
84 85 * mystruct->my_dce = dce_lookup_pkt(mp, ixa,
85 86 * &mystruct->my_dce_generation);
86 87 * Not needed in practice, since we have the default DCE:
87 88 * if (DCE_IS_CONDEMNED(mystruct->my_dce))
88 89 * return failure;
89 90 * }
90 91 *
91 92 * Note that for IPv6 link-local addresses we record the ifindex since the
92 93 * link-locals are not globally unique.
94 + *
95 + * DCEs can remain for an arbitrarily long time, until memory pressure or
96 + * too-deep hash buckets (see dce_lookup_and_add*()) enable the reclaim thread
97 + * to actually remove DCEs from the cache.
93 98 */
94 99
95 100 /*
96 101 * Hash bucket structure for DCEs
97 102 */
98 103 typedef struct dcb_s {
99 104 krwlock_t dcb_lock;
100 105 uint32_t dcb_cnt;
101 106 dce_t *dcb_dce;
102 107 } dcb_t;
103 108
104 109 static void dce_delete_locked(dcb_t *, dce_t *);
105 110 static void dce_make_condemned(dce_t *);
106 111
107 112 static kmem_cache_t *dce_cache;
108 113 static kthread_t *dce_reclaim_thread;
109 114 static kmutex_t dce_reclaim_lock;
110 115 static kcondvar_t dce_reclaim_cv;
111 116 static int dce_reclaim_shutdown;
112 117
113 118 /* Global so it can be tuned in /etc/system. This must be a power of two. */
114 119 uint_t ip_dce_hash_size = 1024;
115 120
116 121 /* The time in seconds between executions of the IP DCE reclaim worker. */
117 122 uint_t ip_dce_reclaim_interval = 60;
118 123
119 124 /* The factor of the DCE threshold at which to start hard reclaims */
120 125 uint_t ip_dce_reclaim_threshold_hard = 2;
121 126
122 127 /* Operates on a uint64_t */
123 128 #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
124 129
125 130 /*
126 131 * Reclaim a fraction of dce's in the dcb.
127 132 * For now we have a higher probability to delete DCEs without DCE_PMTU.
128 133 */
129 134 static void
130 135 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
131 136 {
132 137 uint_t fraction_pmtu = fraction*4;
133 138 uint_t hash;
134 139 dce_t *dce, *nextdce;
135 140 hrtime_t seed = gethrtime();
136 141 uint_t retained = 0;
137 142 uint_t max = ipst->ips_ip_dce_reclaim_threshold;
138 143
139 144 max *= ip_dce_reclaim_threshold_hard;
140 145
141 146 rw_enter(&dcb->dcb_lock, RW_WRITER);
142 147 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
143 148 nextdce = dce->dce_next;
144 149 /* Clear DCEF_PMTU if the pmtu is too old */
145 150 mutex_enter(&dce->dce_lock);
146 151 if ((dce->dce_flags & DCEF_PMTU) &&
147 152 TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
148 153 ipst->ips_ip_pathmtu_interval) {
149 154 dce->dce_flags &= ~DCEF_PMTU;
150 155 mutex_exit(&dce->dce_lock);
151 156 dce_increment_generation(dce);
152 157 } else {
153 158 mutex_exit(&dce->dce_lock);
154 159 }
155 160
156 161 if (max == 0 || retained < max) {
157 162 hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed));
158 163
159 164 if (dce->dce_flags & DCEF_PMTU) {
160 165 if (hash % fraction_pmtu != 0) {
161 166 retained++;
162 167 continue;
163 168 }
164 169 } else {
165 170 if (hash % fraction != 0) {
166 171 retained++;
167 172 continue;
168 173 }
169 174 }
170 175 }
171 176
172 177 IP_STAT(ipst, ip_dce_reclaim_deleted);
173 178 dce_delete_locked(dcb, dce);
174 179 dce_refrele(dce);
175 180 }
176 181 rw_exit(&dcb->dcb_lock);
177 182 }
178 183
179 184 /*
180 185 * kmem_cache callback to free up memory.
181 186 *
182 187 */
183 188 static void
184 189 ip_dce_reclaim_stack(ip_stack_t *ipst)
185 190 {
186 191 int i;
187 192
188 193 IP_STAT(ipst, ip_dce_reclaim_calls);
189 194 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
190 195 dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
191 196 ipst->ips_ip_dce_reclaim_fraction);
192 197
193 198 dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
194 199 ipst->ips_ip_dce_reclaim_fraction);
195 200 }
196 201
197 202 /*
198 203 * Walk all CONNs that can have a reference on an ire, nce or dce.
199 204 * Get them to update any stale references to drop any refholds they
200 205 * have.
201 206 */
202 207 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
203 208 }
204 209
205 210 /*
206 211 * Called by dce_reclaim_worker() below, and no one else. Typically this will
207 212 * mean that the number of entries in the hash buckets has exceeded a tunable
208 213 * threshold.
209 214 */
210 215 static void
211 216 ip_dce_reclaim(void)
212 217 {
213 218 netstack_handle_t nh;
214 219 netstack_t *ns;
215 220 ip_stack_t *ipst;
216 221
217 222 ASSERT(curthread == dce_reclaim_thread);
218 223
219 224 netstack_next_init(&nh);
220 225 while ((ns = netstack_next(&nh)) != NULL) {
221 226 /*
222 227 * netstack_next() can return a netstack_t with a NULL
223 228 * netstack_ip at boot time.
224 229 */
225 230 if ((ipst = ns->netstack_ip) == NULL) {
226 231 netstack_rele(ns);
227 232 continue;
228 233 }
229 234 if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0)
230 235 ip_dce_reclaim_stack(ipst);
231 236 netstack_rele(ns);
232 237 }
233 238 netstack_next_fini(&nh);
234 239 }
235 240
236 241 /* ARGSUSED */
237 242 static void
238 243 dce_reclaim_worker(void *arg)
239 244 {
240 245 callb_cpr_t cprinfo;
241 246
242 247 CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr,
243 248 "dce_reclaim_worker");
244 249
245 250 mutex_enter(&dce_reclaim_lock);
246 251 while (!dce_reclaim_shutdown) {
247 252 CALLB_CPR_SAFE_BEGIN(&cprinfo);
248 253 (void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock,
249 254 ddi_get_lbolt() + ip_dce_reclaim_interval * hz);
250 255 CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock);
251 256
252 257 if (dce_reclaim_shutdown)
253 258 break;
254 259
255 260 mutex_exit(&dce_reclaim_lock);
256 261 ip_dce_reclaim();
257 262 mutex_enter(&dce_reclaim_lock);
258 263 }
259 264
260 265 ASSERT(MUTEX_HELD(&dce_reclaim_lock));
261 266 dce_reclaim_thread = NULL;
262 267 dce_reclaim_shutdown = 0;
263 268 cv_broadcast(&dce_reclaim_cv);
264 269 CALLB_CPR_EXIT(&cprinfo); /* drops the lock */
265 270
266 271 thread_exit();
267 272 }
268 273
269 274 void
270 275 dce_g_init(void)
271 276 {
272 277 dce_cache = kmem_cache_create("dce_cache",
273 278 sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
274 279
275 280 mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
276 281 cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL);
277 282
278 283 dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker,
279 284 NULL, 0, &p0, TS_RUN, minclsyspri);
280 285 }
281 286
282 287 void
283 288 dce_g_destroy(void)
284 289 {
285 290 mutex_enter(&dce_reclaim_lock);
286 291 dce_reclaim_shutdown = 1;
287 292 cv_signal(&dce_reclaim_cv);
288 293 while (dce_reclaim_thread != NULL)
289 294 cv_wait(&dce_reclaim_cv, &dce_reclaim_lock);
290 295 mutex_exit(&dce_reclaim_lock);
291 296
292 297 cv_destroy(&dce_reclaim_cv);
293 298 mutex_destroy(&dce_reclaim_lock);
294 299
295 300 kmem_cache_destroy(dce_cache);
296 301 }
297 302
298 303 /*
299 304 * Allocate a default DCE and a hash table for per-IP address DCEs
300 305 */
301 306 void
302 307 dce_stack_init(ip_stack_t *ipst)
303 308 {
304 309 int i;
305 310
306 311 ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
307 312 bzero(ipst->ips_dce_default, sizeof (dce_t));
308 313 ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
309 314 ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
310 315 ipst->ips_dce_default->dce_last_change_time =
311 316 TICK_TO_SEC(ddi_get_lbolt64());
312 317 ipst->ips_dce_default->dce_refcnt = 1; /* Should never go away */
313 318 ipst->ips_dce_default->dce_ipst = ipst;
314 319
315 320 /* This must be a power of two since we are using IRE_ADDR_HASH macro */
316 321 ipst->ips_dce_hashsize = ip_dce_hash_size;
317 322 ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
318 323 sizeof (dcb_t), KM_SLEEP);
|
↓ open down ↓ |
216 lines elided |
↑ open up ↑ |
319 324 ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
320 325 sizeof (dcb_t), KM_SLEEP);
321 326 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
322 327 rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
323 328 NULL);
324 329 rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
325 330 NULL);
326 331 }
327 332 }
328 333
334 +/*
335 + * Given a DCE hash bucket, unlink DCE entries from it. Some callers need
336 + * ifindex-specific matching, others don't. Don't overload ifindex to indicate
337 + * specificity, just indicate so explicitly.
338 + */
339 +static void
340 +dce_bucket_clean(dcb_t *dcb, boolean_t specific_ifindex, uint_t ifindex)
341 +{
342 + dce_t *dce, *nextdce;
343 +
344 + rw_enter(&dcb->dcb_lock, RW_WRITER);
345 +
346 + for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
347 + nextdce = dce->dce_next;
348 + if ((!specific_ifindex) || dce->dce_ifindex == ifindex) {
349 + dce_delete_locked(dcb, dce);
350 + dce_refrele(dce);
351 + }
352 + }
353 +
354 + rw_exit(&dcb->dcb_lock);
355 +}
356 +
329 357 void
330 358 dce_stack_destroy(ip_stack_t *ipst)
331 359 {
332 360 int i;
333 361 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
362 + dce_bucket_clean(&ipst->ips_dce_hash_v4[i], B_FALSE, 0);
334 363 rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
364 + dce_bucket_clean(&ipst->ips_dce_hash_v6[i], B_FALSE, 0);
335 365 rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
336 366 }
337 367 kmem_free(ipst->ips_dce_hash_v4,
338 368 ipst->ips_dce_hashsize * sizeof (dcb_t));
339 369 ipst->ips_dce_hash_v4 = NULL;
340 370 kmem_free(ipst->ips_dce_hash_v6,
341 371 ipst->ips_dce_hashsize * sizeof (dcb_t));
342 372 ipst->ips_dce_hash_v6 = NULL;
343 373 ipst->ips_dce_hashsize = 0;
344 374
345 375 ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
346 376 kmem_cache_free(dce_cache, ipst->ips_dce_default);
347 377 ipst->ips_dce_default = NULL;
348 378 }
349 379
350 380 /* When any DCE is good enough */
351 381 dce_t *
352 382 dce_get_default(ip_stack_t *ipst)
353 383 {
354 384 dce_t *dce;
355 385
356 386 dce = ipst->ips_dce_default;
357 387 dce_refhold(dce);
358 388 return (dce);
359 389 }
360 390
361 391 /*
362 392 * Generic for IPv4 and IPv6.
363 393 *
364 394 * Used by callers that need to cache e.g., the datapath
365 395 * Returns the generation number in the last argument.
366 396 */
367 397 dce_t *
368 398 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
369 399 {
370 400 if (ixa->ixa_flags & IXAF_IS_IPV4) {
371 401 /*
372 402 * If we have a source route we need to look for the final
373 403 * destination in the source route option.
374 404 */
375 405 ipaddr_t final_dst;
376 406 ipha_t *ipha = (ipha_t *)mp->b_rptr;
377 407
378 408 final_dst = ip_get_dst(ipha);
379 409 return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
380 410 } else {
381 411 uint_t ifindex;
382 412 /*
383 413 * If we have a routing header we need to look for the final
384 414 * destination in the routing extension header.
385 415 */
386 416 in6_addr_t final_dst;
387 417 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
388 418
389 419 final_dst = ip_get_dst_v6(ip6h, mp, NULL);
390 420 ifindex = 0;
391 421 if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
392 422 ifindex = ixa->ixa_nce->nce_common->ncec_ill->
393 423 ill_phyint->phyint_ifindex;
394 424 }
395 425 return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
396 426 generationp));
397 427 }
398 428 }
399 429
400 430 /*
401 431 * Used by callers that need to cache e.g., the datapath
402 432 * Returns the generation number in the last argument.
403 433 */
404 434 dce_t *
405 435 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
406 436 {
407 437 uint_t hash;
408 438 dcb_t *dcb;
409 439 dce_t *dce;
410 440
411 441 /* Set *generationp before dropping the lock(s) that allow additions */
412 442 if (generationp != NULL)
413 443 *generationp = ipst->ips_dce_default->dce_generation;
414 444
415 445 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
416 446 dcb = &ipst->ips_dce_hash_v4[hash];
417 447 rw_enter(&dcb->dcb_lock, RW_READER);
418 448 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
419 449 if (dce->dce_v4addr == dst) {
420 450 mutex_enter(&dce->dce_lock);
421 451 if (!DCE_IS_CONDEMNED(dce)) {
422 452 dce_refhold(dce);
423 453 if (generationp != NULL)
424 454 *generationp = dce->dce_generation;
425 455 mutex_exit(&dce->dce_lock);
426 456 rw_exit(&dcb->dcb_lock);
427 457 return (dce);
428 458 }
429 459 mutex_exit(&dce->dce_lock);
430 460 }
431 461 }
432 462 rw_exit(&dcb->dcb_lock);
433 463 /* Not found */
434 464 dce = ipst->ips_dce_default;
435 465 dce_refhold(dce);
436 466 return (dce);
437 467 }
438 468
439 469 /*
440 470 * Used by callers that need to cache e.g., the datapath
441 471 * Returns the generation number in the last argument.
442 472 * ifindex should only be set for link-locals
443 473 */
444 474 dce_t *
445 475 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
446 476 uint_t *generationp)
447 477 {
448 478 uint_t hash;
449 479 dcb_t *dcb;
450 480 dce_t *dce;
451 481
452 482 /* Set *generationp before dropping the lock(s) that allow additions */
453 483 if (generationp != NULL)
454 484 *generationp = ipst->ips_dce_default->dce_generation;
455 485
456 486 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
457 487 dcb = &ipst->ips_dce_hash_v6[hash];
458 488 rw_enter(&dcb->dcb_lock, RW_READER);
459 489 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
460 490 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
461 491 dce->dce_ifindex == ifindex) {
462 492 mutex_enter(&dce->dce_lock);
463 493 if (!DCE_IS_CONDEMNED(dce)) {
464 494 dce_refhold(dce);
465 495 if (generationp != NULL)
466 496 *generationp = dce->dce_generation;
467 497 mutex_exit(&dce->dce_lock);
468 498 rw_exit(&dcb->dcb_lock);
469 499 return (dce);
470 500 }
471 501 mutex_exit(&dce->dce_lock);
472 502 }
473 503 }
474 504 rw_exit(&dcb->dcb_lock);
475 505 /* Not found */
476 506 dce = ipst->ips_dce_default;
477 507 dce_refhold(dce);
478 508 return (dce);
479 509 }
480 510
481 511 /*
482 512 * Atomically looks for a non-default DCE, and if not found tries to create one.
483 513 * If there is no memory it returns NULL.
484 514 * When an entry is created we increase the generation number on
485 515 * the default DCE so that conn_ip_output will detect there is a new DCE.
486 516 */
487 517 dce_t *
488 518 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
489 519 {
490 520 uint_t hash;
491 521 dcb_t *dcb;
492 522 dce_t *dce;
493 523
494 524 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
495 525 dcb = &ipst->ips_dce_hash_v4[hash];
496 526 /*
497 527 * Assuming that we get fairly even distribution across all of the
498 528 * buckets, once one bucket is overly full, prune the whole cache.
499 529 */
500 530 if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
501 531 atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
502 532 rw_enter(&dcb->dcb_lock, RW_WRITER);
503 533 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
504 534 if (dce->dce_v4addr == dst) {
505 535 mutex_enter(&dce->dce_lock);
506 536 if (!DCE_IS_CONDEMNED(dce)) {
507 537 dce_refhold(dce);
508 538 mutex_exit(&dce->dce_lock);
509 539 rw_exit(&dcb->dcb_lock);
510 540 return (dce);
511 541 }
512 542 mutex_exit(&dce->dce_lock);
513 543 }
514 544 }
515 545 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
516 546 if (dce == NULL) {
517 547 rw_exit(&dcb->dcb_lock);
518 548 return (NULL);
519 549 }
520 550 bzero(dce, sizeof (dce_t));
521 551 dce->dce_ipst = ipst; /* No netstack_hold */
522 552 dce->dce_v4addr = dst;
523 553 dce->dce_generation = DCE_GENERATION_INITIAL;
524 554 dce->dce_ipversion = IPV4_VERSION;
525 555 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
526 556 dce_refhold(dce); /* For the hash list */
527 557
528 558 /* Link into list */
529 559 if (dcb->dcb_dce != NULL)
530 560 dcb->dcb_dce->dce_ptpn = &dce->dce_next;
531 561 dce->dce_next = dcb->dcb_dce;
532 562 dce->dce_ptpn = &dcb->dcb_dce;
533 563 dcb->dcb_dce = dce;
534 564 dce->dce_bucket = dcb;
535 565 atomic_inc_32(&dcb->dcb_cnt);
536 566 dce_refhold(dce); /* For the caller */
537 567 rw_exit(&dcb->dcb_lock);
538 568
539 569 /* Initialize dce_ident to be different than for the last packet */
540 570 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
541 571
542 572 dce_increment_generation(ipst->ips_dce_default);
543 573 return (dce);
544 574 }
545 575
546 576 /*
547 577 * Atomically looks for a non-default DCE, and if not found tries to create one.
548 578 * If there is no memory it returns NULL.
549 579 * When an entry is created we increase the generation number on
550 580 * the default DCE so that conn_ip_output will detect there is a new DCE.
551 581 * ifindex should only be used with link-local addresses.
552 582 */
553 583 dce_t *
554 584 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
555 585 {
556 586 uint_t hash;
557 587 dcb_t *dcb;
558 588 dce_t *dce;
559 589
560 590 /* We should not create entries for link-locals w/o an ifindex */
561 591 ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
562 592
563 593 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
564 594 dcb = &ipst->ips_dce_hash_v6[hash];
565 595 /*
566 596 * Assuming that we get fairly even distribution across all of the
567 597 * buckets, once one bucket is overly full, prune the whole cache.
568 598 */
569 599 if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
570 600 atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
571 601 rw_enter(&dcb->dcb_lock, RW_WRITER);
572 602 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
573 603 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
574 604 dce->dce_ifindex == ifindex) {
575 605 mutex_enter(&dce->dce_lock);
576 606 if (!DCE_IS_CONDEMNED(dce)) {
577 607 dce_refhold(dce);
578 608 mutex_exit(&dce->dce_lock);
579 609 rw_exit(&dcb->dcb_lock);
580 610 return (dce);
581 611 }
582 612 mutex_exit(&dce->dce_lock);
583 613 }
584 614 }
585 615
586 616 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
587 617 if (dce == NULL) {
588 618 rw_exit(&dcb->dcb_lock);
589 619 return (NULL);
590 620 }
591 621 bzero(dce, sizeof (dce_t));
592 622 dce->dce_ipst = ipst; /* No netstack_hold */
593 623 dce->dce_v6addr = *dst;
594 624 dce->dce_ifindex = ifindex;
595 625 dce->dce_generation = DCE_GENERATION_INITIAL;
596 626 dce->dce_ipversion = IPV6_VERSION;
597 627 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
598 628 dce_refhold(dce); /* For the hash list */
599 629
600 630 /* Link into list */
601 631 if (dcb->dcb_dce != NULL)
602 632 dcb->dcb_dce->dce_ptpn = &dce->dce_next;
603 633 dce->dce_next = dcb->dcb_dce;
604 634 dce->dce_ptpn = &dcb->dcb_dce;
605 635 dcb->dcb_dce = dce;
606 636 dce->dce_bucket = dcb;
607 637 atomic_inc_32(&dcb->dcb_cnt);
608 638 dce_refhold(dce); /* For the caller */
609 639 rw_exit(&dcb->dcb_lock);
610 640
611 641 /* Initialize dce_ident to be different than for the last packet */
612 642 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
613 643 dce_increment_generation(ipst->ips_dce_default);
614 644 return (dce);
615 645 }
616 646
617 647 /*
618 648 * Set/update uinfo. Creates a per-destination dce if none exists.
619 649 *
620 650 * Note that we do not bump the generation number here.
621 651 * New connections will find the new uinfo.
622 652 *
623 653 * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
624 654 */
625 655 static void
626 656 dce_setuinfo(dce_t *dce, iulp_t *uinfo)
627 657 {
628 658 /*
629 659 * Update the round trip time estimate and/or the max frag size
630 660 * and/or the slow start threshold.
631 661 *
632 662 * We serialize multiple advises using dce_lock.
633 663 */
634 664 mutex_enter(&dce->dce_lock);
635 665 /* Gard against setting to zero */
636 666 if (uinfo->iulp_rtt != 0) {
637 667 /*
638 668 * If there is no old cached values, initialize them
639 669 * conservatively. Set them to be (1.5 * new value).
640 670 */
641 671 if (dce->dce_uinfo.iulp_rtt != 0) {
642 672 dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
643 673 uinfo->iulp_rtt) >> 1;
644 674 } else {
645 675 dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
646 676 (uinfo->iulp_rtt >> 1);
647 677 }
648 678 if (dce->dce_uinfo.iulp_rtt_sd != 0) {
649 679 dce->dce_uinfo.iulp_rtt_sd =
650 680 (dce->dce_uinfo.iulp_rtt_sd +
651 681 uinfo->iulp_rtt_sd) >> 1;
652 682 } else {
653 683 dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
654 684 (uinfo->iulp_rtt_sd >> 1);
655 685 }
656 686 }
657 687 if (uinfo->iulp_mtu != 0) {
658 688 if (dce->dce_flags & DCEF_PMTU) {
659 689 dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
660 690 } else {
661 691 dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
662 692 dce->dce_flags |= DCEF_PMTU;
663 693 }
664 694 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
665 695 }
666 696 if (uinfo->iulp_ssthresh != 0) {
667 697 if (dce->dce_uinfo.iulp_ssthresh != 0)
668 698 dce->dce_uinfo.iulp_ssthresh =
669 699 (uinfo->iulp_ssthresh +
670 700 dce->dce_uinfo.iulp_ssthresh) >> 1;
671 701 else
672 702 dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
673 703 }
674 704 /* We have uinfo for sure */
675 705 dce->dce_flags |= DCEF_UINFO;
676 706 mutex_exit(&dce->dce_lock);
677 707 }
678 708
679 709
680 710 int
681 711 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
682 712 {
683 713 dce_t *dce;
684 714
685 715 dce = dce_lookup_and_add_v4(dst, ipst);
686 716 if (dce == NULL)
687 717 return (ENOMEM);
688 718
689 719 dce_setuinfo(dce, uinfo);
690 720 dce_refrele(dce);
691 721 return (0);
692 722 }
693 723
694 724 int
695 725 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
696 726 ip_stack_t *ipst)
697 727 {
698 728 dce_t *dce;
699 729
700 730 dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
701 731 if (dce == NULL)
702 732 return (ENOMEM);
703 733
704 734 dce_setuinfo(dce, uinfo);
705 735 dce_refrele(dce);
706 736 return (0);
707 737 }
708 738
709 739 /* Common routine for IPv4 and IPv6 */
710 740 int
711 741 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
712 742 ip_stack_t *ipst)
713 743 {
714 744 ipaddr_t dst4;
715 745
716 746 if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
717 747 IN6_V4MAPPED_TO_IPADDR(dst, dst4);
718 748 return (dce_update_uinfo_v4(dst4, uinfo, ipst));
719 749 } else {
720 750 return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
721 751 }
722 752 }
723 753
724 754 static void
725 755 dce_make_condemned(dce_t *dce)
726 756 {
727 757 ip_stack_t *ipst = dce->dce_ipst;
728 758
729 759 mutex_enter(&dce->dce_lock);
730 760 ASSERT(!DCE_IS_CONDEMNED(dce));
731 761 dce->dce_generation = DCE_GENERATION_CONDEMNED;
732 762 mutex_exit(&dce->dce_lock);
733 763 /* Count how many condemned dces for kmem_cache callback */
734 764 atomic_inc_32(&ipst->ips_num_dce_condemned);
735 765 }
736 766
737 767 /*
738 768 * Increment the generation avoiding the special condemned value
739 769 */
740 770 void
741 771 dce_increment_generation(dce_t *dce)
742 772 {
743 773 uint_t generation;
744 774
745 775 mutex_enter(&dce->dce_lock);
746 776 if (!DCE_IS_CONDEMNED(dce)) {
747 777 generation = dce->dce_generation + 1;
748 778 if (generation == DCE_GENERATION_CONDEMNED)
749 779 generation = DCE_GENERATION_INITIAL;
750 780 ASSERT(generation != DCE_GENERATION_VERIFY);
751 781 dce->dce_generation = generation;
752 782 }
753 783 mutex_exit(&dce->dce_lock);
754 784 }
755 785
756 786 /*
757 787 * Increment the generation number on all dces that have a path MTU and
758 788 * the default DCE. Used when ill_mtu or ill_mc_mtu changes.
759 789 */
760 790 void
761 791 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
762 792 {
763 793 int i;
764 794 dcb_t *dcb;
765 795 dce_t *dce;
766 796
767 797 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
768 798 if (isv6)
769 799 dcb = &ipst->ips_dce_hash_v6[i];
770 800 else
771 801 dcb = &ipst->ips_dce_hash_v4[i];
772 802 rw_enter(&dcb->dcb_lock, RW_WRITER);
773 803 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
774 804 if (DCE_IS_CONDEMNED(dce))
775 805 continue;
776 806 dce_increment_generation(dce);
777 807 }
778 808 rw_exit(&dcb->dcb_lock);
779 809 }
780 810 dce_increment_generation(ipst->ips_dce_default);
781 811 }
782 812
783 813 /*
784 814 * Caller needs to do a dce_refrele since we can't do the
785 815 * dce_refrele under dcb_lock.
786 816 */
787 817 static void
788 818 dce_delete_locked(dcb_t *dcb, dce_t *dce)
789 819 {
790 820 dce->dce_bucket = NULL;
791 821 *dce->dce_ptpn = dce->dce_next;
792 822 if (dce->dce_next != NULL)
793 823 dce->dce_next->dce_ptpn = dce->dce_ptpn;
794 824 dce->dce_ptpn = NULL;
795 825 dce->dce_next = NULL;
796 826 atomic_dec_32(&dcb->dcb_cnt);
797 827 dce_make_condemned(dce);
798 828 }
799 829
800 830 static void
801 831 dce_inactive(dce_t *dce)
802 832 {
803 833 ip_stack_t *ipst = dce->dce_ipst;
804 834
805 835 ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
806 836 ASSERT(dce->dce_ptpn == NULL);
807 837 ASSERT(dce->dce_bucket == NULL);
808 838
809 839 /* Count how many condemned dces for kmem_cache callback */
810 840 if (DCE_IS_CONDEMNED(dce))
811 841 atomic_dec_32(&ipst->ips_num_dce_condemned);
812 842
813 843 kmem_cache_free(dce_cache, dce);
814 844 }
815 845
816 846 void
817 847 dce_refrele(dce_t *dce)
818 848 {
819 849 ASSERT(dce->dce_refcnt != 0);
820 850 if (atomic_dec_32_nv(&dce->dce_refcnt) == 0)
821 851 dce_inactive(dce);
822 852 }
823 853
824 854 void
825 855 dce_refhold(dce_t *dce)
826 856 {
827 857 atomic_inc_32(&dce->dce_refcnt);
828 858 ASSERT(dce->dce_refcnt != 0);
829 859 }
830 860
831 861 /* No tracing support yet hence the same as the above functions */
832 862 void
833 863 dce_refrele_notr(dce_t *dce)
834 864 {
835 865 ASSERT(dce->dce_refcnt != 0);
836 866 if (atomic_dec_32_nv(&dce->dce_refcnt) == 0)
837 867 dce_inactive(dce);
838 868 }
839 869
840 870 void
841 871 dce_refhold_notr(dce_t *dce)
842 872 {
843 873 atomic_inc_32(&dce->dce_refcnt);
844 874 ASSERT(dce->dce_refcnt != 0);
845 875 }
846 876
847 877 /* Report both the IPv4 and IPv6 DCEs. */
848 878 mblk_t *
849 879 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
850 880 {
851 881 struct opthdr *optp;
852 882 mblk_t *mp2ctl;
853 883 dest_cache_entry_t dest_cache;
854 884 mblk_t *mp_tail = NULL;
855 885 dce_t *dce;
856 886 dcb_t *dcb;
857 887 int i;
858 888 uint64_t current_time;
859 889
860 890 current_time = TICK_TO_SEC(ddi_get_lbolt64());
861 891
862 892 /*
863 893 * make a copy of the original message
864 894 */
865 895 mp2ctl = copymsg(mpctl);
866 896
867 897 /* First we do IPv4 entries */
868 898 optp = (struct opthdr *)&mpctl->b_rptr[
869 899 sizeof (struct T_optmgmt_ack)];
870 900 optp->level = MIB2_IP;
871 901 optp->name = EXPER_IP_DCE;
872 902
873 903 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
874 904 dcb = &ipst->ips_dce_hash_v4[i];
875 905 rw_enter(&dcb->dcb_lock, RW_READER);
876 906 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
877 907 dest_cache.DestIpv4Address = dce->dce_v4addr;
878 908 dest_cache.DestFlags = dce->dce_flags;
879 909 if (dce->dce_flags & DCEF_PMTU)
880 910 dest_cache.DestPmtu = dce->dce_pmtu;
881 911 else
882 912 dest_cache.DestPmtu = 0;
883 913 dest_cache.DestIdent = dce->dce_ident;
884 914 dest_cache.DestIfindex = 0;
885 915 dest_cache.DestAge = current_time -
886 916 dce->dce_last_change_time;
887 917 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
888 918 (char *)&dest_cache, (int)sizeof (dest_cache))) {
889 919 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
890 920 "failed to allocate %u bytes\n",
891 921 (uint_t)sizeof (dest_cache)));
892 922 }
893 923 }
894 924 rw_exit(&dcb->dcb_lock);
895 925 }
896 926 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
897 927 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
898 928 (int)optp->level, (int)optp->name, (int)optp->len));
899 929 qreply(q, mpctl);
900 930
901 931 if (mp2ctl == NULL) {
902 932 /* Copymsg failed above */
903 933 return (NULL);
904 934 }
905 935
906 936 /* Now for IPv6 */
907 937 mpctl = mp2ctl;
908 938 mp_tail = NULL;
909 939 mp2ctl = copymsg(mpctl);
910 940 optp = (struct opthdr *)&mpctl->b_rptr[
911 941 sizeof (struct T_optmgmt_ack)];
912 942 optp->level = MIB2_IP6;
913 943 optp->name = EXPER_IP_DCE;
914 944
915 945 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
916 946 dcb = &ipst->ips_dce_hash_v6[i];
917 947 rw_enter(&dcb->dcb_lock, RW_READER);
918 948 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
919 949 dest_cache.DestIpv6Address = dce->dce_v6addr;
920 950 dest_cache.DestFlags = dce->dce_flags;
921 951 if (dce->dce_flags & DCEF_PMTU)
922 952 dest_cache.DestPmtu = dce->dce_pmtu;
923 953 else
924 954 dest_cache.DestPmtu = 0;
925 955 dest_cache.DestIdent = dce->dce_ident;
926 956 if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
927 957 dest_cache.DestIfindex = dce->dce_ifindex;
928 958 else
929 959 dest_cache.DestIfindex = 0;
930 960 dest_cache.DestAge = current_time -
931 961 dce->dce_last_change_time;
932 962 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
933 963 (char *)&dest_cache, (int)sizeof (dest_cache))) {
934 964 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
935 965 "failed to allocate %u bytes\n",
936 966 (uint_t)sizeof (dest_cache)));
937 967 }
938 968 }
939 969 rw_exit(&dcb->dcb_lock);
940 970 }
941 971 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
942 972 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
943 973 (int)optp->level, (int)optp->name, (int)optp->len));
944 974 qreply(q, mpctl);
945 975
946 976 return (mp2ctl);
947 977 }
|
↓ open down ↓ |
603 lines elided |
↑ open up ↑ |
948 978
949 979 /*
950 980 * Remove IPv6 DCEs which refer to an ifindex that is going away.
951 981 * This is not required for correctness, but it avoids netstat -d
952 982 * showing stale stuff that will never be used.
953 983 */
954 984 void
955 985 dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
956 986 {
957 987 uint_t i;
958 - dcb_t *dcb;
959 - dce_t *dce, *nextdce;
960 988
961 - for (i = 0; i < ipst->ips_dce_hashsize; i++) {
962 - dcb = &ipst->ips_dce_hash_v6[i];
963 - rw_enter(&dcb->dcb_lock, RW_WRITER);
964 -
965 - for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
966 - nextdce = dce->dce_next;
967 - if (dce->dce_ifindex == ifindex) {
968 - dce_delete_locked(dcb, dce);
969 - dce_refrele(dce);
970 - }
971 - }
972 - rw_exit(&dcb->dcb_lock);
973 - }
989 + for (i = 0; i < ipst->ips_dce_hashsize; i++)
990 + dce_bucket_clean(&ipst->ips_dce_hash_v6[i], B_TRUE, ifindex);
974 991 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX