Print this page
re #13613 rb4516 Tunables needs volatile keyword
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/inet/tcp/tcp_misc.c
+++ new/usr/src/uts/common/inet/tcp/tcp_misc.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 + * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
24 25 */
25 26
26 27 #include <sys/types.h>
27 28 #include <sys/strlog.h>
28 29 #include <sys/policy.h>
29 30 #include <sys/strsun.h>
30 31 #include <sys/squeue_impl.h>
31 32 #include <sys/squeue.h>
33 +#include <sys/vmsystm.h>
32 34
33 35 #include <inet/common.h>
34 36 #include <inet/ip.h>
35 37 #include <inet/tcp.h>
36 38 #include <inet/tcp_impl.h>
37 39
38 40 /* Control whether TCP can enter defensive mode when under memory pressure. */
39 41 static boolean_t tcp_do_reclaim = B_TRUE;
40 42
41 43 /*
42 44 * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
43 45 *
44 46 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
45 47 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
46 48 * (defined in tcp.h) needs to be filled in and passed into the kernel
47 49 * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
48 50 * structure contains the four-tuple of a TCP connection and a range of TCP
49 51 * states (specified by ac_start and ac_end). The use of wildcard addresses
50 52 * and ports is allowed. Connections with a matching four tuple and a state
51 53 * within the specified range will be aborted. The valid states for the
52 54 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
53 55 * inclusive.
54 56 *
55 57 * An application which has its connection aborted by this ioctl will receive
56 58 * an error that is dependent on the connection state at the time of the abort.
57 59 * If the connection state is < TCPS_TIME_WAIT, an application should behave as
58 60 * though a RST packet has been received. If the connection state is equal to
59 61 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
60 62 * and all resources associated with the connection will be freed.
61 63 */
62 64 static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
63 65 static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
64 66 static void tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
65 67 ip_recv_attr_t *dummy);
66 68 static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
67 69 void tcp_ioctl_abort_conn(queue_t *, mblk_t *);
68 70 static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
69 71 boolean_t, tcp_stack_t *);
70 72
71 73 /*
72 74 * Macros used for accessing the different types of sockaddr
73 75 * structures inside a tcp_ioc_abort_conn_t.
74 76 */
75 77 #define TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local)
76 78 #define TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote)
77 79 #define TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr)
78 80 #define TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr)
79 81 #define TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port)
80 82 #define TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port)
81 83 #define TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local)
82 84 #define TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote)
83 85 #define TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr)
84 86 #define TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr)
85 87 #define TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port)
86 88 #define TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port)
87 89
88 90 /*
89 91 * Return the correct error code to mimic the behavior
90 92 * of a connection reset.
91 93 */
92 94 #define TCP_AC_GET_ERRCODE(state, err) { \
93 95 switch ((state)) { \
94 96 case TCPS_SYN_SENT: \
95 97 case TCPS_SYN_RCVD: \
96 98 (err) = ECONNREFUSED; \
97 99 break; \
98 100 case TCPS_ESTABLISHED: \
99 101 case TCPS_FIN_WAIT_1: \
100 102 case TCPS_FIN_WAIT_2: \
101 103 case TCPS_CLOSE_WAIT: \
102 104 (err) = ECONNRESET; \
103 105 break; \
104 106 case TCPS_CLOSING: \
105 107 case TCPS_LAST_ACK: \
106 108 case TCPS_TIME_WAIT: \
107 109 (err) = 0; \
108 110 break; \
109 111 default: \
110 112 (err) = ENXIO; \
111 113 } \
112 114 }
113 115
114 116 /*
115 117 * Check if a tcp structure matches the info in acp.
116 118 */
117 119 #define TCP_AC_ADDR_MATCH(acp, connp, tcp) \
118 120 (((acp)->ac_local.ss_family == AF_INET) ? \
119 121 ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \
120 122 TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) && \
121 123 (TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \
122 124 TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) && \
123 125 (TCP_AC_V4LPORT((acp)) == 0 || \
124 126 TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) && \
125 127 (TCP_AC_V4RPORT((acp)) == 0 || \
126 128 TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) && \
127 129 (acp)->ac_start <= (tcp)->tcp_state && \
128 130 (acp)->ac_end >= (tcp)->tcp_state) : \
129 131 ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \
130 132 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \
131 133 &(connp)->conn_laddr_v6)) && \
132 134 (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \
133 135 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \
134 136 &(connp)->conn_faddr_v6)) && \
135 137 (TCP_AC_V6LPORT((acp)) == 0 || \
136 138 TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) && \
137 139 (TCP_AC_V6RPORT((acp)) == 0 || \
138 140 TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) && \
139 141 (acp)->ac_start <= (tcp)->tcp_state && \
140 142 (acp)->ac_end >= (tcp)->tcp_state))
141 143
142 144 #define TCP_AC_MATCH(acp, connp, tcp) \
143 145 (((acp)->ac_zoneid == ALL_ZONES || \
144 146 (acp)->ac_zoneid == (connp)->conn_zoneid) ? \
145 147 TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0)
146 148
147 149 /*
148 150 * Build a message containing a tcp_ioc_abort_conn_t structure
149 151 * which is filled in with information from acp and tp.
150 152 */
151 153 static mblk_t *
152 154 tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp)
153 155 {
154 156 mblk_t *mp;
155 157 tcp_ioc_abort_conn_t *tacp;
156 158
157 159 mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO);
158 160 if (mp == NULL)
159 161 return (NULL);
160 162
161 163 *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN;
162 164 tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr +
163 165 sizeof (uint32_t));
164 166
165 167 tacp->ac_start = acp->ac_start;
166 168 tacp->ac_end = acp->ac_end;
167 169 tacp->ac_zoneid = acp->ac_zoneid;
168 170
169 171 if (acp->ac_local.ss_family == AF_INET) {
170 172 tacp->ac_local.ss_family = AF_INET;
171 173 tacp->ac_remote.ss_family = AF_INET;
172 174 TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4;
173 175 TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4;
174 176 TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport;
175 177 TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport;
176 178 } else {
177 179 tacp->ac_local.ss_family = AF_INET6;
178 180 tacp->ac_remote.ss_family = AF_INET6;
179 181 TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6;
180 182 TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6;
181 183 TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport;
182 184 TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport;
183 185 }
184 186 mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp);
185 187 return (mp);
186 188 }
187 189
188 190 /*
189 191 * Print a tcp_ioc_abort_conn_t structure.
190 192 */
191 193 static void
192 194 tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp)
193 195 {
194 196 char lbuf[128];
195 197 char rbuf[128];
196 198 sa_family_t af;
197 199 in_port_t lport, rport;
198 200 ushort_t logflags;
199 201
200 202 af = acp->ac_local.ss_family;
201 203
202 204 if (af == AF_INET) {
203 205 (void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp),
204 206 lbuf, 128);
205 207 (void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp),
206 208 rbuf, 128);
207 209 lport = ntohs(TCP_AC_V4LPORT(acp));
208 210 rport = ntohs(TCP_AC_V4RPORT(acp));
209 211 } else {
210 212 (void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp),
211 213 lbuf, 128);
212 214 (void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp),
213 215 rbuf, 128);
214 216 lport = ntohs(TCP_AC_V6LPORT(acp));
215 217 rport = ntohs(TCP_AC_V6RPORT(acp));
216 218 }
217 219
218 220 logflags = SL_TRACE | SL_NOTE;
219 221 /*
220 222 * Don't print this message to the console if the operation was done
221 223 * to a non-global zone.
222 224 */
223 225 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
224 226 logflags |= SL_CONSOLE;
225 227 (void) strlog(TCP_MOD_ID, 0, 1, logflags,
226 228 "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, "
227 229 "start = %d, end = %d\n", lbuf, lport, rbuf, rport,
228 230 acp->ac_start, acp->ac_end);
229 231 }
230 232
231 233 /*
232 234 * Called using SQ_FILL when a message built using
233 235 * tcp_ioctl_abort_build_msg is put into a queue.
234 236 * Note that when we get here there is no wildcard in acp any more.
235 237 */
236 238 /* ARGSUSED2 */
237 239 static void
238 240 tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
239 241 ip_recv_attr_t *dummy)
240 242 {
241 243 conn_t *connp = (conn_t *)arg;
242 244 tcp_t *tcp = connp->conn_tcp;
243 245 tcp_ioc_abort_conn_t *acp;
244 246
245 247 /*
246 248 * Don't accept any input on a closed tcp as this TCP logically does
247 249 * not exist on the system. Don't proceed further with this TCP.
248 250 * For eg. this packet could trigger another close of this tcp
249 251 * which would be disastrous for tcp_refcnt. tcp_close_detached /
250 252 * tcp_clean_death / tcp_closei_local must be called at most once
251 253 * on a TCP.
252 254 */
253 255 if (tcp->tcp_state == TCPS_CLOSED ||
254 256 tcp->tcp_state == TCPS_BOUND) {
255 257 freemsg(mp);
256 258 return;
257 259 }
258 260
259 261 acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t));
260 262 if (tcp->tcp_state <= acp->ac_end) {
261 263 /*
262 264 * If we get here, we are already on the correct
263 265 * squeue. This ioctl follows the following path
264 266 * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn
265 267 * ->tcp_ioctl_abort->squeue_enter (if on a
266 268 * different squeue)
267 269 */
268 270 int errcode;
269 271
270 272 TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode);
271 273 (void) tcp_clean_death(tcp, errcode);
272 274 }
273 275 freemsg(mp);
274 276 }
275 277
276 278 /*
277 279 * Abort all matching connections on a hash chain.
278 280 */
279 281 static int
280 282 tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count,
281 283 boolean_t exact, tcp_stack_t *tcps)
282 284 {
283 285 int nmatch, err = 0;
284 286 tcp_t *tcp;
285 287 MBLKP mp, last, listhead = NULL;
286 288 conn_t *tconnp;
287 289 connf_t *connfp;
288 290 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
289 291
290 292 connfp = &ipst->ips_ipcl_conn_fanout[index];
291 293
292 294 startover:
293 295 nmatch = 0;
294 296
295 297 mutex_enter(&connfp->connf_lock);
296 298 for (tconnp = connfp->connf_head; tconnp != NULL;
297 299 tconnp = tconnp->conn_next) {
298 300 tcp = tconnp->conn_tcp;
299 301 /*
300 302 * We are missing a check on sin6_scope_id for linklocals here,
301 303 * but current usage is just for aborting based on zoneid
302 304 * for shared-IP zones.
303 305 */
304 306 if (TCP_AC_MATCH(acp, tconnp, tcp)) {
305 307 CONN_INC_REF(tconnp);
306 308 mp = tcp_ioctl_abort_build_msg(acp, tcp);
307 309 if (mp == NULL) {
308 310 err = ENOMEM;
309 311 CONN_DEC_REF(tconnp);
310 312 break;
311 313 }
312 314 mp->b_prev = (mblk_t *)tcp;
313 315
314 316 if (listhead == NULL) {
315 317 listhead = mp;
316 318 last = mp;
317 319 } else {
318 320 last->b_next = mp;
319 321 last = mp;
320 322 }
321 323 nmatch++;
322 324 if (exact)
323 325 break;
324 326 }
325 327
326 328 /* Avoid holding lock for too long. */
327 329 if (nmatch >= 500)
328 330 break;
329 331 }
330 332 mutex_exit(&connfp->connf_lock);
331 333
332 334 /* Pass mp into the correct tcp */
333 335 while ((mp = listhead) != NULL) {
334 336 listhead = listhead->b_next;
335 337 tcp = (tcp_t *)mp->b_prev;
336 338 mp->b_next = mp->b_prev = NULL;
337 339 SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp,
338 340 tcp_ioctl_abort_handler, tcp->tcp_connp, NULL,
339 341 SQ_FILL, SQTAG_TCP_ABORT_BUCKET);
340 342 }
341 343
342 344 *count += nmatch;
343 345 if (nmatch >= 500 && err == 0)
344 346 goto startover;
345 347 return (err);
346 348 }
347 349
348 350 /*
349 351 * Abort all connections that matches the attributes specified in acp.
350 352 */
351 353 static int
352 354 tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp, tcp_stack_t *tcps)
353 355 {
354 356 sa_family_t af;
355 357 uint32_t ports;
356 358 uint16_t *pports;
357 359 int err = 0, count = 0;
358 360 boolean_t exact = B_FALSE; /* set when there is no wildcard */
359 361 int index = -1;
360 362 ushort_t logflags;
361 363 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
362 364
363 365 af = acp->ac_local.ss_family;
364 366
365 367 if (af == AF_INET) {
366 368 if (TCP_AC_V4REMOTE(acp) != INADDR_ANY &&
367 369 TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) {
368 370 pports = (uint16_t *)&ports;
369 371 pports[1] = TCP_AC_V4LPORT(acp);
370 372 pports[0] = TCP_AC_V4RPORT(acp);
371 373 exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY);
372 374 }
373 375 } else {
374 376 if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) &&
375 377 TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) {
376 378 pports = (uint16_t *)&ports;
377 379 pports[1] = TCP_AC_V6LPORT(acp);
378 380 pports[0] = TCP_AC_V6RPORT(acp);
379 381 exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp));
380 382 }
381 383 }
382 384
383 385 /*
384 386 * For cases where remote addr, local port, and remote port are non-
385 387 * wildcards, tcp_ioctl_abort_bucket will only be called once.
386 388 */
387 389 if (index != -1) {
388 390 err = tcp_ioctl_abort_bucket(acp, index,
389 391 &count, exact, tcps);
390 392 } else {
391 393 /*
392 394 * loop through all entries for wildcard case
393 395 */
394 396 for (index = 0;
395 397 index < ipst->ips_ipcl_conn_fanout_size;
396 398 index++) {
397 399 err = tcp_ioctl_abort_bucket(acp, index,
398 400 &count, exact, tcps);
399 401 if (err != 0)
400 402 break;
401 403 }
402 404 }
403 405
404 406 logflags = SL_TRACE | SL_NOTE;
405 407 /*
406 408 * Don't print this message to the console if the operation was done
407 409 * to a non-global zone.
408 410 */
409 411 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
410 412 logflags |= SL_CONSOLE;
411 413 (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: "
412 414 "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' '));
413 415 if (err == 0 && count == 0)
414 416 err = ENOENT;
415 417 return (err);
416 418 }
417 419
418 420 /*
419 421 * Process the TCP_IOC_ABORT_CONN ioctl request.
420 422 */
421 423 void
422 424 tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp)
423 425 {
424 426 int err;
425 427 IOCP iocp;
426 428 MBLKP mp1;
427 429 sa_family_t laf, raf;
428 430 tcp_ioc_abort_conn_t *acp;
429 431 zone_t *zptr;
430 432 conn_t *connp = Q_TO_CONN(q);
431 433 zoneid_t zoneid = connp->conn_zoneid;
432 434 tcp_t *tcp = connp->conn_tcp;
433 435 tcp_stack_t *tcps = tcp->tcp_tcps;
434 436
435 437 iocp = (IOCP)mp->b_rptr;
436 438
437 439 if ((mp1 = mp->b_cont) == NULL ||
438 440 iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) {
439 441 err = EINVAL;
440 442 goto out;
441 443 }
442 444
443 445 /* check permissions */
444 446 if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) {
445 447 err = EPERM;
446 448 goto out;
447 449 }
448 450
449 451 if (mp1->b_cont != NULL) {
450 452 freemsg(mp1->b_cont);
451 453 mp1->b_cont = NULL;
452 454 }
453 455
454 456 acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr;
455 457 laf = acp->ac_local.ss_family;
456 458 raf = acp->ac_remote.ss_family;
457 459
458 460 /* check that a zone with the supplied zoneid exists */
459 461 if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) {
460 462 zptr = zone_find_by_id(zoneid);
461 463 if (zptr != NULL) {
462 464 zone_rele(zptr);
463 465 } else {
464 466 err = EINVAL;
465 467 goto out;
466 468 }
467 469 }
468 470
469 471 /*
470 472 * For exclusive stacks we set the zoneid to zero
471 473 * to make TCP operate as if in the global zone.
472 474 */
473 475 if (tcps->tcps_netstack->netstack_stackid != GLOBAL_NETSTACKID)
474 476 acp->ac_zoneid = GLOBAL_ZONEID;
475 477
476 478 if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT ||
477 479 acp->ac_start > acp->ac_end || laf != raf ||
478 480 (laf != AF_INET && laf != AF_INET6)) {
479 481 err = EINVAL;
480 482 goto out;
481 483 }
482 484
483 485 tcp_ioctl_abort_dump(acp);
484 486 err = tcp_ioctl_abort(acp, tcps);
485 487
486 488 out:
487 489 if (mp1 != NULL) {
488 490 freemsg(mp1);
489 491 mp->b_cont = NULL;
490 492 }
491 493
492 494 if (err != 0)
493 495 miocnak(q, mp, 0, err);
494 496 else
495 497 miocack(q, mp, 0, 0);
496 498 }
|
↓ open down ↓ |
455 lines elided |
↑ open up ↑ |
497 499
498 500 /*
499 501 * Timeout function to reset the TCP stack variable tcps_reclaim to false.
500 502 */
501 503 void
502 504 tcp_reclaim_timer(void *arg)
503 505 {
504 506 tcp_stack_t *tcps = (tcp_stack_t *)arg;
505 507 int64_t tot_conn = 0;
506 508 int i;
507 - extern pgcnt_t lotsfree, needfree;
508 509
509 510 for (i = 0; i < tcps->tcps_sc_cnt; i++)
510 511 tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt;
511 512
512 513 /*
513 514 * This happens only when a stack is going away. tcps_reclaim_tid
514 515 * should not be reset to 0 when returning in this case.
515 516 */
516 517 mutex_enter(&tcps->tcps_reclaim_lock);
517 518 if (!tcps->tcps_reclaim) {
518 519 mutex_exit(&tcps->tcps_reclaim_lock);
519 520 return;
520 521 }
521 522
522 523 if ((freemem >= lotsfree + needfree) || tot_conn < maxusers) {
523 524 tcps->tcps_reclaim = B_FALSE;
524 525 tcps->tcps_reclaim_tid = 0;
525 526 } else {
526 527 /* Stay in defensive mode and restart the timer */
527 528 tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer,
528 529 tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period));
529 530 }
530 531 mutex_exit(&tcps->tcps_reclaim_lock);
531 532 }
532 533
533 534 /*
534 535 * Kmem reclaim call back function. When the system is under memory
535 536 * pressure, we set the TCP stack variable tcps_reclaim to true. This
536 537 * variable is reset to false after tcps_reclaim_period msecs. During this
537 538 * period, TCP will be more aggressive in aborting connections not making
538 539 * progress, meaning retransmitting for some time (tcp_early_abort seconds).
|
↓ open down ↓ |
21 lines elided |
↑ open up ↑ |
539 540 * TCP will also not accept new connection request for those listeners whose
540 541 * q or q0 is not empty.
541 542 */
542 543 /* ARGSUSED */
543 544 void
544 545 tcp_conn_reclaim(void *arg)
545 546 {
546 547 netstack_handle_t nh;
547 548 netstack_t *ns;
548 549 tcp_stack_t *tcps;
549 - extern pgcnt_t lotsfree, needfree;
550 550
551 551 if (!tcp_do_reclaim)
552 552 return;
553 553
554 554 /*
555 555 * The reclaim function may be called even when the system is not
556 556 * really under memory pressure.
557 557 */
558 558 if (freemem >= lotsfree + needfree)
559 559 return;
560 560
561 561 netstack_next_init(&nh);
562 562 while ((ns = netstack_next(&nh)) != NULL) {
563 563 int i;
564 564 int64_t tot_conn = 0;
565 565
566 566 /*
567 567 * During boot time, the first netstack_t is created and
568 568 * initialized before TCP has registered with the netstack
569 569 * framework. If this reclaim function is called before TCP
570 570 * has finished its initialization, netstack_next() will
571 571 * return the first netstack_t (since its netstack_flags is
572 572 * not NSF_UNINIT). And its netstack_tcp will be NULL. We
573 573 * need to catch it.
574 574 *
575 575 * All subsequent netstack_t creation will not have this
576 576 * problem since the initialization is not finished until TCP
577 577 * has finished its own tcp_stack_t initialization. Hence
578 578 * netstack_next() will not return one with NULL netstack_tcp.
579 579 */
580 580 if ((tcps = ns->netstack_tcp) == NULL) {
581 581 netstack_rele(ns);
582 582 continue;
583 583 }
584 584
585 585 /*
586 586 * Even if the system is under memory pressure, the reason may
587 587 * not be because of TCP activity. Check the number of
588 588 * connections in each stack. If the number exceeds the
589 589 * threshold (maxusers), turn on defensive mode.
590 590 */
591 591 for (i = 0; i < tcps->tcps_sc_cnt; i++)
592 592 tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt;
593 593 if (tot_conn < maxusers) {
594 594 netstack_rele(ns);
595 595 continue;
596 596 }
597 597
598 598 mutex_enter(&tcps->tcps_reclaim_lock);
599 599 if (!tcps->tcps_reclaim) {
600 600 tcps->tcps_reclaim = B_TRUE;
601 601 tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer,
602 602 tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period));
603 603 TCP_STAT(tcps, tcp_reclaim_cnt);
604 604 }
605 605 mutex_exit(&tcps->tcps_reclaim_lock);
606 606 netstack_rele(ns);
607 607 }
608 608 netstack_next_fini(&nh);
609 609 }
610 610
611 611 /*
612 612 * Given a tcp_stack_t and a port (in host byte order), find a listener
613 613 * configuration for that port and return the ratio.
614 614 */
615 615 uint32_t
616 616 tcp_find_listener_conf(tcp_stack_t *tcps, in_port_t port)
617 617 {
618 618 tcp_listener_t *tl;
619 619 uint32_t ratio = 0;
620 620
621 621 mutex_enter(&tcps->tcps_listener_conf_lock);
622 622 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
623 623 tl = list_next(&tcps->tcps_listener_conf, tl)) {
624 624 if (tl->tl_port == port) {
625 625 ratio = tl->tl_ratio;
626 626 break;
627 627 }
628 628 }
629 629 mutex_exit(&tcps->tcps_listener_conf_lock);
630 630 return (ratio);
631 631 }
632 632
633 633 /*
634 634 * To remove all listener limit configuration in a tcp_stack_t.
635 635 */
636 636 void
637 637 tcp_listener_conf_cleanup(tcp_stack_t *tcps)
638 638 {
639 639 tcp_listener_t *tl;
640 640
641 641 mutex_enter(&tcps->tcps_listener_conf_lock);
642 642 while ((tl = list_head(&tcps->tcps_listener_conf)) != NULL) {
643 643 list_remove(&tcps->tcps_listener_conf, tl);
644 644 kmem_free(tl, sizeof (tcp_listener_t));
645 645 }
646 646 mutex_destroy(&tcps->tcps_listener_conf_lock);
647 647 list_destroy(&tcps->tcps_listener_conf);
648 648 }
649 649
650 650 /*
651 651 * When a CPU is added, we need to allocate the per CPU stats struct.
652 652 */
653 653 void
654 654 tcp_stack_cpu_add(tcp_stack_t *tcps, processorid_t cpu_seqid)
655 655 {
656 656 int i;
657 657
658 658 if (cpu_seqid < tcps->tcps_sc_cnt)
659 659 return;
660 660 for (i = tcps->tcps_sc_cnt; i <= cpu_seqid; i++) {
661 661 ASSERT(tcps->tcps_sc[i] == NULL);
662 662 tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t),
663 663 KM_SLEEP);
664 664 }
665 665 membar_producer();
666 666 tcps->tcps_sc_cnt = cpu_seqid + 1;
667 667 }
668 668
669 669 /*
670 670 * Diagnostic routine used to return a string associated with the tcp state.
671 671 * Note that if the caller does not supply a buffer, it will use an internal
672 672 * static string. This means that if multiple threads call this function at
673 673 * the same time, output can be corrupted... Note also that this function
674 674 * does not check the size of the supplied buffer. The caller has to make
675 675 * sure that it is big enough.
676 676 */
677 677 char *
678 678 tcp_display(tcp_t *tcp, char *sup_buf, char format)
679 679 {
680 680 char buf1[30];
681 681 static char priv_buf[INET6_ADDRSTRLEN * 2 + 80];
682 682 char *buf;
683 683 char *cp;
684 684 in6_addr_t local, remote;
685 685 char local_addrbuf[INET6_ADDRSTRLEN];
686 686 char remote_addrbuf[INET6_ADDRSTRLEN];
687 687 conn_t *connp;
688 688
689 689 if (sup_buf != NULL)
690 690 buf = sup_buf;
691 691 else
692 692 buf = priv_buf;
693 693
694 694 if (tcp == NULL)
695 695 return ("NULL_TCP");
696 696
697 697 connp = tcp->tcp_connp;
698 698 switch (tcp->tcp_state) {
699 699 case TCPS_CLOSED:
700 700 cp = "TCP_CLOSED";
701 701 break;
702 702 case TCPS_IDLE:
703 703 cp = "TCP_IDLE";
704 704 break;
705 705 case TCPS_BOUND:
706 706 cp = "TCP_BOUND";
707 707 break;
708 708 case TCPS_LISTEN:
709 709 cp = "TCP_LISTEN";
710 710 break;
711 711 case TCPS_SYN_SENT:
712 712 cp = "TCP_SYN_SENT";
713 713 break;
714 714 case TCPS_SYN_RCVD:
715 715 cp = "TCP_SYN_RCVD";
716 716 break;
717 717 case TCPS_ESTABLISHED:
718 718 cp = "TCP_ESTABLISHED";
719 719 break;
720 720 case TCPS_CLOSE_WAIT:
721 721 cp = "TCP_CLOSE_WAIT";
722 722 break;
723 723 case TCPS_FIN_WAIT_1:
724 724 cp = "TCP_FIN_WAIT_1";
725 725 break;
726 726 case TCPS_CLOSING:
727 727 cp = "TCP_CLOSING";
728 728 break;
729 729 case TCPS_LAST_ACK:
730 730 cp = "TCP_LAST_ACK";
731 731 break;
732 732 case TCPS_FIN_WAIT_2:
733 733 cp = "TCP_FIN_WAIT_2";
734 734 break;
735 735 case TCPS_TIME_WAIT:
736 736 cp = "TCP_TIME_WAIT";
737 737 break;
738 738 default:
739 739 (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state);
740 740 cp = buf1;
741 741 break;
742 742 }
743 743 switch (format) {
744 744 case DISP_ADDR_AND_PORT:
745 745 if (connp->conn_ipversion == IPV4_VERSION) {
746 746 /*
747 747 * Note that we use the remote address in the tcp_b
748 748 * structure. This means that it will print out
749 749 * the real destination address, not the next hop's
750 750 * address if source routing is used.
751 751 */
752 752 IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local);
753 753 IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote);
754 754
755 755 } else {
756 756 local = connp->conn_laddr_v6;
757 757 remote = connp->conn_faddr_v6;
758 758 }
759 759 (void) inet_ntop(AF_INET6, &local, local_addrbuf,
760 760 sizeof (local_addrbuf));
761 761 (void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
762 762 sizeof (remote_addrbuf));
763 763 (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
764 764 local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf,
765 765 ntohs(connp->conn_fport), cp);
766 766 break;
767 767 case DISP_PORT_ONLY:
768 768 default:
769 769 (void) mi_sprintf(buf, "[%u, %u] %s",
770 770 ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp);
771 771 break;
772 772 }
773 773
774 774 return (buf);
775 775 }
|
↓ open down ↓ |
216 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX