1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #define _SUN_TPI_VERSION 2
29 #include <sys/tihdr.h>
30 #include <sys/socket.h>
31 #include <sys/xti_xtiopt.h>
32 #include <sys/xti_inet.h>
33 #include <sys/policy.h>
34
35 #include <inet/common.h>
36 #include <netinet/ip6.h>
37 #include <inet/ip.h>
38
39 #include <netinet/in.h>
40 #include <netinet/tcp.h>
41 #include <inet/optcom.h>
42 #include <inet/proto_set.h>
43 #include <inet/tcp_impl.h>
44
45 /*
46 * Table of all known options handled on a TCP protocol stack.
47 *
48 * Note: This table contains options processed by both TCP and IP levels
49 * and is the superset of options that can be performed on a TCP over IP
50 * stack.
51 */
52 opdes_t tcp_opt_arr[] = {
53
54 { SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
55 sizeof (struct linger), 0 },
56
57 { SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
58 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
59 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
60 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
61 },
62 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
63 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
64 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65 { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
66 { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67 { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
68 { SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
69 sizeof (struct timeval), 0 },
70 { SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
71 sizeof (struct timeval), 0 },
72 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
73 },
74 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
75 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
76 0 },
77 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
78 0 },
79 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
80 0 },
81 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
82 0 },
83 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
84
85 { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
86
87 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
88
89 { TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
90 },
91 { TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
92 536 },
93
94 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
95 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
96
97 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
98 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
99
100 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
101 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
102
103 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
104 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
105
106 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
107 0 },
108
109 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
110 sizeof (int), 0 },
111
112 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
113 },
114
115 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
116 sizeof (int), 0 },
117
118 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
119 sizeof (int), 0 },
120
121 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
122
123 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
124
125 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
126
127 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
128 sizeof (int), 0 },
129
130 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
131
132 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
133
134 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
135
136 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
137
138 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
139
140 { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
141 (OP_VARLEN|OP_NODEFAULT),
142 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
143 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
144 (OP_VARLEN|OP_NODEFAULT),
145 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
146
147 { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
148 { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
149 { IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
150 sizeof (int), -1 /* not initialized */ },
151
152 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
153 sizeof (ipsec_req_t), -1 /* not initialized */ },
154
155 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
156 sizeof (int), 0 /* no ifindex */ },
157
158 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
159 sizeof (int), 0 },
160
161 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
162 sizeof (int), -1 /* not initialized */ },
163
164 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
165 sizeof (int), 0 /* no ifindex */ },
166
167 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
168
169 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
170 sizeof (in_addr_t), -1 /* not initialized */ },
171
172 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
173 sizeof (int), 0 },
174
175 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
176 (OP_NODEFAULT|OP_VARLEN),
177 sizeof (struct in6_pktinfo), -1 /* not initialized */ },
178 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
179 OP_NODEFAULT,
180 sizeof (sin6_t), -1 /* not initialized */ },
181 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
182 (OP_VARLEN|OP_NODEFAULT), 255*8,
183 -1 /* not initialized */ },
184 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
185 (OP_VARLEN|OP_NODEFAULT), 255*8,
186 -1 /* not initialized */ },
187 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
188 (OP_VARLEN|OP_NODEFAULT), 255*8,
189 -1 /* not initialized */ },
190 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
191 (OP_VARLEN|OP_NODEFAULT), 255*8,
192 -1 /* not initialized */ },
193 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
194 OP_NODEFAULT,
195 sizeof (int), -1 /* not initialized */ },
196 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
197 OP_NODEFAULT,
198 sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
199 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
200 sizeof (int), 0 },
201 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
202 sizeof (int), 0 },
203 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
204 sizeof (int), 0 },
205
206 /* Enable receipt of ancillary data */
207 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
208 sizeof (int), 0 },
209 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
210 sizeof (int), 0 },
211 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
212 sizeof (int), 0 },
213 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
214 sizeof (int), 0 },
215 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
216 sizeof (int), 0 },
217 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
218 sizeof (int), 0 },
219 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
220 sizeof (int), 0 },
221 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
222 sizeof (int), 0 },
223
224 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
225 sizeof (ipsec_req_t), -1 /* not initialized */ },
226 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
227 sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
228 };
229
230 /*
231 * Table of all supported levels
232 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
233 * any supported options so we need this info separately.
234 *
235 * This is needed only for topmost tpi providers and is used only by
236 * XTI interfaces.
237 */
238 optlevel_t tcp_valid_levels_arr[] = {
239 XTI_GENERIC,
240 SOL_SOCKET,
241 IPPROTO_TCP,
242 IPPROTO_IP,
243 IPPROTO_IPV6
244 };
245
246
247 #define TCP_OPT_ARR_CNT A_CNT(tcp_opt_arr)
248 #define TCP_VALID_LEVELS_CNT A_CNT(tcp_valid_levels_arr)
249
250 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
251
252 /*
253 * Initialize option database object for TCP
254 *
255 * This object represents database of options to search passed to
256 * {sock,tpi}optcom_req() interface routine to take care of option
257 * management and associated methods.
258 */
259
260 optdb_obj_t tcp_opt_obj = {
261 tcp_opt_default, /* TCP default value function pointer */
262 tcp_tpi_opt_get, /* TCP get function pointer */
263 tcp_tpi_opt_set, /* TCP set function pointer */
264 TCP_OPT_ARR_CNT, /* TCP option database count of entries */
265 tcp_opt_arr, /* TCP option database */
266 TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */
267 tcp_valid_levels_arr /* TCP valid level array */
268 };
269
270 /* Maximum TCP initial cwin (start/restart). */
271 #define TCP_MAX_INIT_CWND 16
272
273 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
274
275 /*
276 * Some TCP options can be "set" by requesting them in the option
277 * buffer. This is needed for XTI feature test though we do not
278 * allow it in general. We interpret that this mechanism is more
279 * applicable to OSI protocols and need not be allowed in general.
280 * This routine filters out options for which it is not allowed (most)
281 * and lets through those (few) for which it is. [ The XTI interface
282 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
283 * ever implemented will have to be allowed here ].
284 */
285 static boolean_t
286 tcp_allow_connopt_set(int level, int name)
287 {
288
289 switch (level) {
290 case IPPROTO_TCP:
291 switch (name) {
292 case TCP_NODELAY:
293 return (B_TRUE);
294 default:
295 return (B_FALSE);
296 }
297 /*NOTREACHED*/
298 default:
299 return (B_FALSE);
300 }
301 /*NOTREACHED*/
302 }
303
304 /*
305 * This routine gets default values of certain options whose default
306 * values are maintained by protocol specific code
307 */
308 /* ARGSUSED */
309 int
310 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
311 {
312 int32_t *i1 = (int32_t *)ptr;
313 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
314
315 switch (level) {
316 case IPPROTO_TCP:
317 switch (name) {
318 case TCP_NOTIFY_THRESHOLD:
319 *i1 = tcps->tcps_ip_notify_interval;
320 break;
321 case TCP_ABORT_THRESHOLD:
322 *i1 = tcps->tcps_ip_abort_interval;
323 break;
324 case TCP_CONN_NOTIFY_THRESHOLD:
325 *i1 = tcps->tcps_ip_notify_cinterval;
326 break;
327 case TCP_CONN_ABORT_THRESHOLD:
328 *i1 = tcps->tcps_ip_abort_cinterval;
329 break;
330 default:
331 return (-1);
332 }
333 break;
334 case IPPROTO_IP:
335 switch (name) {
336 case IP_TTL:
337 *i1 = tcps->tcps_ipv4_ttl;
338 break;
339 default:
340 return (-1);
341 }
342 break;
343 case IPPROTO_IPV6:
344 switch (name) {
345 case IPV6_UNICAST_HOPS:
346 *i1 = tcps->tcps_ipv6_hoplimit;
347 break;
348 default:
349 return (-1);
350 }
351 break;
352 default:
353 return (-1);
354 }
355 return (sizeof (int));
356 }
357
358 /*
359 * TCP routine to get the values of options.
360 */
361 int
362 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
363 {
364 int *i1 = (int *)ptr;
365 tcp_t *tcp = connp->conn_tcp;
366 conn_opt_arg_t coas;
367 int retval;
368
369 coas.coa_connp = connp;
370 coas.coa_ixa = connp->conn_ixa;
371 coas.coa_ipp = &connp->conn_xmit_ipp;
372 coas.coa_ancillary = B_FALSE;
373 coas.coa_changed = 0;
374
375 switch (level) {
376 case SOL_SOCKET:
377 switch (name) {
378 case SO_SND_COPYAVOID:
379 *i1 = tcp->tcp_snd_zcopy_on ?
380 SO_SND_COPYAVOID : 0;
381 return (sizeof (int));
382 case SO_ACCEPTCONN:
383 *i1 = (tcp->tcp_state == TCPS_LISTEN);
384 return (sizeof (int));
385 }
386 break;
387 case IPPROTO_TCP:
388 switch (name) {
389 case TCP_NODELAY:
390 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
391 return (sizeof (int));
392 case TCP_MAXSEG:
393 *i1 = tcp->tcp_mss;
394 return (sizeof (int));
395 case TCP_NOTIFY_THRESHOLD:
396 *i1 = (int)tcp->tcp_first_timer_threshold;
397 return (sizeof (int));
398 case TCP_ABORT_THRESHOLD:
399 *i1 = tcp->tcp_second_timer_threshold;
400 return (sizeof (int));
401 case TCP_CONN_NOTIFY_THRESHOLD:
402 *i1 = tcp->tcp_first_ctimer_threshold;
403 return (sizeof (int));
404 case TCP_CONN_ABORT_THRESHOLD:
405 *i1 = tcp->tcp_second_ctimer_threshold;
406 return (sizeof (int));
407 case TCP_INIT_CWND:
408 *i1 = tcp->tcp_init_cwnd;
409 return (sizeof (int));
410 case TCP_KEEPALIVE_THRESHOLD:
411 *i1 = tcp->tcp_ka_interval;
412 return (sizeof (int));
413
414 /*
415 * TCP_KEEPIDLE expects value in seconds, but
416 * tcp_ka_interval is in milliseconds.
417 */
418 case TCP_KEEPIDLE:
419 *i1 = tcp->tcp_ka_interval / 1000;
420 return (sizeof (int));
421 case TCP_KEEPCNT:
422 *i1 = tcp->tcp_ka_cnt;
423 return (sizeof (int));
424
425 /*
426 * TCP_KEEPINTVL expects value in seconds, but
427 * tcp_ka_rinterval is in milliseconds.
428 */
429 case TCP_KEEPINTVL:
430 *i1 = tcp->tcp_ka_rinterval / 1000;
431 return (sizeof (int));
432 case TCP_KEEPALIVE_ABORT_THRESHOLD:
433 *i1 = tcp->tcp_ka_abort_thres;
434 return (sizeof (int));
435 case TCP_CORK:
436 *i1 = tcp->tcp_cork;
437 return (sizeof (int));
438 case TCP_RTO_INITIAL:
439 *i1 = tcp->tcp_rto_initial;
440 return (sizeof (uint32_t));
441 case TCP_RTO_MIN:
442 *i1 = tcp->tcp_rto_min;
443 return (sizeof (uint32_t));
444 case TCP_RTO_MAX:
445 *i1 = tcp->tcp_rto_max;
446 return (sizeof (uint32_t));
447 case TCP_LINGER2:
448 *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
449 return (sizeof (int));
450 }
451 break;
452 case IPPROTO_IP:
453 if (connp->conn_family != AF_INET)
454 return (-1);
455 switch (name) {
456 case IP_OPTIONS:
457 case T_IP_OPTIONS:
458 /* Caller ensures enough space */
459 return (ip_opt_get_user(connp, ptr));
460 default:
461 break;
462 }
463 break;
464
465 case IPPROTO_IPV6:
466 /*
467 * IPPROTO_IPV6 options are only supported for sockets
468 * that are using IPv6 on the wire.
469 */
470 if (connp->conn_ipversion != IPV6_VERSION) {
471 return (-1);
472 }
473 switch (name) {
474 case IPV6_PATHMTU:
475 if (tcp->tcp_state < TCPS_ESTABLISHED)
476 return (-1);
477 break;
478 }
479 break;
480 }
481 mutex_enter(&connp->conn_lock);
482 retval = conn_opt_get(&coas, level, name, ptr);
483 mutex_exit(&connp->conn_lock);
484 return (retval);
485 }
486
487 /*
488 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
489 * Parameters are assumed to be verified by the caller.
490 */
491 /* ARGSUSED */
492 int
493 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
494 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
495 void *thisdg_attrs, cred_t *cr)
496 {
497 tcp_t *tcp = connp->conn_tcp;
498 int *i1 = (int *)invalp;
499 boolean_t onoff = (*i1 == 0) ? 0 : 1;
500 boolean_t checkonly;
501 int reterr;
502 tcp_stack_t *tcps = tcp->tcp_tcps;
503 conn_opt_arg_t coas;
504 uint32_t val = *((uint32_t *)invalp);
505
506 coas.coa_connp = connp;
507 coas.coa_ixa = connp->conn_ixa;
508 coas.coa_ipp = &connp->conn_xmit_ipp;
509 coas.coa_ancillary = B_FALSE;
510 coas.coa_changed = 0;
511
512 switch (optset_context) {
513 case SETFN_OPTCOM_CHECKONLY:
514 checkonly = B_TRUE;
515 /*
516 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
517 * inlen != 0 implies value supplied and
518 * we have to "pretend" to set it.
519 * inlen == 0 implies that there is no
520 * value part in T_CHECK request and just validation
521 * done elsewhere should be enough, we just return here.
522 */
523 if (inlen == 0) {
524 *outlenp = 0;
525 return (0);
526 }
527 break;
528 case SETFN_OPTCOM_NEGOTIATE:
529 checkonly = B_FALSE;
530 break;
531 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
532 case SETFN_CONN_NEGOTIATE:
533 checkonly = B_FALSE;
534 /*
535 * Negotiating local and "association-related" options
536 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
537 * primitives is allowed by XTI, but we choose
538 * to not implement this style negotiation for Internet
539 * protocols (We interpret it is a must for OSI world but
540 * optional for Internet protocols) for all options.
541 * [ Will do only for the few options that enable test
542 * suites that our XTI implementation of this feature
543 * works for transports that do allow it ]
544 */
545 if (!tcp_allow_connopt_set(level, name)) {
546 *outlenp = 0;
547 return (EINVAL);
548 }
549 break;
550 default:
551 /*
552 * We should never get here
553 */
554 *outlenp = 0;
555 return (EINVAL);
556 }
557
558 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
559 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
560
561 /*
562 * For TCP, we should have no ancillary data sent down
563 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
564 * has to be zero.
565 */
566 ASSERT(thisdg_attrs == NULL);
567
568 /*
569 * For fixed length options, no sanity check
570 * of passed in length is done. It is assumed *_optcom_req()
571 * routines do the right thing.
572 */
573 switch (level) {
574 case SOL_SOCKET:
575 switch (name) {
576 case SO_KEEPALIVE:
577 if (checkonly) {
578 /* check only case */
579 break;
580 }
581
582 if (!onoff) {
583 if (connp->conn_keepalive) {
584 if (tcp->tcp_ka_tid != 0) {
585 (void) TCP_TIMER_CANCEL(tcp,
586 tcp->tcp_ka_tid);
587 tcp->tcp_ka_tid = 0;
588 }
589 connp->conn_keepalive = 0;
590 }
591 break;
592 }
593 if (!connp->conn_keepalive) {
594 /* Crank up the keepalive timer */
595 tcp->tcp_ka_last_intrvl = 0;
596 tcp->tcp_ka_tid = TCP_TIMER(tcp,
597 tcp_keepalive_timer, tcp->tcp_ka_interval);
598 connp->conn_keepalive = 1;
599 }
600 break;
601 case SO_SNDBUF: {
602 if (*i1 > tcps->tcps_max_buf) {
603 *outlenp = 0;
604 return (ENOBUFS);
605 }
606 if (checkonly)
607 break;
608
609 connp->conn_sndbuf = *i1;
610 if (tcps->tcps_snd_lowat_fraction != 0) {
611 connp->conn_sndlowat = connp->conn_sndbuf /
612 tcps->tcps_snd_lowat_fraction;
613 }
614 (void) tcp_maxpsz_set(tcp, B_TRUE);
615 /*
616 * If we are flow-controlled, recheck the condition.
617 * There are apps that increase SO_SNDBUF size when
618 * flow-controlled (EWOULDBLOCK), and expect the flow
619 * control condition to be lifted right away.
620 */
621 mutex_enter(&tcp->tcp_non_sq_lock);
622 if (tcp->tcp_flow_stopped &&
623 TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
624 tcp_clrqfull(tcp);
625 }
626 mutex_exit(&tcp->tcp_non_sq_lock);
627 *outlenp = inlen;
628 return (0);
629 }
630 case SO_RCVBUF:
631 if (*i1 > tcps->tcps_max_buf) {
632 *outlenp = 0;
633 return (ENOBUFS);
634 }
635 /* Silently ignore zero */
636 if (!checkonly && *i1 != 0) {
637 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
638 (void) tcp_rwnd_set(tcp, *i1);
639 }
640 /*
641 * XXX should we return the rwnd here
642 * and tcp_opt_get ?
643 */
644 *outlenp = inlen;
645 return (0);
646 case SO_SND_COPYAVOID:
647 if (!checkonly) {
648 if (tcp->tcp_loopback ||
649 (onoff != 1) || !tcp_zcopy_check(tcp)) {
650 *outlenp = 0;
651 return (EOPNOTSUPP);
652 }
653 tcp->tcp_snd_zcopy_aware = 1;
654 }
655 *outlenp = inlen;
656 return (0);
657 }
658 break;
659 case IPPROTO_TCP:
660 switch (name) {
661 case TCP_NODELAY:
662 if (!checkonly)
663 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
664 break;
665 case TCP_NOTIFY_THRESHOLD:
666 if (!checkonly)
667 tcp->tcp_first_timer_threshold = *i1;
668 break;
669 case TCP_ABORT_THRESHOLD:
670 if (!checkonly)
671 tcp->tcp_second_timer_threshold = *i1;
672 break;
673 case TCP_CONN_NOTIFY_THRESHOLD:
674 if (!checkonly)
675 tcp->tcp_first_ctimer_threshold = *i1;
676 break;
677 case TCP_CONN_ABORT_THRESHOLD:
678 if (!checkonly)
679 tcp->tcp_second_ctimer_threshold = *i1;
680 break;
681 case TCP_RECVDSTADDR:
682 if (tcp->tcp_state > TCPS_LISTEN) {
683 *outlenp = 0;
684 return (EOPNOTSUPP);
685 }
686 /* Setting done in conn_opt_set */
687 break;
688 case TCP_INIT_CWND:
689 if (checkonly)
690 break;
691
692 /*
693 * Only allow socket with network configuration
694 * privilege to set the initial cwnd to be larger
695 * than allowed by RFC 3390.
696 */
697 if (val <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
698 tcp->tcp_init_cwnd = val;
699 break;
700 }
701 if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) {
702 *outlenp = 0;
703 return (reterr);
704 }
705 if (val > tcp_max_init_cwnd) {
706 *outlenp = 0;
707 return (EINVAL);
708 }
709 tcp->tcp_init_cwnd = val;
710 break;
711
712 /*
713 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
714 * is in milliseconds. TCP_KEEPIDLE is introduced for
715 * compatibility with other Unix flavors.
716 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
717 * converting the input to milliseconds.
718 */
719 case TCP_KEEPIDLE:
720 *i1 *= 1000;
721 /* FALLTHRU */
722
723 case TCP_KEEPALIVE_THRESHOLD:
724 if (checkonly)
725 break;
726
727 if (*i1 < tcps->tcps_keepalive_interval_low ||
728 *i1 > tcps->tcps_keepalive_interval_high) {
729 *outlenp = 0;
730 return (EINVAL);
731 }
732 if (*i1 != tcp->tcp_ka_interval) {
733 tcp->tcp_ka_interval = *i1;
734 /*
735 * Check if we need to restart the
736 * keepalive timer.
737 */
738 if (tcp->tcp_ka_tid != 0) {
739 ASSERT(connp->conn_keepalive);
740 (void) TCP_TIMER_CANCEL(tcp,
741 tcp->tcp_ka_tid);
742 tcp->tcp_ka_last_intrvl = 0;
743 tcp->tcp_ka_tid = TCP_TIMER(tcp,
744 tcp_keepalive_timer,
745 tcp->tcp_ka_interval);
746 }
747 }
748 break;
749
750 /*
751 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
752 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
753 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
754 * tcp_ka_cnt.
755 */
756 case TCP_KEEPCNT:
757 if (checkonly)
758 break;
759
760 if (*i1 == 0) {
761 return (EINVAL);
762 } else if (tcp->tcp_ka_rinterval == 0) {
763 if ((tcp->tcp_ka_abort_thres / *i1) <
764 tcp->tcp_rto_min ||
765 (tcp->tcp_ka_abort_thres / *i1) >
766 tcp->tcp_rto_max)
767 return (EINVAL);
768
769 tcp->tcp_ka_rinterval =
770 tcp->tcp_ka_abort_thres / *i1;
771 } else {
772 if ((*i1 * tcp->tcp_ka_rinterval) <
773 tcps->tcps_keepalive_abort_interval_low ||
774 (*i1 * tcp->tcp_ka_rinterval) >
775 tcps->tcps_keepalive_abort_interval_high)
776 return (EINVAL);
777 tcp->tcp_ka_abort_thres =
778 (*i1 * tcp->tcp_ka_rinterval);
779 }
780 tcp->tcp_ka_cnt = *i1;
781 break;
782 case TCP_KEEPINTVL:
783 /*
784 * TCP_KEEPINTVL is specified in seconds, but
785 * tcp_ka_rinterval is in milliseconds.
786 */
787
788 if (checkonly)
789 break;
790
791 if ((*i1 * 1000) < tcp->tcp_rto_min ||
792 (*i1 * 1000) > tcp->tcp_rto_max)
793 return (EINVAL);
794
795 if (tcp->tcp_ka_cnt == 0) {
796 tcp->tcp_ka_cnt =
797 tcp->tcp_ka_abort_thres / (*i1 * 1000);
798 } else {
799 if ((*i1 * tcp->tcp_ka_cnt * 1000) <
800 tcps->tcps_keepalive_abort_interval_low ||
801 (*i1 * tcp->tcp_ka_cnt * 1000) >
802 tcps->tcps_keepalive_abort_interval_high)
803 return (EINVAL);
804 tcp->tcp_ka_abort_thres =
805 (*i1 * tcp->tcp_ka_cnt * 1000);
806 }
807 tcp->tcp_ka_rinterval = *i1 * 1000;
808 break;
809 case TCP_KEEPALIVE_ABORT_THRESHOLD:
810 if (!checkonly) {
811 if (*i1 <
812 tcps->tcps_keepalive_abort_interval_low ||
813 *i1 >
814 tcps->tcps_keepalive_abort_interval_high) {
815 *outlenp = 0;
816 return (EINVAL);
817 }
818 tcp->tcp_ka_abort_thres = *i1;
819 tcp->tcp_ka_cnt = 0;
820 tcp->tcp_ka_rinterval = 0;
821 }
822 break;
823 case TCP_CORK:
824 if (!checkonly) {
825 /*
826 * if tcp->tcp_cork was set and is now
827 * being unset, we have to make sure that
828 * the remaining data gets sent out. Also
829 * unset tcp->tcp_cork so that tcp_wput_data()
830 * can send data even if it is less than mss
831 */
832 if (tcp->tcp_cork && onoff == 0 &&
833 tcp->tcp_unsent > 0) {
834 tcp->tcp_cork = B_FALSE;
835 tcp_wput_data(tcp, NULL, B_FALSE);
836 }
837 tcp->tcp_cork = onoff;
838 }
839 break;
840 case TCP_RTO_INITIAL: {
841 clock_t rto;
842
843 if (checkonly || val == 0)
844 break;
845
846 /*
847 * Sanity checks
848 *
849 * The initial RTO should be bounded by the minimum
850 * and maximum RTO. And it should also be smaller
851 * than the connect attempt abort timeout. Otherwise,
852 * the connection won't be aborted in a period
853 * reasonably close to that timeout.
854 */
855 if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
856 val > tcp->tcp_second_ctimer_threshold ||
857 val < tcps->tcps_rexmit_interval_initial_low ||
858 val > tcps->tcps_rexmit_interval_initial_high) {
859 *outlenp = 0;
860 return (EINVAL);
861 }
862 tcp->tcp_rto_initial = val;
863
864 /*
865 * If TCP has not sent anything, need to re-calculate
866 * tcp_rto. Otherwise, this option change does not
867 * really affect anything.
868 */
869 if (tcp->tcp_state >= TCPS_SYN_SENT)
870 break;
871
872 tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
873 tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
874 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
875 tcps->tcps_rexmit_interval_extra +
876 (tcp->tcp_rtt_sa >> 5) +
877 tcps->tcps_conn_grace_period;
878 TCP_SET_RTO(tcp, rto);
879 break;
880 }
881 case TCP_RTO_MIN:
882 if (checkonly || val == 0)
883 break;
884
885 if (val < tcps->tcps_rexmit_interval_min_low ||
886 val > tcps->tcps_rexmit_interval_min_high ||
887 val > tcp->tcp_rto_max) {
888 *outlenp = 0;
889 return (EINVAL);
890 }
891 tcp->tcp_rto_min = val;
892 if (tcp->tcp_rto < val)
893 tcp->tcp_rto = val;
894 break;
895 case TCP_RTO_MAX:
896 if (checkonly || val == 0)
897 break;
898
899 /*
900 * Sanity checks
901 *
902 * The maximum RTO should not be larger than the
903 * connection abort timeout. Otherwise, the
904 * connection won't be aborted in a period reasonably
905 * close to that timeout.
906 */
907 if (val < tcps->tcps_rexmit_interval_max_low ||
908 val > tcps->tcps_rexmit_interval_max_high ||
909 val < tcp->tcp_rto_min ||
910 val > tcp->tcp_second_timer_threshold) {
911 *outlenp = 0;
912 return (EINVAL);
913 }
914 tcp->tcp_rto_max = val;
915 if (tcp->tcp_rto > val)
916 tcp->tcp_rto = val;
917 break;
918 case TCP_LINGER2:
919 if (checkonly || *i1 == 0)
920 break;
921
922 /*
923 * Note that the option value's unit is second. And
924 * the value should be bigger than the private
925 * parameter tcp_fin_wait_2_flush_interval's lower
926 * bound and smaller than the current value of that
927 * parameter. It should be smaller than the current
928 * value to avoid an app setting TCP_LINGER2 to a big
929 * value, causing resource to be held up too long in
930 * FIN-WAIT-2 state.
931 */
932 if (*i1 < 0 ||
933 tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
934 *i1 ||
935 tcps->tcps_fin_wait_2_flush_interval/SECONDS <
936 *i1) {
937 *outlenp = 0;
938 return (EINVAL);
939 }
940 tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
941 break;
942 default:
943 break;
944 }
945 break;
946 case IPPROTO_IP:
947 if (connp->conn_family != AF_INET) {
948 *outlenp = 0;
949 return (EINVAL);
950 }
951 switch (name) {
952 case IP_SEC_OPT:
953 /*
954 * We should not allow policy setting after
955 * we start listening for connections.
956 */
957 if (tcp->tcp_state == TCPS_LISTEN) {
958 return (EINVAL);
959 }
960 break;
961 }
962 break;
963 case IPPROTO_IPV6:
964 /*
965 * IPPROTO_IPV6 options are only supported for sockets
966 * that are using IPv6 on the wire.
967 */
968 if (connp->conn_ipversion != IPV6_VERSION) {
969 *outlenp = 0;
970 return (EINVAL);
971 }
972
973 switch (name) {
974 case IPV6_RECVPKTINFO:
975 if (!checkonly) {
976 /* Force it to be sent up with the next msg */
977 tcp->tcp_recvifindex = 0;
978 }
979 break;
980 case IPV6_RECVTCLASS:
981 if (!checkonly) {
982 /* Force it to be sent up with the next msg */
983 tcp->tcp_recvtclass = 0xffffffffU;
984 }
985 break;
986 case IPV6_RECVHOPLIMIT:
987 if (!checkonly) {
988 /* Force it to be sent up with the next msg */
989 tcp->tcp_recvhops = 0xffffffffU;
990 }
991 break;
992 case IPV6_PKTINFO:
993 /* This is an extra check for TCP */
994 if (inlen == sizeof (struct in6_pktinfo)) {
995 struct in6_pktinfo *pkti;
996
997 pkti = (struct in6_pktinfo *)invalp;
998 /*
999 * RFC 3542 states that ipi6_addr must be
1000 * the unspecified address when setting the
1001 * IPV6_PKTINFO sticky socket option on a
1002 * TCP socket.
1003 */
1004 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1005 return (EINVAL);
1006 }
1007 break;
1008 case IPV6_SEC_OPT:
1009 /*
1010 * We should not allow policy setting after
1011 * we start listening for connections.
1012 */
1013 if (tcp->tcp_state == TCPS_LISTEN) {
1014 return (EINVAL);
1015 }
1016 break;
1017 }
1018 break;
1019 }
1020 reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1021 checkonly, cr);
1022 if (reterr != 0) {
1023 *outlenp = 0;
1024 return (reterr);
1025 }
1026
1027 /*
1028 * Common case of OK return with outval same as inval
1029 */
1030 if (invalp != outvalp) {
1031 /* don't trust bcopy for identical src/dst */
1032 (void) bcopy(invalp, outvalp, inlen);
1033 }
1034 *outlenp = inlen;
1035
1036 if (coas.coa_changed & COA_HEADER_CHANGED) {
1037 /* If we are connected we rebuilt the headers */
1038 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1039 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1040 reterr = tcp_build_hdrs(tcp);
1041 if (reterr != 0)
1042 return (reterr);
1043 }
1044 }
1045 if (coas.coa_changed & COA_ROUTE_CHANGED) {
1046 in6_addr_t nexthop;
1047
1048 /*
1049 * If we are connected we re-cache the information.
1050 * We ignore errors to preserve BSD behavior.
1051 * Note that we don't redo IPsec policy lookup here
1052 * since the final destination (or source) didn't change.
1053 */
1054 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1055 &connp->conn_faddr_v6, &nexthop);
1056
1057 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1058 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1059 (void) ip_attr_connect(connp, connp->conn_ixa,
1060 &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1061 &nexthop, connp->conn_fport, NULL, NULL,
1062 IPDF_VERIFY_DST);
1063 }
1064 }
1065 if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1066 connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1067 }
1068 if (coas.coa_changed & COA_WROFF_CHANGED) {
1069 connp->conn_wroff = connp->conn_ht_iphc_allocated +
1070 tcps->tcps_wroff_xtra;
1071 (void) proto_set_tx_wroff(connp->conn_rq, connp,
1072 connp->conn_wroff);
1073 }
1074 if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1075 if (IPCL_IS_NONSTR(connp))
1076 proto_set_rx_oob_opt(connp, onoff);
1077 }
1078 return (0);
1079 }