Print this page
OS-3752 Increase IOV_MAX to at least 1024
OS-3404 lx brand must support sendmsg() with IOV_MAX of 1024
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/sockfs/socksyscalls.c
+++ new/usr/src/uts/common/fs/sockfs/socksyscalls.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 + * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
25 + * Copyright 2015, Joyent, Inc. All rights reserved.
24 26 */
25 27
26 28 /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
27 29 /*
28 30 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
29 31 */
30 32
31 33 #include <sys/types.h>
32 34 #include <sys/t_lock.h>
33 35 #include <sys/param.h>
34 36 #include <sys/systm.h>
35 37 #include <sys/buf.h>
36 38 #include <sys/conf.h>
37 39 #include <sys/cred.h>
38 40 #include <sys/kmem.h>
39 41 #include <sys/sysmacros.h>
40 42 #include <sys/vfs.h>
41 43 #include <sys/vnode.h>
42 44 #include <sys/debug.h>
43 45 #include <sys/errno.h>
44 46 #include <sys/time.h>
45 47 #include <sys/file.h>
46 48 #include <sys/user.h>
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
47 49 #include <sys/stream.h>
48 50 #include <sys/strsubr.h>
49 51 #include <sys/strsun.h>
50 52 #include <sys/sunddi.h>
51 53 #include <sys/esunddi.h>
52 54 #include <sys/flock.h>
53 55 #include <sys/modctl.h>
54 56 #include <sys/cmn_err.h>
55 57 #include <sys/vmsystm.h>
56 58 #include <sys/policy.h>
59 +#include <sys/limits.h>
57 60
58 61 #include <sys/socket.h>
59 62 #include <sys/socketvar.h>
60 63
61 64 #include <sys/isa_defs.h>
62 65 #include <sys/inttypes.h>
63 66 #include <sys/systm.h>
64 67 #include <sys/cpuvar.h>
65 68 #include <sys/filio.h>
66 69 #include <sys/sendfile.h>
67 70 #include <sys/ddi.h>
68 71 #include <vm/seg.h>
69 72 #include <vm/seg_map.h>
70 73 #include <vm/seg_kpm.h>
71 74
72 75 #include <fs/sockfs/nl7c.h>
73 76 #include <fs/sockfs/sockcommon.h>
74 77 #include <fs/sockfs/sockfilter_impl.h>
75 78 #include <fs/sockfs/socktpi.h>
76 79
77 80 #ifdef SOCK_TEST
78 81 int do_useracc = 1; /* Controlled by setting SO_DEBUG to 4 */
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
79 82 #else
80 83 #define do_useracc 1
81 84 #endif /* SOCK_TEST */
82 85
83 86 extern int xnet_truncate_print;
84 87
85 88 extern void nl7c_init(void);
86 89 extern int sockfs_defer_nl7c_init;
87 90
88 91 /*
89 - * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
90 - * as there isn't a formal definition of IOV_MAX ???
91 - */
92 -#define MSG_MAXIOVLEN 16
93 -
94 -/*
95 92 * Kernel component of socket creation.
96 93 *
97 94 * The socket library determines which version number to use.
98 95 * First the library calls this with a NULL devpath. If this fails
99 96 * to find a transport (using solookup) the library will look in /etc/netconfig
100 97 * for the appropriate transport. If one is found it will pass in the
101 98 * devpath for the kernel to use.
102 99 */
103 100 int
104 101 so_socket(int family, int type_w_flags, int protocol, char *devpath,
105 102 int version)
106 103 {
107 104 struct sonode *so;
108 105 vnode_t *vp;
109 106 struct file *fp;
110 107 int fd;
111 108 int error;
112 109 int type;
113 110
114 111 type = type_w_flags & SOCK_TYPE_MASK;
115 112 type_w_flags &= ~SOCK_TYPE_MASK;
116 113 if (type_w_flags & ~(SOCK_CLOEXEC|SOCK_NDELAY|SOCK_NONBLOCK))
117 114 return (set_errno(EINVAL));
118 115
119 116 if (devpath != NULL) {
120 117 char *buf;
121 118 size_t kdevpathlen = 0;
122 119
123 120 buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
124 121 if ((error = copyinstr(devpath, buf,
125 122 MAXPATHLEN, &kdevpathlen)) != 0) {
126 123 kmem_free(buf, MAXPATHLEN);
127 124 return (set_errno(error));
128 125 }
129 126 so = socket_create(family, type, protocol, buf, NULL,
130 127 SOCKET_SLEEP, version, CRED(), &error);
131 128 kmem_free(buf, MAXPATHLEN);
132 129 } else {
133 130 so = socket_create(family, type, protocol, NULL, NULL,
134 131 SOCKET_SLEEP, version, CRED(), &error);
135 132 }
136 133 if (so == NULL)
137 134 return (set_errno(error));
138 135
139 136 /* Allocate a file descriptor for the socket */
140 137 vp = SOTOV(so);
141 138 if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) {
142 139 (void) socket_close(so, 0, CRED());
143 140 socket_destroy(so);
144 141 return (set_errno(error));
145 142 }
146 143
147 144 /*
148 145 * Now fill in the entries that falloc reserved
149 146 */
150 147 if (type_w_flags & SOCK_NDELAY) {
151 148 so->so_state |= SS_NDELAY;
152 149 fp->f_flag |= FNDELAY;
153 150 }
154 151 if (type_w_flags & SOCK_NONBLOCK) {
155 152 so->so_state |= SS_NONBLOCK;
156 153 fp->f_flag |= FNONBLOCK;
157 154 }
158 155 mutex_exit(&fp->f_tlock);
159 156 setf(fd, fp);
160 157 if ((type_w_flags & SOCK_CLOEXEC) != 0) {
161 158 f_setfd(fd, FD_CLOEXEC);
162 159 }
163 160
164 161 return (fd);
165 162 }
166 163
167 164 /*
168 165 * Map from a file descriptor to a socket node.
169 166 * Returns with the file descriptor held i.e. the caller has to
170 167 * use releasef when done with the file descriptor.
171 168 */
172 169 struct sonode *
173 170 getsonode(int sock, int *errorp, file_t **fpp)
174 171 {
175 172 file_t *fp;
176 173 vnode_t *vp;
177 174 struct sonode *so;
178 175
179 176 if ((fp = getf(sock)) == NULL) {
180 177 *errorp = EBADF;
181 178 eprintline(*errorp);
182 179 return (NULL);
183 180 }
184 181 vp = fp->f_vnode;
185 182 /* Check if it is a socket */
186 183 if (vp->v_type != VSOCK) {
187 184 releasef(sock);
188 185 *errorp = ENOTSOCK;
189 186 eprintline(*errorp);
190 187 return (NULL);
191 188 }
192 189 /*
193 190 * Use the stream head to find the real socket vnode.
194 191 * This is needed when namefs sits above sockfs.
195 192 */
196 193 if (vp->v_stream) {
197 194 ASSERT(vp->v_stream->sd_vnode);
198 195 vp = vp->v_stream->sd_vnode;
199 196
200 197 so = VTOSO(vp);
201 198 if (so->so_version == SOV_STREAM) {
202 199 releasef(sock);
203 200 *errorp = ENOTSOCK;
204 201 eprintsoline(so, *errorp);
205 202 return (NULL);
206 203 }
207 204 } else {
208 205 so = VTOSO(vp);
209 206 }
210 207 if (fpp)
211 208 *fpp = fp;
212 209 return (so);
213 210 }
214 211
215 212 /*
216 213 * Allocate and copyin a sockaddr.
217 214 * Ensures NULL termination for AF_UNIX addresses by extending them
218 215 * with one NULL byte if need be. Verifies that the length is not
219 216 * excessive to prevent an application from consuming all of kernel
220 217 * memory. Returns NULL when an error occurred.
221 218 */
222 219 static struct sockaddr *
223 220 copyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp,
224 221 int *errorp)
225 222 {
226 223 char *faddr;
227 224 size_t namelen = (size_t)*namelenp;
228 225
229 226 ASSERT(namelen != 0);
230 227 if (namelen > SO_MAXARGSIZE) {
231 228 *errorp = EINVAL;
232 229 eprintsoline(so, *errorp);
233 230 return (NULL);
234 231 }
235 232
236 233 faddr = (char *)kmem_alloc(namelen, KM_SLEEP);
237 234 if (copyin(name, faddr, namelen)) {
238 235 kmem_free(faddr, namelen);
239 236 *errorp = EFAULT;
240 237 eprintsoline(so, *errorp);
241 238 return (NULL);
242 239 }
243 240
244 241 /*
245 242 * Add space for NULL termination if needed.
246 243 * Do a quick check if the last byte is NUL.
247 244 */
248 245 if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') {
249 246 /* Check if there is any NULL termination */
250 247 size_t i;
251 248 int foundnull = 0;
252 249
253 250 for (i = sizeof (name->sa_family); i < namelen; i++) {
254 251 if (faddr[i] == '\0') {
255 252 foundnull = 1;
256 253 break;
257 254 }
258 255 }
259 256 if (!foundnull) {
260 257 /* Add extra byte for NUL padding */
261 258 char *nfaddr;
262 259
263 260 nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP);
264 261 bcopy(faddr, nfaddr, namelen);
265 262 kmem_free(faddr, namelen);
266 263
267 264 /* NUL terminate */
268 265 nfaddr[namelen] = '\0';
269 266 namelen++;
270 267 ASSERT((socklen_t)namelen == namelen);
271 268 *namelenp = (socklen_t)namelen;
272 269 faddr = nfaddr;
273 270 }
274 271 }
275 272 return ((struct sockaddr *)faddr);
276 273 }
277 274
278 275 /*
279 276 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
280 277 */
281 278 static int
282 279 copyout_arg(void *uaddr, socklen_t ulen, void *ulenp,
283 280 void *kaddr, socklen_t klen)
284 281 {
285 282 if (uaddr != NULL) {
286 283 if (ulen > klen)
287 284 ulen = klen;
288 285
289 286 if (ulen != 0) {
290 287 if (copyout(kaddr, uaddr, ulen))
291 288 return (EFAULT);
292 289 }
293 290 } else
294 291 ulen = 0;
295 292
296 293 if (ulenp != NULL) {
297 294 if (copyout(&ulen, ulenp, sizeof (ulen)))
298 295 return (EFAULT);
299 296 }
300 297 return (0);
301 298 }
302 299
303 300 /*
304 301 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
305 302 * If klen is greater than ulen it still uses the non-truncated
306 303 * klen to update ulenp.
307 304 */
308 305 static int
309 306 copyout_name(void *uaddr, socklen_t ulen, void *ulenp,
310 307 void *kaddr, socklen_t klen)
311 308 {
312 309 if (uaddr != NULL) {
313 310 if (ulen >= klen)
314 311 ulen = klen;
315 312 else if (ulen != 0 && xnet_truncate_print) {
316 313 printf("sockfs: truncating copyout of address using "
317 314 "XNET semantics for pid = %d. Lengths %d, %d\n",
318 315 curproc->p_pid, klen, ulen);
319 316 }
320 317
321 318 if (ulen != 0) {
322 319 if (copyout(kaddr, uaddr, ulen))
323 320 return (EFAULT);
324 321 } else
325 322 klen = 0;
326 323 } else
327 324 klen = 0;
328 325
329 326 if (ulenp != NULL) {
330 327 if (copyout(&klen, ulenp, sizeof (klen)))
331 328 return (EFAULT);
332 329 }
333 330 return (0);
334 331 }
335 332
336 333 /*
337 334 * The socketpair() code in libsocket creates two sockets (using
338 335 * the /etc/netconfig fallback if needed) before calling this routine
339 336 * to connect the two sockets together.
340 337 *
341 338 * For a SOCK_STREAM socketpair a listener is needed - in that case this
342 339 * routine will create a new file descriptor as part of accepting the
343 340 * connection. The library socketpair() will check if svs[2] has changed
344 341 * in which case it will close the changed fd.
345 342 *
346 343 * Note that this code could use the TPI feature of accepting the connection
347 344 * on the listening endpoint. However, that would require significant changes
348 345 * to soaccept.
349 346 */
350 347 int
351 348 so_socketpair(int sv[2])
352 349 {
353 350 int svs[2];
354 351 struct sonode *so1, *so2;
355 352 int error;
356 353 int orig_flags;
357 354 struct sockaddr_ux *name;
358 355 size_t namelen;
359 356 sotpi_info_t *sti1;
360 357 sotpi_info_t *sti2;
361 358
362 359 dprint(1, ("so_socketpair(%p)\n", (void *)sv));
363 360
364 361 error = useracc(sv, sizeof (svs), B_WRITE);
365 362 if (error && do_useracc)
366 363 return (set_errno(EFAULT));
367 364
368 365 if (copyin(sv, svs, sizeof (svs)))
369 366 return (set_errno(EFAULT));
370 367
371 368 if ((so1 = getsonode(svs[0], &error, NULL)) == NULL)
372 369 return (set_errno(error));
373 370
374 371 if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) {
375 372 releasef(svs[0]);
376 373 return (set_errno(error));
377 374 }
378 375
379 376 if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) {
380 377 error = EOPNOTSUPP;
381 378 goto done;
382 379 }
383 380
384 381 sti1 = SOTOTPI(so1);
385 382 sti2 = SOTOTPI(so2);
386 383
387 384 /*
388 385 * The code below makes assumptions about the "sockfs" implementation.
389 386 * So make sure that the correct implementation is really used.
390 387 */
391 388 ASSERT(so1->so_ops == &sotpi_sonodeops);
392 389 ASSERT(so2->so_ops == &sotpi_sonodeops);
393 390
394 391 if (so1->so_type == SOCK_DGRAM) {
395 392 /*
396 393 * Bind both sockets and connect them with each other.
397 394 * Need to allocate name/namelen for soconnect.
398 395 */
399 396 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED());
400 397 if (error) {
401 398 eprintsoline(so1, error);
402 399 goto done;
403 400 }
404 401 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
405 402 if (error) {
406 403 eprintsoline(so2, error);
407 404 goto done;
408 405 }
409 406 namelen = sizeof (struct sockaddr_ux);
410 407 name = kmem_alloc(namelen, KM_SLEEP);
411 408 name->sou_family = AF_UNIX;
412 409 name->sou_addr = sti2->sti_ux_laddr;
413 410 error = socket_connect(so1,
414 411 (struct sockaddr *)name,
415 412 (socklen_t)namelen,
416 413 0, _SOCONNECT_NOXLATE, CRED());
417 414 if (error) {
418 415 kmem_free(name, namelen);
419 416 eprintsoline(so1, error);
420 417 goto done;
421 418 }
422 419 name->sou_addr = sti1->sti_ux_laddr;
423 420 error = socket_connect(so2,
424 421 (struct sockaddr *)name,
425 422 (socklen_t)namelen,
426 423 0, _SOCONNECT_NOXLATE, CRED());
427 424 kmem_free(name, namelen);
428 425 if (error) {
429 426 eprintsoline(so2, error);
430 427 goto done;
431 428 }
432 429 releasef(svs[0]);
433 430 releasef(svs[1]);
434 431 } else {
435 432 /*
436 433 * Bind both sockets, with so1 being a listener.
437 434 * Connect so2 to so1 - nonblocking to avoid waiting for
438 435 * soaccept to complete.
439 436 * Accept a connection on so1. Pass out the new fd as sv[0].
440 437 * The library will detect the changed fd and close
441 438 * the original one.
442 439 */
443 440 struct sonode *nso;
444 441 struct vnode *nvp;
445 442 struct file *nfp;
446 443 int nfd;
447 444
448 445 /*
449 446 * We could simply call socket_listen() here (which would do the
450 447 * binding automatically) if the code didn't rely on passing
451 448 * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
452 449 */
453 450 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC|
454 451 _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR,
455 452 CRED());
456 453 if (error) {
457 454 eprintsoline(so1, error);
458 455 goto done;
459 456 }
460 457 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
461 458 if (error) {
462 459 eprintsoline(so2, error);
463 460 goto done;
464 461 }
465 462
466 463 namelen = sizeof (struct sockaddr_ux);
467 464 name = kmem_alloc(namelen, KM_SLEEP);
468 465 name->sou_family = AF_UNIX;
469 466 name->sou_addr = sti1->sti_ux_laddr;
470 467 error = socket_connect(so2,
471 468 (struct sockaddr *)name,
472 469 (socklen_t)namelen,
473 470 FNONBLOCK, _SOCONNECT_NOXLATE, CRED());
474 471 kmem_free(name, namelen);
475 472 if (error) {
476 473 if (error != EINPROGRESS) {
477 474 eprintsoline(so2, error); goto done;
478 475 }
479 476 }
480 477
481 478 error = socket_accept(so1, 0, CRED(), &nso);
482 479 if (error) {
483 480 eprintsoline(so1, error);
484 481 goto done;
485 482 }
486 483
487 484 /* wait for so2 being SS_CONNECTED ignoring signals */
488 485 mutex_enter(&so2->so_lock);
489 486 error = sowaitconnected(so2, 0, 1);
490 487 mutex_exit(&so2->so_lock);
491 488 if (error != 0) {
492 489 (void) socket_close(nso, 0, CRED());
493 490 socket_destroy(nso);
494 491 eprintsoline(so2, error);
495 492 goto done;
496 493 }
497 494
498 495 nvp = SOTOV(nso);
499 496 if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) {
500 497 (void) socket_close(nso, 0, CRED());
501 498 socket_destroy(nso);
502 499 eprintsoline(nso, error);
503 500 goto done;
504 501 }
505 502 /*
506 503 * copy over FNONBLOCK and FNDELAY flags should they exist
507 504 */
508 505 if (so1->so_state & SS_NONBLOCK)
509 506 nfp->f_flag |= FNONBLOCK;
510 507 if (so1->so_state & SS_NDELAY)
511 508 nfp->f_flag |= FNDELAY;
512 509
513 510 /*
514 511 * fill in the entries that falloc reserved
515 512 */
516 513 mutex_exit(&nfp->f_tlock);
517 514 setf(nfd, nfp);
518 515
519 516 /*
520 517 * get the original flags before we release
521 518 */
522 519 VERIFY(f_getfd_error(svs[0], &orig_flags) == 0);
523 520
524 521 releasef(svs[0]);
525 522 releasef(svs[1]);
526 523
527 524 /*
528 525 * If FD_CLOEXEC was set on the filedescriptor we're
529 526 * swapping out, we should set it on the new one too.
530 527 */
531 528 if (orig_flags & FD_CLOEXEC) {
532 529 f_setfd(nfd, FD_CLOEXEC);
533 530 }
534 531
535 532 /*
536 533 * The socketpair library routine will close the original
537 534 * svs[0] when this code passes out a different file
538 535 * descriptor.
539 536 */
540 537 svs[0] = nfd;
541 538
542 539 if (copyout(svs, sv, sizeof (svs))) {
543 540 (void) closeandsetf(nfd, NULL);
544 541 eprintline(EFAULT);
545 542 return (set_errno(EFAULT));
546 543 }
547 544 }
548 545 return (0);
549 546
550 547 done:
551 548 releasef(svs[0]);
552 549 releasef(svs[1]);
553 550 return (set_errno(error));
554 551 }
555 552
556 553 int
557 554 bind(int sock, struct sockaddr *name, socklen_t namelen, int version)
558 555 {
559 556 struct sonode *so;
560 557 int error;
561 558
562 559 dprint(1, ("bind(%d, %p, %d)\n",
563 560 sock, (void *)name, namelen));
564 561
565 562 if ((so = getsonode(sock, &error, NULL)) == NULL)
566 563 return (set_errno(error));
567 564
568 565 /* Allocate and copyin name */
569 566 /*
570 567 * X/Open test does not expect EFAULT with NULL name and non-zero
571 568 * namelen.
572 569 */
573 570 if (name != NULL && namelen != 0) {
574 571 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
575 572 name = copyin_name(so, name, &namelen, &error);
576 573 if (name == NULL) {
577 574 releasef(sock);
578 575 return (set_errno(error));
579 576 }
580 577 } else {
581 578 name = NULL;
582 579 namelen = 0;
583 580 }
584 581
585 582 switch (version) {
586 583 default:
587 584 error = socket_bind(so, name, namelen, 0, CRED());
588 585 break;
589 586 case SOV_XPG4_2:
590 587 error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED());
591 588 break;
592 589 case SOV_SOCKBSD:
593 590 error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED());
594 591 break;
595 592 }
596 593 done:
597 594 releasef(sock);
598 595 if (name != NULL)
599 596 kmem_free(name, (size_t)namelen);
600 597
601 598 if (error)
602 599 return (set_errno(error));
603 600 return (0);
604 601 }
605 602
606 603 /* ARGSUSED2 */
607 604 int
608 605 listen(int sock, int backlog, int version)
609 606 {
610 607 struct sonode *so;
611 608 int error;
612 609
613 610 dprint(1, ("listen(%d, %d)\n",
614 611 sock, backlog));
615 612
616 613 if ((so = getsonode(sock, &error, NULL)) == NULL)
617 614 return (set_errno(error));
618 615
619 616 error = socket_listen(so, backlog, CRED());
620 617
621 618 releasef(sock);
622 619 if (error)
623 620 return (set_errno(error));
624 621 return (0);
625 622 }
626 623
627 624 /*ARGSUSED3*/
628 625 int
629 626 accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version,
630 627 int flags)
631 628 {
632 629 struct sonode *so;
633 630 file_t *fp;
634 631 int error;
635 632 socklen_t namelen;
636 633 struct sonode *nso;
637 634 struct vnode *nvp;
638 635 struct file *nfp;
639 636 int nfd;
640 637 int ssflags;
641 638 struct sockaddr *addrp;
642 639 socklen_t addrlen;
643 640
644 641 dprint(1, ("accept(%d, %p, %p)\n",
645 642 sock, (void *)name, (void *)namelenp));
646 643
647 644 if (flags & ~(SOCK_CLOEXEC|SOCK_NONBLOCK|SOCK_NDELAY)) {
648 645 return (set_errno(EINVAL));
649 646 }
650 647
651 648 /* Translate SOCK_ flags to their SS_ variant */
652 649 ssflags = 0;
653 650 if (flags & SOCK_NONBLOCK)
654 651 ssflags |= SS_NONBLOCK;
655 652 if (flags & SOCK_NDELAY)
656 653 ssflags |= SS_NDELAY;
657 654
658 655 if ((so = getsonode(sock, &error, &fp)) == NULL)
659 656 return (set_errno(error));
660 657
661 658 if (name != NULL) {
662 659 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
663 660 if (copyin(namelenp, &namelen, sizeof (namelen))) {
664 661 releasef(sock);
665 662 return (set_errno(EFAULT));
666 663 }
667 664 if (namelen != 0) {
668 665 error = useracc(name, (size_t)namelen, B_WRITE);
669 666 if (error && do_useracc) {
670 667 releasef(sock);
671 668 return (set_errno(EFAULT));
672 669 }
673 670 } else
674 671 name = NULL;
675 672 } else {
676 673 namelen = 0;
677 674 }
678 675
679 676 /*
680 677 * Allocate the user fd before socket_accept() in order to
681 678 * catch EMFILE errors before calling socket_accept().
682 679 */
683 680 if ((nfd = ufalloc(0)) == -1) {
684 681 eprintsoline(so, EMFILE);
685 682 releasef(sock);
686 683 return (set_errno(EMFILE));
687 684 }
688 685 error = socket_accept(so, fp->f_flag, CRED(), &nso);
689 686 if (error) {
690 687 setf(nfd, NULL);
691 688 releasef(sock);
692 689 return (set_errno(error));
693 690 }
694 691
695 692 nvp = SOTOV(nso);
696 693
697 694 ASSERT(MUTEX_NOT_HELD(&nso->so_lock));
698 695 if (namelen != 0) {
699 696 addrlen = so->so_max_addr_len;
700 697 addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP);
701 698
702 699 if ((error = socket_getpeername(nso, (struct sockaddr *)addrp,
703 700 &addrlen, B_TRUE, CRED())) == 0) {
704 701 error = copyout_name(name, namelen, namelenp,
705 702 addrp, addrlen);
706 703 } else {
707 704 ASSERT(error == EINVAL || error == ENOTCONN);
708 705 error = ECONNABORTED;
709 706 }
710 707 kmem_free(addrp, so->so_max_addr_len);
711 708 }
712 709
713 710 if (error) {
714 711 setf(nfd, NULL);
715 712 (void) socket_close(nso, 0, CRED());
716 713 socket_destroy(nso);
717 714 releasef(sock);
718 715 return (set_errno(error));
719 716 }
720 717 if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) {
721 718 setf(nfd, NULL);
722 719 (void) socket_close(nso, 0, CRED());
723 720 socket_destroy(nso);
724 721 eprintsoline(so, error);
725 722 releasef(sock);
726 723 return (set_errno(error));
727 724 }
728 725 /*
729 726 * fill in the entries that falloc reserved
730 727 */
731 728 nfp->f_vnode = nvp;
732 729 mutex_exit(&nfp->f_tlock);
733 730 setf(nfd, nfp);
734 731
735 732 /*
736 733 * Act on SOCK_CLOEXEC from flags
737 734 */
738 735 if (flags & SOCK_CLOEXEC) {
739 736 f_setfd(nfd, FD_CLOEXEC);
740 737 }
741 738
742 739 /*
743 740 * Copy FNDELAY and FNONBLOCK from listener to acceptor
744 741 * and from ssflags
745 742 */
746 743 if ((ssflags | so->so_state) & (SS_NDELAY|SS_NONBLOCK)) {
747 744 uint_t oflag = nfp->f_flag;
748 745 int arg = 0;
749 746
750 747 if ((ssflags | so->so_state) & SS_NONBLOCK)
751 748 arg |= FNONBLOCK;
752 749 else if ((ssflags | so->so_state) & SS_NDELAY)
753 750 arg |= FNDELAY;
754 751
755 752 /*
756 753 * This code is a simplification of the F_SETFL code in fcntl()
757 754 * Ignore any errors from VOP_SETFL.
758 755 */
759 756 if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred, NULL))
760 757 != 0) {
761 758 eprintsoline(so, error);
762 759 error = 0;
763 760 } else {
764 761 mutex_enter(&nfp->f_tlock);
765 762 nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
766 763 nfp->f_flag |= arg;
767 764 mutex_exit(&nfp->f_tlock);
768 765 }
769 766 }
770 767 releasef(sock);
771 768 return (nfd);
772 769 }
773 770
774 771 int
775 772 connect(int sock, struct sockaddr *name, socklen_t namelen, int version)
776 773 {
777 774 struct sonode *so;
778 775 file_t *fp;
779 776 int error;
780 777
781 778 dprint(1, ("connect(%d, %p, %d)\n",
782 779 sock, (void *)name, namelen));
783 780
784 781 if ((so = getsonode(sock, &error, &fp)) == NULL)
785 782 return (set_errno(error));
786 783
787 784 /* Allocate and copyin name */
788 785 if (namelen != 0) {
789 786 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
790 787 name = copyin_name(so, name, &namelen, &error);
791 788 if (name == NULL) {
792 789 releasef(sock);
793 790 return (set_errno(error));
794 791 }
795 792 } else
796 793 name = NULL;
797 794
798 795 error = socket_connect(so, name, namelen, fp->f_flag,
799 796 (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED());
800 797 releasef(sock);
801 798 if (name)
802 799 kmem_free(name, (size_t)namelen);
803 800 if (error)
804 801 return (set_errno(error));
805 802 return (0);
806 803 }
807 804
808 805 /*ARGSUSED2*/
809 806 int
810 807 shutdown(int sock, int how, int version)
811 808 {
812 809 struct sonode *so;
813 810 int error;
814 811
815 812 dprint(1, ("shutdown(%d, %d)\n",
816 813 sock, how));
817 814
818 815 if ((so = getsonode(sock, &error, NULL)) == NULL)
819 816 return (set_errno(error));
820 817
821 818 error = socket_shutdown(so, how, CRED());
822 819
823 820 releasef(sock);
824 821 if (error)
825 822 return (set_errno(error));
826 823 return (0);
827 824 }
828 825
829 826 /*
830 827 * Common receive routine.
831 828 */
832 829 static ssize_t
833 830 recvit(int sock,
834 831 struct nmsghdr *msg,
835 832 struct uio *uiop,
836 833 int flags,
837 834 socklen_t *namelenp,
838 835 socklen_t *controllenp,
839 836 int *flagsp)
840 837 {
841 838 struct sonode *so;
842 839 file_t *fp;
843 840 void *name;
844 841 socklen_t namelen;
845 842 void *control;
846 843 socklen_t controllen;
847 844 ssize_t len;
848 845 int error;
849 846
850 847 if ((so = getsonode(sock, &error, &fp)) == NULL)
851 848 return (set_errno(error));
852 849
853 850 len = uiop->uio_resid;
854 851 uiop->uio_fmode = fp->f_flag;
855 852 uiop->uio_extflg = UIO_COPY_CACHED;
856 853
857 854 name = msg->msg_name;
858 855 namelen = msg->msg_namelen;
859 856 control = msg->msg_control;
860 857 controllen = msg->msg_controllen;
861 858
862 859 msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
863 860 MSG_DONTWAIT | MSG_XPG4_2);
864 861
865 862 error = socket_recvmsg(so, msg, uiop, CRED());
866 863 if (error) {
867 864 releasef(sock);
868 865 return (set_errno(error));
869 866 }
870 867 lwp_stat_update(LWP_STAT_MSGRCV, 1);
871 868 releasef(sock);
872 869
873 870 error = copyout_name(name, namelen, namelenp,
874 871 msg->msg_name, msg->msg_namelen);
875 872 if (error)
876 873 goto err;
877 874
878 875 if (flagsp != NULL) {
879 876 /*
880 877 * Clear internal flag.
881 878 */
882 879 msg->msg_flags &= ~MSG_XPG4_2;
883 880
884 881 /*
885 882 * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
886 883 * when controllen is zero and there is control data to
887 884 * copy out.
888 885 */
889 886 if (controllen != 0 &&
890 887 (msg->msg_controllen > controllen || control == NULL)) {
891 888 dprint(1, ("recvit: CTRUNC %d %d %p\n",
892 889 msg->msg_controllen, controllen, control));
893 890
894 891 msg->msg_flags |= MSG_CTRUNC;
895 892 }
896 893 if (copyout(&msg->msg_flags, flagsp,
897 894 sizeof (msg->msg_flags))) {
898 895 error = EFAULT;
899 896 goto err;
900 897 }
901 898 }
902 899 /*
903 900 * Note: This MUST be done last. There can be no "goto err" after this
904 901 * point since it could make so_closefds run twice on some part
905 902 * of the file descriptor array.
906 903 */
907 904 if (controllen != 0) {
908 905 if (!(flags & MSG_XPG4_2)) {
909 906 /*
910 907 * Good old msg_accrights can only return a multiple
911 908 * of 4 bytes.
912 909 */
913 910 controllen &= ~((int)sizeof (uint32_t) - 1);
914 911 }
915 912 error = copyout_arg(control, controllen, controllenp,
916 913 msg->msg_control, msg->msg_controllen);
917 914 if (error)
918 915 goto err;
919 916
920 917 if (msg->msg_controllen > controllen || control == NULL) {
921 918 if (control == NULL)
922 919 controllen = 0;
923 920 so_closefds(msg->msg_control, msg->msg_controllen,
924 921 !(flags & MSG_XPG4_2), controllen);
925 922 }
926 923 }
927 924 if (msg->msg_namelen != 0)
928 925 kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
929 926 if (msg->msg_controllen != 0)
930 927 kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
931 928 return (len - uiop->uio_resid);
932 929
933 930 err:
934 931 /*
935 932 * If we fail and the control part contains file descriptors
936 933 * we have to close the fd's.
937 934 */
938 935 if (msg->msg_controllen != 0)
939 936 so_closefds(msg->msg_control, msg->msg_controllen,
940 937 !(flags & MSG_XPG4_2), 0);
941 938 if (msg->msg_namelen != 0)
942 939 kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
943 940 if (msg->msg_controllen != 0)
944 941 kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
945 942 return (set_errno(error));
946 943 }
947 944
948 945 /*
949 946 * Native system call
950 947 */
951 948 ssize_t
952 949 recv(int sock, void *buffer, size_t len, int flags)
953 950 {
954 951 struct nmsghdr lmsg;
955 952 struct uio auio;
956 953 struct iovec aiov[1];
957 954
958 955 dprint(1, ("recv(%d, %p, %ld, %d)\n",
959 956 sock, buffer, len, flags));
960 957
961 958 if ((ssize_t)len < 0) {
962 959 return (set_errno(EINVAL));
963 960 }
964 961
965 962 aiov[0].iov_base = buffer;
966 963 aiov[0].iov_len = len;
967 964 auio.uio_loffset = 0;
968 965 auio.uio_iov = aiov;
969 966 auio.uio_iovcnt = 1;
970 967 auio.uio_resid = len;
971 968 auio.uio_segflg = UIO_USERSPACE;
972 969 auio.uio_limit = 0;
973 970
974 971 lmsg.msg_namelen = 0;
975 972 lmsg.msg_controllen = 0;
976 973 lmsg.msg_flags = 0;
977 974 return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL));
978 975 }
979 976
980 977 ssize_t
981 978 recvfrom(int sock, void *buffer, size_t len, int flags,
982 979 struct sockaddr *name, socklen_t *namelenp)
983 980 {
984 981 struct nmsghdr lmsg;
985 982 struct uio auio;
986 983 struct iovec aiov[1];
987 984
988 985 dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
989 986 sock, buffer, len, flags, (void *)name, (void *)namelenp));
990 987
991 988 if ((ssize_t)len < 0) {
992 989 return (set_errno(EINVAL));
993 990 }
994 991
995 992 aiov[0].iov_base = buffer;
996 993 aiov[0].iov_len = len;
997 994 auio.uio_loffset = 0;
998 995 auio.uio_iov = aiov;
999 996 auio.uio_iovcnt = 1;
1000 997 auio.uio_resid = len;
1001 998 auio.uio_segflg = UIO_USERSPACE;
1002 999 auio.uio_limit = 0;
1003 1000
1004 1001 lmsg.msg_name = (char *)name;
1005 1002 if (namelenp != NULL) {
1006 1003 if (copyin(namelenp, &lmsg.msg_namelen,
1007 1004 sizeof (lmsg.msg_namelen)))
1008 1005 return (set_errno(EFAULT));
1009 1006 } else {
1010 1007 lmsg.msg_namelen = 0;
1011 1008 }
1012 1009 lmsg.msg_controllen = 0;
1013 1010 lmsg.msg_flags = 0;
1014 1011
1015 1012 return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL));
1016 1013 }
1017 1014
1018 1015 /*
|
↓ open down ↓ |
914 lines elided |
↑ open up ↑ |
1019 1016 * Uses the MSG_XPG4_2 flag to determine if the caller is using
1020 1017 * struct omsghdr or struct nmsghdr.
1021 1018 */
1022 1019 ssize_t
1023 1020 recvmsg(int sock, struct nmsghdr *msg, int flags)
1024 1021 {
1025 1022 STRUCT_DECL(nmsghdr, u_lmsg);
1026 1023 STRUCT_HANDLE(nmsghdr, umsgptr);
1027 1024 struct nmsghdr lmsg;
1028 1025 struct uio auio;
1029 - struct iovec aiov[MSG_MAXIOVLEN];
1026 + struct iovec buf[IOV_MAX_STACK], *aiov = buf;
1027 + ssize_t iovsize = 0;
1030 1028 int iovcnt;
1031 - ssize_t len;
1029 + ssize_t len, rval;
1032 1030 int i;
1033 1031 int *flagsp;
1034 1032 model_t model;
1035 1033
1036 1034 dprint(1, ("recvmsg(%d, %p, %d)\n",
1037 1035 sock, (void *)msg, flags));
1038 1036
1039 1037 model = get_udatamodel();
1040 1038 STRUCT_INIT(u_lmsg, model);
1041 1039 STRUCT_SET_HANDLE(umsgptr, model, msg);
1042 1040
1043 1041 if (flags & MSG_XPG4_2) {
1044 1042 if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg)))
1045 1043 return (set_errno(EFAULT));
1046 1044 flagsp = STRUCT_FADDR(umsgptr, msg_flags);
1047 1045 } else {
1048 1046 /*
1049 1047 * Assumes that nmsghdr and omsghdr are identically shaped
1050 1048 * except for the added msg_flags field.
1051 1049 */
1052 1050 if (copyin(msg, STRUCT_BUF(u_lmsg),
1053 1051 SIZEOF_STRUCT(omsghdr, model)))
1054 1052 return (set_errno(EFAULT));
1055 1053 STRUCT_FSET(u_lmsg, msg_flags, 0);
1056 1054 flagsp = NULL;
1057 1055 }
1058 1056
1059 1057 /*
1060 1058 * Code below us will kmem_alloc memory and hang it
1061 1059 * off msg_control and msg_name fields. This forces
1062 1060 * us to copy the structure to its native form.
1063 1061 */
|
↓ open down ↓ |
22 lines elided |
↑ open up ↑ |
1064 1062 lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1065 1063 lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1066 1064 lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1067 1065 lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1068 1066 lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1069 1067 lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1070 1068 lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1071 1069
1072 1070 iovcnt = lmsg.msg_iovlen;
1073 1071
1074 - if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1072 + if (iovcnt <= 0 || iovcnt > IOV_MAX) {
1075 1073 return (set_errno(EMSGSIZE));
1076 1074 }
1077 1075
1076 + if (iovcnt > IOV_MAX_STACK) {
1077 + iovsize = iovcnt * sizeof (struct iovec);
1078 + aiov = kmem_alloc(iovsize, KM_SLEEP);
1079 + }
1080 +
1078 1081 #ifdef _SYSCALL32_IMPL
1079 1082 /*
1080 1083 * 32-bit callers need to have their iovec expanded, while ensuring
1081 1084 * that they can't move more than 2Gbytes of data in a single call.
1082 1085 */
1083 1086 if (model == DATAMODEL_ILP32) {
1084 - struct iovec32 aiov32[MSG_MAXIOVLEN];
1087 + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1088 + ssize_t iov32size;
1085 1089 ssize32_t count32;
1086 1090
1087 - if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1088 - iovcnt * sizeof (struct iovec32)))
1091 + iov32size = iovcnt * sizeof (struct iovec32);
1092 + if (iovsize != 0)
1093 + aiov32 = kmem_alloc(iov32size, KM_SLEEP);
1094 +
1095 + if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
1096 + if (iovsize != 0) {
1097 + kmem_free(aiov32, iov32size);
1098 + kmem_free(aiov, iovsize);
1099 + }
1100 +
1089 1101 return (set_errno(EFAULT));
1102 + }
1090 1103
1091 1104 count32 = 0;
1092 1105 for (i = 0; i < iovcnt; i++) {
1093 1106 ssize32_t iovlen32;
1094 1107
1095 1108 iovlen32 = aiov32[i].iov_len;
1096 1109 count32 += iovlen32;
1097 - if (iovlen32 < 0 || count32 < 0)
1110 + if (iovlen32 < 0 || count32 < 0) {
1111 + if (iovsize != 0) {
1112 + kmem_free(aiov32, iov32size);
1113 + kmem_free(aiov, iovsize);
1114 + }
1115 +
1098 1116 return (set_errno(EINVAL));
1117 + }
1118 +
1099 1119 aiov[i].iov_len = iovlen32;
1100 1120 aiov[i].iov_base =
1101 1121 (caddr_t)(uintptr_t)aiov32[i].iov_base;
1102 1122 }
1123 +
1124 + if (iovsize != 0)
1125 + kmem_free(aiov32, iov32size);
1103 1126 } else
1104 1127 #endif /* _SYSCALL32_IMPL */
1105 1128 if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
1129 + if (iovsize != 0)
1130 + kmem_free(aiov, iovsize);
1131 +
1106 1132 return (set_errno(EFAULT));
1107 1133 }
1108 1134 len = 0;
1109 1135 for (i = 0; i < iovcnt; i++) {
1110 1136 ssize_t iovlen = aiov[i].iov_len;
1111 1137 len += iovlen;
1112 1138 if (iovlen < 0 || len < 0) {
1139 + if (iovsize != 0)
1140 + kmem_free(aiov, iovsize);
1141 +
1113 1142 return (set_errno(EINVAL));
1114 1143 }
1115 1144 }
1116 1145 auio.uio_loffset = 0;
1117 1146 auio.uio_iov = aiov;
1118 1147 auio.uio_iovcnt = iovcnt;
1119 1148 auio.uio_resid = len;
1120 1149 auio.uio_segflg = UIO_USERSPACE;
1121 1150 auio.uio_limit = 0;
1122 1151
1123 1152 if (lmsg.msg_control != NULL &&
1124 1153 (do_useracc == 0 ||
1125 1154 useracc(lmsg.msg_control, lmsg.msg_controllen,
1126 1155 B_WRITE) != 0)) {
1156 + if (iovsize != 0)
1157 + kmem_free(aiov, iovsize);
1158 +
1127 1159 return (set_errno(EFAULT));
1128 1160 }
1129 1161
1130 - return (recvit(sock, &lmsg, &auio, flags,
1162 + rval = recvit(sock, &lmsg, &auio, flags,
1131 1163 STRUCT_FADDR(umsgptr, msg_namelen),
1132 - STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
1164 + STRUCT_FADDR(umsgptr, msg_controllen), flagsp);
1165 +
1166 + if (iovsize != 0)
1167 + kmem_free(aiov, iovsize);
1168 +
1169 + return (rval);
1133 1170 }
1134 1171
1135 1172 /*
1136 1173 * Common send function.
1137 1174 */
1138 1175 static ssize_t
1139 1176 sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
1140 1177 {
1141 1178 struct sonode *so;
1142 1179 file_t *fp;
1143 1180 void *name;
1144 1181 socklen_t namelen;
1145 1182 void *control;
1146 1183 socklen_t controllen;
1147 1184 ssize_t len;
1148 1185 int error;
1149 1186
1150 1187 if ((so = getsonode(sock, &error, &fp)) == NULL)
1151 1188 return (set_errno(error));
1152 1189
1153 1190 uiop->uio_fmode = fp->f_flag;
1154 1191
1155 1192 if (so->so_family == AF_UNIX)
1156 1193 uiop->uio_extflg = UIO_COPY_CACHED;
1157 1194 else
1158 1195 uiop->uio_extflg = UIO_COPY_DEFAULT;
1159 1196
1160 1197 /* Allocate and copyin name and control */
1161 1198 name = msg->msg_name;
1162 1199 namelen = msg->msg_namelen;
1163 1200 if (name != NULL && namelen != 0) {
1164 1201 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1165 1202 name = copyin_name(so,
1166 1203 (struct sockaddr *)name,
1167 1204 &namelen, &error);
1168 1205 if (name == NULL)
1169 1206 goto done3;
1170 1207 /* copyin_name null terminates addresses for AF_UNIX */
1171 1208 msg->msg_namelen = namelen;
1172 1209 msg->msg_name = name;
1173 1210 } else {
1174 1211 msg->msg_name = name = NULL;
1175 1212 msg->msg_namelen = namelen = 0;
1176 1213 }
1177 1214
1178 1215 control = msg->msg_control;
1179 1216 controllen = msg->msg_controllen;
1180 1217 if ((control != NULL) && (controllen != 0)) {
1181 1218 /*
1182 1219 * Verify that the length is not excessive to prevent
1183 1220 * an application from consuming all of kernel memory.
1184 1221 */
1185 1222 if (controllen > SO_MAXARGSIZE) {
1186 1223 error = EINVAL;
1187 1224 goto done2;
1188 1225 }
1189 1226 control = kmem_alloc(controllen, KM_SLEEP);
1190 1227
1191 1228 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1192 1229 if (copyin(msg->msg_control, control, controllen)) {
1193 1230 error = EFAULT;
1194 1231 goto done1;
1195 1232 }
1196 1233 msg->msg_control = control;
1197 1234 } else {
1198 1235 msg->msg_control = control = NULL;
1199 1236 msg->msg_controllen = controllen = 0;
1200 1237 }
1201 1238
1202 1239 len = uiop->uio_resid;
1203 1240 msg->msg_flags = flags;
1204 1241
1205 1242 error = socket_sendmsg(so, msg, uiop, CRED());
1206 1243 done1:
1207 1244 if (control != NULL)
1208 1245 kmem_free(control, controllen);
1209 1246 done2:
1210 1247 if (name != NULL)
1211 1248 kmem_free(name, namelen);
1212 1249 done3:
1213 1250 if (error != 0) {
1214 1251 releasef(sock);
1215 1252 return (set_errno(error));
1216 1253 }
1217 1254 lwp_stat_update(LWP_STAT_MSGSND, 1);
1218 1255 releasef(sock);
1219 1256 return (len - uiop->uio_resid);
1220 1257 }
1221 1258
1222 1259 /*
1223 1260 * Native system call
1224 1261 */
1225 1262 ssize_t
1226 1263 send(int sock, void *buffer, size_t len, int flags)
1227 1264 {
1228 1265 struct nmsghdr lmsg;
1229 1266 struct uio auio;
1230 1267 struct iovec aiov[1];
1231 1268
1232 1269 dprint(1, ("send(%d, %p, %ld, %d)\n",
1233 1270 sock, buffer, len, flags));
1234 1271
1235 1272 if ((ssize_t)len < 0) {
1236 1273 return (set_errno(EINVAL));
1237 1274 }
1238 1275
1239 1276 aiov[0].iov_base = buffer;
1240 1277 aiov[0].iov_len = len;
1241 1278 auio.uio_loffset = 0;
1242 1279 auio.uio_iov = aiov;
1243 1280 auio.uio_iovcnt = 1;
1244 1281 auio.uio_resid = len;
1245 1282 auio.uio_segflg = UIO_USERSPACE;
1246 1283 auio.uio_limit = 0;
1247 1284
1248 1285 lmsg.msg_name = NULL;
1249 1286 lmsg.msg_control = NULL;
1250 1287 if (!(flags & MSG_XPG4_2)) {
1251 1288 /*
1252 1289 * In order to be compatible with the libsocket/sockmod
1253 1290 * implementation we set EOR for all send* calls.
1254 1291 */
1255 1292 flags |= MSG_EOR;
1256 1293 }
1257 1294 return (sendit(sock, &lmsg, &auio, flags));
1258 1295 }
1259 1296
|
↓ open down ↓ |
117 lines elided |
↑ open up ↑ |
1260 1297 /*
1261 1298 * Uses the MSG_XPG4_2 flag to determine if the caller is using
1262 1299 * struct omsghdr or struct nmsghdr.
1263 1300 */
1264 1301 ssize_t
1265 1302 sendmsg(int sock, struct nmsghdr *msg, int flags)
1266 1303 {
1267 1304 struct nmsghdr lmsg;
1268 1305 STRUCT_DECL(nmsghdr, u_lmsg);
1269 1306 struct uio auio;
1270 - struct iovec aiov[MSG_MAXIOVLEN];
1307 + struct iovec buf[IOV_MAX_STACK], *aiov = buf;
1308 + ssize_t iovsize = 0;
1271 1309 int iovcnt;
1272 - ssize_t len;
1310 + ssize_t len, rval;
1273 1311 int i;
1274 1312 model_t model;
1275 1313
1276 1314 dprint(1, ("sendmsg(%d, %p, %d)\n", sock, (void *)msg, flags));
1277 1315
1278 1316 model = get_udatamodel();
1279 1317 STRUCT_INIT(u_lmsg, model);
1280 1318
1281 1319 if (flags & MSG_XPG4_2) {
1282 1320 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1283 1321 STRUCT_SIZE(u_lmsg)))
1284 1322 return (set_errno(EFAULT));
1285 1323 } else {
1286 1324 /*
1287 1325 * Assumes that nmsghdr and omsghdr are identically shaped
1288 1326 * except for the added msg_flags field.
1289 1327 */
1290 1328 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1291 1329 SIZEOF_STRUCT(omsghdr, model)))
1292 1330 return (set_errno(EFAULT));
1293 1331 /*
1294 1332 * In order to be compatible with the libsocket/sockmod
1295 1333 * implementation we set EOR for all send* calls.
1296 1334 */
1297 1335 flags |= MSG_EOR;
1298 1336 }
1299 1337
1300 1338 /*
1301 1339 * Code below us will kmem_alloc memory and hang it
1302 1340 * off msg_control and msg_name fields. This forces
1303 1341 * us to copy the structure to its native form.
1304 1342 */
|
↓ open down ↓ |
22 lines elided |
↑ open up ↑ |
1305 1343 lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1306 1344 lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1307 1345 lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1308 1346 lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1309 1347 lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1310 1348 lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1311 1349 lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1312 1350
1313 1351 iovcnt = lmsg.msg_iovlen;
1314 1352
1315 - if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1353 + if (iovcnt <= 0 || iovcnt > IOV_MAX) {
1316 1354 /*
1317 1355 * Unless this is XPG 4.2 we allow iovcnt == 0 to
1318 1356 * be compatible with SunOS 4.X and 4.4BSD.
1319 1357 */
1320 1358 if (iovcnt != 0 || (flags & MSG_XPG4_2))
1321 1359 return (set_errno(EMSGSIZE));
1322 1360 }
1323 1361
1362 + if (iovcnt > IOV_MAX_STACK) {
1363 + iovsize = iovcnt * sizeof (struct iovec);
1364 + aiov = kmem_alloc(iovsize, KM_SLEEP);
1365 + }
1366 +
1324 1367 #ifdef _SYSCALL32_IMPL
1325 1368 /*
1326 1369 * 32-bit callers need to have their iovec expanded, while ensuring
1327 1370 * that they can't move more than 2Gbytes of data in a single call.
1328 1371 */
1329 1372 if (model == DATAMODEL_ILP32) {
1330 - struct iovec32 aiov32[MSG_MAXIOVLEN];
1373 + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1374 + ssize_t iov32size;
1331 1375 ssize32_t count32;
1332 1376
1377 + iov32size = iovcnt * sizeof (struct iovec32);
1378 + if (iovsize != 0)
1379 + aiov32 = kmem_alloc(iov32size, KM_SLEEP);
1380 +
1333 1381 if (iovcnt != 0 &&
1334 - copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1335 - iovcnt * sizeof (struct iovec32)))
1382 + copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
1383 + if (iovsize != 0) {
1384 + kmem_free(aiov32, iov32size);
1385 + kmem_free(aiov, iovsize);
1386 + }
1387 +
1336 1388 return (set_errno(EFAULT));
1389 + }
1337 1390
1338 1391 count32 = 0;
1339 1392 for (i = 0; i < iovcnt; i++) {
1340 1393 ssize32_t iovlen32;
1341 1394
1342 1395 iovlen32 = aiov32[i].iov_len;
1343 1396 count32 += iovlen32;
1344 - if (iovlen32 < 0 || count32 < 0)
1397 + if (iovlen32 < 0 || count32 < 0) {
1398 + if (iovsize != 0) {
1399 + kmem_free(aiov32, iov32size);
1400 + kmem_free(aiov, iovsize);
1401 + }
1402 +
1345 1403 return (set_errno(EINVAL));
1404 + }
1405 +
1346 1406 aiov[i].iov_len = iovlen32;
1347 1407 aiov[i].iov_base =
1348 1408 (caddr_t)(uintptr_t)aiov32[i].iov_base;
1349 1409 }
1410 +
1411 + if (iovsize != 0)
1412 + kmem_free(aiov32, iov32size);
1350 1413 } else
1351 1414 #endif /* _SYSCALL32_IMPL */
1352 1415 if (iovcnt != 0 &&
1353 1416 copyin(lmsg.msg_iov, aiov,
1354 1417 (unsigned)iovcnt * sizeof (struct iovec))) {
1418 + if (iovsize != 0)
1419 + kmem_free(aiov, iovsize);
1420 +
1355 1421 return (set_errno(EFAULT));
1356 1422 }
1357 1423 len = 0;
1358 1424 for (i = 0; i < iovcnt; i++) {
1359 1425 ssize_t iovlen = aiov[i].iov_len;
1360 1426 len += iovlen;
1361 1427 if (iovlen < 0 || len < 0) {
1428 + if (iovsize != 0)
1429 + kmem_free(aiov, iovsize);
1430 +
1362 1431 return (set_errno(EINVAL));
1363 1432 }
1364 1433 }
1365 1434 auio.uio_loffset = 0;
1366 1435 auio.uio_iov = aiov;
1367 1436 auio.uio_iovcnt = iovcnt;
1368 1437 auio.uio_resid = len;
1369 1438 auio.uio_segflg = UIO_USERSPACE;
1370 1439 auio.uio_limit = 0;
1371 1440
1372 - return (sendit(sock, &lmsg, &auio, flags));
1441 + rval = sendit(sock, &lmsg, &auio, flags);
1442 +
1443 + if (iovsize != 0)
1444 + kmem_free(aiov, iovsize);
1445 +
1446 + return (rval);
1373 1447 }
1374 1448
1375 1449 ssize_t
1376 1450 sendto(int sock, void *buffer, size_t len, int flags,
1377 1451 struct sockaddr *name, socklen_t namelen)
1378 1452 {
1379 1453 struct nmsghdr lmsg;
1380 1454 struct uio auio;
1381 1455 struct iovec aiov[1];
1382 1456
1383 1457 dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
1384 1458 sock, buffer, len, flags, (void *)name, namelen));
1385 1459
1386 1460 if ((ssize_t)len < 0) {
1387 1461 return (set_errno(EINVAL));
1388 1462 }
1389 1463
1390 1464 aiov[0].iov_base = buffer;
1391 1465 aiov[0].iov_len = len;
1392 1466 auio.uio_loffset = 0;
1393 1467 auio.uio_iov = aiov;
1394 1468 auio.uio_iovcnt = 1;
1395 1469 auio.uio_resid = len;
1396 1470 auio.uio_segflg = UIO_USERSPACE;
1397 1471 auio.uio_limit = 0;
1398 1472
1399 1473 lmsg.msg_name = (char *)name;
1400 1474 lmsg.msg_namelen = namelen;
1401 1475 lmsg.msg_control = NULL;
1402 1476 if (!(flags & MSG_XPG4_2)) {
1403 1477 /*
1404 1478 * In order to be compatible with the libsocket/sockmod
1405 1479 * implementation we set EOR for all send* calls.
1406 1480 */
1407 1481 flags |= MSG_EOR;
1408 1482 }
1409 1483 return (sendit(sock, &lmsg, &auio, flags));
1410 1484 }
1411 1485
1412 1486 /*ARGSUSED3*/
1413 1487 int
1414 1488 getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1415 1489 {
1416 1490 struct sonode *so;
1417 1491 int error;
1418 1492 socklen_t namelen;
1419 1493 socklen_t sock_addrlen;
1420 1494 struct sockaddr *sock_addrp;
1421 1495
1422 1496 dprint(1, ("getpeername(%d, %p, %p)\n",
1423 1497 sock, (void *)name, (void *)namelenp));
1424 1498
1425 1499 if ((so = getsonode(sock, &error, NULL)) == NULL)
1426 1500 goto bad;
1427 1501
1428 1502 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1429 1503 if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1430 1504 (name == NULL && namelen != 0)) {
1431 1505 error = EFAULT;
1432 1506 goto rel_out;
1433 1507 }
1434 1508 sock_addrlen = so->so_max_addr_len;
1435 1509 sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1436 1510
1437 1511 if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen,
1438 1512 B_FALSE, CRED())) == 0) {
1439 1513 ASSERT(sock_addrlen <= so->so_max_addr_len);
1440 1514 error = copyout_name(name, namelen, namelenp,
1441 1515 (void *)sock_addrp, sock_addrlen);
1442 1516 }
1443 1517 kmem_free(sock_addrp, so->so_max_addr_len);
1444 1518 rel_out:
1445 1519 releasef(sock);
1446 1520 bad: return (error != 0 ? set_errno(error) : 0);
1447 1521 }
1448 1522
1449 1523 /*ARGSUSED3*/
1450 1524 int
1451 1525 getsockname(int sock, struct sockaddr *name,
1452 1526 socklen_t *namelenp, int version)
1453 1527 {
1454 1528 struct sonode *so;
1455 1529 int error;
1456 1530 socklen_t namelen, sock_addrlen;
1457 1531 struct sockaddr *sock_addrp;
1458 1532
1459 1533 dprint(1, ("getsockname(%d, %p, %p)\n",
1460 1534 sock, (void *)name, (void *)namelenp));
1461 1535
1462 1536 if ((so = getsonode(sock, &error, NULL)) == NULL)
1463 1537 goto bad;
1464 1538
1465 1539 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1466 1540 if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1467 1541 (name == NULL && namelen != 0)) {
1468 1542 error = EFAULT;
1469 1543 goto rel_out;
1470 1544 }
1471 1545
1472 1546 sock_addrlen = so->so_max_addr_len;
1473 1547 sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1474 1548 if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen,
1475 1549 CRED())) == 0) {
1476 1550 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1477 1551 ASSERT(sock_addrlen <= so->so_max_addr_len);
1478 1552 error = copyout_name(name, namelen, namelenp,
1479 1553 (void *)sock_addrp, sock_addrlen);
1480 1554 }
1481 1555 kmem_free(sock_addrp, so->so_max_addr_len);
1482 1556 rel_out:
1483 1557 releasef(sock);
1484 1558 bad: return (error != 0 ? set_errno(error) : 0);
1485 1559 }
1486 1560
1487 1561 /*ARGSUSED5*/
1488 1562 int
1489 1563 getsockopt(int sock,
1490 1564 int level,
1491 1565 int option_name,
1492 1566 void *option_value,
1493 1567 socklen_t *option_lenp,
1494 1568 int version)
1495 1569 {
1496 1570 struct sonode *so;
1497 1571 socklen_t optlen, optlen_res;
1498 1572 void *optval;
1499 1573 int error;
1500 1574
1501 1575 dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1502 1576 sock, level, option_name, option_value, (void *)option_lenp));
1503 1577
1504 1578 if ((so = getsonode(sock, &error, NULL)) == NULL)
1505 1579 return (set_errno(error));
1506 1580
1507 1581 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1508 1582 if (copyin(option_lenp, &optlen, sizeof (optlen))) {
1509 1583 releasef(sock);
1510 1584 return (set_errno(EFAULT));
1511 1585 }
1512 1586 /*
1513 1587 * Verify that the length is not excessive to prevent
1514 1588 * an application from consuming all of kernel memory.
1515 1589 */
1516 1590 if (optlen > SO_MAXARGSIZE) {
1517 1591 error = EINVAL;
1518 1592 releasef(sock);
1519 1593 return (set_errno(error));
1520 1594 }
1521 1595 optval = kmem_alloc(optlen, KM_SLEEP);
1522 1596 optlen_res = optlen;
1523 1597 error = socket_getsockopt(so, level, option_name, optval,
1524 1598 &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2,
1525 1599 CRED());
1526 1600 releasef(sock);
1527 1601 if (error) {
1528 1602 kmem_free(optval, optlen);
1529 1603 return (set_errno(error));
1530 1604 }
1531 1605 error = copyout_arg(option_value, optlen, option_lenp,
1532 1606 optval, optlen_res);
1533 1607 kmem_free(optval, optlen);
1534 1608 if (error)
1535 1609 return (set_errno(error));
1536 1610 return (0);
1537 1611 }
1538 1612
1539 1613 /*ARGSUSED5*/
1540 1614 int
1541 1615 setsockopt(int sock,
1542 1616 int level,
1543 1617 int option_name,
1544 1618 void *option_value,
1545 1619 socklen_t option_len,
1546 1620 int version)
1547 1621 {
1548 1622 struct sonode *so;
1549 1623 intptr_t buffer[2];
1550 1624 void *optval = NULL;
1551 1625 int error;
1552 1626
1553 1627 dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
1554 1628 sock, level, option_name, option_value, option_len));
1555 1629
1556 1630 if ((so = getsonode(sock, &error, NULL)) == NULL)
1557 1631 return (set_errno(error));
1558 1632
1559 1633 if (option_value != NULL) {
1560 1634 if (option_len != 0) {
1561 1635 /*
1562 1636 * Verify that the length is not excessive to prevent
1563 1637 * an application from consuming all of kernel memory.
1564 1638 */
1565 1639 if (option_len > SO_MAXARGSIZE) {
1566 1640 error = EINVAL;
1567 1641 goto done2;
1568 1642 }
1569 1643 optval = option_len <= sizeof (buffer) ?
1570 1644 &buffer : kmem_alloc((size_t)option_len, KM_SLEEP);
1571 1645 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1572 1646 if (copyin(option_value, optval, (size_t)option_len)) {
1573 1647 error = EFAULT;
1574 1648 goto done1;
1575 1649 }
1576 1650 }
1577 1651 } else
1578 1652 option_len = 0;
1579 1653
1580 1654 error = socket_setsockopt(so, level, option_name, optval,
1581 1655 (t_uscalar_t)option_len, CRED());
1582 1656 done1:
1583 1657 if (optval != buffer)
1584 1658 kmem_free(optval, (size_t)option_len);
1585 1659 done2:
1586 1660 releasef(sock);
1587 1661 if (error)
1588 1662 return (set_errno(error));
1589 1663 return (0);
1590 1664 }
1591 1665
1592 1666 static int
1593 1667 sockconf_add_sock(int family, int type, int protocol, char *name)
1594 1668 {
1595 1669 int error = 0;
1596 1670 char *kdevpath = NULL;
1597 1671 char *kmodule = NULL;
1598 1672 char *buf = NULL;
1599 1673 size_t pathlen = 0;
1600 1674 struct sockparams *sp;
1601 1675
1602 1676 if (name == NULL)
1603 1677 return (EINVAL);
1604 1678 /*
1605 1679 * Copyin the name.
1606 1680 * This also makes it possible to check for too long pathnames.
1607 1681 * Compress the space needed for the name before passing it
1608 1682 * to soconfig - soconfig will store the string until
1609 1683 * the configuration is removed.
1610 1684 */
1611 1685 buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1612 1686 if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
1613 1687 kmem_free(buf, MAXPATHLEN);
1614 1688 return (error);
1615 1689 }
1616 1690 if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
1617 1691 /* For device */
1618 1692
1619 1693 /*
1620 1694 * Special handling for NCA:
1621 1695 *
1622 1696 * DEV_NCA is never opened even if an application
1623 1697 * requests for AF_NCA. The device opened is instead a
1624 1698 * predefined AF_INET transport (NCA_INET_DEV).
1625 1699 *
1626 1700 * Prior to Volo (PSARC/2007/587) NCA would determine
1627 1701 * the device using a lookup, which worked then because
1628 1702 * all protocols were based on TPI. Since TPI is no
1629 1703 * longer the default, we have to explicitly state
1630 1704 * which device to use.
1631 1705 */
1632 1706 if (strcmp(buf, NCA_DEV) == 0) {
1633 1707 /* only support entry <28, 2, 0> */
1634 1708 if (family != AF_NCA || type != SOCK_STREAM ||
1635 1709 protocol != 0) {
1636 1710 kmem_free(buf, MAXPATHLEN);
1637 1711 return (EINVAL);
1638 1712 }
1639 1713
1640 1714 pathlen = strlen(NCA_INET_DEV) + 1;
1641 1715 kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1642 1716 bcopy(NCA_INET_DEV, kdevpath, pathlen);
1643 1717 kdevpath[pathlen - 1] = '\0';
1644 1718 } else {
1645 1719 kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1646 1720 bcopy(buf, kdevpath, pathlen);
1647 1721 kdevpath[pathlen - 1] = '\0';
1648 1722 }
1649 1723 } else {
1650 1724 /* For socket module */
1651 1725 kmodule = kmem_alloc(pathlen, KM_SLEEP);
1652 1726 bcopy(buf, kmodule, pathlen);
1653 1727 kmodule[pathlen - 1] = '\0';
1654 1728 pathlen = 0;
1655 1729 }
1656 1730 kmem_free(buf, MAXPATHLEN);
1657 1731
1658 1732 /* sockparams_create frees mod name and devpath upon failure */
1659 1733 sp = sockparams_create(family, type, protocol, kmodule,
1660 1734 kdevpath, pathlen, 0, KM_SLEEP, &error);
1661 1735 if (sp != NULL) {
1662 1736 error = sockparams_add(sp);
1663 1737 if (error != 0)
1664 1738 sockparams_destroy(sp);
1665 1739 }
1666 1740
1667 1741 return (error);
1668 1742 }
1669 1743
1670 1744 static int
1671 1745 sockconf_remove_sock(int family, int type, int protocol)
1672 1746 {
1673 1747 return (sockparams_delete(family, type, protocol));
1674 1748 }
1675 1749
1676 1750 static int
1677 1751 sockconfig_remove_filter(const char *uname)
1678 1752 {
1679 1753 char kname[SOF_MAXNAMELEN];
1680 1754 size_t len;
1681 1755 int error;
1682 1756 sof_entry_t *ent;
1683 1757
1684 1758 if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0)
1685 1759 return (error);
1686 1760
1687 1761 ent = sof_entry_remove_by_name(kname);
1688 1762 if (ent == NULL)
1689 1763 return (ENXIO);
1690 1764
1691 1765 mutex_enter(&ent->sofe_lock);
1692 1766 ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED));
1693 1767 if (ent->sofe_refcnt == 0) {
1694 1768 mutex_exit(&ent->sofe_lock);
1695 1769 sof_entry_free(ent);
1696 1770 } else {
1697 1771 /* let the last socket free the filter */
1698 1772 ent->sofe_flags |= SOFEF_CONDEMED;
1699 1773 mutex_exit(&ent->sofe_lock);
1700 1774 }
1701 1775
1702 1776 return (0);
1703 1777 }
1704 1778
1705 1779 static int
1706 1780 sockconfig_add_filter(const char *uname, void *ufilpropp)
1707 1781 {
1708 1782 struct sockconfig_filter_props filprop;
1709 1783 sof_entry_t *ent;
1710 1784 int error;
1711 1785 size_t tuplesz, len;
1712 1786 char hintbuf[SOF_MAXNAMELEN];
1713 1787
1714 1788 ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP);
1715 1789 mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL);
1716 1790
1717 1791 if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN,
1718 1792 &len)) != 0) {
1719 1793 sof_entry_free(ent);
1720 1794 return (error);
1721 1795 }
1722 1796
1723 1797 if (get_udatamodel() == DATAMODEL_NATIVE) {
1724 1798 if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) {
1725 1799 sof_entry_free(ent);
1726 1800 return (EFAULT);
1727 1801 }
1728 1802 }
1729 1803 #ifdef _SYSCALL32_IMPL
1730 1804 else {
1731 1805 struct sockconfig_filter_props32 filprop32;
1732 1806
1733 1807 if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) {
1734 1808 sof_entry_free(ent);
1735 1809 return (EFAULT);
1736 1810 }
1737 1811 filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname;
1738 1812 filprop.sfp_autoattach = filprop32.sfp_autoattach;
1739 1813 filprop.sfp_hint = filprop32.sfp_hint;
1740 1814 filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg;
1741 1815 filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt;
1742 1816 filprop.sfp_socktuple =
1743 1817 (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple;
1744 1818 }
1745 1819 #endif /* _SYSCALL32_IMPL */
1746 1820
1747 1821 if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname,
1748 1822 sizeof (ent->sofe_modname), &len)) != 0) {
1749 1823 sof_entry_free(ent);
1750 1824 return (error);
1751 1825 }
1752 1826
1753 1827 /*
1754 1828 * A filter must specify at least one socket tuple.
1755 1829 */
1756 1830 if (filprop.sfp_socktuple_cnt == 0 ||
1757 1831 filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) {
1758 1832 sof_entry_free(ent);
1759 1833 return (EINVAL);
1760 1834 }
1761 1835 ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG;
1762 1836 ent->sofe_hint = filprop.sfp_hint;
1763 1837
1764 1838 /*
1765 1839 * Verify the hint, and copy in the hint argument, if necessary.
1766 1840 */
1767 1841 switch (ent->sofe_hint) {
1768 1842 case SOF_HINT_BEFORE:
1769 1843 case SOF_HINT_AFTER:
1770 1844 if ((error = copyinstr(filprop.sfp_hintarg, hintbuf,
1771 1845 sizeof (hintbuf), &len)) != 0) {
1772 1846 sof_entry_free(ent);
1773 1847 return (error);
1774 1848 }
1775 1849 ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP);
1776 1850 bcopy(hintbuf, ent->sofe_hintarg, len);
1777 1851 /* FALLTHRU */
1778 1852 case SOF_HINT_TOP:
1779 1853 case SOF_HINT_BOTTOM:
1780 1854 /* hints cannot be used with programmatic filters */
1781 1855 if (ent->sofe_flags & SOFEF_PROG) {
1782 1856 sof_entry_free(ent);
1783 1857 return (EINVAL);
1784 1858 }
1785 1859 break;
1786 1860 case SOF_HINT_NONE:
1787 1861 break;
1788 1862 default:
1789 1863 /* bad hint value */
1790 1864 sof_entry_free(ent);
1791 1865 return (EINVAL);
1792 1866 }
1793 1867
1794 1868 ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt;
1795 1869 tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt;
1796 1870 ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP);
1797 1871
1798 1872 if (get_udatamodel() == DATAMODEL_NATIVE) {
1799 1873 if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple,
1800 1874 tuplesz)) {
1801 1875 sof_entry_free(ent);
1802 1876 return (EFAULT);
1803 1877 }
1804 1878 }
1805 1879 #ifdef _SYSCALL32_IMPL
1806 1880 else {
1807 1881 int i;
1808 1882 caddr_t data = (caddr_t)filprop.sfp_socktuple;
1809 1883 sof_socktuple_t *tup = ent->sofe_socktuple;
1810 1884 sof_socktuple32_t tup32;
1811 1885
1812 1886 tup = ent->sofe_socktuple;
1813 1887 for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) {
1814 1888 ASSERT(tup < ent->sofe_socktuple + tuplesz);
1815 1889
1816 1890 if (copyin(data, &tup32, sizeof (tup32)) != 0) {
1817 1891 sof_entry_free(ent);
1818 1892 return (EFAULT);
1819 1893 }
1820 1894 tup->sofst_family = tup32.sofst_family;
1821 1895 tup->sofst_type = tup32.sofst_type;
1822 1896 tup->sofst_protocol = tup32.sofst_protocol;
1823 1897
1824 1898 data += sizeof (tup32);
1825 1899 }
1826 1900 }
1827 1901 #endif /* _SYSCALL32_IMPL */
1828 1902
1829 1903 /* Sockets can start using the filter as soon as the filter is added */
1830 1904 if ((error = sof_entry_add(ent)) != 0)
1831 1905 sof_entry_free(ent);
1832 1906
1833 1907 return (error);
1834 1908 }
1835 1909
1836 1910 /*
1837 1911 * Socket configuration system call. It is used to add and remove
1838 1912 * socket types.
1839 1913 */
1840 1914 int
1841 1915 sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
1842 1916 {
1843 1917 int error = 0;
1844 1918
1845 1919 if (secpolicy_net_config(CRED(), B_FALSE) != 0)
1846 1920 return (set_errno(EPERM));
1847 1921
1848 1922 if (sockfs_defer_nl7c_init) {
1849 1923 nl7c_init();
1850 1924 sockfs_defer_nl7c_init = 0;
1851 1925 }
1852 1926
1853 1927 switch (cmd) {
1854 1928 case SOCKCONFIG_ADD_SOCK:
1855 1929 error = sockconf_add_sock((int)(uintptr_t)arg1,
1856 1930 (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4);
1857 1931 break;
1858 1932 case SOCKCONFIG_REMOVE_SOCK:
1859 1933 error = sockconf_remove_sock((int)(uintptr_t)arg1,
1860 1934 (int)(uintptr_t)arg2, (int)(uintptr_t)arg3);
1861 1935 break;
1862 1936 case SOCKCONFIG_ADD_FILTER:
1863 1937 error = sockconfig_add_filter((const char *)arg1, arg2);
1864 1938 break;
1865 1939 case SOCKCONFIG_REMOVE_FILTER:
1866 1940 error = sockconfig_remove_filter((const char *)arg1);
1867 1941 break;
1868 1942 case SOCKCONFIG_GET_SOCKTABLE:
1869 1943 error = sockparams_copyout_socktable((int)(uintptr_t)arg1);
1870 1944 break;
1871 1945 default:
1872 1946 #ifdef DEBUG
1873 1947 cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd);
1874 1948 #endif
1875 1949 error = EINVAL;
1876 1950 break;
1877 1951 }
1878 1952
1879 1953 if (error != 0) {
1880 1954 eprintline(error);
1881 1955 return (set_errno(error));
1882 1956 }
1883 1957 return (0);
1884 1958 }
1885 1959
1886 1960
1887 1961 /*
1888 1962 * Sendfile is implemented through two schemes, direct I/O or by
1889 1963 * caching in the filesystem page cache. We cache the input file by
1890 1964 * default and use direct I/O only if sendfile_max_size is set
1891 1965 * appropriately as explained below. Note that this logic is consistent
1892 1966 * with other filesystems where caching is turned on by default
1893 1967 * unless explicitly turned off by using the DIRECTIO ioctl.
1894 1968 *
1895 1969 * We choose a slightly different scheme here. One can turn off
1896 1970 * caching by setting sendfile_max_size to 0. One can also enable
1897 1971 * caching of files <= sendfile_max_size by setting sendfile_max_size
1898 1972 * to an appropriate value. By default sendfile_max_size is set to the
1899 1973 * maximum value so that all files are cached. In future, we may provide
1900 1974 * better interfaces for caching the file.
1901 1975 *
1902 1976 * Sendfile through Direct I/O (Zero copy)
1903 1977 * --------------------------------------
1904 1978 *
1905 1979 * As disks are normally slower than the network, we can't have a
1906 1980 * single thread that reads the disk and writes to the network. We
1907 1981 * need to have parallelism. This is done by having the sendfile
1908 1982 * thread create another thread that reads from the filesystem
1909 1983 * and queues it for network processing. In this scheme, the data
1910 1984 * is never copied anywhere i.e it is zero copy unlike the other
1911 1985 * scheme.
1912 1986 *
1913 1987 * We have a sendfile queue (snfq) where each sendfile
1914 1988 * request (snf_req_t) is queued for processing by a thread. Number
1915 1989 * of threads is dynamically allocated and they exit if they are idling
1916 1990 * beyond a specified amount of time. When each request (snf_req_t) is
1917 1991 * processed by a thread, it produces a number of mblk_t structures to
1918 1992 * be consumed by the sendfile thread. snf_deque and snf_enque are
1919 1993 * used for consuming and producing mblks. Size of the filesystem
1920 1994 * read is determined by the tunable (sendfile_read_size). A single
1921 1995 * mblk holds sendfile_read_size worth of data (except the last
1922 1996 * read of the file) which is sent down as a whole to the network.
1923 1997 * sendfile_read_size is set to 1 MB as this seems to be the optimal
1924 1998 * value for the UFS filesystem backed by a striped storage array.
1925 1999 *
1926 2000 * Synchronisation between read (producer) and write (consumer) threads.
1927 2001 * --------------------------------------------------------------------
1928 2002 *
1929 2003 * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
1930 2004 * adding and deleting items in this list. Error can happen anytime
1931 2005 * during read or write. There could be unprocessed mblks in the
1932 2006 * sr_ib_XXX list when a read or write error occurs. Whenever error
1933 2007 * is encountered, we need two things to happen :
1934 2008 *
1935 2009 * a) One of the threads need to clean the mblks.
1936 2010 * b) When one thread encounters an error, the other should stop.
1937 2011 *
1938 2012 * For (a), we don't want to penalize the reader thread as it could do
1939 2013 * some useful work processing other requests. For (b), the error can
1940 2014 * be detected by examining sr_read_error or sr_write_error.
1941 2015 * sr_lock protects sr_read_error and sr_write_error. If both reader and
1942 2016 * writer encounters error, we need to report the write error back to
1943 2017 * the application as that's what would have happened if the operations
1944 2018 * were done sequentially. With this in mind, following should work :
1945 2019 *
1946 2020 * - Check for errors before read or write.
1947 2021 * - If the reader encounters error, set the error in sr_read_error.
1948 2022 * Check sr_write_error, if it is set, send cv_signal as it is
1949 2023 * waiting for reader to complete. If it is not set, the writer
1950 2024 * is either running sinking data to the network or blocked
1951 2025 * because of flow control. For handling the latter case, we
1952 2026 * always send a signal. In any case, it will examine sr_read_error
1953 2027 * and return. sr_read_error is marked with SR_READ_DONE to tell
1954 2028 * the writer that the reader is done in all the cases.
1955 2029 * - If the writer encounters error, set the error in sr_write_error.
1956 2030 * The reader thread is either blocked because of flow control or
1957 2031 * running reading data from the disk. For the former, we need to
1958 2032 * wakeup the thread. Again to keep it simple, we always wake up
1959 2033 * the reader thread. Then, wait for the read thread to complete
1960 2034 * if it is not done yet. Cleanup and return.
1961 2035 *
1962 2036 * High and low water marks for the read thread.
1963 2037 * --------------------------------------------
1964 2038 *
1965 2039 * If sendfile() is used to send data over a slow network, we need to
1966 2040 * make sure that the read thread does not produce data at a faster
1967 2041 * rate than the network. This can happen if the disk is faster than
1968 2042 * the network. In such a case, we don't want to build a very large queue.
1969 2043 * But we would still like to get all of the network throughput possible.
1970 2044 * This implies that network should never block waiting for data.
1971 2045 * As there are lot of disk throughput/network throughput combinations
1972 2046 * possible, it is difficult to come up with an accurate number.
1973 2047 * A typical 10K RPM disk has a max seek latency 17ms and rotational
1974 2048 * latency of 3ms for reading a disk block. Thus, the total latency to
1975 2049 * initiate a new read, transfer data from the disk and queue for
1976 2050 * transmission would take about a max of 25ms. Todays max transfer rate
1977 2051 * for network is 100MB/sec. If the thread is blocked because of flow
1978 2052 * control, it would take 25ms to get new data ready for transmission.
1979 2053 * We have to make sure that network is not idling, while we are initiating
1980 2054 * new transfers. So, at 100MB/sec, to keep network busy we would need
1981 2055 * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data.
1982 2056 * We need to pick a high water mark so that the woken up thread would
1983 2057 * do considerable work before blocking again to prevent thrashing. Currently,
1984 2058 * we pick this to be 10 times that of the low water mark.
1985 2059 *
1986 2060 * Sendfile with segmap caching (One copy from page cache to mblks).
1987 2061 * ----------------------------------------------------------------
1988 2062 *
1989 2063 * We use the segmap cache for caching the file, if the size of file
1990 2064 * is <= sendfile_max_size. In this case we don't use threads as VM
1991 2065 * is reasonably fast enough to keep up with the network. If the underlying
1992 2066 * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
1993 2067 * of data into segmap space, and use the virtual address from segmap
1994 2068 * directly through desballoc() to avoid copy. Once the transport is done
1995 2069 * with the data, the mapping will be released through segmap_release()
1996 2070 * called by the call-back routine.
1997 2071 *
1998 2072 * If zero-copy is not allowed by the transport, we simply call VOP_READ()
1999 2073 * to copy the data from the filesystem into our temporary network buffer.
2000 2074 *
2001 2075 * To disable caching, set sendfile_max_size to 0.
2002 2076 */
2003 2077
2004 2078 uint_t sendfile_read_size = 1024 * 1024;
2005 2079 #define SENDFILE_REQ_LOWAT 3 * 1024 * 1024
2006 2080 uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT;
2007 2081 uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT;
2008 2082 struct sendfile_stats sf_stats;
2009 2083 struct sendfile_queue *snfq;
2010 2084 clock_t snfq_timeout;
2011 2085 off64_t sendfile_max_size;
2012 2086
2013 2087 static void snf_enque(snf_req_t *, mblk_t *);
2014 2088 static mblk_t *snf_deque(snf_req_t *);
2015 2089
2016 2090 void
2017 2091 sendfile_init(void)
2018 2092 {
2019 2093 snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP);
2020 2094
2021 2095 mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL);
2022 2096 cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL);
2023 2097 snfq->snfq_max_threads = max_ncpus;
2024 2098 snfq_timeout = SNFQ_TIMEOUT;
2025 2099 /* Cache all files by default. */
2026 2100 sendfile_max_size = MAXOFFSET_T;
2027 2101 }
2028 2102
2029 2103 /*
2030 2104 * Queues a mblk_t for network processing.
2031 2105 */
2032 2106 static void
2033 2107 snf_enque(snf_req_t *sr, mblk_t *mp)
2034 2108 {
2035 2109 mp->b_next = NULL;
2036 2110 mutex_enter(&sr->sr_lock);
2037 2111 if (sr->sr_mp_head == NULL) {
2038 2112 sr->sr_mp_head = sr->sr_mp_tail = mp;
2039 2113 cv_signal(&sr->sr_cv);
2040 2114 } else {
2041 2115 sr->sr_mp_tail->b_next = mp;
2042 2116 sr->sr_mp_tail = mp;
2043 2117 }
2044 2118 sr->sr_qlen += MBLKL(mp);
2045 2119 while ((sr->sr_qlen > sr->sr_hiwat) &&
2046 2120 (sr->sr_write_error == 0)) {
2047 2121 sf_stats.ss_full_waits++;
2048 2122 cv_wait(&sr->sr_cv, &sr->sr_lock);
2049 2123 }
2050 2124 mutex_exit(&sr->sr_lock);
2051 2125 }
2052 2126
2053 2127 /*
2054 2128 * De-queues a mblk_t for network processing.
2055 2129 */
2056 2130 static mblk_t *
2057 2131 snf_deque(snf_req_t *sr)
2058 2132 {
2059 2133 mblk_t *mp;
2060 2134
2061 2135 mutex_enter(&sr->sr_lock);
2062 2136 /*
2063 2137 * If we have encountered an error on read or read is
2064 2138 * completed and no more mblks, return NULL.
2065 2139 * We need to check for NULL sr_mp_head also as
2066 2140 * the reads could have completed and there is
2067 2141 * nothing more to come.
2068 2142 */
2069 2143 if (((sr->sr_read_error & ~SR_READ_DONE) != 0) ||
2070 2144 ((sr->sr_read_error & SR_READ_DONE) &&
2071 2145 sr->sr_mp_head == NULL)) {
2072 2146 mutex_exit(&sr->sr_lock);
2073 2147 return (NULL);
2074 2148 }
2075 2149 /*
2076 2150 * To start with neither SR_READ_DONE is marked nor
2077 2151 * the error is set. When we wake up from cv_wait,
2078 2152 * following are the possibilities :
2079 2153 *
2080 2154 * a) sr_read_error is zero and mblks are queued.
2081 2155 * b) sr_read_error is set to SR_READ_DONE
2082 2156 * and mblks are queued.
2083 2157 * c) sr_read_error is set to SR_READ_DONE
2084 2158 * and no mblks.
2085 2159 * d) sr_read_error is set to some error other
2086 2160 * than SR_READ_DONE.
2087 2161 */
2088 2162
2089 2163 while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) {
2090 2164 sf_stats.ss_empty_waits++;
2091 2165 cv_wait(&sr->sr_cv, &sr->sr_lock);
2092 2166 }
2093 2167 /* Handle (a) and (b) first - the normal case. */
2094 2168 if (((sr->sr_read_error & ~SR_READ_DONE) == 0) &&
2095 2169 (sr->sr_mp_head != NULL)) {
2096 2170 mp = sr->sr_mp_head;
2097 2171 sr->sr_mp_head = mp->b_next;
2098 2172 sr->sr_qlen -= MBLKL(mp);
2099 2173 if (sr->sr_qlen < sr->sr_lowat)
2100 2174 cv_signal(&sr->sr_cv);
2101 2175 mutex_exit(&sr->sr_lock);
2102 2176 mp->b_next = NULL;
2103 2177 return (mp);
2104 2178 }
2105 2179 /* Handle (c) and (d). */
2106 2180 mutex_exit(&sr->sr_lock);
2107 2181 return (NULL);
2108 2182 }
2109 2183
2110 2184 /*
2111 2185 * Reads data from the filesystem and queues it for network processing.
2112 2186 */
2113 2187 void
2114 2188 snf_async_read(snf_req_t *sr)
2115 2189 {
2116 2190 size_t iosize;
2117 2191 u_offset_t fileoff;
2118 2192 u_offset_t size;
2119 2193 int ret_size;
2120 2194 int error;
2121 2195 file_t *fp;
2122 2196 mblk_t *mp;
2123 2197 struct vnode *vp;
2124 2198 int extra = 0;
2125 2199 int maxblk = 0;
2126 2200 int wroff = 0;
2127 2201 struct sonode *so;
2128 2202
2129 2203 fp = sr->sr_fp;
2130 2204 size = sr->sr_file_size;
2131 2205 fileoff = sr->sr_file_off;
2132 2206
2133 2207 /*
2134 2208 * Ignore the error for filesystems that doesn't support DIRECTIO.
2135 2209 */
2136 2210 (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0,
2137 2211 kcred, NULL, NULL);
2138 2212
2139 2213 vp = sr->sr_vp;
2140 2214 if (vp->v_type == VSOCK) {
2141 2215 stdata_t *stp;
2142 2216
2143 2217 /*
2144 2218 * Get the extra space to insert a header and a trailer.
2145 2219 */
2146 2220 so = VTOSO(vp);
2147 2221 stp = vp->v_stream;
2148 2222 if (stp == NULL) {
2149 2223 wroff = so->so_proto_props.sopp_wroff;
2150 2224 maxblk = so->so_proto_props.sopp_maxblk;
2151 2225 extra = wroff + so->so_proto_props.sopp_tail;
2152 2226 } else {
2153 2227 wroff = (int)(stp->sd_wroff);
2154 2228 maxblk = (int)(stp->sd_maxblk);
2155 2229 extra = wroff + (int)(stp->sd_tail);
2156 2230 }
2157 2231 }
2158 2232
2159 2233 while ((size != 0) && (sr->sr_write_error == 0)) {
2160 2234
2161 2235 iosize = (int)MIN(sr->sr_maxpsz, size);
2162 2236
2163 2237 /*
2164 2238 * Socket filters can limit the mblk size,
2165 2239 * so limit reads to maxblk if there are
2166 2240 * filters present.
2167 2241 */
2168 2242 if (vp->v_type == VSOCK &&
2169 2243 so->so_filter_active > 0 && maxblk != INFPSZ)
2170 2244 iosize = (int)MIN(iosize, maxblk);
2171 2245
2172 2246 if (is_system_labeled()) {
2173 2247 mp = allocb_cred(iosize + extra, CRED(),
2174 2248 curproc->p_pid);
2175 2249 } else {
2176 2250 mp = allocb(iosize + extra, BPRI_MED);
2177 2251 }
2178 2252 if (mp == NULL) {
2179 2253 error = EAGAIN;
2180 2254 break;
2181 2255 }
2182 2256
2183 2257 mp->b_rptr += wroff;
2184 2258
2185 2259 ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize);
2186 2260
2187 2261 /* Error or Reached EOF ? */
2188 2262 if ((error != 0) || (ret_size == 0)) {
2189 2263 freeb(mp);
2190 2264 break;
2191 2265 }
2192 2266 mp->b_wptr = mp->b_rptr + ret_size;
2193 2267
2194 2268 snf_enque(sr, mp);
2195 2269 size -= ret_size;
2196 2270 fileoff += ret_size;
2197 2271 }
2198 2272 (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0,
2199 2273 kcred, NULL, NULL);
2200 2274 mutex_enter(&sr->sr_lock);
2201 2275 sr->sr_read_error = error;
2202 2276 sr->sr_read_error |= SR_READ_DONE;
2203 2277 cv_signal(&sr->sr_cv);
2204 2278 mutex_exit(&sr->sr_lock);
2205 2279 }
2206 2280
2207 2281 void
2208 2282 snf_async_thread(void)
2209 2283 {
2210 2284 snf_req_t *sr;
2211 2285 callb_cpr_t cprinfo;
2212 2286 clock_t time_left = 1;
2213 2287
2214 2288 CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq");
2215 2289
2216 2290 mutex_enter(&snfq->snfq_lock);
2217 2291 for (;;) {
2218 2292 /*
2219 2293 * If we didn't find a entry, then block until woken up
2220 2294 * again and then look through the queues again.
2221 2295 */
2222 2296 while ((sr = snfq->snfq_req_head) == NULL) {
2223 2297 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2224 2298 if (time_left <= 0) {
2225 2299 snfq->snfq_svc_threads--;
2226 2300 CALLB_CPR_EXIT(&cprinfo);
2227 2301 thread_exit();
2228 2302 /* NOTREACHED */
2229 2303 }
2230 2304 snfq->snfq_idle_cnt++;
2231 2305
2232 2306 time_left = cv_reltimedwait(&snfq->snfq_cv,
2233 2307 &snfq->snfq_lock, snfq_timeout, TR_CLOCK_TICK);
2234 2308 snfq->snfq_idle_cnt--;
2235 2309
2236 2310 CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock);
2237 2311 }
2238 2312 snfq->snfq_req_head = sr->sr_next;
2239 2313 snfq->snfq_req_cnt--;
2240 2314 mutex_exit(&snfq->snfq_lock);
2241 2315 snf_async_read(sr);
2242 2316 mutex_enter(&snfq->snfq_lock);
2243 2317 }
2244 2318 }
2245 2319
2246 2320
2247 2321 snf_req_t *
2248 2322 create_thread(int operation, struct vnode *vp, file_t *fp,
2249 2323 u_offset_t fileoff, u_offset_t size)
2250 2324 {
2251 2325 snf_req_t *sr;
2252 2326 stdata_t *stp;
2253 2327
2254 2328 sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP);
2255 2329
2256 2330 sr->sr_vp = vp;
2257 2331 sr->sr_fp = fp;
2258 2332 stp = vp->v_stream;
2259 2333
2260 2334 /*
2261 2335 * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2262 2336 * stream might be closed before thread returns from snf_async_read.
2263 2337 */
2264 2338 if (stp != NULL && stp->sd_qn_maxpsz > 0) {
2265 2339 sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz);
2266 2340 } else {
2267 2341 sr->sr_maxpsz = MAXBSIZE;
2268 2342 }
2269 2343
2270 2344 sr->sr_operation = operation;
2271 2345 sr->sr_file_off = fileoff;
2272 2346 sr->sr_file_size = size;
2273 2347 sr->sr_hiwat = sendfile_req_hiwat;
2274 2348 sr->sr_lowat = sendfile_req_lowat;
2275 2349 mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL);
2276 2350 cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL);
2277 2351 /*
2278 2352 * See whether we need another thread for servicing this
2279 2353 * request. If there are already enough requests queued
2280 2354 * for the threads, create one if not exceeding
2281 2355 * snfq_max_threads.
2282 2356 */
2283 2357 mutex_enter(&snfq->snfq_lock);
2284 2358 if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt &&
2285 2359 snfq->snfq_svc_threads < snfq->snfq_max_threads) {
2286 2360 (void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0,
2287 2361 TS_RUN, minclsyspri);
2288 2362 snfq->snfq_svc_threads++;
2289 2363 }
2290 2364 if (snfq->snfq_req_head == NULL) {
2291 2365 snfq->snfq_req_head = snfq->snfq_req_tail = sr;
2292 2366 cv_signal(&snfq->snfq_cv);
2293 2367 } else {
2294 2368 snfq->snfq_req_tail->sr_next = sr;
2295 2369 snfq->snfq_req_tail = sr;
2296 2370 }
2297 2371 snfq->snfq_req_cnt++;
2298 2372 mutex_exit(&snfq->snfq_lock);
2299 2373 return (sr);
2300 2374 }
2301 2375
2302 2376 int
2303 2377 snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
2304 2378 ssize_t *count)
2305 2379 {
2306 2380 snf_req_t *sr;
2307 2381 mblk_t *mp;
2308 2382 int iosize;
2309 2383 int error = 0;
2310 2384 short fflag;
2311 2385 struct vnode *vp;
2312 2386 int ksize;
2313 2387 struct nmsghdr msg;
2314 2388
2315 2389 ksize = 0;
2316 2390 *count = 0;
2317 2391 bzero(&msg, sizeof (msg));
2318 2392
2319 2393 vp = fp->f_vnode;
2320 2394 fflag = fp->f_flag;
2321 2395 if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL)
2322 2396 return (EAGAIN);
2323 2397
2324 2398 /*
2325 2399 * We check for read error in snf_deque. It has to check
2326 2400 * for successful READ_DONE and return NULL, and we might
2327 2401 * as well make an additional check there.
2328 2402 */
2329 2403 while ((mp = snf_deque(sr)) != NULL) {
2330 2404
2331 2405 if (ISSIG(curthread, JUSTLOOKING)) {
2332 2406 freeb(mp);
2333 2407 error = EINTR;
2334 2408 break;
2335 2409 }
2336 2410 iosize = MBLKL(mp);
2337 2411
2338 2412 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2339 2413
2340 2414 if (error != 0) {
2341 2415 if (mp != NULL)
2342 2416 freeb(mp);
2343 2417 break;
2344 2418 }
2345 2419 ksize += iosize;
2346 2420 }
2347 2421 *count = ksize;
2348 2422
2349 2423 mutex_enter(&sr->sr_lock);
2350 2424 sr->sr_write_error = error;
2351 2425 /* Look at the big comments on why we cv_signal here. */
2352 2426 cv_signal(&sr->sr_cv);
2353 2427
2354 2428 /* Wait for the reader to complete always. */
2355 2429 while (!(sr->sr_read_error & SR_READ_DONE)) {
2356 2430 cv_wait(&sr->sr_cv, &sr->sr_lock);
2357 2431 }
2358 2432 /* If there is no write error, check for read error. */
2359 2433 if (error == 0)
2360 2434 error = (sr->sr_read_error & ~SR_READ_DONE);
2361 2435
2362 2436 if (error != 0) {
2363 2437 mblk_t *next_mp;
2364 2438
2365 2439 mp = sr->sr_mp_head;
2366 2440 while (mp != NULL) {
2367 2441 next_mp = mp->b_next;
2368 2442 mp->b_next = NULL;
2369 2443 freeb(mp);
2370 2444 mp = next_mp;
2371 2445 }
2372 2446 }
2373 2447 mutex_exit(&sr->sr_lock);
2374 2448 kmem_free(sr, sizeof (snf_req_t));
2375 2449 return (error);
2376 2450 }
2377 2451
2378 2452 /* Maximum no.of pages allocated by vpm for sendfile at a time */
2379 2453 #define SNF_VPMMAXPGS (VPMMAXPGS/2)
2380 2454
2381 2455 /*
2382 2456 * Maximum no.of elements in the list returned by vpm, including
2383 2457 * NULL for the last entry
2384 2458 */
2385 2459 #define SNF_MAXVMAPS (SNF_VPMMAXPGS + 1)
2386 2460
2387 2461 typedef struct {
2388 2462 unsigned int snfv_ref;
2389 2463 frtn_t snfv_frtn;
2390 2464 vnode_t *snfv_vp;
2391 2465 struct vmap snfv_vml[SNF_MAXVMAPS];
2392 2466 } snf_vmap_desbinfo;
2393 2467
2394 2468 typedef struct {
2395 2469 frtn_t snfi_frtn;
2396 2470 caddr_t snfi_base;
2397 2471 uint_t snfi_mapoff;
2398 2472 size_t snfi_len;
2399 2473 vnode_t *snfi_vp;
2400 2474 } snf_smap_desbinfo;
2401 2475
2402 2476 /*
2403 2477 * The callback function used for vpm mapped mblks called when the last ref of
2404 2478 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2405 2479 * can be the driver too due to lazy reclaim.
2406 2480 */
2407 2481 void
2408 2482 snf_vmap_desbfree(snf_vmap_desbinfo *snfv)
2409 2483 {
2410 2484 ASSERT(snfv->snfv_ref != 0);
2411 2485 if (atomic_dec_32_nv(&snfv->snfv_ref) == 0) {
2412 2486 vpm_unmap_pages(snfv->snfv_vml, S_READ);
2413 2487 VN_RELE(snfv->snfv_vp);
2414 2488 kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2415 2489 }
2416 2490 }
2417 2491
2418 2492 /*
2419 2493 * The callback function used for segmap'ped mblks called when the last ref of
2420 2494 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2421 2495 * can be the driver too due to lazy reclaim.
2422 2496 */
2423 2497 void
2424 2498 snf_smap_desbfree(snf_smap_desbinfo *snfi)
2425 2499 {
2426 2500 if (! IS_KPM_ADDR(snfi->snfi_base)) {
2427 2501 /*
2428 2502 * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2429 2503 * segmap_kpm as long as the latter never falls back to
2430 2504 * "use_segmap_range". (See segmap_getmapflt().)
2431 2505 *
2432 2506 * Using S_OTHER saves an redundant hat_setref() in
2433 2507 * segmap_unlock()
2434 2508 */
2435 2509 (void) segmap_fault(kas.a_hat, segkmap,
2436 2510 (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base +
2437 2511 snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len,
2438 2512 F_SOFTUNLOCK, S_OTHER);
2439 2513 }
2440 2514 (void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED);
2441 2515 VN_RELE(snfi->snfi_vp);
2442 2516 kmem_free(snfi, sizeof (*snfi));
2443 2517 }
2444 2518
2445 2519 /*
2446 2520 * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk.
2447 2521 * When segmap is used, the mblk contains a segmap slot of no more
2448 2522 * than MAXBSIZE.
2449 2523 *
2450 2524 * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained
2451 2525 * in each iteration and sent by socket_sendmblk until an error occurs or
2452 2526 * the requested size has been transferred. An mblk is esballoca'ed from
2453 2527 * each mapped page and a chain of these mblk is sent to the transport layer.
2454 2528 * vpm will be called to unmap the pages when all mblks have been freed by
2455 2529 * free_func.
2456 2530 *
2457 2531 * At the end of the whole sendfile() operation, we wait till the data from
2458 2532 * the last mblk is ack'ed by the transport before returning so that the
2459 2533 * caller of sendfile() can safely modify the file content.
2460 2534 */
2461 2535 int
2462 2536 snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t total_size,
2463 2537 ssize_t *count, boolean_t nowait)
2464 2538 {
2465 2539 caddr_t base;
2466 2540 int mapoff;
2467 2541 vnode_t *vp;
2468 2542 mblk_t *mp = NULL;
2469 2543 int chain_size;
2470 2544 int error;
2471 2545 clock_t deadlk_wait;
2472 2546 short fflag;
2473 2547 int ksize;
2474 2548 struct vattr va;
2475 2549 boolean_t dowait = B_FALSE;
2476 2550 struct nmsghdr msg;
2477 2551
2478 2552 vp = fp->f_vnode;
2479 2553 fflag = fp->f_flag;
2480 2554 ksize = 0;
2481 2555 bzero(&msg, sizeof (msg));
2482 2556
2483 2557 for (;;) {
2484 2558 if (ISSIG(curthread, JUSTLOOKING)) {
2485 2559 error = EINTR;
2486 2560 break;
2487 2561 }
2488 2562
2489 2563 if (vpm_enable) {
2490 2564 snf_vmap_desbinfo *snfv;
2491 2565 mblk_t *nmp;
2492 2566 int mblk_size;
2493 2567 int maxsize;
2494 2568 int i;
2495 2569
2496 2570 mapoff = fileoff & PAGEOFFSET;
2497 2571 maxsize = MIN((SNF_VPMMAXPGS * PAGESIZE), total_size);
2498 2572
2499 2573 snfv = kmem_zalloc(sizeof (snf_vmap_desbinfo),
2500 2574 KM_SLEEP);
2501 2575
2502 2576 /*
2503 2577 * Get vpm mappings for maxsize with read access.
2504 2578 * If the pages aren't available yet, we get
2505 2579 * DEADLK, so wait and try again a little later using
2506 2580 * an increasing wait. We might be here a long time.
2507 2581 *
2508 2582 * If delay_sig returns EINTR, be sure to exit and
2509 2583 * pass it up to the caller.
2510 2584 */
2511 2585 deadlk_wait = 0;
2512 2586 while ((error = vpm_map_pages(fvp, fileoff,
2513 2587 (size_t)maxsize, (VPM_FETCHPAGE), snfv->snfv_vml,
2514 2588 SNF_MAXVMAPS, NULL, S_READ)) == EDEADLK) {
2515 2589 deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2516 2590 if ((error = delay_sig(deadlk_wait)) != 0) {
2517 2591 break;
2518 2592 }
2519 2593 }
2520 2594 if (error != 0) {
2521 2595 kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2522 2596 error = (error == EINTR) ? EINTR : EIO;
2523 2597 goto out;
2524 2598 }
2525 2599 snfv->snfv_frtn.free_func = snf_vmap_desbfree;
2526 2600 snfv->snfv_frtn.free_arg = (caddr_t)snfv;
2527 2601
2528 2602 /* Construct the mblk chain from the page mappings */
2529 2603 chain_size = 0;
2530 2604 for (i = 0; (snfv->snfv_vml[i].vs_addr != NULL) &&
2531 2605 total_size > 0; i++) {
2532 2606 ASSERT(chain_size < maxsize);
2533 2607 mblk_size = MIN(snfv->snfv_vml[i].vs_len -
2534 2608 mapoff, total_size);
2535 2609 nmp = esballoca(
2536 2610 (uchar_t *)snfv->snfv_vml[i].vs_addr +
2537 2611 mapoff, mblk_size, BPRI_HI,
2538 2612 &snfv->snfv_frtn);
2539 2613
2540 2614 /*
2541 2615 * We return EAGAIN after unmapping the pages
2542 2616 * if we cannot allocate the the head of the
2543 2617 * chain. Otherwise, we continue sending the
2544 2618 * mblks constructed so far.
2545 2619 */
2546 2620 if (nmp == NULL) {
2547 2621 if (i == 0) {
2548 2622 vpm_unmap_pages(snfv->snfv_vml,
2549 2623 S_READ);
2550 2624 kmem_free(snfv,
2551 2625 sizeof (snf_vmap_desbinfo));
2552 2626 error = EAGAIN;
2553 2627 goto out;
2554 2628 }
2555 2629 break;
2556 2630 }
2557 2631 /* Mark this dblk with the zero-copy flag */
2558 2632 nmp->b_datap->db_struioflag |= STRUIO_ZC;
2559 2633 nmp->b_wptr += mblk_size;
2560 2634 chain_size += mblk_size;
2561 2635 fileoff += mblk_size;
2562 2636 total_size -= mblk_size;
2563 2637 snfv->snfv_ref++;
2564 2638 mapoff = 0;
2565 2639 if (i > 0)
2566 2640 linkb(mp, nmp);
2567 2641 else
2568 2642 mp = nmp;
2569 2643 }
2570 2644 VN_HOLD(fvp);
2571 2645 snfv->snfv_vp = fvp;
2572 2646 } else {
2573 2647 /* vpm not supported. fallback to segmap */
2574 2648 snf_smap_desbinfo *snfi;
2575 2649
2576 2650 mapoff = fileoff & MAXBOFFSET;
2577 2651 chain_size = MAXBSIZE - mapoff;
2578 2652 if (chain_size > total_size)
2579 2653 chain_size = total_size;
2580 2654 /*
2581 2655 * we don't forcefault because we'll call
2582 2656 * segmap_fault(F_SOFTLOCK) next.
2583 2657 *
2584 2658 * S_READ will get the ref bit set (by either
2585 2659 * segmap_getmapflt() or segmap_fault()) and page
2586 2660 * shared locked.
2587 2661 */
2588 2662 base = segmap_getmapflt(segkmap, fvp, fileoff,
2589 2663 chain_size, segmap_kpm ? SM_FAULT : 0, S_READ);
2590 2664
2591 2665 snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP);
2592 2666 snfi->snfi_len = (size_t)roundup(mapoff+chain_size,
2593 2667 PAGESIZE)- (mapoff & PAGEMASK);
2594 2668 /*
2595 2669 * We must call segmap_fault() even for segmap_kpm
2596 2670 * because that's how error gets returned.
2597 2671 * (segmap_getmapflt() never fails but segmap_fault()
2598 2672 * does.)
2599 2673 *
2600 2674 * If the pages aren't available yet, we get
2601 2675 * DEADLK, so wait and try again a little later using
2602 2676 * an increasing wait. We might be here a long time.
2603 2677 *
2604 2678 * If delay_sig returns EINTR, be sure to exit and
2605 2679 * pass it up to the caller.
2606 2680 */
2607 2681 deadlk_wait = 0;
2608 2682 while ((error = FC_ERRNO(segmap_fault(kas.a_hat,
2609 2683 segkmap, (caddr_t)(uintptr_t)(((uintptr_t)base +
2610 2684 mapoff) & PAGEMASK), snfi->snfi_len, F_SOFTLOCK,
2611 2685 S_READ))) == EDEADLK) {
2612 2686 deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2613 2687 if ((error = delay_sig(deadlk_wait)) != 0) {
2614 2688 break;
2615 2689 }
2616 2690 }
2617 2691 if (error != 0) {
2618 2692 (void) segmap_release(segkmap, base, 0);
2619 2693 kmem_free(snfi, sizeof (*snfi));
2620 2694 error = (error == EINTR) ? EINTR : EIO;
2621 2695 goto out;
2622 2696 }
2623 2697 snfi->snfi_frtn.free_func = snf_smap_desbfree;
2624 2698 snfi->snfi_frtn.free_arg = (caddr_t)snfi;
2625 2699 snfi->snfi_base = base;
2626 2700 snfi->snfi_mapoff = mapoff;
2627 2701 mp = esballoca((uchar_t *)base + mapoff, chain_size,
2628 2702 BPRI_HI, &snfi->snfi_frtn);
2629 2703
2630 2704 if (mp == NULL) {
2631 2705 (void) segmap_fault(kas.a_hat, segkmap,
2632 2706 (caddr_t)(uintptr_t)(((uintptr_t)base +
2633 2707 mapoff) & PAGEMASK), snfi->snfi_len,
2634 2708 F_SOFTUNLOCK, S_OTHER);
2635 2709 (void) segmap_release(segkmap, base, 0);
2636 2710 kmem_free(snfi, sizeof (*snfi));
2637 2711 freemsg(mp);
2638 2712 error = EAGAIN;
2639 2713 goto out;
2640 2714 }
2641 2715 VN_HOLD(fvp);
2642 2716 snfi->snfi_vp = fvp;
2643 2717 mp->b_wptr += chain_size;
2644 2718
2645 2719 /* Mark this dblk with the zero-copy flag */
2646 2720 mp->b_datap->db_struioflag |= STRUIO_ZC;
2647 2721 fileoff += chain_size;
2648 2722 total_size -= chain_size;
2649 2723 }
2650 2724
2651 2725 if (total_size == 0 && !nowait) {
2652 2726 ASSERT(!dowait);
2653 2727 dowait = B_TRUE;
2654 2728 mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
2655 2729 }
2656 2730 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2657 2731 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2658 2732 if (error != 0) {
2659 2733 /*
2660 2734 * mp contains the mblks that were not sent by
2661 2735 * socket_sendmblk. Use its size to update *count
2662 2736 */
2663 2737 *count = ksize + (chain_size - msgdsize(mp));
2664 2738 if (mp != NULL)
2665 2739 freemsg(mp);
2666 2740 return (error);
2667 2741 }
2668 2742 ksize += chain_size;
2669 2743 if (total_size == 0)
2670 2744 goto done;
2671 2745
2672 2746 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2673 2747 va.va_mask = AT_SIZE;
2674 2748 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2675 2749 if (error)
2676 2750 break;
2677 2751 /* Read as much as possible. */
2678 2752 if (fileoff >= va.va_size)
2679 2753 break;
2680 2754 if (total_size + fileoff > va.va_size)
2681 2755 total_size = va.va_size - fileoff;
2682 2756 }
2683 2757 out:
2684 2758 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2685 2759 done:
2686 2760 *count = ksize;
2687 2761 if (dowait) {
2688 2762 stdata_t *stp;
2689 2763
2690 2764 stp = vp->v_stream;
2691 2765 if (stp == NULL) {
2692 2766 struct sonode *so;
2693 2767 so = VTOSO(vp);
2694 2768 error = so_zcopy_wait(so);
2695 2769 } else {
2696 2770 mutex_enter(&stp->sd_lock);
2697 2771 while (!(stp->sd_flag & STZCNOTIFY)) {
2698 2772 if (cv_wait_sig(&stp->sd_zcopy_wait,
2699 2773 &stp->sd_lock) == 0) {
2700 2774 error = EINTR;
2701 2775 break;
2702 2776 }
2703 2777 }
2704 2778 stp->sd_flag &= ~STZCNOTIFY;
2705 2779 mutex_exit(&stp->sd_lock);
2706 2780 }
2707 2781 }
2708 2782 return (error);
2709 2783 }
2710 2784
2711 2785 int
2712 2786 snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
2713 2787 uint_t maxpsz, ssize_t *count)
2714 2788 {
2715 2789 struct vnode *vp;
2716 2790 mblk_t *mp;
2717 2791 int iosize;
2718 2792 int extra = 0;
2719 2793 int error;
2720 2794 short fflag;
2721 2795 int ksize;
2722 2796 int ioflag;
2723 2797 struct uio auio;
2724 2798 struct iovec aiov;
2725 2799 struct vattr va;
2726 2800 int maxblk = 0;
2727 2801 int wroff = 0;
2728 2802 struct sonode *so;
2729 2803 struct nmsghdr msg;
2730 2804
2731 2805 vp = fp->f_vnode;
2732 2806 if (vp->v_type == VSOCK) {
2733 2807 stdata_t *stp;
2734 2808
2735 2809 /*
2736 2810 * Get the extra space to insert a header and a trailer.
2737 2811 */
2738 2812 so = VTOSO(vp);
2739 2813 stp = vp->v_stream;
2740 2814 if (stp == NULL) {
2741 2815 wroff = so->so_proto_props.sopp_wroff;
2742 2816 maxblk = so->so_proto_props.sopp_maxblk;
2743 2817 extra = wroff + so->so_proto_props.sopp_tail;
2744 2818 } else {
2745 2819 wroff = (int)(stp->sd_wroff);
2746 2820 maxblk = (int)(stp->sd_maxblk);
2747 2821 extra = wroff + (int)(stp->sd_tail);
2748 2822 }
2749 2823 }
2750 2824 bzero(&msg, sizeof (msg));
2751 2825 fflag = fp->f_flag;
2752 2826 ksize = 0;
2753 2827 auio.uio_iov = &aiov;
2754 2828 auio.uio_iovcnt = 1;
2755 2829 auio.uio_segflg = UIO_SYSSPACE;
2756 2830 auio.uio_llimit = MAXOFFSET_T;
2757 2831 auio.uio_fmode = fflag;
2758 2832 auio.uio_extflg = UIO_COPY_CACHED;
2759 2833 ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
2760 2834 /* If read sync is not asked for, filter sync flags */
2761 2835 if ((ioflag & FRSYNC) == 0)
2762 2836 ioflag &= ~(FSYNC|FDSYNC);
2763 2837 for (;;) {
2764 2838 if (ISSIG(curthread, JUSTLOOKING)) {
2765 2839 error = EINTR;
2766 2840 break;
2767 2841 }
2768 2842 iosize = (int)MIN(maxpsz, size);
2769 2843
2770 2844 /*
2771 2845 * Socket filters can limit the mblk size,
2772 2846 * so limit reads to maxblk if there are
2773 2847 * filters present.
2774 2848 */
2775 2849 if (vp->v_type == VSOCK &&
2776 2850 so->so_filter_active > 0 && maxblk != INFPSZ)
2777 2851 iosize = (int)MIN(iosize, maxblk);
2778 2852
2779 2853 if (is_system_labeled()) {
2780 2854 mp = allocb_cred(iosize + extra, CRED(),
2781 2855 curproc->p_pid);
2782 2856 } else {
2783 2857 mp = allocb(iosize + extra, BPRI_MED);
2784 2858 }
2785 2859 if (mp == NULL) {
2786 2860 error = EAGAIN;
2787 2861 break;
2788 2862 }
2789 2863
2790 2864 mp->b_rptr += wroff;
2791 2865
2792 2866 aiov.iov_base = (caddr_t)mp->b_rptr;
2793 2867 aiov.iov_len = iosize;
2794 2868 auio.uio_loffset = fileoff;
2795 2869 auio.uio_resid = iosize;
2796 2870
2797 2871 error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL);
2798 2872 iosize -= auio.uio_resid;
2799 2873
2800 2874 if (error == EINTR && iosize != 0)
2801 2875 error = 0;
2802 2876
2803 2877 if (error != 0 || iosize == 0) {
2804 2878 freeb(mp);
2805 2879 break;
2806 2880 }
2807 2881 mp->b_wptr = mp->b_rptr + iosize;
2808 2882
2809 2883 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2810 2884
2811 2885 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2812 2886
2813 2887 if (error != 0) {
2814 2888 *count = ksize;
2815 2889 if (mp != NULL)
2816 2890 freeb(mp);
2817 2891 return (error);
2818 2892 }
2819 2893 ksize += iosize;
2820 2894 size -= iosize;
2821 2895 if (size == 0)
2822 2896 goto done;
2823 2897
2824 2898 fileoff += iosize;
2825 2899 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2826 2900 va.va_mask = AT_SIZE;
2827 2901 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2828 2902 if (error)
2829 2903 break;
2830 2904 /* Read as much as possible. */
2831 2905 if (fileoff >= va.va_size)
2832 2906 size = 0;
2833 2907 else if (size + fileoff > va.va_size)
2834 2908 size = va.va_size - fileoff;
2835 2909 }
2836 2910 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2837 2911 done:
2838 2912 *count = ksize;
2839 2913 return (error);
2840 2914 }
2841 2915
2842 2916 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2843 2917 /*
2844 2918 * Largefile support for 32 bit applications only.
2845 2919 */
2846 2920 int
2847 2921 sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
2848 2922 ssize32_t *count32)
2849 2923 {
2850 2924 ssize32_t sfv_len;
2851 2925 u_offset_t sfv_off, va_size;
2852 2926 struct vnode *vp, *fvp, *realvp;
2853 2927 struct vattr va;
2854 2928 stdata_t *stp;
2855 2929 ssize_t count = 0;
2856 2930 int error = 0;
2857 2931 boolean_t dozcopy = B_FALSE;
2858 2932 uint_t maxpsz;
2859 2933
2860 2934 sfv_len = (ssize32_t)sfv->sfv_len;
2861 2935 if (sfv_len < 0) {
2862 2936 error = EINVAL;
2863 2937 goto out;
2864 2938 }
2865 2939
2866 2940 if (sfv_len == 0) goto out;
2867 2941
2868 2942 sfv_off = (u_offset_t)sfv->sfv_off;
2869 2943
2870 2944 /* Same checks as in pread */
2871 2945 if (sfv_off > MAXOFFSET_T) {
2872 2946 error = EINVAL;
2873 2947 goto out;
2874 2948 }
2875 2949 if (sfv_off + sfv_len > MAXOFFSET_T)
2876 2950 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
2877 2951
2878 2952 /*
2879 2953 * There are no more checks on sfv_len. So, we cast it to
2880 2954 * u_offset_t and share the snf_direct_io/snf_cache code between
2881 2955 * 32 bit and 64 bit.
2882 2956 *
2883 2957 * TODO: should do nbl_need_check() like read()?
2884 2958 */
2885 2959 if (sfv_len > sendfile_max_size) {
2886 2960 sf_stats.ss_file_not_cached++;
2887 2961 error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len,
2888 2962 &count);
2889 2963 goto out;
2890 2964 }
2891 2965 fvp = rfp->f_vnode;
2892 2966 if (VOP_REALVP(fvp, &realvp, NULL) == 0)
2893 2967 fvp = realvp;
2894 2968 /*
2895 2969 * Grab the lock as a reader to prevent the file size
2896 2970 * from changing underneath.
2897 2971 */
2898 2972 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2899 2973 va.va_mask = AT_SIZE;
2900 2974 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2901 2975 va_size = va.va_size;
2902 2976 if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) {
2903 2977 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2904 2978 goto out;
2905 2979 }
2906 2980 /* Read as much as possible. */
2907 2981 if (sfv_off + sfv_len > va_size)
2908 2982 sfv_len = va_size - sfv_off;
2909 2983
2910 2984 vp = fp->f_vnode;
2911 2985 stp = vp->v_stream;
2912 2986 /*
2913 2987 * When the NOWAIT flag is not set, we enable zero-copy only if the
2914 2988 * transfer size is large enough. This prevents performance loss
2915 2989 * when the caller sends the file piece by piece.
2916 2990 */
2917 2991 if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) ||
2918 2992 (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) &&
2919 2993 !vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) {
2920 2994 uint_t copyflag;
2921 2995 copyflag = stp != NULL ? stp->sd_copyflag :
2922 2996 VTOSO(vp)->so_proto_props.sopp_zcopyflag;
2923 2997 if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
2924 2998 int on = 1;
2925 2999
2926 3000 if (socket_setsockopt(VTOSO(vp), SOL_SOCKET,
2927 3001 SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0)
2928 3002 dozcopy = B_TRUE;
2929 3003 } else {
2930 3004 dozcopy = copyflag & STZCVMSAFE;
2931 3005 }
2932 3006 }
2933 3007 if (dozcopy) {
2934 3008 sf_stats.ss_file_segmap++;
2935 3009 error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2936 3010 &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0));
2937 3011 } else {
2938 3012 if (vp->v_type == VSOCK && stp == NULL) {
2939 3013 sonode_t *so = VTOSO(vp);
2940 3014 maxpsz = so->so_proto_props.sopp_maxpsz;
2941 3015 } else if (stp != NULL) {
2942 3016 maxpsz = stp->sd_qn_maxpsz;
2943 3017 } else {
2944 3018 maxpsz = maxphys;
2945 3019 }
2946 3020
2947 3021 if (maxpsz == INFPSZ)
2948 3022 maxpsz = maxphys;
2949 3023 else
2950 3024 maxpsz = roundup(maxpsz, MAXBSIZE);
2951 3025 sf_stats.ss_file_cached++;
2952 3026 error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2953 3027 maxpsz, &count);
2954 3028 }
2955 3029 out:
2956 3030 releasef(sfv->sfv_fd);
2957 3031 *count32 = (ssize32_t)count;
2958 3032 return (error);
2959 3033 }
2960 3034 #endif
2961 3035
2962 3036 #ifdef _SYSCALL32_IMPL
2963 3037 /*
2964 3038 * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
2965 3039 * ssize_t rather than ssize32_t; see the comments above read32 for details.
2966 3040 */
2967 3041
2968 3042 ssize_t
2969 3043 recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2970 3044 {
2971 3045 return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2972 3046 }
2973 3047
2974 3048 ssize_t
2975 3049 recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2976 3050 caddr32_t name, caddr32_t namelenp)
2977 3051 {
2978 3052 return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2979 3053 (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp));
2980 3054 }
2981 3055
2982 3056 ssize_t
2983 3057 send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2984 3058 {
2985 3059 return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2986 3060 }
2987 3061
2988 3062 ssize_t
2989 3063 sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2990 3064 caddr32_t name, socklen_t namelen)
2991 3065 {
2992 3066 return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2993 3067 (void *)(uintptr_t)name, namelen));
2994 3068 }
2995 3069 #endif /* _SYSCALL32_IMPL */
2996 3070
2997 3071 /*
2998 3072 * Function wrappers (mostly around the sonode switch) for
2999 3073 * backward compatibility.
3000 3074 */
3001 3075
3002 3076 int
3003 3077 soaccept(struct sonode *so, int fflag, struct sonode **nsop)
3004 3078 {
3005 3079 return (socket_accept(so, fflag, CRED(), nsop));
3006 3080 }
3007 3081
3008 3082 int
3009 3083 sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3010 3084 int backlog, int flags)
3011 3085 {
3012 3086 int error;
3013 3087
3014 3088 error = socket_bind(so, name, namelen, flags, CRED());
3015 3089 if (error == 0 && backlog != 0)
3016 3090 return (socket_listen(so, backlog, CRED()));
3017 3091
3018 3092 return (error);
3019 3093 }
3020 3094
3021 3095 int
3022 3096 solisten(struct sonode *so, int backlog)
3023 3097 {
3024 3098 return (socket_listen(so, backlog, CRED()));
3025 3099 }
3026 3100
3027 3101 int
3028 3102 soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3029 3103 int fflag, int flags)
3030 3104 {
3031 3105 return (socket_connect(so, name, namelen, fflag, flags, CRED()));
3032 3106 }
3033 3107
3034 3108 int
3035 3109 sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3036 3110 {
3037 3111 return (socket_recvmsg(so, msg, uiop, CRED()));
3038 3112 }
3039 3113
3040 3114 int
3041 3115 sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3042 3116 {
3043 3117 return (socket_sendmsg(so, msg, uiop, CRED()));
3044 3118 }
3045 3119
3046 3120 int
3047 3121 soshutdown(struct sonode *so, int how)
3048 3122 {
3049 3123 return (socket_shutdown(so, how, CRED()));
3050 3124 }
3051 3125
3052 3126 int
3053 3127 sogetsockopt(struct sonode *so, int level, int option_name, void *optval,
3054 3128 socklen_t *optlenp, int flags)
3055 3129 {
3056 3130 return (socket_getsockopt(so, level, option_name, optval, optlenp,
3057 3131 flags, CRED()));
3058 3132 }
3059 3133
3060 3134 int
3061 3135 sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
3062 3136 t_uscalar_t optlen)
3063 3137 {
3064 3138 return (socket_setsockopt(so, level, option_name, optval, optlen,
3065 3139 CRED()));
3066 3140 }
3067 3141
3068 3142 /*
3069 3143 * Because this is backward compatibility interface it only needs to be
3070 3144 * able to handle the creation of TPI sockfs sockets.
3071 3145 */
3072 3146 struct sonode *
3073 3147 socreate(struct sockparams *sp, int family, int type, int protocol, int version,
3074 3148 int *errorp)
3075 3149 {
3076 3150 struct sonode *so;
3077 3151
3078 3152 ASSERT(sp != NULL);
3079 3153
3080 3154 so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol,
3081 3155 version, SOCKET_SLEEP, errorp, CRED());
3082 3156 if (so == NULL) {
3083 3157 SOCKPARAMS_DEC_REF(sp);
3084 3158 } else {
3085 3159 if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) {
3086 3160 /* Cannot fail, only bumps so_count */
3087 3161 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL);
3088 3162 } else {
3089 3163 socket_destroy(so);
3090 3164 so = NULL;
3091 3165 }
3092 3166 }
3093 3167 return (so);
3094 3168 }
|
↓ open down ↓ |
1712 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX