io-lx-public-vs-joyent Wdiff usr/src/uts/common/fs/sockfs/socksyscalls.c

Print this page

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/sockfs/socksyscalls.c
          +++ new/usr/src/uts/common/fs/sockfs/socksyscalls.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
  25   25   * Copyright 2015, Joyent, Inc.  All rights reserved.
  26   26   */
  27   27  
  28   28  /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
  29   29  /*
  30   30   * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  31   31   */
  32   32  
  33   33  #include <sys/types.h>
  34   34  #include <sys/t_lock.h>
  35   35  #include <sys/param.h>
  36   36  #include <sys/systm.h>
  37   37  #include <sys/buf.h>
  38   38  #include <sys/conf.h>
  39   39  #include <sys/cred.h>
  40   40  #include <sys/kmem.h>
  41   41  #include <sys/sysmacros.h>
  42   42  #include <sys/vfs.h>
  43   43  #include <sys/vnode.h>
  44   44  #include <sys/debug.h>
  45   45  #include <sys/errno.h>
  46   46  #include <sys/time.h>
  47   47  #include <sys/file.h>
  48   48  #include <sys/user.h>
  49   49  #include <sys/stream.h>
  50   50  #include <sys/strsubr.h>
  51   51  #include <sys/strsun.h>
  52   52  #include <sys/sunddi.h>
  53   53  #include <sys/esunddi.h>
  54   54  #include <sys/flock.h>
  55   55  #include <sys/modctl.h>
  56   56  #include <sys/cmn_err.h>
  57   57  #include <sys/vmsystm.h>
  58   58  #include <sys/policy.h>
  59   59  #include <sys/limits.h>
  60   60  
  61   61  #include <sys/socket.h>
  62   62  #include <sys/socketvar.h>
  63   63  
  64   64  #include <sys/isa_defs.h>
  65   65  #include <sys/inttypes.h>
  66   66  #include <sys/systm.h>
  67   67  #include <sys/cpuvar.h>
  68   68  #include <sys/filio.h>
  69   69  #include <sys/sendfile.h>
  70   70  #include <sys/ddi.h>
  71   71  #include <vm/seg.h>
  72   72  #include <vm/seg_map.h>
  73   73  #include <vm/seg_kpm.h>
  74   74  
  75   75  #include <fs/sockfs/nl7c.h>
  76   76  #include <fs/sockfs/sockcommon.h>
  77   77  #include <fs/sockfs/sockfilter_impl.h>
  78   78  #include <fs/sockfs/socktpi.h>
  79   79  
  80   80  #ifdef SOCK_TEST
  81   81  int do_useracc = 1;             /* Controlled by setting SO_DEBUG to 4 */
  82   82  #else
  83   83  #define do_useracc      1
  84   84  #endif /* SOCK_TEST */
  85   85  
  86   86  extern int      xnet_truncate_print;
  87   87  
  88   88  extern void     nl7c_init(void);
  89   89  extern int      sockfs_defer_nl7c_init;
  90   90  
  91   91  /*
  92   92   * Kernel component of socket creation.
  93   93   *
  94   94   * The socket library determines which version number to use.
  95   95   * First the library calls this with a NULL devpath. If this fails
  96   96   * to find a transport (using solookup) the library will look in /etc/netconfig
  97   97   * for the appropriate transport. If one is found it will pass in the
  98   98   * devpath for the kernel to use.
  99   99   */
 100  100  int
 101  101  so_socket(int family, int type_w_flags, int protocol, char *devpath,
 102  102      int version)
 103  103  {
 104  104          struct sonode *so;
 105  105          vnode_t *vp;
 106  106          struct file *fp;
 107  107          int fd;
 108  108          int error;
 109  109          int type;
 110  110  
 111  111          type = type_w_flags & SOCK_TYPE_MASK;
 112  112          type_w_flags &= ~SOCK_TYPE_MASK;
 113  113          if (type_w_flags & ~(SOCK_CLOEXEC|SOCK_NDELAY|SOCK_NONBLOCK))
 114  114                  return (set_errno(EINVAL));
 115  115  
 116  116          if (devpath != NULL) {
 117  117                  char *buf;
 118  118                  size_t kdevpathlen = 0;
 119  119  
 120  120                  buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 121  121                  if ((error = copyinstr(devpath, buf,
 122  122                      MAXPATHLEN, &kdevpathlen)) != 0) {
 123  123                          kmem_free(buf, MAXPATHLEN);
 124  124                          return (set_errno(error));
 125  125                  }
 126  126                  so = socket_create(family, type, protocol, buf, NULL,
 127  127                      SOCKET_SLEEP, version, CRED(), &error);
 128  128                  kmem_free(buf, MAXPATHLEN);
 129  129          } else {
 130  130                  so = socket_create(family, type, protocol, NULL, NULL,
 131  131                      SOCKET_SLEEP, version, CRED(), &error);
 132  132          }
 133  133          if (so == NULL)
 134  134                  return (set_errno(error));
 135  135  
 136  136          /* Allocate a file descriptor for the socket */
 137  137          vp = SOTOV(so);
 138  138          if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) {
 139  139                  (void) socket_close(so, 0, CRED());
 140  140                  socket_destroy(so);
 141  141                  return (set_errno(error));
 142  142          }
 143  143  
 144  144          /*
 145  145           * Now fill in the entries that falloc reserved
 146  146           */
 147  147          if (type_w_flags & SOCK_NDELAY) {
 148  148                  so->so_state |= SS_NDELAY;
 149  149                  fp->f_flag |= FNDELAY;
 150  150          }
 151  151          if (type_w_flags & SOCK_NONBLOCK) {
 152  152                  so->so_state |= SS_NONBLOCK;
 153  153                  fp->f_flag |= FNONBLOCK;
 154  154          }
 155  155          mutex_exit(&fp->f_tlock);
 156  156          setf(fd, fp);
 157  157          if ((type_w_flags & SOCK_CLOEXEC) != 0) {
 158  158                  f_setfd(fd, FD_CLOEXEC);
 159  159          }
 160  160  
 161  161          return (fd);
 162  162  }
 163  163  
 164  164  /*
 165  165   * Map from a file descriptor to a socket node.
 166  166   * Returns with the file descriptor held i.e. the caller has to
 167  167   * use releasef when done with the file descriptor.
 168  168   */
 169  169  struct sonode *
 170  170  getsonode(int sock, int *errorp, file_t **fpp)
 171  171  {
 172  172          file_t *fp;
 173  173          vnode_t *vp;
 174  174          struct sonode *so;
 175  175  
 176  176          if ((fp = getf(sock)) == NULL) {
 177  177                  *errorp = EBADF;
 178  178                  eprintline(*errorp);
 179  179                  return (NULL);
 180  180          }
 181  181          vp = fp->f_vnode;
 182  182          /* Check if it is a socket */
 183  183          if (vp->v_type != VSOCK) {
 184  184                  releasef(sock);
 185  185                  *errorp = ENOTSOCK;
 186  186                  eprintline(*errorp);
 187  187                  return (NULL);
 188  188          }
 189  189          /*
 190  190           * Use the stream head to find the real socket vnode.
 191  191           * This is needed when namefs sits above sockfs.
 192  192           */
 193  193          if (vp->v_stream) {
 194  194                  ASSERT(vp->v_stream->sd_vnode);
 195  195                  vp = vp->v_stream->sd_vnode;
 196  196  
 197  197                  so = VTOSO(vp);
 198  198                  if (so->so_version == SOV_STREAM) {
 199  199                          releasef(sock);
 200  200                          *errorp = ENOTSOCK;
 201  201                          eprintsoline(so, *errorp);
 202  202                          return (NULL);
 203  203                  }
 204  204          } else {
 205  205                  so = VTOSO(vp);
 206  206          }
 207  207          if (fpp)
 208  208                  *fpp = fp;
 209  209          return (so);
 210  210  }
 211  211  
 212  212  /*
 213  213   * Allocate and copyin a sockaddr.
 214  214   * Ensures NULL termination for AF_UNIX addresses by extending them
 215  215   * with one NULL byte if need be. Verifies that the length is not
 216  216   * excessive to prevent an application from consuming all of kernel
 217  217   * memory. Returns NULL when an error occurred.
 218  218   */
 219  219  static struct sockaddr *
 220  220  copyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp,
 221  221              int *errorp)
 222  222  {
 223  223          char    *faddr;
 224  224          size_t  namelen = (size_t)*namelenp;
 225  225  
 226  226          ASSERT(namelen != 0);
 227  227          if (namelen > SO_MAXARGSIZE) {
 228  228                  *errorp = EINVAL;
 229  229                  eprintsoline(so, *errorp);
 230  230                  return (NULL);
 231  231          }
 232  232  
 233  233          faddr = (char *)kmem_alloc(namelen, KM_SLEEP);
 234  234          if (copyin(name, faddr, namelen)) {
 235  235                  kmem_free(faddr, namelen);
 236  236                  *errorp = EFAULT;
 237  237                  eprintsoline(so, *errorp);
 238  238                  return (NULL);
 239  239          }
 240  240  
 241  241          /*
 242  242           * Add space for NULL termination if needed.
 243  243           * Do a quick check if the last byte is NUL.
 244  244           */
 245  245          if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') {
 246  246                  /* Check if there is any NULL termination */
 247  247                  size_t  i;
 248  248                  int foundnull = 0;
 249  249  
 250  250                  for (i = sizeof (name->sa_family); i < namelen; i++) {
 251  251                          if (faddr[i] == '\0') {
 252  252                                  foundnull = 1;
 253  253                                  break;
 254  254                          }
 255  255                  }
 256  256                  if (!foundnull) {
 257  257                          /* Add extra byte for NUL padding */
 258  258                          char *nfaddr;
 259  259  
 260  260                          nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP);
 261  261                          bcopy(faddr, nfaddr, namelen);
 262  262                          kmem_free(faddr, namelen);
 263  263  
 264  264                          /* NUL terminate */
 265  265                          nfaddr[namelen] = '\0';
 266  266                          namelen++;
 267  267                          ASSERT((socklen_t)namelen == namelen);
 268  268                          *namelenp = (socklen_t)namelen;
 269  269                          faddr = nfaddr;
 270  270                  }
 271  271          }
 272  272          return ((struct sockaddr *)faddr);
 273  273  }
 274  274  
 275  275  /*
 276  276   * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
 277  277   */
 278  278  static int
 279  279  copyout_arg(void *uaddr, socklen_t ulen, void *ulenp,
 280  280                  void *kaddr, socklen_t klen)
 281  281  {
 282  282          if (uaddr != NULL) {
 283  283                  if (ulen > klen)
 284  284                          ulen = klen;
 285  285  
 286  286                  if (ulen != 0) {
 287  287                          if (copyout(kaddr, uaddr, ulen))
 288  288                                  return (EFAULT);
 289  289                  }
 290  290          } else
 291  291                  ulen = 0;
 292  292  
 293  293          if (ulenp != NULL) {
 294  294                  if (copyout(&ulen, ulenp, sizeof (ulen)))
 295  295                          return (EFAULT);
 296  296          }
 297  297          return (0);
 298  298  }
 299  299  
 300  300  /*
 301  301   * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
 302  302   * If klen is greater than ulen it still uses the non-truncated
 303  303   * klen to update ulenp.
 304  304   */
 305  305  static int
 306  306  copyout_name(void *uaddr, socklen_t ulen, void *ulenp,
 307  307                  void *kaddr, socklen_t klen)
 308  308  {
 309  309          if (uaddr != NULL) {
 310  310                  if (ulen >= klen)
 311  311                          ulen = klen;
 312  312                  else if (ulen != 0 && xnet_truncate_print) {
 313  313                          printf("sockfs: truncating copyout of address using "
 314  314                              "XNET semantics for pid = %d. Lengths %d, %d\n",
 315  315                              curproc->p_pid, klen, ulen);
 316  316                  }
 317  317  
 318  318                  if (ulen != 0) {
 319  319                          if (copyout(kaddr, uaddr, ulen))
 320  320                                  return (EFAULT);
 321  321                  } else
 322  322                          klen = 0;
 323  323          } else
 324  324                  klen = 0;
 325  325  
 326  326          if (ulenp != NULL) {
 327  327                  if (copyout(&klen, ulenp, sizeof (klen)))
 328  328                          return (EFAULT);
 329  329          }
 330  330          return (0);
 331  331  }
 332  332  
 333  333  /*
 334  334   * The socketpair() code in libsocket creates two sockets (using
 335  335   * the /etc/netconfig fallback if needed) before calling this routine
 336  336   * to connect the two sockets together.
 337  337   *
 338  338   * For a SOCK_STREAM socketpair a listener is needed - in that case this
 339  339   * routine will create a new file descriptor as part of accepting the
 340  340   * connection. The library socketpair() will check if svs[2] has changed
 341  341   * in which case it will close the changed fd.
 342  342   *
 343  343   * Note that this code could use the TPI feature of accepting the connection
 344  344   * on the listening endpoint. However, that would require significant changes
 345  345   * to soaccept.
 346  346   */
 347  347  int
 348  348  so_socketpair(int sv[2])
 349  349  {
 350  350          int svs[2];
 351  351          struct sonode *so1, *so2;
 352  352          int error;
 353  353          int orig_flags;
 354  354          struct sockaddr_ux *name;
 355  355          size_t namelen;
 356  356          sotpi_info_t *sti1;
 357  357          sotpi_info_t *sti2;
 358  358  
 359  359          dprint(1, ("so_socketpair(%p)\n", (void *)sv));
 360  360  
 361  361          error = useracc(sv, sizeof (svs), B_WRITE);
 362  362          if (error && do_useracc)
 363  363                  return (set_errno(EFAULT));
 364  364  
 365  365          if (copyin(sv, svs, sizeof (svs)))
 366  366                  return (set_errno(EFAULT));
 367  367  
 368  368          if ((so1 = getsonode(svs[0], &error, NULL)) == NULL)
 369  369                  return (set_errno(error));
 370  370  
 371  371          if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) {
 372  372                  releasef(svs[0]);
 373  373                  return (set_errno(error));
 374  374          }
 375  375  
 376  376          if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) {
 377  377                  error = EOPNOTSUPP;
 378  378                  goto done;
 379  379          }
 380  380  
 381  381          sti1 = SOTOTPI(so1);
 382  382          sti2 = SOTOTPI(so2);
 383  383  
 384  384          /*
 385  385           * The code below makes assumptions about the "sockfs" implementation.
 386  386           * So make sure that the correct implementation is really used.
 387  387           */
 388  388          ASSERT(so1->so_ops == &sotpi_sonodeops);
 389  389          ASSERT(so2->so_ops == &sotpi_sonodeops);
 390  390  
 391  391          if (so1->so_type == SOCK_DGRAM) {
 392  392                  /*
 393  393                   * Bind both sockets and connect them with each other.
 394  394                   * Need to allocate name/namelen for soconnect.
 395  395                   */
 396  396                  error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED());
 397  397                  if (error) {
 398  398                          eprintsoline(so1, error);
 399  399                          goto done;
 400  400                  }
 401  401                  error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
 402  402                  if (error) {
 403  403                          eprintsoline(so2, error);
 404  404                          goto done;
 405  405                  }
 406  406                  namelen = sizeof (struct sockaddr_ux);
 407  407                  name = kmem_alloc(namelen, KM_SLEEP);
 408  408                  name->sou_family = AF_UNIX;
 409  409                  name->sou_addr = sti2->sti_ux_laddr;
 410  410                  error = socket_connect(so1,
 411  411                      (struct sockaddr *)name,
 412  412                      (socklen_t)namelen,
 413  413                      0, _SOCONNECT_NOXLATE, CRED());
 414  414                  if (error) {
 415  415                          kmem_free(name, namelen);
 416  416                          eprintsoline(so1, error);
 417  417                          goto done;
 418  418                  }
 419  419                  name->sou_addr = sti1->sti_ux_laddr;
 420  420                  error = socket_connect(so2,
 421  421                      (struct sockaddr *)name,
 422  422                      (socklen_t)namelen,
 423  423                      0, _SOCONNECT_NOXLATE, CRED());
 424  424                  kmem_free(name, namelen);
 425  425                  if (error) {
 426  426                          eprintsoline(so2, error);
 427  427                          goto done;
 428  428                  }
 429  429                  releasef(svs[0]);
 430  430                  releasef(svs[1]);
 431  431          } else {
 432  432                  /*
 433  433                   * Bind both sockets, with so1 being a listener.
 434  434                   * Connect so2 to so1 - nonblocking to avoid waiting for
 435  435                   * soaccept to complete.
 436  436                   * Accept a connection on so1. Pass out the new fd as sv[0].
 437  437                   * The library will detect the changed fd and close
 438  438                   * the original one.
 439  439                   */
 440  440                  struct sonode *nso;
 441  441                  struct vnode *nvp;
 442  442                  struct file *nfp;
 443  443                  int nfd;
 444  444  
 445  445                  /*
 446  446                   * We could simply call socket_listen() here (which would do the
 447  447                   * binding automatically) if the code didn't rely on passing
 448  448                   * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
 449  449                   */
 450  450                  error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC|
 451  451                      _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR,
 452  452                      CRED());
 453  453                  if (error) {
 454  454                          eprintsoline(so1, error);
 455  455                          goto done;
 456  456                  }
 457  457                  error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
 458  458                  if (error) {
 459  459                          eprintsoline(so2, error);
 460  460                          goto done;
 461  461                  }
 462  462  
 463  463                  namelen = sizeof (struct sockaddr_ux);
 464  464                  name = kmem_alloc(namelen, KM_SLEEP);
 465  465                  name->sou_family = AF_UNIX;
 466  466                  name->sou_addr = sti1->sti_ux_laddr;
 467  467                  error = socket_connect(so2,
 468  468                      (struct sockaddr *)name,
 469  469                      (socklen_t)namelen,
 470  470                      FNONBLOCK, _SOCONNECT_NOXLATE, CRED());
 471  471                  kmem_free(name, namelen);
 472  472                  if (error) {
 473  473                          if (error != EINPROGRESS) {
 474  474                                  eprintsoline(so2, error); goto done;
 475  475                          }
 476  476                  }
 477  477  
 478  478                  error = socket_accept(so1, 0, CRED(), &nso);
 479  479                  if (error) {
 480  480                          eprintsoline(so1, error);
 481  481                          goto done;
 482  482                  }
 483  483  
 484  484                  /* wait for so2 being SS_CONNECTED ignoring signals */
 485  485                  mutex_enter(&so2->so_lock);
 486  486                  error = sowaitconnected(so2, 0, 1);
 487  487                  mutex_exit(&so2->so_lock);
 488  488                  if (error != 0) {
 489  489                          (void) socket_close(nso, 0, CRED());
 490  490                          socket_destroy(nso);
 491  491                          eprintsoline(so2, error);
 492  492                          goto done;
 493  493                  }
 494  494  
 495  495                  nvp = SOTOV(nso);
 496  496                  if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) {
 497  497                          (void) socket_close(nso, 0, CRED());
 498  498                          socket_destroy(nso);
 499  499                          eprintsoline(nso, error);
 500  500                          goto done;
 501  501                  }
 502  502                  /*
 503  503                   * copy over FNONBLOCK and FNDELAY flags should they exist
 504  504                   */
 505  505                  if (so1->so_state & SS_NONBLOCK)
 506  506                          nfp->f_flag |= FNONBLOCK;
 507  507                  if (so1->so_state & SS_NDELAY)
 508  508                          nfp->f_flag |= FNDELAY;
 509  509  
 510  510                  /*
 511  511                   * fill in the entries that falloc reserved
 512  512                   */
 513  513                  mutex_exit(&nfp->f_tlock);
 514  514                  setf(nfd, nfp);
 515  515  
 516  516                  /*
 517  517                   * get the original flags before we release
 518  518                   */
 519  519                  VERIFY(f_getfd_error(svs[0], &orig_flags) == 0);
 520  520  
 521  521                  releasef(svs[0]);
 522  522                  releasef(svs[1]);
 523  523  
 524  524                  /*
 525  525                   * If FD_CLOEXEC was set on the filedescriptor we're
 526  526                   * swapping out, we should set it on the new one too.
 527  527                   */
 528  528                  if (orig_flags & FD_CLOEXEC) {
 529  529                          f_setfd(nfd, FD_CLOEXEC);
 530  530                  }
 531  531  
 532  532                  /*
 533  533                   * The socketpair library routine will close the original
 534  534                   * svs[0] when this code passes out a different file
 535  535                   * descriptor.
 536  536                   */
 537  537                  svs[0] = nfd;
 538  538  
 539  539                  if (copyout(svs, sv, sizeof (svs))) {
 540  540                          (void) closeandsetf(nfd, NULL);
 541  541                          eprintline(EFAULT);
 542  542                          return (set_errno(EFAULT));
 543  543                  }
 544  544          }
 545  545          return (0);
 546  546  
 547  547  done:
 548  548          releasef(svs[0]);
 549  549          releasef(svs[1]);
 550  550          return (set_errno(error));
 551  551  }
 552  552  
 553  553  int
 554  554  bind(int sock, struct sockaddr *name, socklen_t namelen, int version)
 555  555  {
 556  556          struct sonode *so;
 557  557          int error;
 558  558  
 559  559          dprint(1, ("bind(%d, %p, %d)\n",
 560  560              sock, (void *)name, namelen));
 561  561  
 562  562          if ((so = getsonode(sock, &error, NULL)) == NULL)
 563  563                  return (set_errno(error));
 564  564  
 565  565          /* Allocate and copyin name */
 566  566          /*
 567  567           * X/Open test does not expect EFAULT with NULL name and non-zero
 568  568           * namelen.
 569  569           */
 570  570          if (name != NULL && namelen != 0) {
 571  571                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 572  572                  name = copyin_name(so, name, &namelen, &error);
 573  573                  if (name == NULL) {
 574  574                          releasef(sock);
 575  575                          return (set_errno(error));
 576  576                  }
 577  577          } else {
 578  578                  name = NULL;
 579  579                  namelen = 0;
 580  580          }
 581  581  
 582  582          switch (version) {
 583  583          default:
 584  584                  error = socket_bind(so, name, namelen, 0, CRED());
 585  585                  break;
 586  586          case SOV_XPG4_2:
 587  587                  error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED());
 588  588                  break;
 589  589          case SOV_SOCKBSD:
 590  590                  error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED());
 591  591                  break;
 592  592          }
 593  593  done:
 594  594          releasef(sock);
 595  595          if (name != NULL)
 596  596                  kmem_free(name, (size_t)namelen);
 597  597  
 598  598          if (error)
 599  599                  return (set_errno(error));
 600  600          return (0);
 601  601  }
 602  602  
 603  603  /* ARGSUSED2 */
 604  604  int
 605  605  listen(int sock, int backlog, int version)
 606  606  {
 607  607          struct sonode *so;
 608  608          int error;
 609  609  
 610  610          dprint(1, ("listen(%d, %d)\n",
 611  611              sock, backlog));
 612  612  
 613  613          if ((so = getsonode(sock, &error, NULL)) == NULL)
 614  614                  return (set_errno(error));
 615  615  
 616  616          error = socket_listen(so, backlog, CRED());
 617  617  
 618  618          releasef(sock);
 619  619          if (error)
 620  620                  return (set_errno(error));
 621  621          return (0);
 622  622  }
 623  623  
 624  624  /*ARGSUSED3*/
 625  625  int
 626  626  accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version,
 627  627      int flags)
 628  628  {
 629  629          struct sonode *so;
 630  630          file_t *fp;
 631  631          int error;
 632  632          socklen_t namelen;
 633  633          struct sonode *nso;
 634  634          struct vnode *nvp;
 635  635          struct file *nfp;
 636  636          int nfd;
 637  637          int ssflags;
 638  638          struct sockaddr *addrp;
 639  639          socklen_t addrlen;
 640  640  
 641  641          dprint(1, ("accept(%d, %p, %p)\n",
 642  642              sock, (void *)name, (void *)namelenp));
 643  643  
 644  644          if (flags & ~(SOCK_CLOEXEC|SOCK_NONBLOCK|SOCK_NDELAY)) {
 645  645                  return (set_errno(EINVAL));
 646  646          }
 647  647  
 648  648          /* Translate SOCK_ flags to their SS_ variant */
 649  649          ssflags = 0;
 650  650          if (flags & SOCK_NONBLOCK)
 651  651                  ssflags |= SS_NONBLOCK;
 652  652          if (flags & SOCK_NDELAY)
 653  653                  ssflags |= SS_NDELAY;
 654  654  
 655  655          if ((so = getsonode(sock, &error, &fp)) == NULL)
 656  656                  return (set_errno(error));
 657  657  
 658  658          if (name != NULL) {
 659  659                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 660  660                  if (copyin(namelenp, &namelen, sizeof (namelen))) {
 661  661                          releasef(sock);
 662  662                          return (set_errno(EFAULT));
 663  663                  }
 664  664                  if (namelen != 0) {
 665  665                          error = useracc(name, (size_t)namelen, B_WRITE);
 666  666                          if (error && do_useracc) {
 667  667                                  releasef(sock);
 668  668                                  return (set_errno(EFAULT));
 669  669                          }
 670  670                  } else
 671  671                          name = NULL;
 672  672          } else {
 673  673                  namelen = 0;
 674  674          }
 675  675  
 676  676          /*
 677  677           * Allocate the user fd before socket_accept() in order to
 678  678           * catch EMFILE errors before calling socket_accept().
 679  679           */
 680  680          if ((nfd = ufalloc(0)) == -1) {
 681  681                  eprintsoline(so, EMFILE);
 682  682                  releasef(sock);
 683  683                  return (set_errno(EMFILE));
 684  684          }
 685  685          error = socket_accept(so, fp->f_flag, CRED(), &nso);
 686  686          if (error) {
 687  687                  setf(nfd, NULL);
 688  688                  releasef(sock);
 689  689                  return (set_errno(error));
 690  690          }
 691  691  
 692  692          nvp = SOTOV(nso);
 693  693  
 694  694          ASSERT(MUTEX_NOT_HELD(&nso->so_lock));
 695  695          if (namelen != 0) {
 696  696                  addrlen = so->so_max_addr_len;
 697  697                  addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP);
 698  698  
 699  699                  if ((error = socket_getpeername(nso, (struct sockaddr *)addrp,
 700  700                      &addrlen, B_TRUE, CRED())) == 0) {
 701  701                          error = copyout_name(name, namelen, namelenp,
 702  702                              addrp, addrlen);
 703  703                  } else {
 704  704                          ASSERT(error == EINVAL || error == ENOTCONN);
 705  705                          error = ECONNABORTED;
 706  706                  }
 707  707                  kmem_free(addrp, so->so_max_addr_len);
 708  708          }
 709  709  
 710  710          if (error) {
 711  711                  setf(nfd, NULL);
 712  712                  (void) socket_close(nso, 0, CRED());
 713  713                  socket_destroy(nso);
 714  714                  releasef(sock);
 715  715                  return (set_errno(error));
 716  716          }
 717  717          if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) {
 718  718                  setf(nfd, NULL);
 719  719                  (void) socket_close(nso, 0, CRED());
 720  720                  socket_destroy(nso);
 721  721                  eprintsoline(so, error);
 722  722                  releasef(sock);
 723  723                  return (set_errno(error));
 724  724          }
 725  725          /*
 726  726           * fill in the entries that falloc reserved
 727  727           */
 728  728          nfp->f_vnode = nvp;
 729  729          mutex_exit(&nfp->f_tlock);
 730  730          setf(nfd, nfp);
 731  731  
 732  732          /*
 733  733           * Act on SOCK_CLOEXEC from flags
 734  734           */
 735  735          if (flags & SOCK_CLOEXEC) {
 736  736                  f_setfd(nfd, FD_CLOEXEC);
 737  737          }
 738  738  
 739  739          /*
 740  740           * Copy FNDELAY and FNONBLOCK from listener to acceptor
 741  741           * and from ssflags
 742  742           */
 743  743          if ((ssflags | so->so_state) & (SS_NDELAY|SS_NONBLOCK)) {
 744  744                  uint_t oflag = nfp->f_flag;
 745  745                  int arg = 0;
 746  746  
 747  747                  if ((ssflags | so->so_state) & SS_NONBLOCK)
 748  748                          arg |= FNONBLOCK;
 749  749                  else if ((ssflags | so->so_state) & SS_NDELAY)
 750  750                          arg |= FNDELAY;
 751  751  
 752  752                  /*
 753  753                   * This code is a simplification of the F_SETFL code in fcntl()
 754  754                   * Ignore any errors from VOP_SETFL.
 755  755                   */
 756  756                  if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred, NULL))
 757  757                      != 0) {
 758  758                          eprintsoline(so, error);
 759  759                          error = 0;
 760  760                  } else {
 761  761                          mutex_enter(&nfp->f_tlock);
 762  762                          nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
 763  763                          nfp->f_flag |= arg;
 764  764                          mutex_exit(&nfp->f_tlock);
 765  765                  }
 766  766          }
 767  767          releasef(sock);
 768  768          return (nfd);
 769  769  }
 770  770  
 771  771  int
 772  772  connect(int sock, struct sockaddr *name, socklen_t namelen, int version)
 773  773  {
 774  774          struct sonode *so;
 775  775          file_t *fp;
 776  776          int error;
 777  777  
 778  778          dprint(1, ("connect(%d, %p, %d)\n",
 779  779              sock, (void *)name, namelen));
 780  780  
 781  781          if ((so = getsonode(sock, &error, &fp)) == NULL)
 782  782                  return (set_errno(error));
 783  783  
 784  784          /* Allocate and copyin name */
 785  785          if (namelen != 0) {
 786  786                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 787  787                  name = copyin_name(so, name, &namelen, &error);
 788  788                  if (name == NULL) {
 789  789                          releasef(sock);
 790  790                          return (set_errno(error));
 791  791                  }
 792  792          } else
 793  793                  name = NULL;
 794  794  
 795  795          error = socket_connect(so, name, namelen, fp->f_flag,
 796  796              (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED());
 797  797          releasef(sock);
 798  798          if (name)
 799  799                  kmem_free(name, (size_t)namelen);
 800  800          if (error)
 801  801                  return (set_errno(error));
 802  802          return (0);
 803  803  }
 804  804  
 805  805  /*ARGSUSED2*/
 806  806  int
 807  807  shutdown(int sock, int how, int version)
 808  808  {
 809  809          struct sonode *so;
 810  810          int error;
 811  811  
 812  812          dprint(1, ("shutdown(%d, %d)\n",
 813  813              sock, how));
 814  814  
 815  815          if ((so = getsonode(sock, &error, NULL)) == NULL)
 816  816                  return (set_errno(error));
 817  817  
 818  818          error = socket_shutdown(so, how, CRED());
 819  819  
 820  820          releasef(sock);
 821  821          if (error)
 822  822                  return (set_errno(error));
 823  823          return (0);
 824  824  }
 825  825  
 826  826  /*
 827  827   * Common receive routine.
 828  828   */
 829  829  static ssize_t
 830  830  recvit(int sock,
 831  831          struct nmsghdr *msg,
 832  832          struct uio *uiop,
 833  833          int flags,
 834  834          socklen_t *namelenp,
 835  835          socklen_t *controllenp,
 836  836          int *flagsp)
 837  837  {
 838  838          struct sonode *so;
 839  839          file_t *fp;
 840  840          void *name;
 841  841          socklen_t namelen;
 842  842          void *control;
 843  843          socklen_t controllen;
 844  844          ssize_t len;
 845  845          int error;
 846  846  
 847  847          if ((so = getsonode(sock, &error, &fp)) == NULL)
 848  848                  return (set_errno(error));
 849  849  
 850  850          len = uiop->uio_resid;
 851  851          uiop->uio_fmode = fp->f_flag;
 852  852          uiop->uio_extflg = UIO_COPY_CACHED;
 853  853  
 854  854          name = msg->msg_name;
 855  855          namelen = msg->msg_namelen;
 856  856          control = msg->msg_control;
 857  857          controllen = msg->msg_controllen;
 858  858  
 859  859          msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
 860  860              MSG_DONTWAIT | MSG_XPG4_2);
 861  861  
 862  862          error = socket_recvmsg(so, msg, uiop, CRED());
 863  863          if (error) {
 864  864                  releasef(sock);
 865  865                  return (set_errno(error));
 866  866          }
 867  867          lwp_stat_update(LWP_STAT_MSGRCV, 1);
 868  868          releasef(sock);
 869  869  
 870  870          error = copyout_name(name, namelen, namelenp,
 871  871              msg->msg_name, msg->msg_namelen);
 872  872          if (error)
 873  873                  goto err;
 874  874  
 875  875          if (flagsp != NULL) {
 876  876                  /*
 877  877                   * Clear internal flag.
 878  878                   */
 879  879                  msg->msg_flags &= ~MSG_XPG4_2;
 880  880  
 881  881                  /*
 882  882                   * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
 883  883                   * when controllen is zero and there is control data to
 884  884                   * copy out.
 885  885                   */
 886  886                  if (controllen != 0 &&
 887  887                      (msg->msg_controllen > controllen || control == NULL)) {
 888  888                          dprint(1, ("recvit: CTRUNC %d %d %p\n",
 889  889                              msg->msg_controllen, controllen, control));
 890  890  
 891  891                          msg->msg_flags |= MSG_CTRUNC;
 892  892                  }
 893  893                  if (copyout(&msg->msg_flags, flagsp,
 894  894                      sizeof (msg->msg_flags))) {
 895  895                          error = EFAULT;
 896  896                          goto err;
 897  897                  }
 898  898          }
 899  899          /*
 900  900           * Note: This MUST be done last. There can be no "goto err" after this
 901  901           * point since it could make so_closefds run twice on some part
 902  902           * of the file descriptor array.
 903  903           */
 904  904          if (controllen != 0) {
 905  905                  if (!(flags & MSG_XPG4_2)) {
 906  906                          /*
 907  907                           * Good old msg_accrights can only return a multiple
 908  908                           * of 4 bytes.
 909  909                           */
 910  910                          controllen &= ~((int)sizeof (uint32_t) - 1);
 911  911                  }
 912  912                  error = copyout_arg(control, controllen, controllenp,
 913  913                      msg->msg_control, msg->msg_controllen);
 914  914                  if (error)
 915  915                          goto err;
 916  916  
 917  917                  if (msg->msg_controllen > controllen || control == NULL) {
 918  918                          if (control == NULL)
 919  919                                  controllen = 0;
 920  920                          so_closefds(msg->msg_control, msg->msg_controllen,
 921  921                              !(flags & MSG_XPG4_2), controllen);
 922  922                  }
 923  923          }
 924  924          if (msg->msg_namelen != 0)
 925  925                  kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
 926  926          if (msg->msg_controllen != 0)
 927  927                  kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
 928  928          return (len - uiop->uio_resid);
 929  929  
 930  930  err:
 931  931          /*
 932  932           * If we fail and the control part contains file descriptors
 933  933           * we have to close the fd's.
 934  934           */
 935  935          if (msg->msg_controllen != 0)
 936  936                  so_closefds(msg->msg_control, msg->msg_controllen,
 937  937                      !(flags & MSG_XPG4_2), 0);
 938  938          if (msg->msg_namelen != 0)
 939  939                  kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
 940  940          if (msg->msg_controllen != 0)
 941  941                  kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
 942  942          return (set_errno(error));
 943  943  }
 944  944  
 945  945  /*
 946  946   * Native system call
 947  947   */
 948  948  ssize_t
 949  949  recv(int sock, void *buffer, size_t len, int flags)
 950  950  {
 951  951          struct nmsghdr lmsg;
 952  952          struct uio auio;
 953  953          struct iovec aiov[1];
 954  954  
 955  955          dprint(1, ("recv(%d, %p, %ld, %d)\n",
 956  956              sock, buffer, len, flags));
 957  957  
 958  958          if ((ssize_t)len < 0) {
 959  959                  return (set_errno(EINVAL));
 960  960          }
 961  961  
 962  962          aiov[0].iov_base = buffer;
 963  963          aiov[0].iov_len = len;
 964  964          auio.uio_loffset = 0;
 965  965          auio.uio_iov = aiov;
 966  966          auio.uio_iovcnt = 1;
 967  967          auio.uio_resid = len;
 968  968          auio.uio_segflg = UIO_USERSPACE;
 969  969          auio.uio_limit = 0;
 970  970  
 971  971          lmsg.msg_namelen = 0;
 972  972          lmsg.msg_controllen = 0;
 973  973          lmsg.msg_flags = 0;
 974  974          return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL));
 975  975  }
 976  976  
 977  977  ssize_t
 978  978  recvfrom(int sock, void *buffer, size_t len, int flags,
 979  979          struct sockaddr *name, socklen_t *namelenp)
 980  980  {
 981  981          struct nmsghdr lmsg;
 982  982          struct uio auio;
 983  983          struct iovec aiov[1];
 984  984  
 985  985          dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
 986  986              sock, buffer, len, flags, (void *)name, (void *)namelenp));
 987  987  
 988  988          if ((ssize_t)len < 0) {
 989  989                  return (set_errno(EINVAL));
 990  990          }
 991  991  
 992  992          aiov[0].iov_base = buffer;
 993  993          aiov[0].iov_len = len;
 994  994          auio.uio_loffset = 0;
 995  995          auio.uio_iov = aiov;
 996  996          auio.uio_iovcnt = 1;
 997  997          auio.uio_resid = len;
 998  998          auio.uio_segflg = UIO_USERSPACE;
 999  999          auio.uio_limit = 0;
1000 1000  
1001 1001          lmsg.msg_name = (char *)name;
1002 1002          if (namelenp != NULL) {
1003 1003                  if (copyin(namelenp, &lmsg.msg_namelen,
1004 1004                      sizeof (lmsg.msg_namelen)))
1005 1005                          return (set_errno(EFAULT));
1006 1006          } else {
1007 1007                  lmsg.msg_namelen = 0;
1008 1008          }
1009 1009          lmsg.msg_controllen = 0;
1010 1010          lmsg.msg_flags = 0;
1011 1011  
1012 1012          return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL));
1013 1013  }
1014 1014  
1015 1015  /*
1016 1016   * Uses the MSG_XPG4_2 flag to determine if the caller is using
1017 1017   * struct omsghdr or struct nmsghdr.
1018 1018   */
1019 1019  ssize_t
1020 1020  recvmsg(int sock, struct nmsghdr *msg, int flags)
1021 1021  {
1022 1022          STRUCT_DECL(nmsghdr, u_lmsg);
1023 1023          STRUCT_HANDLE(nmsghdr, umsgptr);
1024 1024          struct nmsghdr lmsg;
1025 1025          struct uio auio;
1026 1026          struct iovec buf[IOV_MAX_STACK], *aiov = buf;
1027 1027          ssize_t iovsize = 0;
1028 1028          int iovcnt;
1029 1029          ssize_t len, rval;
1030 1030          int i;
1031 1031          int *flagsp;
1032 1032          model_t model;
1033 1033  
1034 1034          dprint(1, ("recvmsg(%d, %p, %d)\n",
1035 1035              sock, (void *)msg, flags));
1036 1036  
1037 1037          model = get_udatamodel();
1038 1038          STRUCT_INIT(u_lmsg, model);
1039 1039          STRUCT_SET_HANDLE(umsgptr, model, msg);
1040 1040  
1041 1041          if (flags & MSG_XPG4_2) {
1042 1042                  if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg)))
1043 1043                          return (set_errno(EFAULT));
1044 1044                  flagsp = STRUCT_FADDR(umsgptr, msg_flags);
1045 1045          } else {
1046 1046                  /*
1047 1047                   * Assumes that nmsghdr and omsghdr are identically shaped
1048 1048                   * except for the added msg_flags field.
1049 1049                   */
1050 1050                  if (copyin(msg, STRUCT_BUF(u_lmsg),
1051 1051                      SIZEOF_STRUCT(omsghdr, model)))
1052 1052                          return (set_errno(EFAULT));
1053 1053                  STRUCT_FSET(u_lmsg, msg_flags, 0);
1054 1054                  flagsp = NULL;
1055 1055          }
1056 1056  
1057 1057          /*
1058 1058           * Code below us will kmem_alloc memory and hang it
1059 1059           * off msg_control and msg_name fields. This forces
1060 1060           * us to copy the structure to its native form.
1061 1061           */
1062 1062          lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1063 1063          lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1064 1064          lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1065 1065          lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1066 1066          lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1067 1067          lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1068 1068          lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1069 1069  
1070 1070          iovcnt = lmsg.msg_iovlen;
1071 1071  
1072 1072          if (iovcnt <= 0 || iovcnt > IOV_MAX) {
1073 1073                  return (set_errno(EMSGSIZE));
1074 1074          }
1075 1075  
1076 1076          if (iovcnt > IOV_MAX_STACK) {
1077 1077                  iovsize = iovcnt * sizeof (struct iovec);
1078 1078                  aiov = kmem_alloc(iovsize, KM_SLEEP);
1079 1079          }
1080 1080  
1081 1081  #ifdef _SYSCALL32_IMPL
1082 1082          /*
1083 1083           * 32-bit callers need to have their iovec expanded, while ensuring
1084 1084           * that they can't move more than 2Gbytes of data in a single call.
1085 1085           */
1086 1086          if (model == DATAMODEL_ILP32) {
1087 1087                  struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1088 1088                  ssize_t iov32size;
1089 1089                  ssize32_t count32;
1090 1090  
1091 1091                  iov32size = iovcnt * sizeof (struct iovec32);
1092 1092                  if (iovsize != 0)
1093 1093                          aiov32 = kmem_alloc(iov32size, KM_SLEEP);
1094 1094  
1095 1095                  if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
1096 1096                          if (iovsize != 0) {
1097 1097                                  kmem_free(aiov32, iov32size);
1098 1098                                  kmem_free(aiov, iovsize);
1099 1099                          }
1100 1100  
1101 1101                          return (set_errno(EFAULT));
1102 1102                  }
1103 1103  
1104 1104                  count32 = 0;
1105 1105                  for (i = 0; i < iovcnt; i++) {
1106 1106                          ssize32_t iovlen32;
1107 1107  
1108 1108                          iovlen32 = aiov32[i].iov_len;
1109 1109                          count32 += iovlen32;
1110 1110                          if (iovlen32 < 0 || count32 < 0) {
1111 1111                                  if (iovsize != 0) {
1112 1112                                          kmem_free(aiov32, iov32size);
1113 1113                                          kmem_free(aiov, iovsize);
1114 1114                                  }
1115 1115  
1116 1116                                  return (set_errno(EINVAL));
1117 1117                          }
1118 1118  
1119 1119                          aiov[i].iov_len = iovlen32;
1120 1120                          aiov[i].iov_base =
1121 1121                              (caddr_t)(uintptr_t)aiov32[i].iov_base;
1122 1122                  }
1123 1123  
1124 1124                  if (iovsize != 0)
1125 1125                          kmem_free(aiov32, iov32size);
1126 1126          } else
1127 1127  #endif /* _SYSCALL32_IMPL */
1128 1128          if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
1129 1129                  if (iovsize != 0)
1130 1130                          kmem_free(aiov, iovsize);
1131 1131  
1132 1132                  return (set_errno(EFAULT));
1133 1133          }
1134 1134          len = 0;
1135 1135          for (i = 0; i < iovcnt; i++) {
1136 1136                  ssize_t iovlen = aiov[i].iov_len;
1137 1137                  len += iovlen;
1138 1138                  if (iovlen < 0 || len < 0) {
1139 1139                          if (iovsize != 0)
1140 1140                                  kmem_free(aiov, iovsize);
1141 1141  
1142 1142                          return (set_errno(EINVAL));
1143 1143                  }
1144 1144          }
1145 1145          auio.uio_loffset = 0;
1146 1146          auio.uio_iov = aiov;
1147 1147          auio.uio_iovcnt = iovcnt;
1148 1148          auio.uio_resid = len;
1149 1149          auio.uio_segflg = UIO_USERSPACE;
1150 1150          auio.uio_limit = 0;
1151 1151  
1152 1152          if (lmsg.msg_control != NULL &&
1153 1153              (do_useracc == 0 ||
1154 1154              useracc(lmsg.msg_control, lmsg.msg_controllen,
1155 1155              B_WRITE) != 0)) {
1156 1156                  if (iovsize != 0)
1157 1157                          kmem_free(aiov, iovsize);
1158 1158  
1159 1159                  return (set_errno(EFAULT));
1160 1160          }
1161 1161  
1162 1162          rval = recvit(sock, &lmsg, &auio, flags,
1163 1163              STRUCT_FADDR(umsgptr, msg_namelen),
1164 1164              STRUCT_FADDR(umsgptr, msg_controllen), flagsp);
1165 1165  
1166 1166          if (iovsize != 0)
1167 1167                  kmem_free(aiov, iovsize);
1168 1168  
1169 1169          return (rval);
1170 1170  }
1171 1171  
1172 1172  /*
1173 1173   * Common send function.
1174 1174   */
1175 1175  static ssize_t
1176 1176  sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
1177 1177  {
1178 1178          struct sonode *so;
1179 1179          file_t *fp;
1180 1180          void *name;
1181 1181          socklen_t namelen;
1182 1182          void *control;
1183 1183          socklen_t controllen;
1184 1184          ssize_t len;
1185 1185          int error;
1186 1186  
1187 1187          if ((so = getsonode(sock, &error, &fp)) == NULL)
1188 1188                  return (set_errno(error));
1189 1189  
1190 1190          uiop->uio_fmode = fp->f_flag;
1191 1191  
1192 1192          if (so->so_family == AF_UNIX)
1193 1193                  uiop->uio_extflg = UIO_COPY_CACHED;
1194 1194          else
1195 1195                  uiop->uio_extflg = UIO_COPY_DEFAULT;
1196 1196  
1197 1197          /* Allocate and copyin name and control */
1198 1198          name = msg->msg_name;
1199 1199          namelen = msg->msg_namelen;
1200 1200          if (name != NULL && namelen != 0) {
1201 1201                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1202 1202                  name = copyin_name(so,
1203 1203                      (struct sockaddr *)name,
1204 1204                      &namelen, &error);
1205 1205                  if (name == NULL)
1206 1206                          goto done3;
1207 1207                  /* copyin_name null terminates addresses for AF_UNIX */
1208 1208                  msg->msg_namelen = namelen;
1209 1209                  msg->msg_name = name;
1210 1210          } else {
1211 1211                  msg->msg_name = name = NULL;
1212 1212                  msg->msg_namelen = namelen = 0;
1213 1213          }
1214 1214  
1215 1215          control = msg->msg_control;
1216 1216          controllen = msg->msg_controllen;
1217 1217          if ((control != NULL) && (controllen != 0)) {
1218 1218                  /*
1219 1219                   * Verify that the length is not excessive to prevent
1220 1220                   * an application from consuming all of kernel memory.
1221 1221                   */
1222 1222                  if (controllen > SO_MAXARGSIZE) {
1223 1223                          error = EINVAL;
1224 1224                          goto done2;
1225 1225                  }
1226 1226                  control = kmem_alloc(controllen, KM_SLEEP);
1227 1227  
1228 1228                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1229 1229                  if (copyin(msg->msg_control, control, controllen)) {
1230 1230                          error = EFAULT;
1231 1231                          goto done1;
1232 1232                  }
1233 1233                  msg->msg_control = control;
1234 1234          } else {
1235 1235                  msg->msg_control = control = NULL;
1236 1236                  msg->msg_controllen = controllen = 0;
1237 1237          }
1238 1238  
1239 1239          len = uiop->uio_resid;
1240 1240          msg->msg_flags = flags;
1241 1241  
1242 1242          error = socket_sendmsg(so, msg, uiop, CRED());
1243 1243  done1:
1244 1244          if (control != NULL)
1245 1245                  kmem_free(control, controllen);
1246 1246  done2:
1247 1247          if (name != NULL)
1248 1248                  kmem_free(name, namelen);
1249 1249  done3:
1250 1250          if (error != 0) {
1251 1251                  releasef(sock);
1252 1252                  return (set_errno(error));
1253 1253          }
1254 1254          lwp_stat_update(LWP_STAT_MSGSND, 1);
1255 1255          releasef(sock);
1256 1256          return (len - uiop->uio_resid);
1257 1257  }
1258 1258  
1259 1259  /*
1260 1260   * Native system call
1261 1261   */
1262 1262  ssize_t
1263 1263  send(int sock, void *buffer, size_t len, int flags)
1264 1264  {
1265 1265          struct nmsghdr lmsg;
1266 1266          struct uio auio;
1267 1267          struct iovec aiov[1];
1268 1268  
1269 1269          dprint(1, ("send(%d, %p, %ld, %d)\n",
1270 1270              sock, buffer, len, flags));
1271 1271  
1272 1272          if ((ssize_t)len < 0) {
1273 1273                  return (set_errno(EINVAL));
1274 1274          }
1275 1275  
1276 1276          aiov[0].iov_base = buffer;
1277 1277          aiov[0].iov_len = len;
1278 1278          auio.uio_loffset = 0;
1279 1279          auio.uio_iov = aiov;
1280 1280          auio.uio_iovcnt = 1;
1281 1281          auio.uio_resid = len;
1282 1282          auio.uio_segflg = UIO_USERSPACE;
1283 1283          auio.uio_limit = 0;
1284 1284  
1285 1285          lmsg.msg_name = NULL;
1286 1286          lmsg.msg_control = NULL;
1287 1287          if (!(flags & MSG_XPG4_2)) {
1288 1288                  /*
1289 1289                   * In order to be compatible with the libsocket/sockmod
1290 1290                   * implementation we set EOR for all send* calls.
1291 1291                   */
1292 1292                  flags |= MSG_EOR;
1293 1293          }
1294 1294          return (sendit(sock, &lmsg, &auio, flags));
1295 1295  }
1296 1296  
1297 1297  /*
1298 1298   * Uses the MSG_XPG4_2 flag to determine if the caller is using
1299 1299   * struct omsghdr or struct nmsghdr.
1300 1300   */
1301 1301  ssize_t
1302 1302  sendmsg(int sock, struct nmsghdr *msg, int flags)
1303 1303  {
1304 1304          struct nmsghdr lmsg;
1305 1305          STRUCT_DECL(nmsghdr, u_lmsg);
1306 1306          struct uio auio;
1307 1307          struct iovec buf[IOV_MAX_STACK], *aiov = buf;
1308 1308          ssize_t iovsize = 0;
1309 1309          int iovcnt;
1310 1310          ssize_t len, rval;
1311 1311          int i;
1312 1312          model_t model;
1313 1313  
1314 1314          dprint(1, ("sendmsg(%d, %p, %d)\n", sock, (void *)msg, flags));
1315 1315  
1316 1316          model = get_udatamodel();
1317 1317          STRUCT_INIT(u_lmsg, model);
1318 1318  
1319 1319          if (flags & MSG_XPG4_2) {
1320 1320                  if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1321 1321                      STRUCT_SIZE(u_lmsg)))
1322 1322                          return (set_errno(EFAULT));
1323 1323          } else {
1324 1324                  /*
1325 1325                   * Assumes that nmsghdr and omsghdr are identically shaped
1326 1326                   * except for the added msg_flags field.
1327 1327                   */
1328 1328                  if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1329 1329                      SIZEOF_STRUCT(omsghdr, model)))
1330 1330                          return (set_errno(EFAULT));
1331 1331                  /*
1332 1332                   * In order to be compatible with the libsocket/sockmod
1333 1333                   * implementation we set EOR for all send* calls.
1334 1334                   */
1335 1335                  flags |= MSG_EOR;
1336 1336          }
1337 1337  
1338 1338          /*
1339 1339           * Code below us will kmem_alloc memory and hang it
1340 1340           * off msg_control and msg_name fields. This forces
1341 1341           * us to copy the structure to its native form.
1342 1342           */
1343 1343          lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1344 1344          lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1345 1345          lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1346 1346          lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1347 1347          lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1348 1348          lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1349 1349          lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1350 1350  
1351 1351          iovcnt = lmsg.msg_iovlen;
1352 1352  
1353 1353          if (iovcnt <= 0 || iovcnt > IOV_MAX) {
1354 1354                  /*
1355 1355                   * Unless this is XPG 4.2 we allow iovcnt == 0 to
1356 1356                   * be compatible with SunOS 4.X and 4.4BSD.
1357 1357                   */
1358 1358                  if (iovcnt != 0 || (flags & MSG_XPG4_2))
1359 1359                          return (set_errno(EMSGSIZE));
1360 1360          }
1361 1361  
1362 1362          if (iovcnt > IOV_MAX_STACK) {
1363 1363                  iovsize = iovcnt * sizeof (struct iovec);
1364 1364                  aiov = kmem_alloc(iovsize, KM_SLEEP);
1365 1365          }
1366 1366  
1367 1367  #ifdef _SYSCALL32_IMPL
1368 1368          /*
1369 1369           * 32-bit callers need to have their iovec expanded, while ensuring
1370 1370           * that they can't move more than 2Gbytes of data in a single call.
1371 1371           */
1372 1372          if (model == DATAMODEL_ILP32) {
1373 1373                  struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1374 1374                  ssize_t iov32size;
1375 1375                  ssize32_t count32;
1376 1376  
1377 1377                  iov32size = iovcnt * sizeof (struct iovec32);
1378 1378                  if (iovsize != 0)
1379 1379                          aiov32 = kmem_alloc(iov32size, KM_SLEEP);
1380 1380  
1381 1381                  if (iovcnt != 0 &&
1382 1382                      copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
1383 1383                          if (iovsize != 0) {
1384 1384                                  kmem_free(aiov32, iov32size);
1385 1385                                  kmem_free(aiov, iovsize);
1386 1386                          }
1387 1387  
1388 1388                          return (set_errno(EFAULT));
1389 1389                  }
1390 1390  
1391 1391                  count32 = 0;
1392 1392                  for (i = 0; i < iovcnt; i++) {
1393 1393                          ssize32_t iovlen32;
1394 1394  
1395 1395                          iovlen32 = aiov32[i].iov_len;
1396 1396                          count32 += iovlen32;
1397 1397                          if (iovlen32 < 0 || count32 < 0) {
1398 1398                                  if (iovsize != 0) {
1399 1399                                          kmem_free(aiov32, iov32size);
1400 1400                                          kmem_free(aiov, iovsize);
1401 1401                                  }
1402 1402  
1403 1403                                  return (set_errno(EINVAL));
1404 1404                          }
1405 1405  
1406 1406                          aiov[i].iov_len = iovlen32;
1407 1407                          aiov[i].iov_base =
1408 1408                              (caddr_t)(uintptr_t)aiov32[i].iov_base;
1409 1409                  }
1410 1410  
1411 1411                  if (iovsize != 0)
1412 1412                          kmem_free(aiov32, iov32size);
1413 1413          } else
1414 1414  #endif /* _SYSCALL32_IMPL */
1415 1415          if (iovcnt != 0 &&
1416 1416              copyin(lmsg.msg_iov, aiov,
1417 1417              (unsigned)iovcnt * sizeof (struct iovec))) {
1418 1418                  if (iovsize != 0)
1419 1419                          kmem_free(aiov, iovsize);
1420 1420  
1421 1421                  return (set_errno(EFAULT));
1422 1422          }
1423 1423          len = 0;
1424 1424          for (i = 0; i < iovcnt; i++) {
1425 1425                  ssize_t iovlen = aiov[i].iov_len;
1426 1426                  len += iovlen;
1427 1427                  if (iovlen < 0 || len < 0) {
1428 1428                          if (iovsize != 0)
1429 1429                                  kmem_free(aiov, iovsize);
1430 1430  
1431 1431                          return (set_errno(EINVAL));
1432 1432                  }
1433 1433          }
1434 1434          auio.uio_loffset = 0;
1435 1435          auio.uio_iov = aiov;
1436 1436          auio.uio_iovcnt = iovcnt;
1437 1437          auio.uio_resid = len;
1438 1438          auio.uio_segflg = UIO_USERSPACE;
1439 1439          auio.uio_limit = 0;
1440 1440  
1441 1441          rval = sendit(sock, &lmsg, &auio, flags);
1442 1442  
1443 1443          if (iovsize != 0)
1444 1444                  kmem_free(aiov, iovsize);
1445 1445  
1446 1446          return (rval);
1447 1447  }
1448 1448  
1449 1449  ssize_t
1450 1450  sendto(int sock, void *buffer, size_t len, int flags,
1451 1451      struct sockaddr *name, socklen_t namelen)
1452 1452  {
1453 1453          struct nmsghdr lmsg;
1454 1454          struct uio auio;
1455 1455          struct iovec aiov[1];
1456 1456  
1457 1457          dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
1458 1458              sock, buffer, len, flags, (void *)name, namelen));
1459 1459  
1460 1460          if ((ssize_t)len < 0) {
1461 1461                  return (set_errno(EINVAL));
1462 1462          }
1463 1463  
1464 1464          aiov[0].iov_base = buffer;
1465 1465          aiov[0].iov_len = len;
1466 1466          auio.uio_loffset = 0;
1467 1467          auio.uio_iov = aiov;
1468 1468          auio.uio_iovcnt = 1;
1469 1469          auio.uio_resid = len;
1470 1470          auio.uio_segflg = UIO_USERSPACE;
1471 1471          auio.uio_limit = 0;
1472 1472  
1473 1473          lmsg.msg_name = (char *)name;
1474 1474          lmsg.msg_namelen = namelen;
1475 1475          lmsg.msg_control = NULL;
1476 1476          if (!(flags & MSG_XPG4_2)) {
1477 1477                  /*
1478 1478                   * In order to be compatible with the libsocket/sockmod
1479 1479                   * implementation we set EOR for all send* calls.
1480 1480                   */
1481 1481                  flags |= MSG_EOR;
1482 1482          }
1483 1483          return (sendit(sock, &lmsg, &auio, flags));
1484 1484  }
1485 1485  
1486 1486  /*ARGSUSED3*/
1487 1487  int
1488 1488  getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1489 1489  {
1490 1490          struct sonode *so;
1491 1491          int error;
1492 1492          socklen_t namelen;
1493 1493          socklen_t sock_addrlen;
1494 1494          struct sockaddr *sock_addrp;
1495 1495  
1496 1496          dprint(1, ("getpeername(%d, %p, %p)\n",
1497 1497              sock, (void *)name, (void *)namelenp));
1498 1498  
1499 1499          if ((so = getsonode(sock, &error, NULL)) == NULL)
1500 1500                  goto bad;
1501 1501  
1502 1502          ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1503 1503          if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1504 1504              (name == NULL && namelen != 0)) {
1505 1505                  error = EFAULT;
1506 1506                  goto rel_out;
1507 1507          }
1508 1508          sock_addrlen = so->so_max_addr_len;
1509 1509          sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1510 1510  
1511 1511          if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen,
1512 1512              B_FALSE, CRED())) == 0) {
1513 1513                  ASSERT(sock_addrlen <= so->so_max_addr_len);
1514 1514                  error = copyout_name(name, namelen, namelenp,
1515 1515                      (void *)sock_addrp, sock_addrlen);
1516 1516          }
1517 1517          kmem_free(sock_addrp, so->so_max_addr_len);
1518 1518  rel_out:
1519 1519          releasef(sock);
1520 1520  bad:    return (error != 0 ? set_errno(error) : 0);
1521 1521  }
1522 1522  
1523 1523  /*ARGSUSED3*/
1524 1524  int
1525 1525  getsockname(int sock, struct sockaddr *name,
1526 1526                  socklen_t *namelenp, int version)
1527 1527  {
1528 1528          struct sonode *so;
1529 1529          int error;
1530 1530          socklen_t namelen, sock_addrlen;
1531 1531          struct sockaddr *sock_addrp;
1532 1532  
1533 1533          dprint(1, ("getsockname(%d, %p, %p)\n",
1534 1534              sock, (void *)name, (void *)namelenp));
1535 1535  
1536 1536          if ((so = getsonode(sock, &error, NULL)) == NULL)
1537 1537                  goto bad;
1538 1538  
1539 1539          ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1540 1540          if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1541 1541              (name == NULL && namelen != 0)) {
1542 1542                  error = EFAULT;
1543 1543                  goto rel_out;
1544 1544          }
1545 1545  
1546 1546          sock_addrlen = so->so_max_addr_len;
1547 1547          sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1548 1548          if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen,
1549 1549              CRED())) == 0) {
1550 1550                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1551 1551                  ASSERT(sock_addrlen <= so->so_max_addr_len);
1552 1552                  error = copyout_name(name, namelen, namelenp,
1553 1553                      (void *)sock_addrp, sock_addrlen);
1554 1554          }
1555 1555          kmem_free(sock_addrp, so->so_max_addr_len);
1556 1556  rel_out:
1557 1557          releasef(sock);
1558 1558  bad:    return (error != 0 ? set_errno(error) : 0);
1559 1559  }
1560 1560  
1561 1561  /*ARGSUSED5*/
1562 1562  int
1563 1563  getsockopt(int sock,
1564 1564          int level,
1565 1565          int option_name,
1566 1566          void *option_value,
1567 1567          socklen_t *option_lenp,
1568 1568          int version)
1569 1569  {
1570 1570          struct sonode *so;
1571 1571          socklen_t optlen, optlen_res;
1572 1572          void *optval;
1573 1573          int error;
1574 1574  
1575 1575          dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1576 1576              sock, level, option_name, option_value, (void *)option_lenp));
1577 1577  
1578 1578          if ((so = getsonode(sock, &error, NULL)) == NULL)
1579 1579                  return (set_errno(error));
1580 1580  
1581 1581          ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1582 1582          if (copyin(option_lenp, &optlen, sizeof (optlen))) {
1583 1583                  releasef(sock);
1584 1584                  return (set_errno(EFAULT));
1585 1585          }
1586 1586          /*
1587 1587           * Verify that the length is not excessive to prevent
1588 1588           * an application from consuming all of kernel memory.
1589 1589           */
1590 1590          if (optlen > SO_MAXARGSIZE) {
1591 1591                  error = EINVAL;
1592 1592                  releasef(sock);
1593 1593                  return (set_errno(error));
1594 1594          }
1595 1595          optval = kmem_alloc(optlen, KM_SLEEP);
1596 1596          optlen_res = optlen;
1597 1597          error = socket_getsockopt(so, level, option_name, optval,
1598 1598              &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2,
1599 1599              CRED());
1600 1600          releasef(sock);
1601 1601          if (error) {
1602 1602                  kmem_free(optval, optlen);
1603 1603                  return (set_errno(error));
1604 1604          }
1605 1605          error = copyout_arg(option_value, optlen, option_lenp,
1606 1606              optval, optlen_res);
1607 1607          kmem_free(optval, optlen);
1608 1608          if (error)
1609 1609                  return (set_errno(error));
1610 1610          return (0);
1611 1611  }
1612 1612  
1613 1613  /*ARGSUSED5*/
1614 1614  int
1615 1615  setsockopt(int sock,
1616 1616          int level,
1617 1617          int option_name,
1618 1618          void *option_value,
1619 1619          socklen_t option_len,
1620 1620          int version)
1621 1621  {
1622 1622          struct sonode *so;
1623 1623          intptr_t buffer[2];
1624 1624          void *optval = NULL;
1625 1625          int error;
1626 1626  
1627 1627          dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
1628 1628              sock, level, option_name, option_value, option_len));
1629 1629  
1630 1630          if ((so = getsonode(sock, &error, NULL)) == NULL)
1631 1631                  return (set_errno(error));
1632 1632  
1633 1633          if (option_value != NULL) {
1634 1634                  if (option_len != 0) {
1635 1635                          /*
1636 1636                           * Verify that the length is not excessive to prevent
1637 1637                           * an application from consuming all of kernel memory.
1638 1638                           */
1639 1639                          if (option_len > SO_MAXARGSIZE) {
1640 1640                                  error = EINVAL;
1641 1641                                  goto done2;
1642 1642                          }
1643 1643                          optval = option_len <= sizeof (buffer) ?
1644 1644                              &buffer : kmem_alloc((size_t)option_len, KM_SLEEP);
1645 1645                          ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1646 1646                          if (copyin(option_value, optval, (size_t)option_len)) {
1647 1647                                  error = EFAULT;
1648 1648                                  goto done1;
1649 1649                          }
1650 1650                  }
1651 1651          } else
1652 1652                  option_len = 0;
1653 1653  
1654 1654          error = socket_setsockopt(so, level, option_name, optval,
1655 1655              (t_uscalar_t)option_len, CRED());
1656 1656  done1:
1657 1657          if (optval != buffer)
1658 1658                  kmem_free(optval, (size_t)option_len);
1659 1659  done2:
1660 1660          releasef(sock);
1661 1661          if (error)
1662 1662                  return (set_errno(error));
1663 1663          return (0);
1664 1664  }
1665 1665  
1666 1666  static int
1667 1667  sockconf_add_sock(int family, int type, int protocol, char *name)
1668 1668  {
1669 1669          int error = 0;
1670 1670          char *kdevpath = NULL;
1671 1671          char *kmodule = NULL;
1672 1672          char *buf = NULL;
1673 1673          size_t pathlen = 0;
1674 1674          struct sockparams *sp;
1675 1675  
1676 1676          if (name == NULL)
1677 1677                  return (EINVAL);
1678 1678          /*
1679 1679           * Copyin the name.
1680 1680           * This also makes it possible to check for too long pathnames.
1681 1681           * Compress the space needed for the name before passing it
1682 1682           * to soconfig - soconfig will store the string until
1683 1683           * the configuration is removed.
1684 1684           */
1685 1685          buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1686 1686          if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
1687 1687                  kmem_free(buf, MAXPATHLEN);
1688 1688                  return (error);
1689 1689          }
1690 1690          if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
1691 1691                  /* For device */
1692 1692  
1693 1693                  /*
1694 1694                   * Special handling for NCA:
1695 1695                   *
1696 1696                   * DEV_NCA is never opened even if an application
1697 1697                   * requests for AF_NCA. The device opened is instead a
1698 1698                   * predefined AF_INET transport (NCA_INET_DEV).
1699 1699                   *
1700 1700                   * Prior to Volo (PSARC/2007/587) NCA would determine
1701 1701                   * the device using a lookup, which worked then because
1702 1702                   * all protocols were based on TPI. Since TPI is no
1703 1703                   * longer the default, we have to explicitly state
1704 1704                   * which device to use.
1705 1705                   */
1706 1706                  if (strcmp(buf, NCA_DEV) == 0) {
1707 1707                          /* only support entry <28, 2, 0> */
1708 1708                          if (family != AF_NCA || type != SOCK_STREAM ||
1709 1709                              protocol != 0) {
1710 1710                                  kmem_free(buf, MAXPATHLEN);
1711 1711                                  return (EINVAL);
1712 1712                          }
1713 1713  
1714 1714                          pathlen = strlen(NCA_INET_DEV) + 1;
1715 1715                          kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1716 1716                          bcopy(NCA_INET_DEV, kdevpath, pathlen);
1717 1717                          kdevpath[pathlen - 1] = '\0';
1718 1718                  } else {
1719 1719                          kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1720 1720                          bcopy(buf, kdevpath, pathlen);
1721 1721                          kdevpath[pathlen - 1] = '\0';
1722 1722                  }
1723 1723          } else {
1724 1724                  /* For socket module */
1725 1725                  kmodule = kmem_alloc(pathlen, KM_SLEEP);
1726 1726                  bcopy(buf, kmodule, pathlen);
1727 1727                  kmodule[pathlen - 1] = '\0';
1728 1728                  pathlen = 0;
1729 1729          }
1730 1730          kmem_free(buf, MAXPATHLEN);
1731 1731  
1732 1732          /* sockparams_create frees mod name and devpath upon failure */
1733 1733          sp = sockparams_create(family, type, protocol, kmodule,
1734 1734              kdevpath, pathlen, 0, KM_SLEEP, &error);
1735 1735          if (sp != NULL) {
1736 1736                  error = sockparams_add(sp);
1737 1737                  if (error != 0)
1738 1738                          sockparams_destroy(sp);
1739 1739          }
1740 1740  
1741 1741          return (error);
1742 1742  }
1743 1743  
1744 1744  static int
1745 1745  sockconf_remove_sock(int family, int type, int protocol)
1746 1746  {
1747 1747          return (sockparams_delete(family, type, protocol));
1748 1748  }
1749 1749  
1750 1750  static int
1751 1751  sockconfig_remove_filter(const char *uname)
1752 1752  {
1753 1753          char kname[SOF_MAXNAMELEN];
1754 1754          size_t len;
1755 1755          int error;
1756 1756          sof_entry_t *ent;
1757 1757  
1758 1758          if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0)
1759 1759                  return (error);
1760 1760  
1761 1761          ent = sof_entry_remove_by_name(kname);
1762 1762          if (ent == NULL)
1763 1763                  return (ENXIO);
1764 1764  
1765 1765          mutex_enter(&ent->sofe_lock);
1766 1766          ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED));
1767 1767          if (ent->sofe_refcnt == 0) {
1768 1768                  mutex_exit(&ent->sofe_lock);
1769 1769                  sof_entry_free(ent);
1770 1770          } else {
1771 1771                  /* let the last socket free the filter */
1772 1772                  ent->sofe_flags |= SOFEF_CONDEMED;
1773 1773                  mutex_exit(&ent->sofe_lock);
1774 1774          }
1775 1775  
1776 1776          return (0);
1777 1777  }
1778 1778  
1779 1779  static int
1780 1780  sockconfig_add_filter(const char *uname, void *ufilpropp)
1781 1781  {
1782 1782          struct sockconfig_filter_props filprop;
1783 1783          sof_entry_t *ent;
1784 1784          int error;
1785 1785          size_t tuplesz, len;
1786 1786          char hintbuf[SOF_MAXNAMELEN];
1787 1787  
1788 1788          ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP);
1789 1789          mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL);
1790 1790  
1791 1791          if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN,
1792 1792              &len)) != 0) {
1793 1793                  sof_entry_free(ent);
1794 1794                  return (error);
1795 1795          }
1796 1796  
1797 1797          if (get_udatamodel() == DATAMODEL_NATIVE) {
1798 1798                  if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) {
1799 1799                          sof_entry_free(ent);
1800 1800                          return (EFAULT);
1801 1801                  }
1802 1802          }
1803 1803  #ifdef  _SYSCALL32_IMPL
1804 1804          else {
1805 1805                  struct sockconfig_filter_props32 filprop32;
1806 1806  
1807 1807                  if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) {
1808 1808                          sof_entry_free(ent);
1809 1809                          return (EFAULT);
1810 1810                  }
1811 1811                  filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname;
1812 1812                  filprop.sfp_autoattach = filprop32.sfp_autoattach;
1813 1813                  filprop.sfp_hint = filprop32.sfp_hint;
1814 1814                  filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg;
1815 1815                  filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt;
1816 1816                  filprop.sfp_socktuple =
1817 1817                      (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple;
1818 1818          }
1819 1819  #endif  /* _SYSCALL32_IMPL */
1820 1820  
1821 1821          if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname,
1822 1822              sizeof (ent->sofe_modname), &len)) != 0) {
1823 1823                  sof_entry_free(ent);
1824 1824                  return (error);
1825 1825          }
1826 1826  
1827 1827          /*
1828 1828           * A filter must specify at least one socket tuple.
1829 1829           */
1830 1830          if (filprop.sfp_socktuple_cnt == 0 ||
1831 1831              filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) {
1832 1832                  sof_entry_free(ent);
1833 1833                  return (EINVAL);
1834 1834          }
1835 1835          ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG;
1836 1836          ent->sofe_hint = filprop.sfp_hint;
1837 1837  
1838 1838          /*
1839 1839           * Verify the hint, and copy in the hint argument, if necessary.
1840 1840           */
1841 1841          switch (ent->sofe_hint) {
1842 1842          case SOF_HINT_BEFORE:
1843 1843          case SOF_HINT_AFTER:
1844 1844                  if ((error = copyinstr(filprop.sfp_hintarg, hintbuf,
1845 1845                      sizeof (hintbuf), &len)) != 0) {
1846 1846                          sof_entry_free(ent);
1847 1847                          return (error);
1848 1848                  }
1849 1849                  ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP);
1850 1850                  bcopy(hintbuf, ent->sofe_hintarg, len);
1851 1851                  /* FALLTHRU */
1852 1852          case SOF_HINT_TOP:
1853 1853          case SOF_HINT_BOTTOM:
1854 1854                  /* hints cannot be used with programmatic filters */
1855 1855                  if (ent->sofe_flags & SOFEF_PROG) {
1856 1856                          sof_entry_free(ent);
1857 1857                          return (EINVAL);
1858 1858                  }
1859 1859                  break;
1860 1860          case SOF_HINT_NONE:
1861 1861                  break;
1862 1862          default:
1863 1863                  /* bad hint value */
1864 1864                  sof_entry_free(ent);
1865 1865                  return (EINVAL);
1866 1866          }
1867 1867  
1868 1868          ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt;
1869 1869          tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt;
1870 1870          ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP);
1871 1871  
1872 1872          if (get_udatamodel() == DATAMODEL_NATIVE) {
1873 1873                  if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple,
1874 1874                      tuplesz)) {
1875 1875                          sof_entry_free(ent);
1876 1876                          return (EFAULT);
1877 1877                  }
1878 1878          }
1879 1879  #ifdef  _SYSCALL32_IMPL
1880 1880          else {
1881 1881                  int i;
1882 1882                  caddr_t data = (caddr_t)filprop.sfp_socktuple;
1883 1883                  sof_socktuple_t *tup = ent->sofe_socktuple;
1884 1884                  sof_socktuple32_t tup32;
1885 1885  
1886 1886                  tup = ent->sofe_socktuple;
1887 1887                  for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) {
1888 1888                          ASSERT(tup < ent->sofe_socktuple + tuplesz);
1889 1889  
1890 1890                          if (copyin(data, &tup32, sizeof (tup32)) != 0) {
1891 1891                                  sof_entry_free(ent);
1892 1892                                  return (EFAULT);
1893 1893                          }
1894 1894                          tup->sofst_family = tup32.sofst_family;
1895 1895                          tup->sofst_type = tup32.sofst_type;
1896 1896                          tup->sofst_protocol = tup32.sofst_protocol;
1897 1897  
1898 1898                          data += sizeof (tup32);
1899 1899                  }
1900 1900          }
1901 1901  #endif  /* _SYSCALL32_IMPL */
1902 1902  
1903 1903          /* Sockets can start using the filter as soon as the filter is added */
1904 1904          if ((error = sof_entry_add(ent)) != 0)
1905 1905                  sof_entry_free(ent);
1906 1906  
1907 1907          return (error);
1908 1908  }
1909 1909  
1910 1910  /*
1911 1911   * Socket configuration system call. It is used to add and remove
1912 1912   * socket types.
1913 1913   */
1914 1914  int
1915 1915  sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
1916 1916  {
1917 1917          int error = 0;
1918 1918  
1919 1919          if (secpolicy_net_config(CRED(), B_FALSE) != 0)
1920 1920                  return (set_errno(EPERM));
1921 1921  
1922 1922          if (sockfs_defer_nl7c_init) {
1923 1923                  nl7c_init();
1924 1924                  sockfs_defer_nl7c_init = 0;
1925 1925          }
1926 1926  
1927 1927          switch (cmd) {
1928 1928          case SOCKCONFIG_ADD_SOCK:
1929 1929                  error = sockconf_add_sock((int)(uintptr_t)arg1,
1930 1930                      (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4);
1931 1931                  break;
1932 1932          case SOCKCONFIG_REMOVE_SOCK:
1933 1933                  error = sockconf_remove_sock((int)(uintptr_t)arg1,
1934 1934                      (int)(uintptr_t)arg2, (int)(uintptr_t)arg3);
1935 1935                  break;
1936 1936          case SOCKCONFIG_ADD_FILTER:
1937 1937                  error = sockconfig_add_filter((const char *)arg1, arg2);
1938 1938                  break;
1939 1939          case SOCKCONFIG_REMOVE_FILTER:
1940 1940                  error = sockconfig_remove_filter((const char *)arg1);
1941 1941                  break;
1942 1942          case SOCKCONFIG_GET_SOCKTABLE:
1943 1943                  error = sockparams_copyout_socktable((int)(uintptr_t)arg1);
1944 1944                  break;
1945 1945          default:
1946 1946  #ifdef  DEBUG
1947 1947                  cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd);
1948 1948  #endif
1949 1949                  error = EINVAL;
1950 1950                  break;
1951 1951          }
1952 1952  
1953 1953          if (error != 0) {
1954 1954                  eprintline(error);
1955 1955                  return (set_errno(error));
1956 1956          }
1957 1957          return (0);
1958 1958  }
1959 1959  
1960 1960  
1961 1961  /*
1962 1962   * Sendfile is implemented through two schemes, direct I/O or by
1963 1963   * caching in the filesystem page cache. We cache the input file by
1964 1964   * default and use direct I/O only if sendfile_max_size is set
1965 1965   * appropriately as explained below. Note that this logic is consistent
1966 1966   * with other filesystems where caching is turned on by default
1967 1967   * unless explicitly turned off by using the DIRECTIO ioctl.
1968 1968   *
1969 1969   * We choose a slightly different scheme here. One can turn off
1970 1970   * caching by setting sendfile_max_size to 0. One can also enable
1971 1971   * caching of files <= sendfile_max_size by setting sendfile_max_size
1972 1972   * to an appropriate value. By default sendfile_max_size is set to the
1973 1973   * maximum value so that all files are cached. In future, we may provide
1974 1974   * better interfaces for caching the file.
1975 1975   *
1976 1976   * Sendfile through Direct I/O (Zero copy)
1977 1977   * --------------------------------------
1978 1978   *
1979 1979   * As disks are normally slower than the network, we can't have a
1980 1980   * single thread that reads the disk and writes to the network. We
1981 1981   * need to have parallelism. This is done by having the sendfile
1982 1982   * thread create another thread that reads from the filesystem
1983 1983   * and queues it for network processing. In this scheme, the data
1984 1984   * is never copied anywhere i.e it is zero copy unlike the other
1985 1985   * scheme.
1986 1986   *
1987 1987   * We have a sendfile queue (snfq) where each sendfile
1988 1988   * request (snf_req_t) is queued for processing by a thread. Number
1989 1989   * of threads is dynamically allocated and they exit if they are idling
1990 1990   * beyond a specified amount of time. When each request (snf_req_t) is
1991 1991   * processed by a thread, it produces a number of mblk_t structures to
1992 1992   * be consumed by the sendfile thread. snf_deque and snf_enque are
1993 1993   * used for consuming and producing mblks. Size of the filesystem
1994 1994   * read is determined by the tunable (sendfile_read_size). A single
1995 1995   * mblk holds sendfile_read_size worth of data (except the last
1996 1996   * read of the file) which is sent down as a whole to the network.
1997 1997   * sendfile_read_size is set to 1 MB as this seems to be the optimal
1998 1998   * value for the UFS filesystem backed by a striped storage array.
1999 1999   *
2000 2000   * Synchronisation between read (producer) and write (consumer) threads.
2001 2001   * --------------------------------------------------------------------
2002 2002   *
2003 2003   * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
2004 2004   * adding and deleting items in this list. Error can happen anytime
2005 2005   * during read or write. There could be unprocessed mblks in the
2006 2006   * sr_ib_XXX list when a read or write error occurs. Whenever error
2007 2007   * is encountered, we need two things to happen :
2008 2008   *
2009 2009   * a) One of the threads need to clean the mblks.
2010 2010   * b) When one thread encounters an error, the other should stop.
2011 2011   *
2012 2012   * For (a), we don't want to penalize the reader thread as it could do
2013 2013   * some useful work processing other requests. For (b), the error can
2014 2014   * be detected by examining sr_read_error or sr_write_error.
2015 2015   * sr_lock protects sr_read_error and sr_write_error. If both reader and
2016 2016   * writer encounters error, we need to report the write error back to
2017 2017   * the application as that's what would have happened if the operations
2018 2018   * were done sequentially. With this in mind, following should work :
2019 2019   *
2020 2020   *      - Check for errors before read or write.
2021 2021   *      - If the reader encounters error, set the error in sr_read_error.
2022 2022   *        Check sr_write_error, if it is set, send cv_signal as it is
2023 2023   *        waiting for reader to complete. If it is not set, the writer
2024 2024   *        is either running sinking data to the network or blocked
2025 2025   *        because of flow control. For handling the latter case, we
2026 2026   *        always send a signal. In any case, it will examine sr_read_error
2027 2027   *        and return. sr_read_error is marked with SR_READ_DONE to tell
2028 2028   *        the writer that the reader is done in all the cases.
2029 2029   *      - If the writer encounters error, set the error in sr_write_error.
2030 2030   *        The reader thread is either blocked because of flow control or
2031 2031   *        running reading data from the disk. For the former, we need to
2032 2032   *        wakeup the thread. Again to keep it simple, we always wake up
2033 2033   *        the reader thread. Then, wait for the read thread to complete
2034 2034   *        if it is not done yet. Cleanup and return.
2035 2035   *
2036 2036   * High and low water marks for the read thread.
2037 2037   * --------------------------------------------
2038 2038   *
2039 2039   * If sendfile() is used to send data over a slow network, we need to
2040 2040   * make sure that the read thread does not produce data at a faster
2041 2041   * rate than the network. This can happen if the disk is faster than
2042 2042   * the network. In such a case, we don't want to build a very large queue.
2043 2043   * But we would still like to get all of the network throughput possible.
2044 2044   * This implies that network should never block waiting for data.
2045 2045   * As there are lot of disk throughput/network throughput combinations
2046 2046   * possible, it is difficult to come up with an accurate number.
2047 2047   * A typical 10K RPM disk has a max seek latency 17ms and rotational
2048 2048   * latency of 3ms for reading a disk block. Thus, the total latency to
2049 2049   * initiate a new read, transfer data from the disk and queue for
2050 2050   * transmission would take about a max of 25ms. Todays max transfer rate
2051 2051   * for network is 100MB/sec. If the thread is blocked because of flow
2052 2052   * control, it would take 25ms to get new data ready for transmission.
2053 2053   * We have to make sure that network is not idling, while we are initiating
2054 2054   * new transfers. So, at 100MB/sec, to keep network busy we would need
2055 2055   * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data.
2056 2056   * We need to pick a high water mark so that the woken up thread would
2057 2057   * do considerable work before blocking again to prevent thrashing. Currently,
2058 2058   * we pick this to be 10 times that of the low water mark.
2059 2059   *
2060 2060   * Sendfile with segmap caching (One copy from page cache to mblks).
2061 2061   * ----------------------------------------------------------------
2062 2062   *
2063 2063   * We use the segmap cache for caching the file, if the size of file
2064 2064   * is <= sendfile_max_size. In this case we don't use threads as VM
2065 2065   * is reasonably fast enough to keep up with the network. If the underlying
2066 2066   * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
2067 2067   * of data into segmap space, and use the virtual address from segmap
2068 2068   * directly through desballoc() to avoid copy. Once the transport is done
2069 2069   * with the data, the mapping will be released through segmap_release()
2070 2070   * called by the call-back routine.
2071 2071   *
2072 2072   * If zero-copy is not allowed by the transport, we simply call VOP_READ()
2073 2073   * to copy the data from the filesystem into our temporary network buffer.
2074 2074   *
2075 2075   * To disable caching, set sendfile_max_size to 0.
2076 2076   */
2077 2077  
2078 2078  uint_t sendfile_read_size = 1024 * 1024;
2079 2079  #define SENDFILE_REQ_LOWAT      3 * 1024 * 1024
2080 2080  uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT;
2081 2081  uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT;
2082 2082  struct sendfile_stats sf_stats;
2083 2083  struct sendfile_queue *snfq;
2084 2084  clock_t snfq_timeout;
2085 2085  off64_t sendfile_max_size;
2086 2086  
2087 2087  static void snf_enque(snf_req_t *, mblk_t *);
2088 2088  static mblk_t *snf_deque(snf_req_t *);
2089 2089  
2090 2090  void
2091 2091  sendfile_init(void)
2092 2092  {
2093 2093          snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP);
2094 2094  
2095 2095          mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL);
2096 2096          cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL);
2097 2097          snfq->snfq_max_threads = max_ncpus;
2098 2098          snfq_timeout = SNFQ_TIMEOUT;
2099 2099          /* Cache all files by default. */
2100 2100          sendfile_max_size = MAXOFFSET_T;
2101 2101  }
2102 2102  
2103 2103  /*
2104 2104   * Queues a mblk_t for network processing.
2105 2105   */
2106 2106  static void
2107 2107  snf_enque(snf_req_t *sr, mblk_t *mp)
2108 2108  {
2109 2109          mp->b_next = NULL;
2110 2110          mutex_enter(&sr->sr_lock);
2111 2111          if (sr->sr_mp_head == NULL) {
2112 2112                  sr->sr_mp_head = sr->sr_mp_tail = mp;
2113 2113                  cv_signal(&sr->sr_cv);
2114 2114          } else {
2115 2115                  sr->sr_mp_tail->b_next = mp;
2116 2116                  sr->sr_mp_tail = mp;
2117 2117          }
2118 2118          sr->sr_qlen += MBLKL(mp);
2119 2119          while ((sr->sr_qlen > sr->sr_hiwat) &&
2120 2120              (sr->sr_write_error == 0)) {
2121 2121                  sf_stats.ss_full_waits++;
2122 2122                  cv_wait(&sr->sr_cv, &sr->sr_lock);
2123 2123          }
2124 2124          mutex_exit(&sr->sr_lock);
2125 2125  }
2126 2126  
2127 2127  /*
2128 2128   * De-queues a mblk_t for network processing.
2129 2129   */
2130 2130  static mblk_t *
2131 2131  snf_deque(snf_req_t *sr)
2132 2132  {
2133 2133          mblk_t *mp;
2134 2134  
2135 2135          mutex_enter(&sr->sr_lock);
2136 2136          /*
2137 2137           * If we have encountered an error on read or read is
2138 2138           * completed and no more mblks, return NULL.
2139 2139           * We need to check for NULL sr_mp_head also as
2140 2140           * the reads could have completed and there is
2141 2141           * nothing more to come.
2142 2142           */
2143 2143          if (((sr->sr_read_error & ~SR_READ_DONE) != 0) ||
2144 2144              ((sr->sr_read_error & SR_READ_DONE) &&
2145 2145              sr->sr_mp_head == NULL)) {
2146 2146                  mutex_exit(&sr->sr_lock);
2147 2147                  return (NULL);
2148 2148          }
2149 2149          /*
2150 2150           * To start with neither SR_READ_DONE is marked nor
2151 2151           * the error is set. When we wake up from cv_wait,
2152 2152           * following are the possibilities :
2153 2153           *
2154 2154           *      a) sr_read_error is zero and mblks are queued.
2155 2155           *      b) sr_read_error is set to SR_READ_DONE
2156 2156           *         and mblks are queued.
2157 2157           *      c) sr_read_error is set to SR_READ_DONE
2158 2158           *         and no mblks.
2159 2159           *      d) sr_read_error is set to some error other
2160 2160           *         than SR_READ_DONE.
2161 2161           */
2162 2162  
2163 2163          while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) {
2164 2164                  sf_stats.ss_empty_waits++;
2165 2165                  cv_wait(&sr->sr_cv, &sr->sr_lock);
2166 2166          }
2167 2167          /* Handle (a) and (b) first  - the normal case. */
2168 2168          if (((sr->sr_read_error & ~SR_READ_DONE) == 0) &&
2169 2169              (sr->sr_mp_head != NULL)) {
2170 2170                  mp = sr->sr_mp_head;
2171 2171                  sr->sr_mp_head = mp->b_next;
2172 2172                  sr->sr_qlen -= MBLKL(mp);
2173 2173                  if (sr->sr_qlen < sr->sr_lowat)
2174 2174                          cv_signal(&sr->sr_cv);
2175 2175                  mutex_exit(&sr->sr_lock);
2176 2176                  mp->b_next = NULL;
2177 2177                  return (mp);
2178 2178          }
2179 2179          /* Handle (c) and (d). */
2180 2180          mutex_exit(&sr->sr_lock);
2181 2181          return (NULL);
2182 2182  }
2183 2183  
2184 2184  /*
2185 2185   * Reads data from the filesystem and queues it for network processing.
2186 2186   */
2187 2187  void
2188 2188  snf_async_read(snf_req_t *sr)
2189 2189  {
2190 2190          size_t iosize;
2191 2191          u_offset_t fileoff;
2192 2192          u_offset_t size;
2193 2193          int ret_size;
2194 2194          int error;
2195 2195          file_t *fp;
2196 2196          mblk_t *mp;
2197 2197          struct vnode *vp;
2198 2198          int extra = 0;
2199 2199          int maxblk = 0;
2200 2200          int wroff = 0;
2201 2201          struct sonode *so;
2202 2202  
2203 2203          fp = sr->sr_fp;
2204 2204          size = sr->sr_file_size;
2205 2205          fileoff = sr->sr_file_off;
2206 2206  
2207 2207          /*
2208 2208           * Ignore the error for filesystems that doesn't support DIRECTIO.
2209 2209           */
2210 2210          (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0,
2211 2211              kcred, NULL, NULL);
2212 2212  
2213 2213          vp = sr->sr_vp;
2214 2214          if (vp->v_type == VSOCK) {
2215 2215                  stdata_t *stp;
2216 2216  
2217 2217                  /*
2218 2218                   * Get the extra space to insert a header and a trailer.
2219 2219                   */
2220 2220                  so = VTOSO(vp);
2221 2221                  stp = vp->v_stream;
2222 2222                  if (stp == NULL) {
2223 2223                          wroff = so->so_proto_props.sopp_wroff;
2224 2224                          maxblk = so->so_proto_props.sopp_maxblk;
2225 2225                          extra = wroff + so->so_proto_props.sopp_tail;
2226 2226                  } else {
2227 2227                          wroff = (int)(stp->sd_wroff);
2228 2228                          maxblk = (int)(stp->sd_maxblk);
2229 2229                          extra = wroff + (int)(stp->sd_tail);
2230 2230                  }
2231 2231          }
2232 2232  
2233 2233          while ((size != 0) && (sr->sr_write_error == 0)) {
2234 2234  
2235 2235                  iosize = (int)MIN(sr->sr_maxpsz, size);
2236 2236  
2237 2237                  /*
2238 2238                   * Socket filters can limit the mblk size,
2239 2239                   * so limit reads to maxblk if there are
2240 2240                   * filters present.
2241 2241                   */
2242 2242                  if (vp->v_type == VSOCK &&
2243 2243                      so->so_filter_active > 0 && maxblk != INFPSZ)
2244 2244                          iosize = (int)MIN(iosize, maxblk);
2245 2245  
2246 2246                  if (is_system_labeled()) {
2247 2247                          mp = allocb_cred(iosize + extra, CRED(),
2248 2248                              curproc->p_pid);
2249 2249                  } else {
2250 2250                          mp = allocb(iosize + extra, BPRI_MED);
2251 2251                  }
2252 2252                  if (mp == NULL) {
2253 2253                          error = EAGAIN;
2254 2254                          break;
2255 2255                  }
2256 2256  
2257 2257                  mp->b_rptr += wroff;
2258 2258  
2259 2259                  ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize);
2260 2260  
2261 2261                  /* Error or Reached EOF ? */
2262 2262                  if ((error != 0) || (ret_size == 0)) {
2263 2263                          freeb(mp);
2264 2264                          break;
2265 2265                  }
2266 2266                  mp->b_wptr = mp->b_rptr + ret_size;
2267 2267  
2268 2268                  snf_enque(sr, mp);
2269 2269                  size -= ret_size;
2270 2270                  fileoff += ret_size;
2271 2271          }
2272 2272          (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0,
2273 2273              kcred, NULL, NULL);
2274 2274          mutex_enter(&sr->sr_lock);
2275 2275          sr->sr_read_error = error;
2276 2276          sr->sr_read_error |= SR_READ_DONE;
2277 2277          cv_signal(&sr->sr_cv);
2278 2278          mutex_exit(&sr->sr_lock);
2279 2279  }
2280 2280  
2281 2281  void
2282 2282  snf_async_thread(void)
2283 2283  {
2284 2284          snf_req_t *sr;
2285 2285          callb_cpr_t cprinfo;
2286 2286          clock_t time_left = 1;
2287 2287  
2288 2288          CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq");
2289 2289  
2290 2290          mutex_enter(&snfq->snfq_lock);
2291 2291          for (;;) {
2292 2292                  /*
2293 2293                   * If we didn't find a entry, then block until woken up
2294 2294                   * again and then look through the queues again.
2295 2295                   */
2296 2296                  while ((sr = snfq->snfq_req_head) == NULL) {
2297 2297                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
2298 2298                          if (time_left <= 0) {
2299 2299                                  snfq->snfq_svc_threads--;
2300 2300                                  CALLB_CPR_EXIT(&cprinfo);
2301 2301                                  thread_exit();
2302 2302                                  /* NOTREACHED */
2303 2303                          }
2304 2304                          snfq->snfq_idle_cnt++;
2305 2305  
2306 2306                          time_left = cv_reltimedwait(&snfq->snfq_cv,
2307 2307                              &snfq->snfq_lock, snfq_timeout, TR_CLOCK_TICK);
2308 2308                          snfq->snfq_idle_cnt--;
2309 2309  
2310 2310                          CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock);
2311 2311                  }
2312 2312                  snfq->snfq_req_head = sr->sr_next;
2313 2313                  snfq->snfq_req_cnt--;
2314 2314                  mutex_exit(&snfq->snfq_lock);
2315 2315                  snf_async_read(sr);
2316 2316                  mutex_enter(&snfq->snfq_lock);
2317 2317          }
2318 2318  }
2319 2319  
2320 2320  
2321 2321  snf_req_t *
2322 2322  create_thread(int operation, struct vnode *vp, file_t *fp,
2323 2323      u_offset_t fileoff, u_offset_t size)
2324 2324  {
2325 2325          snf_req_t *sr;
2326 2326          stdata_t *stp;
2327 2327  
2328 2328          sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP);
2329 2329  
2330 2330          sr->sr_vp = vp;
2331 2331          sr->sr_fp = fp;
2332 2332          stp = vp->v_stream;
2333 2333  
2334 2334          /*
2335 2335           * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2336 2336           * stream might be closed before thread returns from snf_async_read.
2337 2337           */
2338 2338          if (stp != NULL && stp->sd_qn_maxpsz > 0) {
2339 2339                  sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz);
2340 2340          } else {
2341 2341                  sr->sr_maxpsz = MAXBSIZE;
2342 2342          }
2343 2343  
2344 2344          sr->sr_operation = operation;
2345 2345          sr->sr_file_off = fileoff;
2346 2346          sr->sr_file_size = size;
2347 2347          sr->sr_hiwat = sendfile_req_hiwat;
2348 2348          sr->sr_lowat = sendfile_req_lowat;
2349 2349          mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL);
2350 2350          cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL);
2351 2351          /*
2352 2352           * See whether we need another thread for servicing this
2353 2353           * request. If there are already enough requests queued
2354 2354           * for the threads, create one if not exceeding
2355 2355           * snfq_max_threads.
2356 2356           */
2357 2357          mutex_enter(&snfq->snfq_lock);
2358 2358          if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt &&
2359 2359              snfq->snfq_svc_threads < snfq->snfq_max_threads) {
2360 2360                  (void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0,
2361 2361                      TS_RUN, minclsyspri);
2362 2362                  snfq->snfq_svc_threads++;
2363 2363          }
2364 2364          if (snfq->snfq_req_head == NULL) {
2365 2365                  snfq->snfq_req_head = snfq->snfq_req_tail = sr;
2366 2366                  cv_signal(&snfq->snfq_cv);
2367 2367          } else {
2368 2368                  snfq->snfq_req_tail->sr_next = sr;
2369 2369                  snfq->snfq_req_tail = sr;
2370 2370          }
2371 2371          snfq->snfq_req_cnt++;
2372 2372          mutex_exit(&snfq->snfq_lock);
2373 2373          return (sr);
2374 2374  }
2375 2375  
2376 2376  int
2377 2377  snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
2378 2378      ssize_t *count)
2379 2379  {
2380 2380          snf_req_t *sr;
2381 2381          mblk_t *mp;
2382 2382          int iosize;
2383 2383          int error = 0;
2384 2384          short fflag;
2385 2385          struct vnode *vp;
2386 2386          int ksize;
2387 2387          struct nmsghdr msg;
2388 2388  
2389 2389          ksize = 0;
2390 2390          *count = 0;
2391 2391          bzero(&msg, sizeof (msg));
2392 2392  
2393 2393          vp = fp->f_vnode;
2394 2394          fflag = fp->f_flag;
2395 2395          if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL)
2396 2396                  return (EAGAIN);
2397 2397  
2398 2398          /*
2399 2399           * We check for read error in snf_deque. It has to check
2400 2400           * for successful READ_DONE and return NULL, and we might
2401 2401           * as well make an additional check there.
2402 2402           */
2403 2403          while ((mp = snf_deque(sr)) != NULL) {
2404 2404  
2405 2405                  if (ISSIG(curthread, JUSTLOOKING)) {
2406 2406                          freeb(mp);
2407 2407                          error = EINTR;
2408 2408                          break;
2409 2409                  }
2410 2410                  iosize = MBLKL(mp);
2411 2411  
2412 2412                  error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2413 2413  
2414 2414                  if (error != 0) {
2415 2415                          if (mp != NULL)
2416 2416                                  freeb(mp);
2417 2417                          break;
2418 2418                  }
2419 2419                  ksize += iosize;
2420 2420          }
2421 2421          *count = ksize;
2422 2422  
2423 2423          mutex_enter(&sr->sr_lock);
2424 2424          sr->sr_write_error = error;
2425 2425          /* Look at the big comments on why we cv_signal here. */
2426 2426          cv_signal(&sr->sr_cv);
2427 2427  
2428 2428          /* Wait for the reader to complete always. */
2429 2429          while (!(sr->sr_read_error & SR_READ_DONE)) {
2430 2430                  cv_wait(&sr->sr_cv, &sr->sr_lock);
2431 2431          }
2432 2432          /* If there is no write error, check for read error. */
2433 2433          if (error == 0)
2434 2434                  error = (sr->sr_read_error & ~SR_READ_DONE);
2435 2435  
2436 2436          if (error != 0) {
2437 2437                  mblk_t *next_mp;
2438 2438  
2439 2439                  mp = sr->sr_mp_head;
2440 2440                  while (mp != NULL) {
2441 2441                          next_mp = mp->b_next;
2442 2442                          mp->b_next = NULL;
2443 2443                          freeb(mp);
2444 2444                          mp = next_mp;
2445 2445                  }
2446 2446          }
2447 2447          mutex_exit(&sr->sr_lock);
2448 2448          kmem_free(sr, sizeof (snf_req_t));
2449 2449          return (error);
2450 2450  }
2451 2451  
2452 2452  /* Maximum no.of pages allocated by vpm for sendfile at a time */
2453 2453  #define SNF_VPMMAXPGS   (VPMMAXPGS/2)
2454 2454  
2455 2455  /*
2456 2456   * Maximum no.of elements in the list returned by vpm, including
2457 2457   * NULL for the last entry
2458 2458   */
2459 2459  #define SNF_MAXVMAPS    (SNF_VPMMAXPGS + 1)
2460 2460  
2461 2461  typedef struct {
2462 2462          unsigned int    snfv_ref;
2463 2463          frtn_t          snfv_frtn;
2464 2464          vnode_t         *snfv_vp;
2465 2465          struct vmap     snfv_vml[SNF_MAXVMAPS];
2466 2466  } snf_vmap_desbinfo;
2467 2467  
2468 2468  typedef struct {
2469 2469          frtn_t          snfi_frtn;
2470 2470          caddr_t         snfi_base;
2471 2471          uint_t          snfi_mapoff;
2472 2472          size_t          snfi_len;
2473 2473          vnode_t         *snfi_vp;
2474 2474  } snf_smap_desbinfo;
2475 2475  
2476 2476  /*
2477 2477   * The callback function used for vpm mapped mblks called when the last ref of
2478 2478   * the mblk is dropped which normally occurs when TCP receives the ack. But it
2479 2479   * can be the driver too due to lazy reclaim.
2480 2480   */
2481 2481  void
2482 2482  snf_vmap_desbfree(snf_vmap_desbinfo *snfv)
2483 2483  {
2484 2484          ASSERT(snfv->snfv_ref != 0);
2485 2485          if (atomic_dec_32_nv(&snfv->snfv_ref) == 0) {
2486 2486                  vpm_unmap_pages(snfv->snfv_vml, S_READ);
2487 2487                  VN_RELE(snfv->snfv_vp);
2488 2488                  kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2489 2489          }
2490 2490  }
2491 2491  
2492 2492  /*
2493 2493   * The callback function used for segmap'ped mblks called when the last ref of
2494 2494   * the mblk is dropped which normally occurs when TCP receives the ack. But it
2495 2495   * can be the driver too due to lazy reclaim.
2496 2496   */
2497 2497  void
2498 2498  snf_smap_desbfree(snf_smap_desbinfo *snfi)
2499 2499  {
2500 2500          if (! IS_KPM_ADDR(snfi->snfi_base)) {
2501 2501                  /*
2502 2502                   * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2503 2503                   * segmap_kpm as long as the latter never falls back to
2504 2504                   * "use_segmap_range". (See segmap_getmapflt().)
2505 2505                   *
2506 2506                   * Using S_OTHER saves an redundant hat_setref() in
2507 2507                   * segmap_unlock()
2508 2508                   */
2509 2509                  (void) segmap_fault(kas.a_hat, segkmap,
2510 2510                      (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base +
2511 2511                      snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len,
2512 2512                      F_SOFTUNLOCK, S_OTHER);
2513 2513          }
2514 2514          (void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED);
2515 2515          VN_RELE(snfi->snfi_vp);
2516 2516          kmem_free(snfi, sizeof (*snfi));
2517 2517  }
2518 2518  
2519 2519  /*
2520 2520   * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk.
2521 2521   * When segmap is used, the mblk contains a segmap slot of no more
2522 2522   * than MAXBSIZE.
2523 2523   *
2524 2524   * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained
2525 2525   * in each iteration and sent by socket_sendmblk until an error occurs or
2526 2526   * the requested size has been transferred. An mblk is esballoca'ed from
2527 2527   * each mapped page and a chain of these mblk is sent to the transport layer.
2528 2528   * vpm will be called to unmap the pages when all mblks have been freed by
2529 2529   * free_func.
2530 2530   *
2531 2531   * At the end of the whole sendfile() operation, we wait till the data from
2532 2532   * the last mblk is ack'ed by the transport before returning so that the
2533 2533   * caller of sendfile() can safely modify the file content.
2534 2534   */
2535 2535  int
2536 2536  snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t total_size,
2537 2537      ssize_t *count, boolean_t nowait)
2538 2538  {
2539 2539          caddr_t base;
2540 2540          int mapoff;
2541 2541          vnode_t *vp;
2542 2542          mblk_t *mp = NULL;
2543 2543          int chain_size;
2544 2544          int error;
2545 2545          clock_t deadlk_wait;
2546 2546          short fflag;
2547 2547          int ksize;
2548 2548          struct vattr va;
2549 2549          boolean_t dowait = B_FALSE;
2550 2550          struct nmsghdr msg;
2551 2551  
2552 2552          vp = fp->f_vnode;
2553 2553          fflag = fp->f_flag;
2554 2554          ksize = 0;
2555 2555          bzero(&msg, sizeof (msg));
2556 2556  
2557 2557          for (;;) {
2558 2558                  if (ISSIG(curthread, JUSTLOOKING)) {
2559 2559                          error = EINTR;
2560 2560                          break;
2561 2561                  }
2562 2562  
2563 2563                  if (vpm_enable) {
2564 2564                          snf_vmap_desbinfo *snfv;
2565 2565                          mblk_t *nmp;
2566 2566                          int mblk_size;
2567 2567                          int maxsize;
2568 2568                          int i;
2569 2569  
2570 2570                          mapoff = fileoff & PAGEOFFSET;
2571 2571                          maxsize = MIN((SNF_VPMMAXPGS * PAGESIZE), total_size);
2572 2572  
2573 2573                          snfv = kmem_zalloc(sizeof (snf_vmap_desbinfo),
2574 2574                              KM_SLEEP);
2575 2575  
2576 2576                          /*
2577 2577                           * Get vpm mappings for maxsize with read access.
2578 2578                           * If the pages aren't available yet, we get
2579 2579                           * DEADLK, so wait and try again a little later using
2580 2580                           * an increasing wait. We might be here a long time.
2581 2581                           *
2582 2582                           * If delay_sig returns EINTR, be sure to exit and
2583 2583                           * pass it up to the caller.
2584 2584                           */
2585 2585                          deadlk_wait = 0;
2586 2586                          while ((error = vpm_map_pages(fvp, fileoff,
2587 2587                              (size_t)maxsize, (VPM_FETCHPAGE), snfv->snfv_vml,
2588 2588                              SNF_MAXVMAPS, NULL, S_READ)) == EDEADLK) {
2589 2589                                  deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2590 2590                                  if ((error = delay_sig(deadlk_wait)) != 0) {
2591 2591                                          break;
2592 2592                                  }
2593 2593                          }
2594 2594                          if (error != 0) {
2595 2595                                  kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2596 2596                                  error = (error == EINTR) ? EINTR : EIO;
2597 2597                                  goto out;
2598 2598                          }
2599 2599                          snfv->snfv_frtn.free_func = snf_vmap_desbfree;
2600 2600                          snfv->snfv_frtn.free_arg = (caddr_t)snfv;
2601 2601  
2602 2602                          /* Construct the mblk chain from the page mappings */
2603 2603                          chain_size = 0;
2604 2604                          for (i = 0; (snfv->snfv_vml[i].vs_addr != NULL) &&
2605 2605                              total_size > 0; i++) {
2606 2606                                  ASSERT(chain_size < maxsize);
2607 2607                                  mblk_size = MIN(snfv->snfv_vml[i].vs_len -
2608 2608                                      mapoff, total_size);
2609 2609                                  nmp = esballoca(
2610 2610                                      (uchar_t *)snfv->snfv_vml[i].vs_addr +
2611 2611                                      mapoff, mblk_size, BPRI_HI,
2612 2612                                      &snfv->snfv_frtn);
2613 2613  
2614 2614                                  /*
2615 2615                                   * We return EAGAIN after unmapping the pages
2616 2616                                   * if we cannot allocate the the head of the
2617 2617                                   * chain. Otherwise, we continue sending the
2618 2618                                   * mblks constructed so far.
2619 2619                                   */
2620 2620                                  if (nmp == NULL) {
2621 2621                                          if (i == 0) {
2622 2622                                                  vpm_unmap_pages(snfv->snfv_vml,
2623 2623                                                      S_READ);
2624 2624                                                  kmem_free(snfv,
2625 2625                                                      sizeof (snf_vmap_desbinfo));
2626 2626                                                  error = EAGAIN;
2627 2627                                                  goto out;
2628 2628                                          }
2629 2629                                          break;
2630 2630                                  }
2631 2631                                  /* Mark this dblk with the zero-copy flag */
2632 2632                                  nmp->b_datap->db_struioflag |= STRUIO_ZC;
2633 2633                                  nmp->b_wptr += mblk_size;
2634 2634                                  chain_size += mblk_size;
2635 2635                                  fileoff += mblk_size;
2636 2636                                  total_size -= mblk_size;
2637 2637                                  snfv->snfv_ref++;
2638 2638                                  mapoff = 0;
2639 2639                                  if (i > 0)
2640 2640                                          linkb(mp, nmp);
2641 2641                                  else
2642 2642                                          mp = nmp;
2643 2643                          }
2644 2644                          VN_HOLD(fvp);
2645 2645                          snfv->snfv_vp = fvp;
2646 2646                  } else {
2647 2647                          /* vpm not supported. fallback to segmap */
2648 2648                          snf_smap_desbinfo *snfi;
2649 2649  
2650 2650                          mapoff = fileoff & MAXBOFFSET;
2651 2651                          chain_size = MAXBSIZE - mapoff;
2652 2652                          if (chain_size > total_size)
2653 2653                                  chain_size = total_size;
2654 2654                          /*
2655 2655                           * we don't forcefault because we'll call
2656 2656                           * segmap_fault(F_SOFTLOCK) next.
2657 2657                           *
2658 2658                           * S_READ will get the ref bit set (by either
2659 2659                           * segmap_getmapflt() or segmap_fault()) and page
2660 2660                           * shared locked.
2661 2661                           */
2662 2662                          base = segmap_getmapflt(segkmap, fvp, fileoff,
2663 2663                              chain_size, segmap_kpm ? SM_FAULT : 0, S_READ);
2664 2664  
2665 2665                          snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP);
2666 2666                          snfi->snfi_len = (size_t)roundup(mapoff+chain_size,
2667 2667                              PAGESIZE)- (mapoff & PAGEMASK);
2668 2668                          /*
2669 2669                           * We must call segmap_fault() even for segmap_kpm
2670 2670                           * because that's how error gets returned.
2671 2671                           * (segmap_getmapflt() never fails but segmap_fault()
2672 2672                           * does.)
2673 2673                           *
2674 2674                           * If the pages aren't available yet, we get
2675 2675                           * DEADLK, so wait and try again a little later using
2676 2676                           * an increasing wait. We might be here a long time.
2677 2677                           *
2678 2678                           * If delay_sig returns EINTR, be sure to exit and
2679 2679                           * pass it up to the caller.
2680 2680                           */
2681 2681                          deadlk_wait = 0;
2682 2682                          while ((error = FC_ERRNO(segmap_fault(kas.a_hat,
2683 2683                              segkmap, (caddr_t)(uintptr_t)(((uintptr_t)base +
2684 2684                              mapoff) & PAGEMASK), snfi->snfi_len, F_SOFTLOCK,
2685 2685                              S_READ))) == EDEADLK) {
2686 2686                                  deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2687 2687                                  if ((error = delay_sig(deadlk_wait)) != 0) {
2688 2688                                          break;
2689 2689                                  }
2690 2690                          }
2691 2691                          if (error != 0) {
2692 2692                                  (void) segmap_release(segkmap, base, 0);
2693 2693                                  kmem_free(snfi, sizeof (*snfi));
2694 2694                                  error = (error == EINTR) ? EINTR : EIO;
2695 2695                                  goto out;
2696 2696                          }
2697 2697                          snfi->snfi_frtn.free_func = snf_smap_desbfree;
2698 2698                          snfi->snfi_frtn.free_arg = (caddr_t)snfi;
2699 2699                          snfi->snfi_base = base;
2700 2700                          snfi->snfi_mapoff = mapoff;
2701 2701                          mp = esballoca((uchar_t *)base + mapoff, chain_size,
2702 2702                              BPRI_HI, &snfi->snfi_frtn);
2703 2703  
2704 2704                          if (mp == NULL) {
2705 2705                                  (void) segmap_fault(kas.a_hat, segkmap,
2706 2706                                      (caddr_t)(uintptr_t)(((uintptr_t)base +
2707 2707                                      mapoff) & PAGEMASK), snfi->snfi_len,
2708 2708                                      F_SOFTUNLOCK, S_OTHER);
2709 2709                                  (void) segmap_release(segkmap, base, 0);
2710 2710                                  kmem_free(snfi, sizeof (*snfi));
2711 2711                                  freemsg(mp);
2712 2712                                  error = EAGAIN;
2713 2713                                  goto out;
2714 2714                          }
2715 2715                          VN_HOLD(fvp);
2716 2716                          snfi->snfi_vp = fvp;
2717 2717                          mp->b_wptr += chain_size;
2718 2718  
2719 2719                          /* Mark this dblk with the zero-copy flag */
2720 2720                          mp->b_datap->db_struioflag |= STRUIO_ZC;
2721 2721                          fileoff += chain_size;
2722 2722                          total_size -= chain_size;
2723 2723                  }
2724 2724  
2725 2725                  if (total_size == 0 && !nowait) {
2726 2726                          ASSERT(!dowait);
2727 2727                          dowait = B_TRUE;
2728 2728                          mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
2729 2729                  }
2730 2730                  VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2731 2731                  error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2732 2732                  if (error != 0) {
2733 2733                          /*
2734 2734                           * mp contains the mblks that were not sent by
2735 2735                           * socket_sendmblk. Use its size to update *count
2736 2736                           */
2737 2737                          *count = ksize + (chain_size - msgdsize(mp));
2738 2738                          if (mp != NULL)
2739 2739                                  freemsg(mp);
2740 2740                          return (error);
2741 2741                  }
2742 2742                  ksize += chain_size;
2743 2743                  if (total_size == 0)
2744 2744                          goto done;
2745 2745  
2746 2746                  (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2747 2747                  va.va_mask = AT_SIZE;
2748 2748                  error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2749 2749                  if (error)
2750 2750                          break;
2751 2751                  /* Read as much as possible. */
2752 2752                  if (fileoff >= va.va_size)
2753 2753                          break;
2754 2754                  if (total_size + fileoff > va.va_size)
2755 2755                          total_size = va.va_size - fileoff;
2756 2756          }
2757 2757  out:
2758 2758          VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2759 2759  done:
2760 2760          *count = ksize;
2761 2761          if (dowait) {
2762 2762                  stdata_t *stp;
2763 2763  
2764 2764                  stp = vp->v_stream;
2765 2765                  if (stp == NULL) {
2766 2766                          struct sonode *so;
2767 2767                          so = VTOSO(vp);
2768 2768                          error = so_zcopy_wait(so);
2769 2769                  } else {
2770 2770                          mutex_enter(&stp->sd_lock);
2771 2771                          while (!(stp->sd_flag & STZCNOTIFY)) {
2772 2772                                  if (cv_wait_sig(&stp->sd_zcopy_wait,
2773 2773                                      &stp->sd_lock) == 0) {
2774 2774                                          error = EINTR;
2775 2775                                          break;
2776 2776                                  }
2777 2777                          }
2778 2778                          stp->sd_flag &= ~STZCNOTIFY;
2779 2779                          mutex_exit(&stp->sd_lock);
2780 2780                  }
2781 2781          }
2782 2782          return (error);
2783 2783  }
2784 2784  
2785 2785  int
2786 2786  snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
2787 2787      uint_t maxpsz, ssize_t *count)
2788 2788  {
2789 2789          struct vnode *vp;
2790 2790          mblk_t *mp;
2791 2791          int iosize;
2792 2792          int extra = 0;
2793 2793          int error;
2794 2794          short fflag;
2795 2795          int ksize;
2796 2796          int ioflag;
2797 2797          struct uio auio;
2798 2798          struct iovec aiov;
2799 2799          struct vattr va;
2800 2800          int maxblk = 0;
2801 2801          int wroff = 0;
2802 2802          struct sonode *so;
2803 2803          struct nmsghdr msg;
2804 2804  
2805 2805          vp = fp->f_vnode;
2806 2806          if (vp->v_type == VSOCK) {
2807 2807                  stdata_t *stp;
2808 2808  
2809 2809                  /*
2810 2810                   * Get the extra space to insert a header and a trailer.
2811 2811                   */
2812 2812                  so = VTOSO(vp);
2813 2813                  stp = vp->v_stream;
2814 2814                  if (stp == NULL) {
2815 2815                          wroff = so->so_proto_props.sopp_wroff;
2816 2816                          maxblk = so->so_proto_props.sopp_maxblk;
2817 2817                          extra = wroff + so->so_proto_props.sopp_tail;
2818 2818                  } else {
2819 2819                          wroff = (int)(stp->sd_wroff);
2820 2820                          maxblk = (int)(stp->sd_maxblk);
2821 2821                          extra = wroff + (int)(stp->sd_tail);
2822 2822                  }
2823 2823          }
2824 2824          bzero(&msg, sizeof (msg));
2825 2825          fflag = fp->f_flag;
2826 2826          ksize = 0;
2827 2827          auio.uio_iov = &aiov;
2828 2828          auio.uio_iovcnt = 1;
2829 2829          auio.uio_segflg = UIO_SYSSPACE;
2830 2830          auio.uio_llimit = MAXOFFSET_T;
2831 2831          auio.uio_fmode = fflag;
2832 2832          auio.uio_extflg = UIO_COPY_CACHED;
2833 2833          ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
2834 2834          /* If read sync is not asked for, filter sync flags */
2835 2835          if ((ioflag & FRSYNC) == 0)
2836 2836                  ioflag &= ~(FSYNC|FDSYNC);
2837 2837          for (;;) {
2838 2838                  if (ISSIG(curthread, JUSTLOOKING)) {
2839 2839                          error = EINTR;
2840 2840                          break;
2841 2841                  }
2842 2842                  iosize = (int)MIN(maxpsz, size);
2843 2843  
2844 2844                  /*
2845 2845                   * Socket filters can limit the mblk size,
2846 2846                   * so limit reads to maxblk if there are
2847 2847                   * filters present.
2848 2848                   */
2849 2849                  if (vp->v_type == VSOCK &&
2850 2850                      so->so_filter_active > 0 && maxblk != INFPSZ)
2851 2851                          iosize = (int)MIN(iosize, maxblk);
2852 2852  
2853 2853                  if (is_system_labeled()) {
2854 2854                          mp = allocb_cred(iosize + extra, CRED(),
2855 2855                              curproc->p_pid);
2856 2856                  } else {
2857 2857                          mp = allocb(iosize + extra, BPRI_MED);
2858 2858                  }
2859 2859                  if (mp == NULL) {
2860 2860                          error = EAGAIN;
2861 2861                          break;
2862 2862                  }
2863 2863  
2864 2864                  mp->b_rptr += wroff;
2865 2865  
2866 2866                  aiov.iov_base = (caddr_t)mp->b_rptr;
2867 2867                  aiov.iov_len = iosize;
2868 2868                  auio.uio_loffset = fileoff;
2869 2869                  auio.uio_resid = iosize;
2870 2870  
2871 2871                  error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL);
2872 2872                  iosize -= auio.uio_resid;
2873 2873  
2874 2874                  if (error == EINTR && iosize != 0)
2875 2875                          error = 0;
2876 2876  
2877 2877                  if (error != 0 || iosize == 0) {
2878 2878                          freeb(mp);
2879 2879                          break;
2880 2880                  }
2881 2881                  mp->b_wptr = mp->b_rptr + iosize;
2882 2882  
2883 2883                  VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2884 2884  
2885 2885                  error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2886 2886  
2887 2887                  if (error != 0) {
2888 2888                          *count = ksize;
2889 2889                          if (mp != NULL)
2890 2890                                  freeb(mp);
2891 2891                          return (error);
2892 2892                  }
2893 2893                  ksize += iosize;
2894 2894                  size -= iosize;
2895 2895                  if (size == 0)
2896 2896                          goto done;
2897 2897  
2898 2898                  fileoff += iosize;
2899 2899                  (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2900 2900                  va.va_mask = AT_SIZE;
2901 2901                  error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2902 2902                  if (error)
2903 2903                          break;
2904 2904                  /* Read as much as possible. */
2905 2905                  if (fileoff >= va.va_size)
2906 2906                          size = 0;
2907 2907                  else if (size + fileoff > va.va_size)
2908 2908                          size = va.va_size - fileoff;
2909 2909          }
2910 2910          VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2911 2911  done:
2912 2912          *count = ksize;
2913 2913          return (error);
2914 2914  }
2915 2915  
2916 2916  #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2917 2917  /*
2918 2918   * Largefile support for 32 bit applications only.
2919 2919   */
2920 2920  int
2921 2921  sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
2922 2922      ssize32_t *count32)
2923 2923  {
2924 2924          ssize32_t sfv_len;
2925 2925          u_offset_t sfv_off, va_size;
2926 2926          struct vnode *vp, *fvp, *realvp;
2927 2927          struct vattr va;
2928 2928          stdata_t *stp;
2929 2929          ssize_t count = 0;
2930 2930          int error = 0;
2931 2931          boolean_t dozcopy = B_FALSE;
2932 2932          uint_t maxpsz;
2933 2933  
2934 2934          sfv_len = (ssize32_t)sfv->sfv_len;
2935 2935          if (sfv_len < 0) {
2936 2936                  error = EINVAL;
2937 2937                  goto out;
2938 2938          }
2939 2939  
2940 2940          if (sfv_len == 0) goto out;
2941 2941  
2942 2942          sfv_off = (u_offset_t)sfv->sfv_off;
2943 2943  
2944 2944          /* Same checks as in pread */
2945 2945          if (sfv_off > MAXOFFSET_T) {
2946 2946                  error = EINVAL;
2947 2947                  goto out;
2948 2948          }
2949 2949          if (sfv_off + sfv_len > MAXOFFSET_T)
2950 2950                  sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
2951 2951  
2952 2952          /*
2953 2953           * There are no more checks on sfv_len. So, we cast it to
2954 2954           * u_offset_t and share the snf_direct_io/snf_cache code between
2955 2955           * 32 bit and 64 bit.
2956 2956           *
2957 2957           * TODO: should do nbl_need_check() like read()?
2958 2958           */
2959 2959          if (sfv_len > sendfile_max_size) {
2960 2960                  sf_stats.ss_file_not_cached++;
2961 2961                  error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len,
2962 2962                      &count);
2963 2963                  goto out;
2964 2964          }
2965 2965          fvp = rfp->f_vnode;
2966 2966          if (VOP_REALVP(fvp, &realvp, NULL) == 0)
2967 2967                  fvp = realvp;
2968 2968          /*
2969 2969           * Grab the lock as a reader to prevent the file size
2970 2970           * from changing underneath.
2971 2971           */
2972 2972          (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2973 2973          va.va_mask = AT_SIZE;
2974 2974          error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2975 2975          va_size = va.va_size;
2976 2976          if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) {
2977 2977                  VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2978 2978                  goto out;
2979 2979          }
2980 2980          /* Read as much as possible. */
2981 2981          if (sfv_off + sfv_len > va_size)
2982 2982                  sfv_len = va_size - sfv_off;
2983 2983  
2984 2984          vp = fp->f_vnode;
2985 2985          stp = vp->v_stream;
2986 2986          /*
2987 2987           * When the NOWAIT flag is not set, we enable zero-copy only if the
2988 2988           * transfer size is large enough. This prevents performance loss
2989 2989           * when the caller sends the file piece by piece.
2990 2990           */
2991 2991          if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) ||
2992 2992              (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) &&
2993 2993              !vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) {
2994 2994                  uint_t copyflag;
2995 2995                  copyflag = stp != NULL ? stp->sd_copyflag :
2996 2996                      VTOSO(vp)->so_proto_props.sopp_zcopyflag;
2997 2997                  if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
2998 2998                          int on = 1;
2999 2999  
3000 3000                          if (socket_setsockopt(VTOSO(vp), SOL_SOCKET,
3001 3001                              SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0)
3002 3002                                  dozcopy = B_TRUE;
3003 3003                  } else {
3004 3004                          dozcopy = copyflag & STZCVMSAFE;
3005 3005                  }
3006 3006          }
3007 3007          if (dozcopy) {
3008 3008                  sf_stats.ss_file_segmap++;
3009 3009                  error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len,
3010 3010                      &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0));
3011 3011          } else {
3012 3012                  if (vp->v_type == VSOCK && stp == NULL) {
3013 3013                          sonode_t *so = VTOSO(vp);
3014 3014                          maxpsz = so->so_proto_props.sopp_maxpsz;
3015 3015                  } else if (stp != NULL) {
3016 3016                          maxpsz = stp->sd_qn_maxpsz;
3017 3017                  } else {
3018 3018                          maxpsz = maxphys;
3019 3019                  }
3020 3020  
3021 3021                  if (maxpsz == INFPSZ)
3022 3022                          maxpsz = maxphys;
3023 3023                  else
3024 3024                          maxpsz = roundup(maxpsz, MAXBSIZE);
3025 3025                  sf_stats.ss_file_cached++;
3026 3026                  error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len,
3027 3027                      maxpsz, &count);
3028 3028          }
3029 3029  out:
3030 3030          releasef(sfv->sfv_fd);
3031 3031          *count32 = (ssize32_t)count;
3032 3032          return (error);
3033 3033  }
3034 3034  #endif
3035 3035  
3036 3036  #ifdef _SYSCALL32_IMPL
3037 3037  /*
3038 3038   * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
3039 3039   * ssize_t rather than ssize32_t; see the comments above read32 for details.
3040 3040   */
3041 3041  
3042 3042  ssize_t
3043 3043  recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
3044 3044  {
3045 3045          return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
3046 3046  }
3047 3047  
3048 3048  ssize_t
3049 3049  recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
3050 3050          caddr32_t name, caddr32_t namelenp)
3051 3051  {
3052 3052          return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
3053 3053              (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp));
3054 3054  }
3055 3055  
3056 3056  ssize_t
3057 3057  send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
3058 3058  {
3059 3059          return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
3060 3060  }
3061 3061  
3062 3062  ssize_t
3063 3063  sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
3064 3064          caddr32_t name, socklen_t namelen)
3065 3065  {
3066 3066          return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
3067 3067              (void *)(uintptr_t)name, namelen));
3068 3068  }
3069 3069  #endif  /* _SYSCALL32_IMPL */
3070 3070  
3071 3071  /*
3072 3072   * Function wrappers (mostly around the sonode switch) for
3073 3073   * backward compatibility.
3074 3074   */
3075 3075  
3076 3076  int
3077 3077  soaccept(struct sonode *so, int fflag, struct sonode **nsop)
3078 3078  {
3079 3079          return (socket_accept(so, fflag, CRED(), nsop));
3080 3080  }
3081 3081  
3082 3082  int
3083 3083  sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3084 3084      int backlog, int flags)
3085 3085  {
3086 3086          int     error;
3087 3087  
3088 3088          error = socket_bind(so, name, namelen, flags, CRED());
3089 3089          if (error == 0 && backlog != 0)
3090 3090                  return (socket_listen(so, backlog, CRED()));
3091 3091  
3092 3092          return (error);
3093 3093  }
3094 3094  
3095 3095  int
3096 3096  solisten(struct sonode *so, int backlog)
3097 3097  {
3098 3098          return (socket_listen(so, backlog, CRED()));
3099 3099  }
3100 3100  
3101 3101  int
3102 3102  soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3103 3103      int fflag, int flags)
3104 3104  {
3105 3105          return (socket_connect(so, name, namelen, fflag, flags, CRED()));
3106 3106  }
3107 3107  
3108 3108  int
3109 3109  sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3110 3110  {
3111 3111          return (socket_recvmsg(so, msg, uiop, CRED()));
3112 3112  }
3113 3113  
3114 3114  int
3115 3115  sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3116 3116  {
3117 3117          return (socket_sendmsg(so, msg, uiop, CRED()));
3118 3118  }
3119 3119  
3120 3120  int
3121 3121  soshutdown(struct sonode *so, int how)
3122 3122  {
3123 3123          return (socket_shutdown(so, how, CRED()));
3124 3124  }
3125 3125  
3126 3126  int
3127 3127  sogetsockopt(struct sonode *so, int level, int option_name, void *optval,
3128 3128      socklen_t *optlenp, int flags)
3129 3129  {
3130 3130          return (socket_getsockopt(so, level, option_name, optval, optlenp,
3131 3131              flags, CRED()));
3132 3132  }
3133 3133  
3134 3134  int
3135 3135  sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
3136 3136      t_uscalar_t optlen)
3137 3137  {
3138 3138          return (socket_setsockopt(so, level, option_name, optval, optlen,
3139 3139              CRED()));
3140 3140  }
3141 3141  
3142 3142  /*
3143 3143   * Because this is backward compatibility interface it only needs to be
3144 3144   * able to handle the creation of TPI sockfs sockets.
3145 3145   */
3146 3146  struct sonode *
3147 3147  socreate(struct sockparams *sp, int family, int type, int protocol, int version,
3148 3148      int *errorp)
3149 3149  {
3150 3150          struct sonode *so;
3151 3151  
3152 3152          ASSERT(sp != NULL);
3153 3153  
3154 3154          so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol,
3155 3155              version, SOCKET_SLEEP, errorp, CRED());
3156 3156          if (so == NULL) {
3157 3157                  SOCKPARAMS_DEC_REF(sp);
3158 3158          } else {
3159 3159                  if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) {
3160 3160                          /* Cannot fail, only bumps so_count */
3161 3161                          (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL);
3162 3162                  } else {
3163 3163                          socket_destroy(so);
3164 3164                          so = NULL;
3165 3165                  }
3166 3166          }
3167 3167          return (so);
3168 3168  }

↓ open down ↓

3168 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX