Print this page
    
OS-3752 Increase IOV_MAX to at least 1024
OS-3404 lx brand must support sendmsg() with IOV_MAX of 1024
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/sockfs/socksyscalls.c
          +++ new/usr/src/uts/common/fs/sockfs/socksyscalls.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
       24 + * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
       25 + * Copyright 2015, Joyent, Inc.  All rights reserved.
  24   26   */
  25   27  
  26   28  /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
  27   29  /*
  28   30   * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  29   31   */
  30   32  
  31   33  #include <sys/types.h>
  32   34  #include <sys/t_lock.h>
  33   35  #include <sys/param.h>
  34   36  #include <sys/systm.h>
  35   37  #include <sys/buf.h>
  36   38  #include <sys/conf.h>
  37   39  #include <sys/cred.h>
  38   40  #include <sys/kmem.h>
  39   41  #include <sys/sysmacros.h>
  40   42  #include <sys/vfs.h>
  41   43  #include <sys/vnode.h>
  42   44  #include <sys/debug.h>
  43   45  #include <sys/errno.h>
  44   46  #include <sys/time.h>
  45   47  #include <sys/file.h>
  46   48  #include <sys/user.h>
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  47   49  #include <sys/stream.h>
  48   50  #include <sys/strsubr.h>
  49   51  #include <sys/strsun.h>
  50   52  #include <sys/sunddi.h>
  51   53  #include <sys/esunddi.h>
  52   54  #include <sys/flock.h>
  53   55  #include <sys/modctl.h>
  54   56  #include <sys/cmn_err.h>
  55   57  #include <sys/vmsystm.h>
  56   58  #include <sys/policy.h>
       59 +#include <sys/limits.h>
  57   60  
  58   61  #include <sys/socket.h>
  59   62  #include <sys/socketvar.h>
  60   63  
  61   64  #include <sys/isa_defs.h>
  62   65  #include <sys/inttypes.h>
  63   66  #include <sys/systm.h>
  64   67  #include <sys/cpuvar.h>
  65   68  #include <sys/filio.h>
  66   69  #include <sys/sendfile.h>
  67   70  #include <sys/ddi.h>
  68   71  #include <vm/seg.h>
  69   72  #include <vm/seg_map.h>
  70   73  #include <vm/seg_kpm.h>
  71   74  
  72   75  #include <fs/sockfs/nl7c.h>
  73   76  #include <fs/sockfs/sockcommon.h>
  74   77  #include <fs/sockfs/sockfilter_impl.h>
  75   78  #include <fs/sockfs/socktpi.h>
  76   79  
  77   80  #ifdef SOCK_TEST
  78   81  int do_useracc = 1;             /* Controlled by setting SO_DEBUG to 4 */
  
    | 
      ↓ open down ↓ | 
    12 lines elided | 
    
      ↑ open up ↑ | 
  
  79   82  #else
  80   83  #define do_useracc      1
  81   84  #endif /* SOCK_TEST */
  82   85  
  83   86  extern int      xnet_truncate_print;
  84   87  
  85   88  extern void     nl7c_init(void);
  86   89  extern int      sockfs_defer_nl7c_init;
  87   90  
  88   91  /*
  89      - * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
  90      - *       as there isn't a formal definition of IOV_MAX ???
  91      - */
  92      -#define MSG_MAXIOVLEN   16
  93      -
  94      -/*
  95   92   * Kernel component of socket creation.
  96   93   *
  97   94   * The socket library determines which version number to use.
  98   95   * First the library calls this with a NULL devpath. If this fails
  99   96   * to find a transport (using solookup) the library will look in /etc/netconfig
 100   97   * for the appropriate transport. If one is found it will pass in the
 101   98   * devpath for the kernel to use.
 102   99   */
 103  100  int
 104  101  so_socket(int family, int type_w_flags, int protocol, char *devpath,
 105  102      int version)
 106  103  {
 107  104          struct sonode *so;
 108  105          vnode_t *vp;
 109  106          struct file *fp;
 110  107          int fd;
 111  108          int error;
 112  109          int type;
 113  110  
 114  111          type = type_w_flags & SOCK_TYPE_MASK;
 115  112          type_w_flags &= ~SOCK_TYPE_MASK;
 116  113          if (type_w_flags & ~(SOCK_CLOEXEC|SOCK_NDELAY|SOCK_NONBLOCK))
 117  114                  return (set_errno(EINVAL));
 118  115  
 119  116          if (devpath != NULL) {
 120  117                  char *buf;
 121  118                  size_t kdevpathlen = 0;
 122  119  
 123  120                  buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 124  121                  if ((error = copyinstr(devpath, buf,
 125  122                      MAXPATHLEN, &kdevpathlen)) != 0) {
 126  123                          kmem_free(buf, MAXPATHLEN);
 127  124                          return (set_errno(error));
 128  125                  }
 129  126                  so = socket_create(family, type, protocol, buf, NULL,
 130  127                      SOCKET_SLEEP, version, CRED(), &error);
 131  128                  kmem_free(buf, MAXPATHLEN);
 132  129          } else {
 133  130                  so = socket_create(family, type, protocol, NULL, NULL,
 134  131                      SOCKET_SLEEP, version, CRED(), &error);
 135  132          }
 136  133          if (so == NULL)
 137  134                  return (set_errno(error));
 138  135  
 139  136          /* Allocate a file descriptor for the socket */
 140  137          vp = SOTOV(so);
 141  138          if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) {
 142  139                  (void) socket_close(so, 0, CRED());
 143  140                  socket_destroy(so);
 144  141                  return (set_errno(error));
 145  142          }
 146  143  
 147  144          /*
 148  145           * Now fill in the entries that falloc reserved
 149  146           */
 150  147          if (type_w_flags & SOCK_NDELAY) {
 151  148                  so->so_state |= SS_NDELAY;
 152  149                  fp->f_flag |= FNDELAY;
 153  150          }
 154  151          if (type_w_flags & SOCK_NONBLOCK) {
 155  152                  so->so_state |= SS_NONBLOCK;
 156  153                  fp->f_flag |= FNONBLOCK;
 157  154          }
 158  155          mutex_exit(&fp->f_tlock);
 159  156          setf(fd, fp);
 160  157          if ((type_w_flags & SOCK_CLOEXEC) != 0) {
 161  158                  f_setfd(fd, FD_CLOEXEC);
 162  159          }
 163  160  
 164  161          return (fd);
 165  162  }
 166  163  
 167  164  /*
 168  165   * Map from a file descriptor to a socket node.
 169  166   * Returns with the file descriptor held i.e. the caller has to
 170  167   * use releasef when done with the file descriptor.
 171  168   */
 172  169  struct sonode *
 173  170  getsonode(int sock, int *errorp, file_t **fpp)
 174  171  {
 175  172          file_t *fp;
 176  173          vnode_t *vp;
 177  174          struct sonode *so;
 178  175  
 179  176          if ((fp = getf(sock)) == NULL) {
 180  177                  *errorp = EBADF;
 181  178                  eprintline(*errorp);
 182  179                  return (NULL);
 183  180          }
 184  181          vp = fp->f_vnode;
 185  182          /* Check if it is a socket */
 186  183          if (vp->v_type != VSOCK) {
 187  184                  releasef(sock);
 188  185                  *errorp = ENOTSOCK;
 189  186                  eprintline(*errorp);
 190  187                  return (NULL);
 191  188          }
 192  189          /*
 193  190           * Use the stream head to find the real socket vnode.
 194  191           * This is needed when namefs sits above sockfs.
 195  192           */
 196  193          if (vp->v_stream) {
 197  194                  ASSERT(vp->v_stream->sd_vnode);
 198  195                  vp = vp->v_stream->sd_vnode;
 199  196  
 200  197                  so = VTOSO(vp);
 201  198                  if (so->so_version == SOV_STREAM) {
 202  199                          releasef(sock);
 203  200                          *errorp = ENOTSOCK;
 204  201                          eprintsoline(so, *errorp);
 205  202                          return (NULL);
 206  203                  }
 207  204          } else {
 208  205                  so = VTOSO(vp);
 209  206          }
 210  207          if (fpp)
 211  208                  *fpp = fp;
 212  209          return (so);
 213  210  }
 214  211  
 215  212  /*
 216  213   * Allocate and copyin a sockaddr.
 217  214   * Ensures NULL termination for AF_UNIX addresses by extending them
 218  215   * with one NULL byte if need be. Verifies that the length is not
 219  216   * excessive to prevent an application from consuming all of kernel
 220  217   * memory. Returns NULL when an error occurred.
 221  218   */
 222  219  static struct sockaddr *
 223  220  copyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp,
 224  221              int *errorp)
 225  222  {
 226  223          char    *faddr;
 227  224          size_t  namelen = (size_t)*namelenp;
 228  225  
 229  226          ASSERT(namelen != 0);
 230  227          if (namelen > SO_MAXARGSIZE) {
 231  228                  *errorp = EINVAL;
 232  229                  eprintsoline(so, *errorp);
 233  230                  return (NULL);
 234  231          }
 235  232  
 236  233          faddr = (char *)kmem_alloc(namelen, KM_SLEEP);
 237  234          if (copyin(name, faddr, namelen)) {
 238  235                  kmem_free(faddr, namelen);
 239  236                  *errorp = EFAULT;
 240  237                  eprintsoline(so, *errorp);
 241  238                  return (NULL);
 242  239          }
 243  240  
 244  241          /*
 245  242           * Add space for NULL termination if needed.
 246  243           * Do a quick check if the last byte is NUL.
 247  244           */
 248  245          if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') {
 249  246                  /* Check if there is any NULL termination */
 250  247                  size_t  i;
 251  248                  int foundnull = 0;
 252  249  
 253  250                  for (i = sizeof (name->sa_family); i < namelen; i++) {
 254  251                          if (faddr[i] == '\0') {
 255  252                                  foundnull = 1;
 256  253                                  break;
 257  254                          }
 258  255                  }
 259  256                  if (!foundnull) {
 260  257                          /* Add extra byte for NUL padding */
 261  258                          char *nfaddr;
 262  259  
 263  260                          nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP);
 264  261                          bcopy(faddr, nfaddr, namelen);
 265  262                          kmem_free(faddr, namelen);
 266  263  
 267  264                          /* NUL terminate */
 268  265                          nfaddr[namelen] = '\0';
 269  266                          namelen++;
 270  267                          ASSERT((socklen_t)namelen == namelen);
 271  268                          *namelenp = (socklen_t)namelen;
 272  269                          faddr = nfaddr;
 273  270                  }
 274  271          }
 275  272          return ((struct sockaddr *)faddr);
 276  273  }
 277  274  
 278  275  /*
 279  276   * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
 280  277   */
 281  278  static int
 282  279  copyout_arg(void *uaddr, socklen_t ulen, void *ulenp,
 283  280                  void *kaddr, socklen_t klen)
 284  281  {
 285  282          if (uaddr != NULL) {
 286  283                  if (ulen > klen)
 287  284                          ulen = klen;
 288  285  
 289  286                  if (ulen != 0) {
 290  287                          if (copyout(kaddr, uaddr, ulen))
 291  288                                  return (EFAULT);
 292  289                  }
 293  290          } else
 294  291                  ulen = 0;
 295  292  
 296  293          if (ulenp != NULL) {
 297  294                  if (copyout(&ulen, ulenp, sizeof (ulen)))
 298  295                          return (EFAULT);
 299  296          }
 300  297          return (0);
 301  298  }
 302  299  
 303  300  /*
 304  301   * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
 305  302   * If klen is greater than ulen it still uses the non-truncated
 306  303   * klen to update ulenp.
 307  304   */
 308  305  static int
 309  306  copyout_name(void *uaddr, socklen_t ulen, void *ulenp,
 310  307                  void *kaddr, socklen_t klen)
 311  308  {
 312  309          if (uaddr != NULL) {
 313  310                  if (ulen >= klen)
 314  311                          ulen = klen;
 315  312                  else if (ulen != 0 && xnet_truncate_print) {
 316  313                          printf("sockfs: truncating copyout of address using "
 317  314                              "XNET semantics for pid = %d. Lengths %d, %d\n",
 318  315                              curproc->p_pid, klen, ulen);
 319  316                  }
 320  317  
 321  318                  if (ulen != 0) {
 322  319                          if (copyout(kaddr, uaddr, ulen))
 323  320                                  return (EFAULT);
 324  321                  } else
 325  322                          klen = 0;
 326  323          } else
 327  324                  klen = 0;
 328  325  
 329  326          if (ulenp != NULL) {
 330  327                  if (copyout(&klen, ulenp, sizeof (klen)))
 331  328                          return (EFAULT);
 332  329          }
 333  330          return (0);
 334  331  }
 335  332  
 336  333  /*
 337  334   * The socketpair() code in libsocket creates two sockets (using
 338  335   * the /etc/netconfig fallback if needed) before calling this routine
 339  336   * to connect the two sockets together.
 340  337   *
 341  338   * For a SOCK_STREAM socketpair a listener is needed - in that case this
 342  339   * routine will create a new file descriptor as part of accepting the
 343  340   * connection. The library socketpair() will check if svs[2] has changed
 344  341   * in which case it will close the changed fd.
 345  342   *
 346  343   * Note that this code could use the TPI feature of accepting the connection
 347  344   * on the listening endpoint. However, that would require significant changes
 348  345   * to soaccept.
 349  346   */
 350  347  int
 351  348  so_socketpair(int sv[2])
 352  349  {
 353  350          int svs[2];
 354  351          struct sonode *so1, *so2;
 355  352          int error;
 356  353          int orig_flags;
 357  354          struct sockaddr_ux *name;
 358  355          size_t namelen;
 359  356          sotpi_info_t *sti1;
 360  357          sotpi_info_t *sti2;
 361  358  
 362  359          dprint(1, ("so_socketpair(%p)\n", (void *)sv));
 363  360  
 364  361          error = useracc(sv, sizeof (svs), B_WRITE);
 365  362          if (error && do_useracc)
 366  363                  return (set_errno(EFAULT));
 367  364  
 368  365          if (copyin(sv, svs, sizeof (svs)))
 369  366                  return (set_errno(EFAULT));
 370  367  
 371  368          if ((so1 = getsonode(svs[0], &error, NULL)) == NULL)
 372  369                  return (set_errno(error));
 373  370  
 374  371          if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) {
 375  372                  releasef(svs[0]);
 376  373                  return (set_errno(error));
 377  374          }
 378  375  
 379  376          if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) {
 380  377                  error = EOPNOTSUPP;
 381  378                  goto done;
 382  379          }
 383  380  
 384  381          sti1 = SOTOTPI(so1);
 385  382          sti2 = SOTOTPI(so2);
 386  383  
 387  384          /*
 388  385           * The code below makes assumptions about the "sockfs" implementation.
 389  386           * So make sure that the correct implementation is really used.
 390  387           */
 391  388          ASSERT(so1->so_ops == &sotpi_sonodeops);
 392  389          ASSERT(so2->so_ops == &sotpi_sonodeops);
 393  390  
 394  391          if (so1->so_type == SOCK_DGRAM) {
 395  392                  /*
 396  393                   * Bind both sockets and connect them with each other.
 397  394                   * Need to allocate name/namelen for soconnect.
 398  395                   */
 399  396                  error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED());
 400  397                  if (error) {
 401  398                          eprintsoline(so1, error);
 402  399                          goto done;
 403  400                  }
 404  401                  error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
 405  402                  if (error) {
 406  403                          eprintsoline(so2, error);
 407  404                          goto done;
 408  405                  }
 409  406                  namelen = sizeof (struct sockaddr_ux);
 410  407                  name = kmem_alloc(namelen, KM_SLEEP);
 411  408                  name->sou_family = AF_UNIX;
 412  409                  name->sou_addr = sti2->sti_ux_laddr;
 413  410                  error = socket_connect(so1,
 414  411                      (struct sockaddr *)name,
 415  412                      (socklen_t)namelen,
 416  413                      0, _SOCONNECT_NOXLATE, CRED());
 417  414                  if (error) {
 418  415                          kmem_free(name, namelen);
 419  416                          eprintsoline(so1, error);
 420  417                          goto done;
 421  418                  }
 422  419                  name->sou_addr = sti1->sti_ux_laddr;
 423  420                  error = socket_connect(so2,
 424  421                      (struct sockaddr *)name,
 425  422                      (socklen_t)namelen,
 426  423                      0, _SOCONNECT_NOXLATE, CRED());
 427  424                  kmem_free(name, namelen);
 428  425                  if (error) {
 429  426                          eprintsoline(so2, error);
 430  427                          goto done;
 431  428                  }
 432  429                  releasef(svs[0]);
 433  430                  releasef(svs[1]);
 434  431          } else {
 435  432                  /*
 436  433                   * Bind both sockets, with so1 being a listener.
 437  434                   * Connect so2 to so1 - nonblocking to avoid waiting for
 438  435                   * soaccept to complete.
 439  436                   * Accept a connection on so1. Pass out the new fd as sv[0].
 440  437                   * The library will detect the changed fd and close
 441  438                   * the original one.
 442  439                   */
 443  440                  struct sonode *nso;
 444  441                  struct vnode *nvp;
 445  442                  struct file *nfp;
 446  443                  int nfd;
 447  444  
 448  445                  /*
 449  446                   * We could simply call socket_listen() here (which would do the
 450  447                   * binding automatically) if the code didn't rely on passing
 451  448                   * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
 452  449                   */
 453  450                  error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC|
 454  451                      _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR,
 455  452                      CRED());
 456  453                  if (error) {
 457  454                          eprintsoline(so1, error);
 458  455                          goto done;
 459  456                  }
 460  457                  error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
 461  458                  if (error) {
 462  459                          eprintsoline(so2, error);
 463  460                          goto done;
 464  461                  }
 465  462  
 466  463                  namelen = sizeof (struct sockaddr_ux);
 467  464                  name = kmem_alloc(namelen, KM_SLEEP);
 468  465                  name->sou_family = AF_UNIX;
 469  466                  name->sou_addr = sti1->sti_ux_laddr;
 470  467                  error = socket_connect(so2,
 471  468                      (struct sockaddr *)name,
 472  469                      (socklen_t)namelen,
 473  470                      FNONBLOCK, _SOCONNECT_NOXLATE, CRED());
 474  471                  kmem_free(name, namelen);
 475  472                  if (error) {
 476  473                          if (error != EINPROGRESS) {
 477  474                                  eprintsoline(so2, error); goto done;
 478  475                          }
 479  476                  }
 480  477  
 481  478                  error = socket_accept(so1, 0, CRED(), &nso);
 482  479                  if (error) {
 483  480                          eprintsoline(so1, error);
 484  481                          goto done;
 485  482                  }
 486  483  
 487  484                  /* wait for so2 being SS_CONNECTED ignoring signals */
 488  485                  mutex_enter(&so2->so_lock);
 489  486                  error = sowaitconnected(so2, 0, 1);
 490  487                  mutex_exit(&so2->so_lock);
 491  488                  if (error != 0) {
 492  489                          (void) socket_close(nso, 0, CRED());
 493  490                          socket_destroy(nso);
 494  491                          eprintsoline(so2, error);
 495  492                          goto done;
 496  493                  }
 497  494  
 498  495                  nvp = SOTOV(nso);
 499  496                  if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) {
 500  497                          (void) socket_close(nso, 0, CRED());
 501  498                          socket_destroy(nso);
 502  499                          eprintsoline(nso, error);
 503  500                          goto done;
 504  501                  }
 505  502                  /*
 506  503                   * copy over FNONBLOCK and FNDELAY flags should they exist
 507  504                   */
 508  505                  if (so1->so_state & SS_NONBLOCK)
 509  506                          nfp->f_flag |= FNONBLOCK;
 510  507                  if (so1->so_state & SS_NDELAY)
 511  508                          nfp->f_flag |= FNDELAY;
 512  509  
 513  510                  /*
 514  511                   * fill in the entries that falloc reserved
 515  512                   */
 516  513                  mutex_exit(&nfp->f_tlock);
 517  514                  setf(nfd, nfp);
 518  515  
 519  516                  /*
 520  517                   * get the original flags before we release
 521  518                   */
 522  519                  VERIFY(f_getfd_error(svs[0], &orig_flags) == 0);
 523  520  
 524  521                  releasef(svs[0]);
 525  522                  releasef(svs[1]);
 526  523  
 527  524                  /*
 528  525                   * If FD_CLOEXEC was set on the filedescriptor we're
 529  526                   * swapping out, we should set it on the new one too.
 530  527                   */
 531  528                  if (orig_flags & FD_CLOEXEC) {
 532  529                          f_setfd(nfd, FD_CLOEXEC);
 533  530                  }
 534  531  
 535  532                  /*
 536  533                   * The socketpair library routine will close the original
 537  534                   * svs[0] when this code passes out a different file
 538  535                   * descriptor.
 539  536                   */
 540  537                  svs[0] = nfd;
 541  538  
 542  539                  if (copyout(svs, sv, sizeof (svs))) {
 543  540                          (void) closeandsetf(nfd, NULL);
 544  541                          eprintline(EFAULT);
 545  542                          return (set_errno(EFAULT));
 546  543                  }
 547  544          }
 548  545          return (0);
 549  546  
 550  547  done:
 551  548          releasef(svs[0]);
 552  549          releasef(svs[1]);
 553  550          return (set_errno(error));
 554  551  }
 555  552  
 556  553  int
 557  554  bind(int sock, struct sockaddr *name, socklen_t namelen, int version)
 558  555  {
 559  556          struct sonode *so;
 560  557          int error;
 561  558  
 562  559          dprint(1, ("bind(%d, %p, %d)\n",
 563  560              sock, (void *)name, namelen));
 564  561  
 565  562          if ((so = getsonode(sock, &error, NULL)) == NULL)
 566  563                  return (set_errno(error));
 567  564  
 568  565          /* Allocate and copyin name */
 569  566          /*
 570  567           * X/Open test does not expect EFAULT with NULL name and non-zero
 571  568           * namelen.
 572  569           */
 573  570          if (name != NULL && namelen != 0) {
 574  571                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 575  572                  name = copyin_name(so, name, &namelen, &error);
 576  573                  if (name == NULL) {
 577  574                          releasef(sock);
 578  575                          return (set_errno(error));
 579  576                  }
 580  577          } else {
 581  578                  name = NULL;
 582  579                  namelen = 0;
 583  580          }
 584  581  
 585  582          switch (version) {
 586  583          default:
 587  584                  error = socket_bind(so, name, namelen, 0, CRED());
 588  585                  break;
 589  586          case SOV_XPG4_2:
 590  587                  error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED());
 591  588                  break;
 592  589          case SOV_SOCKBSD:
 593  590                  error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED());
 594  591                  break;
 595  592          }
 596  593  done:
 597  594          releasef(sock);
 598  595          if (name != NULL)
 599  596                  kmem_free(name, (size_t)namelen);
 600  597  
 601  598          if (error)
 602  599                  return (set_errno(error));
 603  600          return (0);
 604  601  }
 605  602  
 606  603  /* ARGSUSED2 */
 607  604  int
 608  605  listen(int sock, int backlog, int version)
 609  606  {
 610  607          struct sonode *so;
 611  608          int error;
 612  609  
 613  610          dprint(1, ("listen(%d, %d)\n",
 614  611              sock, backlog));
 615  612  
 616  613          if ((so = getsonode(sock, &error, NULL)) == NULL)
 617  614                  return (set_errno(error));
 618  615  
 619  616          error = socket_listen(so, backlog, CRED());
 620  617  
 621  618          releasef(sock);
 622  619          if (error)
 623  620                  return (set_errno(error));
 624  621          return (0);
 625  622  }
 626  623  
 627  624  /*ARGSUSED3*/
 628  625  int
 629  626  accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version,
 630  627      int flags)
 631  628  {
 632  629          struct sonode *so;
 633  630          file_t *fp;
 634  631          int error;
 635  632          socklen_t namelen;
 636  633          struct sonode *nso;
 637  634          struct vnode *nvp;
 638  635          struct file *nfp;
 639  636          int nfd;
 640  637          int ssflags;
 641  638          struct sockaddr *addrp;
 642  639          socklen_t addrlen;
 643  640  
 644  641          dprint(1, ("accept(%d, %p, %p)\n",
 645  642              sock, (void *)name, (void *)namelenp));
 646  643  
 647  644          if (flags & ~(SOCK_CLOEXEC|SOCK_NONBLOCK|SOCK_NDELAY)) {
 648  645                  return (set_errno(EINVAL));
 649  646          }
 650  647  
 651  648          /* Translate SOCK_ flags to their SS_ variant */
 652  649          ssflags = 0;
 653  650          if (flags & SOCK_NONBLOCK)
 654  651                  ssflags |= SS_NONBLOCK;
 655  652          if (flags & SOCK_NDELAY)
 656  653                  ssflags |= SS_NDELAY;
 657  654  
 658  655          if ((so = getsonode(sock, &error, &fp)) == NULL)
 659  656                  return (set_errno(error));
 660  657  
 661  658          if (name != NULL) {
 662  659                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 663  660                  if (copyin(namelenp, &namelen, sizeof (namelen))) {
 664  661                          releasef(sock);
 665  662                          return (set_errno(EFAULT));
 666  663                  }
 667  664                  if (namelen != 0) {
 668  665                          error = useracc(name, (size_t)namelen, B_WRITE);
 669  666                          if (error && do_useracc) {
 670  667                                  releasef(sock);
 671  668                                  return (set_errno(EFAULT));
 672  669                          }
 673  670                  } else
 674  671                          name = NULL;
 675  672          } else {
 676  673                  namelen = 0;
 677  674          }
 678  675  
 679  676          /*
 680  677           * Allocate the user fd before socket_accept() in order to
 681  678           * catch EMFILE errors before calling socket_accept().
 682  679           */
 683  680          if ((nfd = ufalloc(0)) == -1) {
 684  681                  eprintsoline(so, EMFILE);
 685  682                  releasef(sock);
 686  683                  return (set_errno(EMFILE));
 687  684          }
 688  685          error = socket_accept(so, fp->f_flag, CRED(), &nso);
 689  686          if (error) {
 690  687                  setf(nfd, NULL);
 691  688                  releasef(sock);
 692  689                  return (set_errno(error));
 693  690          }
 694  691  
 695  692          nvp = SOTOV(nso);
 696  693  
 697  694          ASSERT(MUTEX_NOT_HELD(&nso->so_lock));
 698  695          if (namelen != 0) {
 699  696                  addrlen = so->so_max_addr_len;
 700  697                  addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP);
 701  698  
 702  699                  if ((error = socket_getpeername(nso, (struct sockaddr *)addrp,
 703  700                      &addrlen, B_TRUE, CRED())) == 0) {
 704  701                          error = copyout_name(name, namelen, namelenp,
 705  702                              addrp, addrlen);
 706  703                  } else {
 707  704                          ASSERT(error == EINVAL || error == ENOTCONN);
 708  705                          error = ECONNABORTED;
 709  706                  }
 710  707                  kmem_free(addrp, so->so_max_addr_len);
 711  708          }
 712  709  
 713  710          if (error) {
 714  711                  setf(nfd, NULL);
 715  712                  (void) socket_close(nso, 0, CRED());
 716  713                  socket_destroy(nso);
 717  714                  releasef(sock);
 718  715                  return (set_errno(error));
 719  716          }
 720  717          if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) {
 721  718                  setf(nfd, NULL);
 722  719                  (void) socket_close(nso, 0, CRED());
 723  720                  socket_destroy(nso);
 724  721                  eprintsoline(so, error);
 725  722                  releasef(sock);
 726  723                  return (set_errno(error));
 727  724          }
 728  725          /*
 729  726           * fill in the entries that falloc reserved
 730  727           */
 731  728          nfp->f_vnode = nvp;
 732  729          mutex_exit(&nfp->f_tlock);
 733  730          setf(nfd, nfp);
 734  731  
 735  732          /*
 736  733           * Act on SOCK_CLOEXEC from flags
 737  734           */
 738  735          if (flags & SOCK_CLOEXEC) {
 739  736                  f_setfd(nfd, FD_CLOEXEC);
 740  737          }
 741  738  
 742  739          /*
 743  740           * Copy FNDELAY and FNONBLOCK from listener to acceptor
 744  741           * and from ssflags
 745  742           */
 746  743          if ((ssflags | so->so_state) & (SS_NDELAY|SS_NONBLOCK)) {
 747  744                  uint_t oflag = nfp->f_flag;
 748  745                  int arg = 0;
 749  746  
 750  747                  if ((ssflags | so->so_state) & SS_NONBLOCK)
 751  748                          arg |= FNONBLOCK;
 752  749                  else if ((ssflags | so->so_state) & SS_NDELAY)
 753  750                          arg |= FNDELAY;
 754  751  
 755  752                  /*
 756  753                   * This code is a simplification of the F_SETFL code in fcntl()
 757  754                   * Ignore any errors from VOP_SETFL.
 758  755                   */
 759  756                  if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred, NULL))
 760  757                      != 0) {
 761  758                          eprintsoline(so, error);
 762  759                          error = 0;
 763  760                  } else {
 764  761                          mutex_enter(&nfp->f_tlock);
 765  762                          nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
 766  763                          nfp->f_flag |= arg;
 767  764                          mutex_exit(&nfp->f_tlock);
 768  765                  }
 769  766          }
 770  767          releasef(sock);
 771  768          return (nfd);
 772  769  }
 773  770  
 774  771  int
 775  772  connect(int sock, struct sockaddr *name, socklen_t namelen, int version)
 776  773  {
 777  774          struct sonode *so;
 778  775          file_t *fp;
 779  776          int error;
 780  777  
 781  778          dprint(1, ("connect(%d, %p, %d)\n",
 782  779              sock, (void *)name, namelen));
 783  780  
 784  781          if ((so = getsonode(sock, &error, &fp)) == NULL)
 785  782                  return (set_errno(error));
 786  783  
 787  784          /* Allocate and copyin name */
 788  785          if (namelen != 0) {
 789  786                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 790  787                  name = copyin_name(so, name, &namelen, &error);
 791  788                  if (name == NULL) {
 792  789                          releasef(sock);
 793  790                          return (set_errno(error));
 794  791                  }
 795  792          } else
 796  793                  name = NULL;
 797  794  
 798  795          error = socket_connect(so, name, namelen, fp->f_flag,
 799  796              (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED());
 800  797          releasef(sock);
 801  798          if (name)
 802  799                  kmem_free(name, (size_t)namelen);
 803  800          if (error)
 804  801                  return (set_errno(error));
 805  802          return (0);
 806  803  }
 807  804  
 808  805  /*ARGSUSED2*/
 809  806  int
 810  807  shutdown(int sock, int how, int version)
 811  808  {
 812  809          struct sonode *so;
 813  810          int error;
 814  811  
 815  812          dprint(1, ("shutdown(%d, %d)\n",
 816  813              sock, how));
 817  814  
 818  815          if ((so = getsonode(sock, &error, NULL)) == NULL)
 819  816                  return (set_errno(error));
 820  817  
 821  818          error = socket_shutdown(so, how, CRED());
 822  819  
 823  820          releasef(sock);
 824  821          if (error)
 825  822                  return (set_errno(error));
 826  823          return (0);
 827  824  }
 828  825  
 829  826  /*
 830  827   * Common receive routine.
 831  828   */
 832  829  static ssize_t
 833  830  recvit(int sock,
 834  831          struct nmsghdr *msg,
 835  832          struct uio *uiop,
 836  833          int flags,
 837  834          socklen_t *namelenp,
 838  835          socklen_t *controllenp,
 839  836          int *flagsp)
 840  837  {
 841  838          struct sonode *so;
 842  839          file_t *fp;
 843  840          void *name;
 844  841          socklen_t namelen;
 845  842          void *control;
 846  843          socklen_t controllen;
 847  844          ssize_t len;
 848  845          int error;
 849  846  
 850  847          if ((so = getsonode(sock, &error, &fp)) == NULL)
 851  848                  return (set_errno(error));
 852  849  
 853  850          len = uiop->uio_resid;
 854  851          uiop->uio_fmode = fp->f_flag;
 855  852          uiop->uio_extflg = UIO_COPY_CACHED;
 856  853  
 857  854          name = msg->msg_name;
 858  855          namelen = msg->msg_namelen;
 859  856          control = msg->msg_control;
 860  857          controllen = msg->msg_controllen;
 861  858  
 862  859          msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
 863  860              MSG_DONTWAIT | MSG_XPG4_2);
 864  861  
 865  862          error = socket_recvmsg(so, msg, uiop, CRED());
 866  863          if (error) {
 867  864                  releasef(sock);
 868  865                  return (set_errno(error));
 869  866          }
 870  867          lwp_stat_update(LWP_STAT_MSGRCV, 1);
 871  868          releasef(sock);
 872  869  
 873  870          error = copyout_name(name, namelen, namelenp,
 874  871              msg->msg_name, msg->msg_namelen);
 875  872          if (error)
 876  873                  goto err;
 877  874  
 878  875          if (flagsp != NULL) {
 879  876                  /*
 880  877                   * Clear internal flag.
 881  878                   */
 882  879                  msg->msg_flags &= ~MSG_XPG4_2;
 883  880  
 884  881                  /*
 885  882                   * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
 886  883                   * when controllen is zero and there is control data to
 887  884                   * copy out.
 888  885                   */
 889  886                  if (controllen != 0 &&
 890  887                      (msg->msg_controllen > controllen || control == NULL)) {
 891  888                          dprint(1, ("recvit: CTRUNC %d %d %p\n",
 892  889                              msg->msg_controllen, controllen, control));
 893  890  
 894  891                          msg->msg_flags |= MSG_CTRUNC;
 895  892                  }
 896  893                  if (copyout(&msg->msg_flags, flagsp,
 897  894                      sizeof (msg->msg_flags))) {
 898  895                          error = EFAULT;
 899  896                          goto err;
 900  897                  }
 901  898          }
 902  899          /*
 903  900           * Note: This MUST be done last. There can be no "goto err" after this
 904  901           * point since it could make so_closefds run twice on some part
 905  902           * of the file descriptor array.
 906  903           */
 907  904          if (controllen != 0) {
 908  905                  if (!(flags & MSG_XPG4_2)) {
 909  906                          /*
 910  907                           * Good old msg_accrights can only return a multiple
 911  908                           * of 4 bytes.
 912  909                           */
 913  910                          controllen &= ~((int)sizeof (uint32_t) - 1);
 914  911                  }
 915  912                  error = copyout_arg(control, controllen, controllenp,
 916  913                      msg->msg_control, msg->msg_controllen);
 917  914                  if (error)
 918  915                          goto err;
 919  916  
 920  917                  if (msg->msg_controllen > controllen || control == NULL) {
 921  918                          if (control == NULL)
 922  919                                  controllen = 0;
 923  920                          so_closefds(msg->msg_control, msg->msg_controllen,
 924  921                              !(flags & MSG_XPG4_2), controllen);
 925  922                  }
 926  923          }
 927  924          if (msg->msg_namelen != 0)
 928  925                  kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
 929  926          if (msg->msg_controllen != 0)
 930  927                  kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
 931  928          return (len - uiop->uio_resid);
 932  929  
 933  930  err:
 934  931          /*
 935  932           * If we fail and the control part contains file descriptors
 936  933           * we have to close the fd's.
 937  934           */
 938  935          if (msg->msg_controllen != 0)
 939  936                  so_closefds(msg->msg_control, msg->msg_controllen,
 940  937                      !(flags & MSG_XPG4_2), 0);
 941  938          if (msg->msg_namelen != 0)
 942  939                  kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
 943  940          if (msg->msg_controllen != 0)
 944  941                  kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
 945  942          return (set_errno(error));
 946  943  }
 947  944  
 948  945  /*
 949  946   * Native system call
 950  947   */
 951  948  ssize_t
 952  949  recv(int sock, void *buffer, size_t len, int flags)
 953  950  {
 954  951          struct nmsghdr lmsg;
 955  952          struct uio auio;
 956  953          struct iovec aiov[1];
 957  954  
 958  955          dprint(1, ("recv(%d, %p, %ld, %d)\n",
 959  956              sock, buffer, len, flags));
 960  957  
 961  958          if ((ssize_t)len < 0) {
 962  959                  return (set_errno(EINVAL));
 963  960          }
 964  961  
 965  962          aiov[0].iov_base = buffer;
 966  963          aiov[0].iov_len = len;
 967  964          auio.uio_loffset = 0;
 968  965          auio.uio_iov = aiov;
 969  966          auio.uio_iovcnt = 1;
 970  967          auio.uio_resid = len;
 971  968          auio.uio_segflg = UIO_USERSPACE;
 972  969          auio.uio_limit = 0;
 973  970  
 974  971          lmsg.msg_namelen = 0;
 975  972          lmsg.msg_controllen = 0;
 976  973          lmsg.msg_flags = 0;
 977  974          return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL));
 978  975  }
 979  976  
 980  977  ssize_t
 981  978  recvfrom(int sock, void *buffer, size_t len, int flags,
 982  979          struct sockaddr *name, socklen_t *namelenp)
 983  980  {
 984  981          struct nmsghdr lmsg;
 985  982          struct uio auio;
 986  983          struct iovec aiov[1];
 987  984  
 988  985          dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
 989  986              sock, buffer, len, flags, (void *)name, (void *)namelenp));
 990  987  
 991  988          if ((ssize_t)len < 0) {
 992  989                  return (set_errno(EINVAL));
 993  990          }
 994  991  
 995  992          aiov[0].iov_base = buffer;
 996  993          aiov[0].iov_len = len;
 997  994          auio.uio_loffset = 0;
 998  995          auio.uio_iov = aiov;
 999  996          auio.uio_iovcnt = 1;
1000  997          auio.uio_resid = len;
1001  998          auio.uio_segflg = UIO_USERSPACE;
1002  999          auio.uio_limit = 0;
1003 1000  
1004 1001          lmsg.msg_name = (char *)name;
1005 1002          if (namelenp != NULL) {
1006 1003                  if (copyin(namelenp, &lmsg.msg_namelen,
1007 1004                      sizeof (lmsg.msg_namelen)))
1008 1005                          return (set_errno(EFAULT));
1009 1006          } else {
1010 1007                  lmsg.msg_namelen = 0;
1011 1008          }
1012 1009          lmsg.msg_controllen = 0;
1013 1010          lmsg.msg_flags = 0;
1014 1011  
1015 1012          return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL));
1016 1013  }
1017 1014  
1018 1015  /*
  
    | 
      ↓ open down ↓ | 
    914 lines elided | 
    
      ↑ open up ↑ | 
  
1019 1016   * Uses the MSG_XPG4_2 flag to determine if the caller is using
1020 1017   * struct omsghdr or struct nmsghdr.
1021 1018   */
1022 1019  ssize_t
1023 1020  recvmsg(int sock, struct nmsghdr *msg, int flags)
1024 1021  {
1025 1022          STRUCT_DECL(nmsghdr, u_lmsg);
1026 1023          STRUCT_HANDLE(nmsghdr, umsgptr);
1027 1024          struct nmsghdr lmsg;
1028 1025          struct uio auio;
1029      -        struct iovec aiov[MSG_MAXIOVLEN];
     1026 +        struct iovec buf[IOV_MAX_STACK], *aiov = buf;
     1027 +        ssize_t iovsize = 0;
1030 1028          int iovcnt;
1031      -        ssize_t len;
     1029 +        ssize_t len, rval;
1032 1030          int i;
1033 1031          int *flagsp;
1034 1032          model_t model;
1035 1033  
1036 1034          dprint(1, ("recvmsg(%d, %p, %d)\n",
1037 1035              sock, (void *)msg, flags));
1038 1036  
1039 1037          model = get_udatamodel();
1040 1038          STRUCT_INIT(u_lmsg, model);
1041 1039          STRUCT_SET_HANDLE(umsgptr, model, msg);
1042 1040  
1043 1041          if (flags & MSG_XPG4_2) {
1044 1042                  if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg)))
1045 1043                          return (set_errno(EFAULT));
1046 1044                  flagsp = STRUCT_FADDR(umsgptr, msg_flags);
1047 1045          } else {
1048 1046                  /*
1049 1047                   * Assumes that nmsghdr and omsghdr are identically shaped
1050 1048                   * except for the added msg_flags field.
1051 1049                   */
1052 1050                  if (copyin(msg, STRUCT_BUF(u_lmsg),
1053 1051                      SIZEOF_STRUCT(omsghdr, model)))
1054 1052                          return (set_errno(EFAULT));
1055 1053                  STRUCT_FSET(u_lmsg, msg_flags, 0);
1056 1054                  flagsp = NULL;
1057 1055          }
1058 1056  
1059 1057          /*
1060 1058           * Code below us will kmem_alloc memory and hang it
1061 1059           * off msg_control and msg_name fields. This forces
1062 1060           * us to copy the structure to its native form.
1063 1061           */
  
    | 
      ↓ open down ↓ | 
    22 lines elided | 
    
      ↑ open up ↑ | 
  
1064 1062          lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1065 1063          lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1066 1064          lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1067 1065          lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1068 1066          lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1069 1067          lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1070 1068          lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1071 1069  
1072 1070          iovcnt = lmsg.msg_iovlen;
1073 1071  
1074      -        if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
     1072 +        if (iovcnt <= 0 || iovcnt > IOV_MAX) {
1075 1073                  return (set_errno(EMSGSIZE));
1076 1074          }
1077 1075  
     1076 +        if (iovcnt > IOV_MAX_STACK) {
     1077 +                iovsize = iovcnt * sizeof (struct iovec);
     1078 +                aiov = kmem_alloc(iovsize, KM_SLEEP);
     1079 +        }
     1080 +
1078 1081  #ifdef _SYSCALL32_IMPL
1079 1082          /*
1080 1083           * 32-bit callers need to have their iovec expanded, while ensuring
1081 1084           * that they can't move more than 2Gbytes of data in a single call.
1082 1085           */
1083 1086          if (model == DATAMODEL_ILP32) {
1084      -                struct iovec32 aiov32[MSG_MAXIOVLEN];
     1087 +                struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
     1088 +                ssize_t iov32size;
1085 1089                  ssize32_t count32;
1086 1090  
1087      -                if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1088      -                    iovcnt * sizeof (struct iovec32)))
     1091 +                iov32size = iovcnt * sizeof (struct iovec32);
     1092 +                if (iovsize != 0)
     1093 +                        aiov32 = kmem_alloc(iov32size, KM_SLEEP);
     1094 +
     1095 +                if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
     1096 +                        if (iovsize != 0) {
     1097 +                                kmem_free(aiov32, iov32size);
     1098 +                                kmem_free(aiov, iovsize);
     1099 +                        }
     1100 +
1089 1101                          return (set_errno(EFAULT));
     1102 +                }
1090 1103  
1091 1104                  count32 = 0;
1092 1105                  for (i = 0; i < iovcnt; i++) {
1093 1106                          ssize32_t iovlen32;
1094 1107  
1095 1108                          iovlen32 = aiov32[i].iov_len;
1096 1109                          count32 += iovlen32;
1097      -                        if (iovlen32 < 0 || count32 < 0)
     1110 +                        if (iovlen32 < 0 || count32 < 0) {
     1111 +                                if (iovsize != 0) {
     1112 +                                        kmem_free(aiov32, iov32size);
     1113 +                                        kmem_free(aiov, iovsize);
     1114 +                                }
     1115 +
1098 1116                                  return (set_errno(EINVAL));
     1117 +                        }
     1118 +
1099 1119                          aiov[i].iov_len = iovlen32;
1100 1120                          aiov[i].iov_base =
1101 1121                              (caddr_t)(uintptr_t)aiov32[i].iov_base;
1102 1122                  }
     1123 +
     1124 +                if (iovsize != 0)
     1125 +                        kmem_free(aiov32, iov32size);
1103 1126          } else
1104 1127  #endif /* _SYSCALL32_IMPL */
1105 1128          if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
     1129 +                if (iovsize != 0)
     1130 +                        kmem_free(aiov, iovsize);
     1131 +
1106 1132                  return (set_errno(EFAULT));
1107 1133          }
1108 1134          len = 0;
1109 1135          for (i = 0; i < iovcnt; i++) {
1110 1136                  ssize_t iovlen = aiov[i].iov_len;
1111 1137                  len += iovlen;
1112 1138                  if (iovlen < 0 || len < 0) {
     1139 +                        if (iovsize != 0)
     1140 +                                kmem_free(aiov, iovsize);
     1141 +
1113 1142                          return (set_errno(EINVAL));
1114 1143                  }
1115 1144          }
1116 1145          auio.uio_loffset = 0;
1117 1146          auio.uio_iov = aiov;
1118 1147          auio.uio_iovcnt = iovcnt;
1119 1148          auio.uio_resid = len;
1120 1149          auio.uio_segflg = UIO_USERSPACE;
1121 1150          auio.uio_limit = 0;
1122 1151  
1123 1152          if (lmsg.msg_control != NULL &&
1124 1153              (do_useracc == 0 ||
1125 1154              useracc(lmsg.msg_control, lmsg.msg_controllen,
1126 1155              B_WRITE) != 0)) {
     1156 +                if (iovsize != 0)
     1157 +                        kmem_free(aiov, iovsize);
     1158 +
1127 1159                  return (set_errno(EFAULT));
1128 1160          }
1129 1161  
1130      -        return (recvit(sock, &lmsg, &auio, flags,
     1162 +        rval = recvit(sock, &lmsg, &auio, flags,
1131 1163              STRUCT_FADDR(umsgptr, msg_namelen),
1132      -            STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
     1164 +            STRUCT_FADDR(umsgptr, msg_controllen), flagsp);
     1165 +
     1166 +        if (iovsize != 0)
     1167 +                kmem_free(aiov, iovsize);
     1168 +
     1169 +        return (rval);
1133 1170  }
1134 1171  
1135 1172  /*
1136 1173   * Common send function.
1137 1174   */
1138 1175  static ssize_t
1139 1176  sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
1140 1177  {
1141 1178          struct sonode *so;
1142 1179          file_t *fp;
1143 1180          void *name;
1144 1181          socklen_t namelen;
1145 1182          void *control;
1146 1183          socklen_t controllen;
1147 1184          ssize_t len;
1148 1185          int error;
1149 1186  
1150 1187          if ((so = getsonode(sock, &error, &fp)) == NULL)
1151 1188                  return (set_errno(error));
1152 1189  
1153 1190          uiop->uio_fmode = fp->f_flag;
1154 1191  
1155 1192          if (so->so_family == AF_UNIX)
1156 1193                  uiop->uio_extflg = UIO_COPY_CACHED;
1157 1194          else
1158 1195                  uiop->uio_extflg = UIO_COPY_DEFAULT;
1159 1196  
1160 1197          /* Allocate and copyin name and control */
1161 1198          name = msg->msg_name;
1162 1199          namelen = msg->msg_namelen;
1163 1200          if (name != NULL && namelen != 0) {
1164 1201                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1165 1202                  name = copyin_name(so,
1166 1203                      (struct sockaddr *)name,
1167 1204                      &namelen, &error);
1168 1205                  if (name == NULL)
1169 1206                          goto done3;
1170 1207                  /* copyin_name null terminates addresses for AF_UNIX */
1171 1208                  msg->msg_namelen = namelen;
1172 1209                  msg->msg_name = name;
1173 1210          } else {
1174 1211                  msg->msg_name = name = NULL;
1175 1212                  msg->msg_namelen = namelen = 0;
1176 1213          }
1177 1214  
1178 1215          control = msg->msg_control;
1179 1216          controllen = msg->msg_controllen;
1180 1217          if ((control != NULL) && (controllen != 0)) {
1181 1218                  /*
1182 1219                   * Verify that the length is not excessive to prevent
1183 1220                   * an application from consuming all of kernel memory.
1184 1221                   */
1185 1222                  if (controllen > SO_MAXARGSIZE) {
1186 1223                          error = EINVAL;
1187 1224                          goto done2;
1188 1225                  }
1189 1226                  control = kmem_alloc(controllen, KM_SLEEP);
1190 1227  
1191 1228                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1192 1229                  if (copyin(msg->msg_control, control, controllen)) {
1193 1230                          error = EFAULT;
1194 1231                          goto done1;
1195 1232                  }
1196 1233                  msg->msg_control = control;
1197 1234          } else {
1198 1235                  msg->msg_control = control = NULL;
1199 1236                  msg->msg_controllen = controllen = 0;
1200 1237          }
1201 1238  
1202 1239          len = uiop->uio_resid;
1203 1240          msg->msg_flags = flags;
1204 1241  
1205 1242          error = socket_sendmsg(so, msg, uiop, CRED());
1206 1243  done1:
1207 1244          if (control != NULL)
1208 1245                  kmem_free(control, controllen);
1209 1246  done2:
1210 1247          if (name != NULL)
1211 1248                  kmem_free(name, namelen);
1212 1249  done3:
1213 1250          if (error != 0) {
1214 1251                  releasef(sock);
1215 1252                  return (set_errno(error));
1216 1253          }
1217 1254          lwp_stat_update(LWP_STAT_MSGSND, 1);
1218 1255          releasef(sock);
1219 1256          return (len - uiop->uio_resid);
1220 1257  }
1221 1258  
1222 1259  /*
1223 1260   * Native system call
1224 1261   */
1225 1262  ssize_t
1226 1263  send(int sock, void *buffer, size_t len, int flags)
1227 1264  {
1228 1265          struct nmsghdr lmsg;
1229 1266          struct uio auio;
1230 1267          struct iovec aiov[1];
1231 1268  
1232 1269          dprint(1, ("send(%d, %p, %ld, %d)\n",
1233 1270              sock, buffer, len, flags));
1234 1271  
1235 1272          if ((ssize_t)len < 0) {
1236 1273                  return (set_errno(EINVAL));
1237 1274          }
1238 1275  
1239 1276          aiov[0].iov_base = buffer;
1240 1277          aiov[0].iov_len = len;
1241 1278          auio.uio_loffset = 0;
1242 1279          auio.uio_iov = aiov;
1243 1280          auio.uio_iovcnt = 1;
1244 1281          auio.uio_resid = len;
1245 1282          auio.uio_segflg = UIO_USERSPACE;
1246 1283          auio.uio_limit = 0;
1247 1284  
1248 1285          lmsg.msg_name = NULL;
1249 1286          lmsg.msg_control = NULL;
1250 1287          if (!(flags & MSG_XPG4_2)) {
1251 1288                  /*
1252 1289                   * In order to be compatible with the libsocket/sockmod
1253 1290                   * implementation we set EOR for all send* calls.
1254 1291                   */
1255 1292                  flags |= MSG_EOR;
1256 1293          }
1257 1294          return (sendit(sock, &lmsg, &auio, flags));
1258 1295  }
1259 1296  
  
    | 
      ↓ open down ↓ | 
    117 lines elided | 
    
      ↑ open up ↑ | 
  
1260 1297  /*
1261 1298   * Uses the MSG_XPG4_2 flag to determine if the caller is using
1262 1299   * struct omsghdr or struct nmsghdr.
1263 1300   */
1264 1301  ssize_t
1265 1302  sendmsg(int sock, struct nmsghdr *msg, int flags)
1266 1303  {
1267 1304          struct nmsghdr lmsg;
1268 1305          STRUCT_DECL(nmsghdr, u_lmsg);
1269 1306          struct uio auio;
1270      -        struct iovec aiov[MSG_MAXIOVLEN];
     1307 +        struct iovec buf[IOV_MAX_STACK], *aiov = buf;
     1308 +        ssize_t iovsize = 0;
1271 1309          int iovcnt;
1272      -        ssize_t len;
     1310 +        ssize_t len, rval;
1273 1311          int i;
1274 1312          model_t model;
1275 1313  
1276 1314          dprint(1, ("sendmsg(%d, %p, %d)\n", sock, (void *)msg, flags));
1277 1315  
1278 1316          model = get_udatamodel();
1279 1317          STRUCT_INIT(u_lmsg, model);
1280 1318  
1281 1319          if (flags & MSG_XPG4_2) {
1282 1320                  if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1283 1321                      STRUCT_SIZE(u_lmsg)))
1284 1322                          return (set_errno(EFAULT));
1285 1323          } else {
1286 1324                  /*
1287 1325                   * Assumes that nmsghdr and omsghdr are identically shaped
1288 1326                   * except for the added msg_flags field.
1289 1327                   */
1290 1328                  if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1291 1329                      SIZEOF_STRUCT(omsghdr, model)))
1292 1330                          return (set_errno(EFAULT));
1293 1331                  /*
1294 1332                   * In order to be compatible with the libsocket/sockmod
1295 1333                   * implementation we set EOR for all send* calls.
1296 1334                   */
1297 1335                  flags |= MSG_EOR;
1298 1336          }
1299 1337  
1300 1338          /*
1301 1339           * Code below us will kmem_alloc memory and hang it
1302 1340           * off msg_control and msg_name fields. This forces
1303 1341           * us to copy the structure to its native form.
1304 1342           */
  
    | 
      ↓ open down ↓ | 
    22 lines elided | 
    
      ↑ open up ↑ | 
  
1305 1343          lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1306 1344          lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1307 1345          lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1308 1346          lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1309 1347          lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1310 1348          lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1311 1349          lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1312 1350  
1313 1351          iovcnt = lmsg.msg_iovlen;
1314 1352  
1315      -        if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
     1353 +        if (iovcnt <= 0 || iovcnt > IOV_MAX) {
1316 1354                  /*
1317 1355                   * Unless this is XPG 4.2 we allow iovcnt == 0 to
1318 1356                   * be compatible with SunOS 4.X and 4.4BSD.
1319 1357                   */
1320 1358                  if (iovcnt != 0 || (flags & MSG_XPG4_2))
1321 1359                          return (set_errno(EMSGSIZE));
1322 1360          }
1323 1361  
     1362 +        if (iovcnt > IOV_MAX_STACK) {
     1363 +                iovsize = iovcnt * sizeof (struct iovec);
     1364 +                aiov = kmem_alloc(iovsize, KM_SLEEP);
     1365 +        }
     1366 +
1324 1367  #ifdef _SYSCALL32_IMPL
1325 1368          /*
1326 1369           * 32-bit callers need to have their iovec expanded, while ensuring
1327 1370           * that they can't move more than 2Gbytes of data in a single call.
1328 1371           */
1329 1372          if (model == DATAMODEL_ILP32) {
1330      -                struct iovec32 aiov32[MSG_MAXIOVLEN];
     1373 +                struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
     1374 +                ssize_t iov32size;
1331 1375                  ssize32_t count32;
1332 1376  
     1377 +                iov32size = iovcnt * sizeof (struct iovec32);
     1378 +                if (iovsize != 0)
     1379 +                        aiov32 = kmem_alloc(iov32size, KM_SLEEP);
     1380 +
1333 1381                  if (iovcnt != 0 &&
1334      -                    copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1335      -                    iovcnt * sizeof (struct iovec32)))
     1382 +                    copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
     1383 +                        if (iovsize != 0) {
     1384 +                                kmem_free(aiov32, iov32size);
     1385 +                                kmem_free(aiov, iovsize);
     1386 +                        }
     1387 +
1336 1388                          return (set_errno(EFAULT));
     1389 +                }
1337 1390  
1338 1391                  count32 = 0;
1339 1392                  for (i = 0; i < iovcnt; i++) {
1340 1393                          ssize32_t iovlen32;
1341 1394  
1342 1395                          iovlen32 = aiov32[i].iov_len;
1343 1396                          count32 += iovlen32;
1344      -                        if (iovlen32 < 0 || count32 < 0)
     1397 +                        if (iovlen32 < 0 || count32 < 0) {
     1398 +                                if (iovsize != 0) {
     1399 +                                        kmem_free(aiov32, iov32size);
     1400 +                                        kmem_free(aiov, iovsize);
     1401 +                                }
     1402 +
1345 1403                                  return (set_errno(EINVAL));
     1404 +                        }
     1405 +
1346 1406                          aiov[i].iov_len = iovlen32;
1347 1407                          aiov[i].iov_base =
1348 1408                              (caddr_t)(uintptr_t)aiov32[i].iov_base;
1349 1409                  }
     1410 +
     1411 +                if (iovsize != 0)
     1412 +                        kmem_free(aiov32, iov32size);
1350 1413          } else
1351 1414  #endif /* _SYSCALL32_IMPL */
1352 1415          if (iovcnt != 0 &&
1353 1416              copyin(lmsg.msg_iov, aiov,
1354 1417              (unsigned)iovcnt * sizeof (struct iovec))) {
     1418 +                if (iovsize != 0)
     1419 +                        kmem_free(aiov, iovsize);
     1420 +
1355 1421                  return (set_errno(EFAULT));
1356 1422          }
1357 1423          len = 0;
1358 1424          for (i = 0; i < iovcnt; i++) {
1359 1425                  ssize_t iovlen = aiov[i].iov_len;
1360 1426                  len += iovlen;
1361 1427                  if (iovlen < 0 || len < 0) {
     1428 +                        if (iovsize != 0)
     1429 +                                kmem_free(aiov, iovsize);
     1430 +
1362 1431                          return (set_errno(EINVAL));
1363 1432                  }
1364 1433          }
1365 1434          auio.uio_loffset = 0;
1366 1435          auio.uio_iov = aiov;
1367 1436          auio.uio_iovcnt = iovcnt;
1368 1437          auio.uio_resid = len;
1369 1438          auio.uio_segflg = UIO_USERSPACE;
1370 1439          auio.uio_limit = 0;
1371 1440  
1372      -        return (sendit(sock, &lmsg, &auio, flags));
     1441 +        rval = sendit(sock, &lmsg, &auio, flags);
     1442 +
     1443 +        if (iovsize != 0)
     1444 +                kmem_free(aiov, iovsize);
     1445 +
     1446 +        return (rval);
1373 1447  }
1374 1448  
1375 1449  ssize_t
1376 1450  sendto(int sock, void *buffer, size_t len, int flags,
1377 1451      struct sockaddr *name, socklen_t namelen)
1378 1452  {
1379 1453          struct nmsghdr lmsg;
1380 1454          struct uio auio;
1381 1455          struct iovec aiov[1];
1382 1456  
1383 1457          dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
1384 1458              sock, buffer, len, flags, (void *)name, namelen));
1385 1459  
1386 1460          if ((ssize_t)len < 0) {
1387 1461                  return (set_errno(EINVAL));
1388 1462          }
1389 1463  
1390 1464          aiov[0].iov_base = buffer;
1391 1465          aiov[0].iov_len = len;
1392 1466          auio.uio_loffset = 0;
1393 1467          auio.uio_iov = aiov;
1394 1468          auio.uio_iovcnt = 1;
1395 1469          auio.uio_resid = len;
1396 1470          auio.uio_segflg = UIO_USERSPACE;
1397 1471          auio.uio_limit = 0;
1398 1472  
1399 1473          lmsg.msg_name = (char *)name;
1400 1474          lmsg.msg_namelen = namelen;
1401 1475          lmsg.msg_control = NULL;
1402 1476          if (!(flags & MSG_XPG4_2)) {
1403 1477                  /*
1404 1478                   * In order to be compatible with the libsocket/sockmod
1405 1479                   * implementation we set EOR for all send* calls.
1406 1480                   */
1407 1481                  flags |= MSG_EOR;
1408 1482          }
1409 1483          return (sendit(sock, &lmsg, &auio, flags));
1410 1484  }
1411 1485  
1412 1486  /*ARGSUSED3*/
1413 1487  int
1414 1488  getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1415 1489  {
1416 1490          struct sonode *so;
1417 1491          int error;
1418 1492          socklen_t namelen;
1419 1493          socklen_t sock_addrlen;
1420 1494          struct sockaddr *sock_addrp;
1421 1495  
1422 1496          dprint(1, ("getpeername(%d, %p, %p)\n",
1423 1497              sock, (void *)name, (void *)namelenp));
1424 1498  
1425 1499          if ((so = getsonode(sock, &error, NULL)) == NULL)
1426 1500                  goto bad;
1427 1501  
1428 1502          ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1429 1503          if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1430 1504              (name == NULL && namelen != 0)) {
1431 1505                  error = EFAULT;
1432 1506                  goto rel_out;
1433 1507          }
1434 1508          sock_addrlen = so->so_max_addr_len;
1435 1509          sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1436 1510  
1437 1511          if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen,
1438 1512              B_FALSE, CRED())) == 0) {
1439 1513                  ASSERT(sock_addrlen <= so->so_max_addr_len);
1440 1514                  error = copyout_name(name, namelen, namelenp,
1441 1515                      (void *)sock_addrp, sock_addrlen);
1442 1516          }
1443 1517          kmem_free(sock_addrp, so->so_max_addr_len);
1444 1518  rel_out:
1445 1519          releasef(sock);
1446 1520  bad:    return (error != 0 ? set_errno(error) : 0);
1447 1521  }
1448 1522  
1449 1523  /*ARGSUSED3*/
1450 1524  int
1451 1525  getsockname(int sock, struct sockaddr *name,
1452 1526                  socklen_t *namelenp, int version)
1453 1527  {
1454 1528          struct sonode *so;
1455 1529          int error;
1456 1530          socklen_t namelen, sock_addrlen;
1457 1531          struct sockaddr *sock_addrp;
1458 1532  
1459 1533          dprint(1, ("getsockname(%d, %p, %p)\n",
1460 1534              sock, (void *)name, (void *)namelenp));
1461 1535  
1462 1536          if ((so = getsonode(sock, &error, NULL)) == NULL)
1463 1537                  goto bad;
1464 1538  
1465 1539          ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1466 1540          if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1467 1541              (name == NULL && namelen != 0)) {
1468 1542                  error = EFAULT;
1469 1543                  goto rel_out;
1470 1544          }
1471 1545  
1472 1546          sock_addrlen = so->so_max_addr_len;
1473 1547          sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1474 1548          if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen,
1475 1549              CRED())) == 0) {
1476 1550                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1477 1551                  ASSERT(sock_addrlen <= so->so_max_addr_len);
1478 1552                  error = copyout_name(name, namelen, namelenp,
1479 1553                      (void *)sock_addrp, sock_addrlen);
1480 1554          }
1481 1555          kmem_free(sock_addrp, so->so_max_addr_len);
1482 1556  rel_out:
1483 1557          releasef(sock);
1484 1558  bad:    return (error != 0 ? set_errno(error) : 0);
1485 1559  }
1486 1560  
1487 1561  /*ARGSUSED5*/
1488 1562  int
1489 1563  getsockopt(int sock,
1490 1564          int level,
1491 1565          int option_name,
1492 1566          void *option_value,
1493 1567          socklen_t *option_lenp,
1494 1568          int version)
1495 1569  {
1496 1570          struct sonode *so;
1497 1571          socklen_t optlen, optlen_res;
1498 1572          void *optval;
1499 1573          int error;
1500 1574  
1501 1575          dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1502 1576              sock, level, option_name, option_value, (void *)option_lenp));
1503 1577  
1504 1578          if ((so = getsonode(sock, &error, NULL)) == NULL)
1505 1579                  return (set_errno(error));
1506 1580  
1507 1581          ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1508 1582          if (copyin(option_lenp, &optlen, sizeof (optlen))) {
1509 1583                  releasef(sock);
1510 1584                  return (set_errno(EFAULT));
1511 1585          }
1512 1586          /*
1513 1587           * Verify that the length is not excessive to prevent
1514 1588           * an application from consuming all of kernel memory.
1515 1589           */
1516 1590          if (optlen > SO_MAXARGSIZE) {
1517 1591                  error = EINVAL;
1518 1592                  releasef(sock);
1519 1593                  return (set_errno(error));
1520 1594          }
1521 1595          optval = kmem_alloc(optlen, KM_SLEEP);
1522 1596          optlen_res = optlen;
1523 1597          error = socket_getsockopt(so, level, option_name, optval,
1524 1598              &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2,
1525 1599              CRED());
1526 1600          releasef(sock);
1527 1601          if (error) {
1528 1602                  kmem_free(optval, optlen);
1529 1603                  return (set_errno(error));
1530 1604          }
1531 1605          error = copyout_arg(option_value, optlen, option_lenp,
1532 1606              optval, optlen_res);
1533 1607          kmem_free(optval, optlen);
1534 1608          if (error)
1535 1609                  return (set_errno(error));
1536 1610          return (0);
1537 1611  }
1538 1612  
1539 1613  /*ARGSUSED5*/
1540 1614  int
1541 1615  setsockopt(int sock,
1542 1616          int level,
1543 1617          int option_name,
1544 1618          void *option_value,
1545 1619          socklen_t option_len,
1546 1620          int version)
1547 1621  {
1548 1622          struct sonode *so;
1549 1623          intptr_t buffer[2];
1550 1624          void *optval = NULL;
1551 1625          int error;
1552 1626  
1553 1627          dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
1554 1628              sock, level, option_name, option_value, option_len));
1555 1629  
1556 1630          if ((so = getsonode(sock, &error, NULL)) == NULL)
1557 1631                  return (set_errno(error));
1558 1632  
1559 1633          if (option_value != NULL) {
1560 1634                  if (option_len != 0) {
1561 1635                          /*
1562 1636                           * Verify that the length is not excessive to prevent
1563 1637                           * an application from consuming all of kernel memory.
1564 1638                           */
1565 1639                          if (option_len > SO_MAXARGSIZE) {
1566 1640                                  error = EINVAL;
1567 1641                                  goto done2;
1568 1642                          }
1569 1643                          optval = option_len <= sizeof (buffer) ?
1570 1644                              &buffer : kmem_alloc((size_t)option_len, KM_SLEEP);
1571 1645                          ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1572 1646                          if (copyin(option_value, optval, (size_t)option_len)) {
1573 1647                                  error = EFAULT;
1574 1648                                  goto done1;
1575 1649                          }
1576 1650                  }
1577 1651          } else
1578 1652                  option_len = 0;
1579 1653  
1580 1654          error = socket_setsockopt(so, level, option_name, optval,
1581 1655              (t_uscalar_t)option_len, CRED());
1582 1656  done1:
1583 1657          if (optval != buffer)
1584 1658                  kmem_free(optval, (size_t)option_len);
1585 1659  done2:
1586 1660          releasef(sock);
1587 1661          if (error)
1588 1662                  return (set_errno(error));
1589 1663          return (0);
1590 1664  }
1591 1665  
1592 1666  static int
1593 1667  sockconf_add_sock(int family, int type, int protocol, char *name)
1594 1668  {
1595 1669          int error = 0;
1596 1670          char *kdevpath = NULL;
1597 1671          char *kmodule = NULL;
1598 1672          char *buf = NULL;
1599 1673          size_t pathlen = 0;
1600 1674          struct sockparams *sp;
1601 1675  
1602 1676          if (name == NULL)
1603 1677                  return (EINVAL);
1604 1678          /*
1605 1679           * Copyin the name.
1606 1680           * This also makes it possible to check for too long pathnames.
1607 1681           * Compress the space needed for the name before passing it
1608 1682           * to soconfig - soconfig will store the string until
1609 1683           * the configuration is removed.
1610 1684           */
1611 1685          buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1612 1686          if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
1613 1687                  kmem_free(buf, MAXPATHLEN);
1614 1688                  return (error);
1615 1689          }
1616 1690          if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
1617 1691                  /* For device */
1618 1692  
1619 1693                  /*
1620 1694                   * Special handling for NCA:
1621 1695                   *
1622 1696                   * DEV_NCA is never opened even if an application
1623 1697                   * requests for AF_NCA. The device opened is instead a
1624 1698                   * predefined AF_INET transport (NCA_INET_DEV).
1625 1699                   *
1626 1700                   * Prior to Volo (PSARC/2007/587) NCA would determine
1627 1701                   * the device using a lookup, which worked then because
1628 1702                   * all protocols were based on TPI. Since TPI is no
1629 1703                   * longer the default, we have to explicitly state
1630 1704                   * which device to use.
1631 1705                   */
1632 1706                  if (strcmp(buf, NCA_DEV) == 0) {
1633 1707                          /* only support entry <28, 2, 0> */
1634 1708                          if (family != AF_NCA || type != SOCK_STREAM ||
1635 1709                              protocol != 0) {
1636 1710                                  kmem_free(buf, MAXPATHLEN);
1637 1711                                  return (EINVAL);
1638 1712                          }
1639 1713  
1640 1714                          pathlen = strlen(NCA_INET_DEV) + 1;
1641 1715                          kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1642 1716                          bcopy(NCA_INET_DEV, kdevpath, pathlen);
1643 1717                          kdevpath[pathlen - 1] = '\0';
1644 1718                  } else {
1645 1719                          kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1646 1720                          bcopy(buf, kdevpath, pathlen);
1647 1721                          kdevpath[pathlen - 1] = '\0';
1648 1722                  }
1649 1723          } else {
1650 1724                  /* For socket module */
1651 1725                  kmodule = kmem_alloc(pathlen, KM_SLEEP);
1652 1726                  bcopy(buf, kmodule, pathlen);
1653 1727                  kmodule[pathlen - 1] = '\0';
1654 1728                  pathlen = 0;
1655 1729          }
1656 1730          kmem_free(buf, MAXPATHLEN);
1657 1731  
1658 1732          /* sockparams_create frees mod name and devpath upon failure */
1659 1733          sp = sockparams_create(family, type, protocol, kmodule,
1660 1734              kdevpath, pathlen, 0, KM_SLEEP, &error);
1661 1735          if (sp != NULL) {
1662 1736                  error = sockparams_add(sp);
1663 1737                  if (error != 0)
1664 1738                          sockparams_destroy(sp);
1665 1739          }
1666 1740  
1667 1741          return (error);
1668 1742  }
1669 1743  
1670 1744  static int
1671 1745  sockconf_remove_sock(int family, int type, int protocol)
1672 1746  {
1673 1747          return (sockparams_delete(family, type, protocol));
1674 1748  }
1675 1749  
1676 1750  static int
1677 1751  sockconfig_remove_filter(const char *uname)
1678 1752  {
1679 1753          char kname[SOF_MAXNAMELEN];
1680 1754          size_t len;
1681 1755          int error;
1682 1756          sof_entry_t *ent;
1683 1757  
1684 1758          if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0)
1685 1759                  return (error);
1686 1760  
1687 1761          ent = sof_entry_remove_by_name(kname);
1688 1762          if (ent == NULL)
1689 1763                  return (ENXIO);
1690 1764  
1691 1765          mutex_enter(&ent->sofe_lock);
1692 1766          ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED));
1693 1767          if (ent->sofe_refcnt == 0) {
1694 1768                  mutex_exit(&ent->sofe_lock);
1695 1769                  sof_entry_free(ent);
1696 1770          } else {
1697 1771                  /* let the last socket free the filter */
1698 1772                  ent->sofe_flags |= SOFEF_CONDEMED;
1699 1773                  mutex_exit(&ent->sofe_lock);
1700 1774          }
1701 1775  
1702 1776          return (0);
1703 1777  }
1704 1778  
1705 1779  static int
1706 1780  sockconfig_add_filter(const char *uname, void *ufilpropp)
1707 1781  {
1708 1782          struct sockconfig_filter_props filprop;
1709 1783          sof_entry_t *ent;
1710 1784          int error;
1711 1785          size_t tuplesz, len;
1712 1786          char hintbuf[SOF_MAXNAMELEN];
1713 1787  
1714 1788          ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP);
1715 1789          mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL);
1716 1790  
1717 1791          if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN,
1718 1792              &len)) != 0) {
1719 1793                  sof_entry_free(ent);
1720 1794                  return (error);
1721 1795          }
1722 1796  
1723 1797          if (get_udatamodel() == DATAMODEL_NATIVE) {
1724 1798                  if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) {
1725 1799                          sof_entry_free(ent);
1726 1800                          return (EFAULT);
1727 1801                  }
1728 1802          }
1729 1803  #ifdef  _SYSCALL32_IMPL
1730 1804          else {
1731 1805                  struct sockconfig_filter_props32 filprop32;
1732 1806  
1733 1807                  if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) {
1734 1808                          sof_entry_free(ent);
1735 1809                          return (EFAULT);
1736 1810                  }
1737 1811                  filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname;
1738 1812                  filprop.sfp_autoattach = filprop32.sfp_autoattach;
1739 1813                  filprop.sfp_hint = filprop32.sfp_hint;
1740 1814                  filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg;
1741 1815                  filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt;
1742 1816                  filprop.sfp_socktuple =
1743 1817                      (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple;
1744 1818          }
1745 1819  #endif  /* _SYSCALL32_IMPL */
1746 1820  
1747 1821          if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname,
1748 1822              sizeof (ent->sofe_modname), &len)) != 0) {
1749 1823                  sof_entry_free(ent);
1750 1824                  return (error);
1751 1825          }
1752 1826  
1753 1827          /*
1754 1828           * A filter must specify at least one socket tuple.
1755 1829           */
1756 1830          if (filprop.sfp_socktuple_cnt == 0 ||
1757 1831              filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) {
1758 1832                  sof_entry_free(ent);
1759 1833                  return (EINVAL);
1760 1834          }
1761 1835          ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG;
1762 1836          ent->sofe_hint = filprop.sfp_hint;
1763 1837  
1764 1838          /*
1765 1839           * Verify the hint, and copy in the hint argument, if necessary.
1766 1840           */
1767 1841          switch (ent->sofe_hint) {
1768 1842          case SOF_HINT_BEFORE:
1769 1843          case SOF_HINT_AFTER:
1770 1844                  if ((error = copyinstr(filprop.sfp_hintarg, hintbuf,
1771 1845                      sizeof (hintbuf), &len)) != 0) {
1772 1846                          sof_entry_free(ent);
1773 1847                          return (error);
1774 1848                  }
1775 1849                  ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP);
1776 1850                  bcopy(hintbuf, ent->sofe_hintarg, len);
1777 1851                  /* FALLTHRU */
1778 1852          case SOF_HINT_TOP:
1779 1853          case SOF_HINT_BOTTOM:
1780 1854                  /* hints cannot be used with programmatic filters */
1781 1855                  if (ent->sofe_flags & SOFEF_PROG) {
1782 1856                          sof_entry_free(ent);
1783 1857                          return (EINVAL);
1784 1858                  }
1785 1859                  break;
1786 1860          case SOF_HINT_NONE:
1787 1861                  break;
1788 1862          default:
1789 1863                  /* bad hint value */
1790 1864                  sof_entry_free(ent);
1791 1865                  return (EINVAL);
1792 1866          }
1793 1867  
1794 1868          ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt;
1795 1869          tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt;
1796 1870          ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP);
1797 1871  
1798 1872          if (get_udatamodel() == DATAMODEL_NATIVE) {
1799 1873                  if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple,
1800 1874                      tuplesz)) {
1801 1875                          sof_entry_free(ent);
1802 1876                          return (EFAULT);
1803 1877                  }
1804 1878          }
1805 1879  #ifdef  _SYSCALL32_IMPL
1806 1880          else {
1807 1881                  int i;
1808 1882                  caddr_t data = (caddr_t)filprop.sfp_socktuple;
1809 1883                  sof_socktuple_t *tup = ent->sofe_socktuple;
1810 1884                  sof_socktuple32_t tup32;
1811 1885  
1812 1886                  tup = ent->sofe_socktuple;
1813 1887                  for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) {
1814 1888                          ASSERT(tup < ent->sofe_socktuple + tuplesz);
1815 1889  
1816 1890                          if (copyin(data, &tup32, sizeof (tup32)) != 0) {
1817 1891                                  sof_entry_free(ent);
1818 1892                                  return (EFAULT);
1819 1893                          }
1820 1894                          tup->sofst_family = tup32.sofst_family;
1821 1895                          tup->sofst_type = tup32.sofst_type;
1822 1896                          tup->sofst_protocol = tup32.sofst_protocol;
1823 1897  
1824 1898                          data += sizeof (tup32);
1825 1899                  }
1826 1900          }
1827 1901  #endif  /* _SYSCALL32_IMPL */
1828 1902  
1829 1903          /* Sockets can start using the filter as soon as the filter is added */
1830 1904          if ((error = sof_entry_add(ent)) != 0)
1831 1905                  sof_entry_free(ent);
1832 1906  
1833 1907          return (error);
1834 1908  }
1835 1909  
1836 1910  /*
1837 1911   * Socket configuration system call. It is used to add and remove
1838 1912   * socket types.
1839 1913   */
1840 1914  int
1841 1915  sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
1842 1916  {
1843 1917          int error = 0;
1844 1918  
1845 1919          if (secpolicy_net_config(CRED(), B_FALSE) != 0)
1846 1920                  return (set_errno(EPERM));
1847 1921  
1848 1922          if (sockfs_defer_nl7c_init) {
1849 1923                  nl7c_init();
1850 1924                  sockfs_defer_nl7c_init = 0;
1851 1925          }
1852 1926  
1853 1927          switch (cmd) {
1854 1928          case SOCKCONFIG_ADD_SOCK:
1855 1929                  error = sockconf_add_sock((int)(uintptr_t)arg1,
1856 1930                      (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4);
1857 1931                  break;
1858 1932          case SOCKCONFIG_REMOVE_SOCK:
1859 1933                  error = sockconf_remove_sock((int)(uintptr_t)arg1,
1860 1934                      (int)(uintptr_t)arg2, (int)(uintptr_t)arg3);
1861 1935                  break;
1862 1936          case SOCKCONFIG_ADD_FILTER:
1863 1937                  error = sockconfig_add_filter((const char *)arg1, arg2);
1864 1938                  break;
1865 1939          case SOCKCONFIG_REMOVE_FILTER:
1866 1940                  error = sockconfig_remove_filter((const char *)arg1);
1867 1941                  break;
1868 1942          case SOCKCONFIG_GET_SOCKTABLE:
1869 1943                  error = sockparams_copyout_socktable((int)(uintptr_t)arg1);
1870 1944                  break;
1871 1945          default:
1872 1946  #ifdef  DEBUG
1873 1947                  cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd);
1874 1948  #endif
1875 1949                  error = EINVAL;
1876 1950                  break;
1877 1951          }
1878 1952  
1879 1953          if (error != 0) {
1880 1954                  eprintline(error);
1881 1955                  return (set_errno(error));
1882 1956          }
1883 1957          return (0);
1884 1958  }
1885 1959  
1886 1960  
1887 1961  /*
1888 1962   * Sendfile is implemented through two schemes, direct I/O or by
1889 1963   * caching in the filesystem page cache. We cache the input file by
1890 1964   * default and use direct I/O only if sendfile_max_size is set
1891 1965   * appropriately as explained below. Note that this logic is consistent
1892 1966   * with other filesystems where caching is turned on by default
1893 1967   * unless explicitly turned off by using the DIRECTIO ioctl.
1894 1968   *
1895 1969   * We choose a slightly different scheme here. One can turn off
1896 1970   * caching by setting sendfile_max_size to 0. One can also enable
1897 1971   * caching of files <= sendfile_max_size by setting sendfile_max_size
1898 1972   * to an appropriate value. By default sendfile_max_size is set to the
1899 1973   * maximum value so that all files are cached. In future, we may provide
1900 1974   * better interfaces for caching the file.
1901 1975   *
1902 1976   * Sendfile through Direct I/O (Zero copy)
1903 1977   * --------------------------------------
1904 1978   *
1905 1979   * As disks are normally slower than the network, we can't have a
1906 1980   * single thread that reads the disk and writes to the network. We
1907 1981   * need to have parallelism. This is done by having the sendfile
1908 1982   * thread create another thread that reads from the filesystem
1909 1983   * and queues it for network processing. In this scheme, the data
1910 1984   * is never copied anywhere i.e it is zero copy unlike the other
1911 1985   * scheme.
1912 1986   *
1913 1987   * We have a sendfile queue (snfq) where each sendfile
1914 1988   * request (snf_req_t) is queued for processing by a thread. Number
1915 1989   * of threads is dynamically allocated and they exit if they are idling
1916 1990   * beyond a specified amount of time. When each request (snf_req_t) is
1917 1991   * processed by a thread, it produces a number of mblk_t structures to
1918 1992   * be consumed by the sendfile thread. snf_deque and snf_enque are
1919 1993   * used for consuming and producing mblks. Size of the filesystem
1920 1994   * read is determined by the tunable (sendfile_read_size). A single
1921 1995   * mblk holds sendfile_read_size worth of data (except the last
1922 1996   * read of the file) which is sent down as a whole to the network.
1923 1997   * sendfile_read_size is set to 1 MB as this seems to be the optimal
1924 1998   * value for the UFS filesystem backed by a striped storage array.
1925 1999   *
1926 2000   * Synchronisation between read (producer) and write (consumer) threads.
1927 2001   * --------------------------------------------------------------------
1928 2002   *
1929 2003   * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
1930 2004   * adding and deleting items in this list. Error can happen anytime
1931 2005   * during read or write. There could be unprocessed mblks in the
1932 2006   * sr_ib_XXX list when a read or write error occurs. Whenever error
1933 2007   * is encountered, we need two things to happen :
1934 2008   *
1935 2009   * a) One of the threads need to clean the mblks.
1936 2010   * b) When one thread encounters an error, the other should stop.
1937 2011   *
1938 2012   * For (a), we don't want to penalize the reader thread as it could do
1939 2013   * some useful work processing other requests. For (b), the error can
1940 2014   * be detected by examining sr_read_error or sr_write_error.
1941 2015   * sr_lock protects sr_read_error and sr_write_error. If both reader and
1942 2016   * writer encounters error, we need to report the write error back to
1943 2017   * the application as that's what would have happened if the operations
1944 2018   * were done sequentially. With this in mind, following should work :
1945 2019   *
1946 2020   *      - Check for errors before read or write.
1947 2021   *      - If the reader encounters error, set the error in sr_read_error.
1948 2022   *        Check sr_write_error, if it is set, send cv_signal as it is
1949 2023   *        waiting for reader to complete. If it is not set, the writer
1950 2024   *        is either running sinking data to the network or blocked
1951 2025   *        because of flow control. For handling the latter case, we
1952 2026   *        always send a signal. In any case, it will examine sr_read_error
1953 2027   *        and return. sr_read_error is marked with SR_READ_DONE to tell
1954 2028   *        the writer that the reader is done in all the cases.
1955 2029   *      - If the writer encounters error, set the error in sr_write_error.
1956 2030   *        The reader thread is either blocked because of flow control or
1957 2031   *        running reading data from the disk. For the former, we need to
1958 2032   *        wakeup the thread. Again to keep it simple, we always wake up
1959 2033   *        the reader thread. Then, wait for the read thread to complete
1960 2034   *        if it is not done yet. Cleanup and return.
1961 2035   *
1962 2036   * High and low water marks for the read thread.
1963 2037   * --------------------------------------------
1964 2038   *
1965 2039   * If sendfile() is used to send data over a slow network, we need to
1966 2040   * make sure that the read thread does not produce data at a faster
1967 2041   * rate than the network. This can happen if the disk is faster than
1968 2042   * the network. In such a case, we don't want to build a very large queue.
1969 2043   * But we would still like to get all of the network throughput possible.
1970 2044   * This implies that network should never block waiting for data.
1971 2045   * As there are lot of disk throughput/network throughput combinations
1972 2046   * possible, it is difficult to come up with an accurate number.
1973 2047   * A typical 10K RPM disk has a max seek latency 17ms and rotational
1974 2048   * latency of 3ms for reading a disk block. Thus, the total latency to
1975 2049   * initiate a new read, transfer data from the disk and queue for
1976 2050   * transmission would take about a max of 25ms. Todays max transfer rate
1977 2051   * for network is 100MB/sec. If the thread is blocked because of flow
1978 2052   * control, it would take 25ms to get new data ready for transmission.
1979 2053   * We have to make sure that network is not idling, while we are initiating
1980 2054   * new transfers. So, at 100MB/sec, to keep network busy we would need
1981 2055   * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data.
1982 2056   * We need to pick a high water mark so that the woken up thread would
1983 2057   * do considerable work before blocking again to prevent thrashing. Currently,
1984 2058   * we pick this to be 10 times that of the low water mark.
1985 2059   *
1986 2060   * Sendfile with segmap caching (One copy from page cache to mblks).
1987 2061   * ----------------------------------------------------------------
1988 2062   *
1989 2063   * We use the segmap cache for caching the file, if the size of file
1990 2064   * is <= sendfile_max_size. In this case we don't use threads as VM
1991 2065   * is reasonably fast enough to keep up with the network. If the underlying
1992 2066   * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
1993 2067   * of data into segmap space, and use the virtual address from segmap
1994 2068   * directly through desballoc() to avoid copy. Once the transport is done
1995 2069   * with the data, the mapping will be released through segmap_release()
1996 2070   * called by the call-back routine.
1997 2071   *
1998 2072   * If zero-copy is not allowed by the transport, we simply call VOP_READ()
1999 2073   * to copy the data from the filesystem into our temporary network buffer.
2000 2074   *
2001 2075   * To disable caching, set sendfile_max_size to 0.
2002 2076   */
2003 2077  
2004 2078  uint_t sendfile_read_size = 1024 * 1024;
2005 2079  #define SENDFILE_REQ_LOWAT      3 * 1024 * 1024
2006 2080  uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT;
2007 2081  uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT;
2008 2082  struct sendfile_stats sf_stats;
2009 2083  struct sendfile_queue *snfq;
2010 2084  clock_t snfq_timeout;
2011 2085  off64_t sendfile_max_size;
2012 2086  
2013 2087  static void snf_enque(snf_req_t *, mblk_t *);
2014 2088  static mblk_t *snf_deque(snf_req_t *);
2015 2089  
2016 2090  void
2017 2091  sendfile_init(void)
2018 2092  {
2019 2093          snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP);
2020 2094  
2021 2095          mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL);
2022 2096          cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL);
2023 2097          snfq->snfq_max_threads = max_ncpus;
2024 2098          snfq_timeout = SNFQ_TIMEOUT;
2025 2099          /* Cache all files by default. */
2026 2100          sendfile_max_size = MAXOFFSET_T;
2027 2101  }
2028 2102  
2029 2103  /*
2030 2104   * Queues a mblk_t for network processing.
2031 2105   */
2032 2106  static void
2033 2107  snf_enque(snf_req_t *sr, mblk_t *mp)
2034 2108  {
2035 2109          mp->b_next = NULL;
2036 2110          mutex_enter(&sr->sr_lock);
2037 2111          if (sr->sr_mp_head == NULL) {
2038 2112                  sr->sr_mp_head = sr->sr_mp_tail = mp;
2039 2113                  cv_signal(&sr->sr_cv);
2040 2114          } else {
2041 2115                  sr->sr_mp_tail->b_next = mp;
2042 2116                  sr->sr_mp_tail = mp;
2043 2117          }
2044 2118          sr->sr_qlen += MBLKL(mp);
2045 2119          while ((sr->sr_qlen > sr->sr_hiwat) &&
2046 2120              (sr->sr_write_error == 0)) {
2047 2121                  sf_stats.ss_full_waits++;
2048 2122                  cv_wait(&sr->sr_cv, &sr->sr_lock);
2049 2123          }
2050 2124          mutex_exit(&sr->sr_lock);
2051 2125  }
2052 2126  
2053 2127  /*
2054 2128   * De-queues a mblk_t for network processing.
2055 2129   */
2056 2130  static mblk_t *
2057 2131  snf_deque(snf_req_t *sr)
2058 2132  {
2059 2133          mblk_t *mp;
2060 2134  
2061 2135          mutex_enter(&sr->sr_lock);
2062 2136          /*
2063 2137           * If we have encountered an error on read or read is
2064 2138           * completed and no more mblks, return NULL.
2065 2139           * We need to check for NULL sr_mp_head also as
2066 2140           * the reads could have completed and there is
2067 2141           * nothing more to come.
2068 2142           */
2069 2143          if (((sr->sr_read_error & ~SR_READ_DONE) != 0) ||
2070 2144              ((sr->sr_read_error & SR_READ_DONE) &&
2071 2145              sr->sr_mp_head == NULL)) {
2072 2146                  mutex_exit(&sr->sr_lock);
2073 2147                  return (NULL);
2074 2148          }
2075 2149          /*
2076 2150           * To start with neither SR_READ_DONE is marked nor
2077 2151           * the error is set. When we wake up from cv_wait,
2078 2152           * following are the possibilities :
2079 2153           *
2080 2154           *      a) sr_read_error is zero and mblks are queued.
2081 2155           *      b) sr_read_error is set to SR_READ_DONE
2082 2156           *         and mblks are queued.
2083 2157           *      c) sr_read_error is set to SR_READ_DONE
2084 2158           *         and no mblks.
2085 2159           *      d) sr_read_error is set to some error other
2086 2160           *         than SR_READ_DONE.
2087 2161           */
2088 2162  
2089 2163          while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) {
2090 2164                  sf_stats.ss_empty_waits++;
2091 2165                  cv_wait(&sr->sr_cv, &sr->sr_lock);
2092 2166          }
2093 2167          /* Handle (a) and (b) first  - the normal case. */
2094 2168          if (((sr->sr_read_error & ~SR_READ_DONE) == 0) &&
2095 2169              (sr->sr_mp_head != NULL)) {
2096 2170                  mp = sr->sr_mp_head;
2097 2171                  sr->sr_mp_head = mp->b_next;
2098 2172                  sr->sr_qlen -= MBLKL(mp);
2099 2173                  if (sr->sr_qlen < sr->sr_lowat)
2100 2174                          cv_signal(&sr->sr_cv);
2101 2175                  mutex_exit(&sr->sr_lock);
2102 2176                  mp->b_next = NULL;
2103 2177                  return (mp);
2104 2178          }
2105 2179          /* Handle (c) and (d). */
2106 2180          mutex_exit(&sr->sr_lock);
2107 2181          return (NULL);
2108 2182  }
2109 2183  
2110 2184  /*
2111 2185   * Reads data from the filesystem and queues it for network processing.
2112 2186   */
2113 2187  void
2114 2188  snf_async_read(snf_req_t *sr)
2115 2189  {
2116 2190          size_t iosize;
2117 2191          u_offset_t fileoff;
2118 2192          u_offset_t size;
2119 2193          int ret_size;
2120 2194          int error;
2121 2195          file_t *fp;
2122 2196          mblk_t *mp;
2123 2197          struct vnode *vp;
2124 2198          int extra = 0;
2125 2199          int maxblk = 0;
2126 2200          int wroff = 0;
2127 2201          struct sonode *so;
2128 2202  
2129 2203          fp = sr->sr_fp;
2130 2204          size = sr->sr_file_size;
2131 2205          fileoff = sr->sr_file_off;
2132 2206  
2133 2207          /*
2134 2208           * Ignore the error for filesystems that doesn't support DIRECTIO.
2135 2209           */
2136 2210          (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0,
2137 2211              kcred, NULL, NULL);
2138 2212  
2139 2213          vp = sr->sr_vp;
2140 2214          if (vp->v_type == VSOCK) {
2141 2215                  stdata_t *stp;
2142 2216  
2143 2217                  /*
2144 2218                   * Get the extra space to insert a header and a trailer.
2145 2219                   */
2146 2220                  so = VTOSO(vp);
2147 2221                  stp = vp->v_stream;
2148 2222                  if (stp == NULL) {
2149 2223                          wroff = so->so_proto_props.sopp_wroff;
2150 2224                          maxblk = so->so_proto_props.sopp_maxblk;
2151 2225                          extra = wroff + so->so_proto_props.sopp_tail;
2152 2226                  } else {
2153 2227                          wroff = (int)(stp->sd_wroff);
2154 2228                          maxblk = (int)(stp->sd_maxblk);
2155 2229                          extra = wroff + (int)(stp->sd_tail);
2156 2230                  }
2157 2231          }
2158 2232  
2159 2233          while ((size != 0) && (sr->sr_write_error == 0)) {
2160 2234  
2161 2235                  iosize = (int)MIN(sr->sr_maxpsz, size);
2162 2236  
2163 2237                  /*
2164 2238                   * Socket filters can limit the mblk size,
2165 2239                   * so limit reads to maxblk if there are
2166 2240                   * filters present.
2167 2241                   */
2168 2242                  if (vp->v_type == VSOCK &&
2169 2243                      so->so_filter_active > 0 && maxblk != INFPSZ)
2170 2244                          iosize = (int)MIN(iosize, maxblk);
2171 2245  
2172 2246                  if (is_system_labeled()) {
2173 2247                          mp = allocb_cred(iosize + extra, CRED(),
2174 2248                              curproc->p_pid);
2175 2249                  } else {
2176 2250                          mp = allocb(iosize + extra, BPRI_MED);
2177 2251                  }
2178 2252                  if (mp == NULL) {
2179 2253                          error = EAGAIN;
2180 2254                          break;
2181 2255                  }
2182 2256  
2183 2257                  mp->b_rptr += wroff;
2184 2258  
2185 2259                  ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize);
2186 2260  
2187 2261                  /* Error or Reached EOF ? */
2188 2262                  if ((error != 0) || (ret_size == 0)) {
2189 2263                          freeb(mp);
2190 2264                          break;
2191 2265                  }
2192 2266                  mp->b_wptr = mp->b_rptr + ret_size;
2193 2267  
2194 2268                  snf_enque(sr, mp);
2195 2269                  size -= ret_size;
2196 2270                  fileoff += ret_size;
2197 2271          }
2198 2272          (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0,
2199 2273              kcred, NULL, NULL);
2200 2274          mutex_enter(&sr->sr_lock);
2201 2275          sr->sr_read_error = error;
2202 2276          sr->sr_read_error |= SR_READ_DONE;
2203 2277          cv_signal(&sr->sr_cv);
2204 2278          mutex_exit(&sr->sr_lock);
2205 2279  }
2206 2280  
2207 2281  void
2208 2282  snf_async_thread(void)
2209 2283  {
2210 2284          snf_req_t *sr;
2211 2285          callb_cpr_t cprinfo;
2212 2286          clock_t time_left = 1;
2213 2287  
2214 2288          CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq");
2215 2289  
2216 2290          mutex_enter(&snfq->snfq_lock);
2217 2291          for (;;) {
2218 2292                  /*
2219 2293                   * If we didn't find a entry, then block until woken up
2220 2294                   * again and then look through the queues again.
2221 2295                   */
2222 2296                  while ((sr = snfq->snfq_req_head) == NULL) {
2223 2297                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
2224 2298                          if (time_left <= 0) {
2225 2299                                  snfq->snfq_svc_threads--;
2226 2300                                  CALLB_CPR_EXIT(&cprinfo);
2227 2301                                  thread_exit();
2228 2302                                  /* NOTREACHED */
2229 2303                          }
2230 2304                          snfq->snfq_idle_cnt++;
2231 2305  
2232 2306                          time_left = cv_reltimedwait(&snfq->snfq_cv,
2233 2307                              &snfq->snfq_lock, snfq_timeout, TR_CLOCK_TICK);
2234 2308                          snfq->snfq_idle_cnt--;
2235 2309  
2236 2310                          CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock);
2237 2311                  }
2238 2312                  snfq->snfq_req_head = sr->sr_next;
2239 2313                  snfq->snfq_req_cnt--;
2240 2314                  mutex_exit(&snfq->snfq_lock);
2241 2315                  snf_async_read(sr);
2242 2316                  mutex_enter(&snfq->snfq_lock);
2243 2317          }
2244 2318  }
2245 2319  
2246 2320  
2247 2321  snf_req_t *
2248 2322  create_thread(int operation, struct vnode *vp, file_t *fp,
2249 2323      u_offset_t fileoff, u_offset_t size)
2250 2324  {
2251 2325          snf_req_t *sr;
2252 2326          stdata_t *stp;
2253 2327  
2254 2328          sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP);
2255 2329  
2256 2330          sr->sr_vp = vp;
2257 2331          sr->sr_fp = fp;
2258 2332          stp = vp->v_stream;
2259 2333  
2260 2334          /*
2261 2335           * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2262 2336           * stream might be closed before thread returns from snf_async_read.
2263 2337           */
2264 2338          if (stp != NULL && stp->sd_qn_maxpsz > 0) {
2265 2339                  sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz);
2266 2340          } else {
2267 2341                  sr->sr_maxpsz = MAXBSIZE;
2268 2342          }
2269 2343  
2270 2344          sr->sr_operation = operation;
2271 2345          sr->sr_file_off = fileoff;
2272 2346          sr->sr_file_size = size;
2273 2347          sr->sr_hiwat = sendfile_req_hiwat;
2274 2348          sr->sr_lowat = sendfile_req_lowat;
2275 2349          mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL);
2276 2350          cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL);
2277 2351          /*
2278 2352           * See whether we need another thread for servicing this
2279 2353           * request. If there are already enough requests queued
2280 2354           * for the threads, create one if not exceeding
2281 2355           * snfq_max_threads.
2282 2356           */
2283 2357          mutex_enter(&snfq->snfq_lock);
2284 2358          if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt &&
2285 2359              snfq->snfq_svc_threads < snfq->snfq_max_threads) {
2286 2360                  (void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0,
2287 2361                      TS_RUN, minclsyspri);
2288 2362                  snfq->snfq_svc_threads++;
2289 2363          }
2290 2364          if (snfq->snfq_req_head == NULL) {
2291 2365                  snfq->snfq_req_head = snfq->snfq_req_tail = sr;
2292 2366                  cv_signal(&snfq->snfq_cv);
2293 2367          } else {
2294 2368                  snfq->snfq_req_tail->sr_next = sr;
2295 2369                  snfq->snfq_req_tail = sr;
2296 2370          }
2297 2371          snfq->snfq_req_cnt++;
2298 2372          mutex_exit(&snfq->snfq_lock);
2299 2373          return (sr);
2300 2374  }
2301 2375  
2302 2376  int
2303 2377  snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
2304 2378      ssize_t *count)
2305 2379  {
2306 2380          snf_req_t *sr;
2307 2381          mblk_t *mp;
2308 2382          int iosize;
2309 2383          int error = 0;
2310 2384          short fflag;
2311 2385          struct vnode *vp;
2312 2386          int ksize;
2313 2387          struct nmsghdr msg;
2314 2388  
2315 2389          ksize = 0;
2316 2390          *count = 0;
2317 2391          bzero(&msg, sizeof (msg));
2318 2392  
2319 2393          vp = fp->f_vnode;
2320 2394          fflag = fp->f_flag;
2321 2395          if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL)
2322 2396                  return (EAGAIN);
2323 2397  
2324 2398          /*
2325 2399           * We check for read error in snf_deque. It has to check
2326 2400           * for successful READ_DONE and return NULL, and we might
2327 2401           * as well make an additional check there.
2328 2402           */
2329 2403          while ((mp = snf_deque(sr)) != NULL) {
2330 2404  
2331 2405                  if (ISSIG(curthread, JUSTLOOKING)) {
2332 2406                          freeb(mp);
2333 2407                          error = EINTR;
2334 2408                          break;
2335 2409                  }
2336 2410                  iosize = MBLKL(mp);
2337 2411  
2338 2412                  error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2339 2413  
2340 2414                  if (error != 0) {
2341 2415                          if (mp != NULL)
2342 2416                                  freeb(mp);
2343 2417                          break;
2344 2418                  }
2345 2419                  ksize += iosize;
2346 2420          }
2347 2421          *count = ksize;
2348 2422  
2349 2423          mutex_enter(&sr->sr_lock);
2350 2424          sr->sr_write_error = error;
2351 2425          /* Look at the big comments on why we cv_signal here. */
2352 2426          cv_signal(&sr->sr_cv);
2353 2427  
2354 2428          /* Wait for the reader to complete always. */
2355 2429          while (!(sr->sr_read_error & SR_READ_DONE)) {
2356 2430                  cv_wait(&sr->sr_cv, &sr->sr_lock);
2357 2431          }
2358 2432          /* If there is no write error, check for read error. */
2359 2433          if (error == 0)
2360 2434                  error = (sr->sr_read_error & ~SR_READ_DONE);
2361 2435  
2362 2436          if (error != 0) {
2363 2437                  mblk_t *next_mp;
2364 2438  
2365 2439                  mp = sr->sr_mp_head;
2366 2440                  while (mp != NULL) {
2367 2441                          next_mp = mp->b_next;
2368 2442                          mp->b_next = NULL;
2369 2443                          freeb(mp);
2370 2444                          mp = next_mp;
2371 2445                  }
2372 2446          }
2373 2447          mutex_exit(&sr->sr_lock);
2374 2448          kmem_free(sr, sizeof (snf_req_t));
2375 2449          return (error);
2376 2450  }
2377 2451  
2378 2452  /* Maximum no.of pages allocated by vpm for sendfile at a time */
2379 2453  #define SNF_VPMMAXPGS   (VPMMAXPGS/2)
2380 2454  
2381 2455  /*
2382 2456   * Maximum no.of elements in the list returned by vpm, including
2383 2457   * NULL for the last entry
2384 2458   */
2385 2459  #define SNF_MAXVMAPS    (SNF_VPMMAXPGS + 1)
2386 2460  
2387 2461  typedef struct {
2388 2462          unsigned int    snfv_ref;
2389 2463          frtn_t          snfv_frtn;
2390 2464          vnode_t         *snfv_vp;
2391 2465          struct vmap     snfv_vml[SNF_MAXVMAPS];
2392 2466  } snf_vmap_desbinfo;
2393 2467  
2394 2468  typedef struct {
2395 2469          frtn_t          snfi_frtn;
2396 2470          caddr_t         snfi_base;
2397 2471          uint_t          snfi_mapoff;
2398 2472          size_t          snfi_len;
2399 2473          vnode_t         *snfi_vp;
2400 2474  } snf_smap_desbinfo;
2401 2475  
2402 2476  /*
2403 2477   * The callback function used for vpm mapped mblks called when the last ref of
2404 2478   * the mblk is dropped which normally occurs when TCP receives the ack. But it
2405 2479   * can be the driver too due to lazy reclaim.
2406 2480   */
2407 2481  void
2408 2482  snf_vmap_desbfree(snf_vmap_desbinfo *snfv)
2409 2483  {
2410 2484          ASSERT(snfv->snfv_ref != 0);
2411 2485          if (atomic_dec_32_nv(&snfv->snfv_ref) == 0) {
2412 2486                  vpm_unmap_pages(snfv->snfv_vml, S_READ);
2413 2487                  VN_RELE(snfv->snfv_vp);
2414 2488                  kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2415 2489          }
2416 2490  }
2417 2491  
2418 2492  /*
2419 2493   * The callback function used for segmap'ped mblks called when the last ref of
2420 2494   * the mblk is dropped which normally occurs when TCP receives the ack. But it
2421 2495   * can be the driver too due to lazy reclaim.
2422 2496   */
2423 2497  void
2424 2498  snf_smap_desbfree(snf_smap_desbinfo *snfi)
2425 2499  {
2426 2500          if (! IS_KPM_ADDR(snfi->snfi_base)) {
2427 2501                  /*
2428 2502                   * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2429 2503                   * segmap_kpm as long as the latter never falls back to
2430 2504                   * "use_segmap_range". (See segmap_getmapflt().)
2431 2505                   *
2432 2506                   * Using S_OTHER saves an redundant hat_setref() in
2433 2507                   * segmap_unlock()
2434 2508                   */
2435 2509                  (void) segmap_fault(kas.a_hat, segkmap,
2436 2510                      (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base +
2437 2511                      snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len,
2438 2512                      F_SOFTUNLOCK, S_OTHER);
2439 2513          }
2440 2514          (void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED);
2441 2515          VN_RELE(snfi->snfi_vp);
2442 2516          kmem_free(snfi, sizeof (*snfi));
2443 2517  }
2444 2518  
2445 2519  /*
2446 2520   * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk.
2447 2521   * When segmap is used, the mblk contains a segmap slot of no more
2448 2522   * than MAXBSIZE.
2449 2523   *
2450 2524   * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained
2451 2525   * in each iteration and sent by socket_sendmblk until an error occurs or
2452 2526   * the requested size has been transferred. An mblk is esballoca'ed from
2453 2527   * each mapped page and a chain of these mblk is sent to the transport layer.
2454 2528   * vpm will be called to unmap the pages when all mblks have been freed by
2455 2529   * free_func.
2456 2530   *
2457 2531   * At the end of the whole sendfile() operation, we wait till the data from
2458 2532   * the last mblk is ack'ed by the transport before returning so that the
2459 2533   * caller of sendfile() can safely modify the file content.
2460 2534   */
2461 2535  int
2462 2536  snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t total_size,
2463 2537      ssize_t *count, boolean_t nowait)
2464 2538  {
2465 2539          caddr_t base;
2466 2540          int mapoff;
2467 2541          vnode_t *vp;
2468 2542          mblk_t *mp = NULL;
2469 2543          int chain_size;
2470 2544          int error;
2471 2545          clock_t deadlk_wait;
2472 2546          short fflag;
2473 2547          int ksize;
2474 2548          struct vattr va;
2475 2549          boolean_t dowait = B_FALSE;
2476 2550          struct nmsghdr msg;
2477 2551  
2478 2552          vp = fp->f_vnode;
2479 2553          fflag = fp->f_flag;
2480 2554          ksize = 0;
2481 2555          bzero(&msg, sizeof (msg));
2482 2556  
2483 2557          for (;;) {
2484 2558                  if (ISSIG(curthread, JUSTLOOKING)) {
2485 2559                          error = EINTR;
2486 2560                          break;
2487 2561                  }
2488 2562  
2489 2563                  if (vpm_enable) {
2490 2564                          snf_vmap_desbinfo *snfv;
2491 2565                          mblk_t *nmp;
2492 2566                          int mblk_size;
2493 2567                          int maxsize;
2494 2568                          int i;
2495 2569  
2496 2570                          mapoff = fileoff & PAGEOFFSET;
2497 2571                          maxsize = MIN((SNF_VPMMAXPGS * PAGESIZE), total_size);
2498 2572  
2499 2573                          snfv = kmem_zalloc(sizeof (snf_vmap_desbinfo),
2500 2574                              KM_SLEEP);
2501 2575  
2502 2576                          /*
2503 2577                           * Get vpm mappings for maxsize with read access.
2504 2578                           * If the pages aren't available yet, we get
2505 2579                           * DEADLK, so wait and try again a little later using
2506 2580                           * an increasing wait. We might be here a long time.
2507 2581                           *
2508 2582                           * If delay_sig returns EINTR, be sure to exit and
2509 2583                           * pass it up to the caller.
2510 2584                           */
2511 2585                          deadlk_wait = 0;
2512 2586                          while ((error = vpm_map_pages(fvp, fileoff,
2513 2587                              (size_t)maxsize, (VPM_FETCHPAGE), snfv->snfv_vml,
2514 2588                              SNF_MAXVMAPS, NULL, S_READ)) == EDEADLK) {
2515 2589                                  deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2516 2590                                  if ((error = delay_sig(deadlk_wait)) != 0) {
2517 2591                                          break;
2518 2592                                  }
2519 2593                          }
2520 2594                          if (error != 0) {
2521 2595                                  kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2522 2596                                  error = (error == EINTR) ? EINTR : EIO;
2523 2597                                  goto out;
2524 2598                          }
2525 2599                          snfv->snfv_frtn.free_func = snf_vmap_desbfree;
2526 2600                          snfv->snfv_frtn.free_arg = (caddr_t)snfv;
2527 2601  
2528 2602                          /* Construct the mblk chain from the page mappings */
2529 2603                          chain_size = 0;
2530 2604                          for (i = 0; (snfv->snfv_vml[i].vs_addr != NULL) &&
2531 2605                              total_size > 0; i++) {
2532 2606                                  ASSERT(chain_size < maxsize);
2533 2607                                  mblk_size = MIN(snfv->snfv_vml[i].vs_len -
2534 2608                                      mapoff, total_size);
2535 2609                                  nmp = esballoca(
2536 2610                                      (uchar_t *)snfv->snfv_vml[i].vs_addr +
2537 2611                                      mapoff, mblk_size, BPRI_HI,
2538 2612                                      &snfv->snfv_frtn);
2539 2613  
2540 2614                                  /*
2541 2615                                   * We return EAGAIN after unmapping the pages
2542 2616                                   * if we cannot allocate the the head of the
2543 2617                                   * chain. Otherwise, we continue sending the
2544 2618                                   * mblks constructed so far.
2545 2619                                   */
2546 2620                                  if (nmp == NULL) {
2547 2621                                          if (i == 0) {
2548 2622                                                  vpm_unmap_pages(snfv->snfv_vml,
2549 2623                                                      S_READ);
2550 2624                                                  kmem_free(snfv,
2551 2625                                                      sizeof (snf_vmap_desbinfo));
2552 2626                                                  error = EAGAIN;
2553 2627                                                  goto out;
2554 2628                                          }
2555 2629                                          break;
2556 2630                                  }
2557 2631                                  /* Mark this dblk with the zero-copy flag */
2558 2632                                  nmp->b_datap->db_struioflag |= STRUIO_ZC;
2559 2633                                  nmp->b_wptr += mblk_size;
2560 2634                                  chain_size += mblk_size;
2561 2635                                  fileoff += mblk_size;
2562 2636                                  total_size -= mblk_size;
2563 2637                                  snfv->snfv_ref++;
2564 2638                                  mapoff = 0;
2565 2639                                  if (i > 0)
2566 2640                                          linkb(mp, nmp);
2567 2641                                  else
2568 2642                                          mp = nmp;
2569 2643                          }
2570 2644                          VN_HOLD(fvp);
2571 2645                          snfv->snfv_vp = fvp;
2572 2646                  } else {
2573 2647                          /* vpm not supported. fallback to segmap */
2574 2648                          snf_smap_desbinfo *snfi;
2575 2649  
2576 2650                          mapoff = fileoff & MAXBOFFSET;
2577 2651                          chain_size = MAXBSIZE - mapoff;
2578 2652                          if (chain_size > total_size)
2579 2653                                  chain_size = total_size;
2580 2654                          /*
2581 2655                           * we don't forcefault because we'll call
2582 2656                           * segmap_fault(F_SOFTLOCK) next.
2583 2657                           *
2584 2658                           * S_READ will get the ref bit set (by either
2585 2659                           * segmap_getmapflt() or segmap_fault()) and page
2586 2660                           * shared locked.
2587 2661                           */
2588 2662                          base = segmap_getmapflt(segkmap, fvp, fileoff,
2589 2663                              chain_size, segmap_kpm ? SM_FAULT : 0, S_READ);
2590 2664  
2591 2665                          snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP);
2592 2666                          snfi->snfi_len = (size_t)roundup(mapoff+chain_size,
2593 2667                              PAGESIZE)- (mapoff & PAGEMASK);
2594 2668                          /*
2595 2669                           * We must call segmap_fault() even for segmap_kpm
2596 2670                           * because that's how error gets returned.
2597 2671                           * (segmap_getmapflt() never fails but segmap_fault()
2598 2672                           * does.)
2599 2673                           *
2600 2674                           * If the pages aren't available yet, we get
2601 2675                           * DEADLK, so wait and try again a little later using
2602 2676                           * an increasing wait. We might be here a long time.
2603 2677                           *
2604 2678                           * If delay_sig returns EINTR, be sure to exit and
2605 2679                           * pass it up to the caller.
2606 2680                           */
2607 2681                          deadlk_wait = 0;
2608 2682                          while ((error = FC_ERRNO(segmap_fault(kas.a_hat,
2609 2683                              segkmap, (caddr_t)(uintptr_t)(((uintptr_t)base +
2610 2684                              mapoff) & PAGEMASK), snfi->snfi_len, F_SOFTLOCK,
2611 2685                              S_READ))) == EDEADLK) {
2612 2686                                  deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2613 2687                                  if ((error = delay_sig(deadlk_wait)) != 0) {
2614 2688                                          break;
2615 2689                                  }
2616 2690                          }
2617 2691                          if (error != 0) {
2618 2692                                  (void) segmap_release(segkmap, base, 0);
2619 2693                                  kmem_free(snfi, sizeof (*snfi));
2620 2694                                  error = (error == EINTR) ? EINTR : EIO;
2621 2695                                  goto out;
2622 2696                          }
2623 2697                          snfi->snfi_frtn.free_func = snf_smap_desbfree;
2624 2698                          snfi->snfi_frtn.free_arg = (caddr_t)snfi;
2625 2699                          snfi->snfi_base = base;
2626 2700                          snfi->snfi_mapoff = mapoff;
2627 2701                          mp = esballoca((uchar_t *)base + mapoff, chain_size,
2628 2702                              BPRI_HI, &snfi->snfi_frtn);
2629 2703  
2630 2704                          if (mp == NULL) {
2631 2705                                  (void) segmap_fault(kas.a_hat, segkmap,
2632 2706                                      (caddr_t)(uintptr_t)(((uintptr_t)base +
2633 2707                                      mapoff) & PAGEMASK), snfi->snfi_len,
2634 2708                                      F_SOFTUNLOCK, S_OTHER);
2635 2709                                  (void) segmap_release(segkmap, base, 0);
2636 2710                                  kmem_free(snfi, sizeof (*snfi));
2637 2711                                  freemsg(mp);
2638 2712                                  error = EAGAIN;
2639 2713                                  goto out;
2640 2714                          }
2641 2715                          VN_HOLD(fvp);
2642 2716                          snfi->snfi_vp = fvp;
2643 2717                          mp->b_wptr += chain_size;
2644 2718  
2645 2719                          /* Mark this dblk with the zero-copy flag */
2646 2720                          mp->b_datap->db_struioflag |= STRUIO_ZC;
2647 2721                          fileoff += chain_size;
2648 2722                          total_size -= chain_size;
2649 2723                  }
2650 2724  
2651 2725                  if (total_size == 0 && !nowait) {
2652 2726                          ASSERT(!dowait);
2653 2727                          dowait = B_TRUE;
2654 2728                          mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
2655 2729                  }
2656 2730                  VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2657 2731                  error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2658 2732                  if (error != 0) {
2659 2733                          /*
2660 2734                           * mp contains the mblks that were not sent by
2661 2735                           * socket_sendmblk. Use its size to update *count
2662 2736                           */
2663 2737                          *count = ksize + (chain_size - msgdsize(mp));
2664 2738                          if (mp != NULL)
2665 2739                                  freemsg(mp);
2666 2740                          return (error);
2667 2741                  }
2668 2742                  ksize += chain_size;
2669 2743                  if (total_size == 0)
2670 2744                          goto done;
2671 2745  
2672 2746                  (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2673 2747                  va.va_mask = AT_SIZE;
2674 2748                  error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2675 2749                  if (error)
2676 2750                          break;
2677 2751                  /* Read as much as possible. */
2678 2752                  if (fileoff >= va.va_size)
2679 2753                          break;
2680 2754                  if (total_size + fileoff > va.va_size)
2681 2755                          total_size = va.va_size - fileoff;
2682 2756          }
2683 2757  out:
2684 2758          VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2685 2759  done:
2686 2760          *count = ksize;
2687 2761          if (dowait) {
2688 2762                  stdata_t *stp;
2689 2763  
2690 2764                  stp = vp->v_stream;
2691 2765                  if (stp == NULL) {
2692 2766                          struct sonode *so;
2693 2767                          so = VTOSO(vp);
2694 2768                          error = so_zcopy_wait(so);
2695 2769                  } else {
2696 2770                          mutex_enter(&stp->sd_lock);
2697 2771                          while (!(stp->sd_flag & STZCNOTIFY)) {
2698 2772                                  if (cv_wait_sig(&stp->sd_zcopy_wait,
2699 2773                                      &stp->sd_lock) == 0) {
2700 2774                                          error = EINTR;
2701 2775                                          break;
2702 2776                                  }
2703 2777                          }
2704 2778                          stp->sd_flag &= ~STZCNOTIFY;
2705 2779                          mutex_exit(&stp->sd_lock);
2706 2780                  }
2707 2781          }
2708 2782          return (error);
2709 2783  }
2710 2784  
2711 2785  int
2712 2786  snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
2713 2787      uint_t maxpsz, ssize_t *count)
2714 2788  {
2715 2789          struct vnode *vp;
2716 2790          mblk_t *mp;
2717 2791          int iosize;
2718 2792          int extra = 0;
2719 2793          int error;
2720 2794          short fflag;
2721 2795          int ksize;
2722 2796          int ioflag;
2723 2797          struct uio auio;
2724 2798          struct iovec aiov;
2725 2799          struct vattr va;
2726 2800          int maxblk = 0;
2727 2801          int wroff = 0;
2728 2802          struct sonode *so;
2729 2803          struct nmsghdr msg;
2730 2804  
2731 2805          vp = fp->f_vnode;
2732 2806          if (vp->v_type == VSOCK) {
2733 2807                  stdata_t *stp;
2734 2808  
2735 2809                  /*
2736 2810                   * Get the extra space to insert a header and a trailer.
2737 2811                   */
2738 2812                  so = VTOSO(vp);
2739 2813                  stp = vp->v_stream;
2740 2814                  if (stp == NULL) {
2741 2815                          wroff = so->so_proto_props.sopp_wroff;
2742 2816                          maxblk = so->so_proto_props.sopp_maxblk;
2743 2817                          extra = wroff + so->so_proto_props.sopp_tail;
2744 2818                  } else {
2745 2819                          wroff = (int)(stp->sd_wroff);
2746 2820                          maxblk = (int)(stp->sd_maxblk);
2747 2821                          extra = wroff + (int)(stp->sd_tail);
2748 2822                  }
2749 2823          }
2750 2824          bzero(&msg, sizeof (msg));
2751 2825          fflag = fp->f_flag;
2752 2826          ksize = 0;
2753 2827          auio.uio_iov = &aiov;
2754 2828          auio.uio_iovcnt = 1;
2755 2829          auio.uio_segflg = UIO_SYSSPACE;
2756 2830          auio.uio_llimit = MAXOFFSET_T;
2757 2831          auio.uio_fmode = fflag;
2758 2832          auio.uio_extflg = UIO_COPY_CACHED;
2759 2833          ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
2760 2834          /* If read sync is not asked for, filter sync flags */
2761 2835          if ((ioflag & FRSYNC) == 0)
2762 2836                  ioflag &= ~(FSYNC|FDSYNC);
2763 2837          for (;;) {
2764 2838                  if (ISSIG(curthread, JUSTLOOKING)) {
2765 2839                          error = EINTR;
2766 2840                          break;
2767 2841                  }
2768 2842                  iosize = (int)MIN(maxpsz, size);
2769 2843  
2770 2844                  /*
2771 2845                   * Socket filters can limit the mblk size,
2772 2846                   * so limit reads to maxblk if there are
2773 2847                   * filters present.
2774 2848                   */
2775 2849                  if (vp->v_type == VSOCK &&
2776 2850                      so->so_filter_active > 0 && maxblk != INFPSZ)
2777 2851                          iosize = (int)MIN(iosize, maxblk);
2778 2852  
2779 2853                  if (is_system_labeled()) {
2780 2854                          mp = allocb_cred(iosize + extra, CRED(),
2781 2855                              curproc->p_pid);
2782 2856                  } else {
2783 2857                          mp = allocb(iosize + extra, BPRI_MED);
2784 2858                  }
2785 2859                  if (mp == NULL) {
2786 2860                          error = EAGAIN;
2787 2861                          break;
2788 2862                  }
2789 2863  
2790 2864                  mp->b_rptr += wroff;
2791 2865  
2792 2866                  aiov.iov_base = (caddr_t)mp->b_rptr;
2793 2867                  aiov.iov_len = iosize;
2794 2868                  auio.uio_loffset = fileoff;
2795 2869                  auio.uio_resid = iosize;
2796 2870  
2797 2871                  error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL);
2798 2872                  iosize -= auio.uio_resid;
2799 2873  
2800 2874                  if (error == EINTR && iosize != 0)
2801 2875                          error = 0;
2802 2876  
2803 2877                  if (error != 0 || iosize == 0) {
2804 2878                          freeb(mp);
2805 2879                          break;
2806 2880                  }
2807 2881                  mp->b_wptr = mp->b_rptr + iosize;
2808 2882  
2809 2883                  VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2810 2884  
2811 2885                  error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2812 2886  
2813 2887                  if (error != 0) {
2814 2888                          *count = ksize;
2815 2889                          if (mp != NULL)
2816 2890                                  freeb(mp);
2817 2891                          return (error);
2818 2892                  }
2819 2893                  ksize += iosize;
2820 2894                  size -= iosize;
2821 2895                  if (size == 0)
2822 2896                          goto done;
2823 2897  
2824 2898                  fileoff += iosize;
2825 2899                  (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2826 2900                  va.va_mask = AT_SIZE;
2827 2901                  error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2828 2902                  if (error)
2829 2903                          break;
2830 2904                  /* Read as much as possible. */
2831 2905                  if (fileoff >= va.va_size)
2832 2906                          size = 0;
2833 2907                  else if (size + fileoff > va.va_size)
2834 2908                          size = va.va_size - fileoff;
2835 2909          }
2836 2910          VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2837 2911  done:
2838 2912          *count = ksize;
2839 2913          return (error);
2840 2914  }
2841 2915  
2842 2916  #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2843 2917  /*
2844 2918   * Largefile support for 32 bit applications only.
2845 2919   */
2846 2920  int
2847 2921  sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
2848 2922      ssize32_t *count32)
2849 2923  {
2850 2924          ssize32_t sfv_len;
2851 2925          u_offset_t sfv_off, va_size;
2852 2926          struct vnode *vp, *fvp, *realvp;
2853 2927          struct vattr va;
2854 2928          stdata_t *stp;
2855 2929          ssize_t count = 0;
2856 2930          int error = 0;
2857 2931          boolean_t dozcopy = B_FALSE;
2858 2932          uint_t maxpsz;
2859 2933  
2860 2934          sfv_len = (ssize32_t)sfv->sfv_len;
2861 2935          if (sfv_len < 0) {
2862 2936                  error = EINVAL;
2863 2937                  goto out;
2864 2938          }
2865 2939  
2866 2940          if (sfv_len == 0) goto out;
2867 2941  
2868 2942          sfv_off = (u_offset_t)sfv->sfv_off;
2869 2943  
2870 2944          /* Same checks as in pread */
2871 2945          if (sfv_off > MAXOFFSET_T) {
2872 2946                  error = EINVAL;
2873 2947                  goto out;
2874 2948          }
2875 2949          if (sfv_off + sfv_len > MAXOFFSET_T)
2876 2950                  sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
2877 2951  
2878 2952          /*
2879 2953           * There are no more checks on sfv_len. So, we cast it to
2880 2954           * u_offset_t and share the snf_direct_io/snf_cache code between
2881 2955           * 32 bit and 64 bit.
2882 2956           *
2883 2957           * TODO: should do nbl_need_check() like read()?
2884 2958           */
2885 2959          if (sfv_len > sendfile_max_size) {
2886 2960                  sf_stats.ss_file_not_cached++;
2887 2961                  error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len,
2888 2962                      &count);
2889 2963                  goto out;
2890 2964          }
2891 2965          fvp = rfp->f_vnode;
2892 2966          if (VOP_REALVP(fvp, &realvp, NULL) == 0)
2893 2967                  fvp = realvp;
2894 2968          /*
2895 2969           * Grab the lock as a reader to prevent the file size
2896 2970           * from changing underneath.
2897 2971           */
2898 2972          (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2899 2973          va.va_mask = AT_SIZE;
2900 2974          error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2901 2975          va_size = va.va_size;
2902 2976          if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) {
2903 2977                  VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2904 2978                  goto out;
2905 2979          }
2906 2980          /* Read as much as possible. */
2907 2981          if (sfv_off + sfv_len > va_size)
2908 2982                  sfv_len = va_size - sfv_off;
2909 2983  
2910 2984          vp = fp->f_vnode;
2911 2985          stp = vp->v_stream;
2912 2986          /*
2913 2987           * When the NOWAIT flag is not set, we enable zero-copy only if the
2914 2988           * transfer size is large enough. This prevents performance loss
2915 2989           * when the caller sends the file piece by piece.
2916 2990           */
2917 2991          if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) ||
2918 2992              (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) &&
2919 2993              !vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) {
2920 2994                  uint_t copyflag;
2921 2995                  copyflag = stp != NULL ? stp->sd_copyflag :
2922 2996                      VTOSO(vp)->so_proto_props.sopp_zcopyflag;
2923 2997                  if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
2924 2998                          int on = 1;
2925 2999  
2926 3000                          if (socket_setsockopt(VTOSO(vp), SOL_SOCKET,
2927 3001                              SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0)
2928 3002                                  dozcopy = B_TRUE;
2929 3003                  } else {
2930 3004                          dozcopy = copyflag & STZCVMSAFE;
2931 3005                  }
2932 3006          }
2933 3007          if (dozcopy) {
2934 3008                  sf_stats.ss_file_segmap++;
2935 3009                  error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2936 3010                      &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0));
2937 3011          } else {
2938 3012                  if (vp->v_type == VSOCK && stp == NULL) {
2939 3013                          sonode_t *so = VTOSO(vp);
2940 3014                          maxpsz = so->so_proto_props.sopp_maxpsz;
2941 3015                  } else if (stp != NULL) {
2942 3016                          maxpsz = stp->sd_qn_maxpsz;
2943 3017                  } else {
2944 3018                          maxpsz = maxphys;
2945 3019                  }
2946 3020  
2947 3021                  if (maxpsz == INFPSZ)
2948 3022                          maxpsz = maxphys;
2949 3023                  else
2950 3024                          maxpsz = roundup(maxpsz, MAXBSIZE);
2951 3025                  sf_stats.ss_file_cached++;
2952 3026                  error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2953 3027                      maxpsz, &count);
2954 3028          }
2955 3029  out:
2956 3030          releasef(sfv->sfv_fd);
2957 3031          *count32 = (ssize32_t)count;
2958 3032          return (error);
2959 3033  }
2960 3034  #endif
2961 3035  
2962 3036  #ifdef _SYSCALL32_IMPL
2963 3037  /*
2964 3038   * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
2965 3039   * ssize_t rather than ssize32_t; see the comments above read32 for details.
2966 3040   */
2967 3041  
2968 3042  ssize_t
2969 3043  recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2970 3044  {
2971 3045          return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2972 3046  }
2973 3047  
2974 3048  ssize_t
2975 3049  recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2976 3050          caddr32_t name, caddr32_t namelenp)
2977 3051  {
2978 3052          return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2979 3053              (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp));
2980 3054  }
2981 3055  
2982 3056  ssize_t
2983 3057  send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2984 3058  {
2985 3059          return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2986 3060  }
2987 3061  
2988 3062  ssize_t
2989 3063  sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2990 3064          caddr32_t name, socklen_t namelen)
2991 3065  {
2992 3066          return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2993 3067              (void *)(uintptr_t)name, namelen));
2994 3068  }
2995 3069  #endif  /* _SYSCALL32_IMPL */
2996 3070  
2997 3071  /*
2998 3072   * Function wrappers (mostly around the sonode switch) for
2999 3073   * backward compatibility.
3000 3074   */
3001 3075  
3002 3076  int
3003 3077  soaccept(struct sonode *so, int fflag, struct sonode **nsop)
3004 3078  {
3005 3079          return (socket_accept(so, fflag, CRED(), nsop));
3006 3080  }
3007 3081  
3008 3082  int
3009 3083  sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3010 3084      int backlog, int flags)
3011 3085  {
3012 3086          int     error;
3013 3087  
3014 3088          error = socket_bind(so, name, namelen, flags, CRED());
3015 3089          if (error == 0 && backlog != 0)
3016 3090                  return (socket_listen(so, backlog, CRED()));
3017 3091  
3018 3092          return (error);
3019 3093  }
3020 3094  
3021 3095  int
3022 3096  solisten(struct sonode *so, int backlog)
3023 3097  {
3024 3098          return (socket_listen(so, backlog, CRED()));
3025 3099  }
3026 3100  
3027 3101  int
3028 3102  soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3029 3103      int fflag, int flags)
3030 3104  {
3031 3105          return (socket_connect(so, name, namelen, fflag, flags, CRED()));
3032 3106  }
3033 3107  
3034 3108  int
3035 3109  sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3036 3110  {
3037 3111          return (socket_recvmsg(so, msg, uiop, CRED()));
3038 3112  }
3039 3113  
3040 3114  int
3041 3115  sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3042 3116  {
3043 3117          return (socket_sendmsg(so, msg, uiop, CRED()));
3044 3118  }
3045 3119  
3046 3120  int
3047 3121  soshutdown(struct sonode *so, int how)
3048 3122  {
3049 3123          return (socket_shutdown(so, how, CRED()));
3050 3124  }
3051 3125  
3052 3126  int
3053 3127  sogetsockopt(struct sonode *so, int level, int option_name, void *optval,
3054 3128      socklen_t *optlenp, int flags)
3055 3129  {
3056 3130          return (socket_getsockopt(so, level, option_name, optval, optlenp,
3057 3131              flags, CRED()));
3058 3132  }
3059 3133  
3060 3134  int
3061 3135  sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
3062 3136      t_uscalar_t optlen)
3063 3137  {
3064 3138          return (socket_setsockopt(so, level, option_name, optval, optlen,
3065 3139              CRED()));
3066 3140  }
3067 3141  
3068 3142  /*
3069 3143   * Because this is backward compatibility interface it only needs to be
3070 3144   * able to handle the creation of TPI sockfs sockets.
3071 3145   */
3072 3146  struct sonode *
3073 3147  socreate(struct sockparams *sp, int family, int type, int protocol, int version,
3074 3148      int *errorp)
3075 3149  {
3076 3150          struct sonode *so;
3077 3151  
3078 3152          ASSERT(sp != NULL);
3079 3153  
3080 3154          so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol,
3081 3155              version, SOCKET_SLEEP, errorp, CRED());
3082 3156          if (so == NULL) {
3083 3157                  SOCKPARAMS_DEC_REF(sp);
3084 3158          } else {
3085 3159                  if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) {
3086 3160                          /* Cannot fail, only bumps so_count */
3087 3161                          (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL);
3088 3162                  } else {
3089 3163                          socket_destroy(so);
3090 3164                          so = NULL;
3091 3165                  }
3092 3166          }
3093 3167          return (so);
3094 3168  }
  
    | 
      ↓ open down ↓ | 
    1712 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX