1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/t_lock.h>
  29 #include <sys/param.h>
  30 #include <sys/systm.h>
  31 #include <sys/buf.h>
  32 #include <sys/conf.h>
  33 #include <sys/cred.h>
  34 #include <sys/kmem.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vfs_opreg.h>
  38 #include <sys/vnode.h>
  39 #include <sys/debug.h>
  40 #include <sys/errno.h>
  41 #include <sys/time.h>
  42 #include <sys/file.h>
  43 #include <sys/open.h>
  44 #include <sys/user.h>
  45 #include <sys/termios.h>
  46 #include <sys/stream.h>
  47 #include <sys/strsubr.h>
  48 #include <sys/strsun.h>
  49 #include <sys/esunddi.h>
  50 #include <sys/flock.h>
  51 #include <sys/modctl.h>
  52 #include <sys/cmn_err.h>
  53 #include <sys/mkdev.h>
  54 #include <sys/pathname.h>
  55 #include <sys/ddi.h>
  56 #include <sys/stat.h>
  57 #include <sys/fs/snode.h>
  58 #include <sys/fs/dv_node.h>
  59 #include <sys/zone.h>
  60 
  61 #include <sys/socket.h>
  62 #include <sys/socketvar.h>
  63 #include <netinet/in.h>
  64 #include <sys/un.h>
  65 #include <sys/ucred.h>
  66 
  67 #include <sys/tiuser.h>
  68 #define _SUN_TPI_VERSION        2
  69 #include <sys/tihdr.h>
  70 
  71 #include <c2/audit.h>
  72 
  73 #include <fs/sockfs/nl7c.h>
  74 #include <fs/sockfs/sockcommon.h>
  75 #include <fs/sockfs/sockfilter_impl.h>
  76 #include <fs/sockfs/socktpi.h>
  77 #include <fs/sockfs/socktpi_impl.h>
  78 #include <fs/sockfs/sodirect.h>
  79 
  80 /*
  81  * Macros that operate on struct cmsghdr.
  82  * The CMSG_VALID macro does not assume that the last option buffer is padded.
  83  */
  84 #define CMSG_CONTENT(cmsg)      (&((cmsg)[1]))
  85 #define CMSG_CONTENTLEN(cmsg)   ((cmsg)->cmsg_len - sizeof (struct cmsghdr))
  86 #define CMSG_VALID(cmsg, start, end)                                    \
  87         (ISALIGNED_cmsghdr(cmsg) &&                                     \
  88         ((uintptr_t)(cmsg) >= (uintptr_t)(start)) &&                 \
  89         ((uintptr_t)(cmsg) < (uintptr_t)(end)) &&                    \
  90         ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \
  91         ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)))
  92 #define SO_LOCK_WAKEUP_TIME     3000    /* Wakeup time in milliseconds */
  93 
  94 dev_t sockdev;  /* For fsid in getattr */
  95 int sockfs_defer_nl7c_init = 0;
  96 
  97 struct socklist socklist;
  98 
  99 struct kmem_cache *socket_cache;
 100 
 101 /*
 102  * sockconf_lock protects the socket configuration (socket types and
 103  * socket filters) which is changed via the sockconfig system call.
 104  */
 105 krwlock_t sockconf_lock;
 106 
 107 static int sockfs_update(kstat_t *, int);
 108 static int sockfs_snapshot(kstat_t *, void *, int);
 109 extern smod_info_t *sotpi_smod_create(void);
 110 
 111 extern void sendfile_init();
 112 
 113 extern void nl7c_init(void);
 114 
 115 extern int modrootloaded;
 116 
 117 #define ADRSTRLEN (2 * sizeof (void *) + 1)
 118 /*
 119  * kernel structure for passing the sockinfo data back up to the user.
 120  * the strings array allows us to convert AF_UNIX addresses into strings
 121  * with a common method regardless of which n-bit kernel we're running.
 122  */
 123 struct k_sockinfo {
 124         struct sockinfo ks_si;
 125         char            ks_straddr[3][ADRSTRLEN];
 126 };
 127 
 128 /*
 129  * Translate from a device pathname (e.g. "/dev/tcp") to a vnode.
 130  * Returns with the vnode held.
 131  */
 132 int
 133 sogetvp(char *devpath, vnode_t **vpp, int uioflag)
 134 {
 135         struct snode *csp;
 136         vnode_t *vp, *dvp;
 137         major_t maj;
 138         int error;
 139 
 140         ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE);
 141 
 142         /*
 143          * Lookup the underlying filesystem vnode.
 144          */
 145         error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp);
 146         if (error)
 147                 return (error);
 148 
 149         /* Check that it is the correct vnode */
 150         if (vp->v_type != VCHR) {
 151                 VN_RELE(vp);
 152                 return (ENOTSOCK);
 153         }
 154 
 155         /*
 156          * If devpath went through devfs, the device should already
 157          * be configured. If devpath is a mknod file, however, we
 158          * need to make sure the device is properly configured.
 159          * To do this, we do something similar to spec_open()
 160          * except that we resolve to the minor/leaf level since
 161          * we need to return a vnode.
 162          */
 163         csp = VTOS(VTOS(vp)->s_commonvp);
 164         if (!(csp->s_flag & SDIPSET)) {
 165                 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 166                 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname);
 167                 if (error == 0)
 168                         error = devfs_lookupname(pathname, NULLVPP, &dvp);
 169                 VN_RELE(vp);
 170                 kmem_free(pathname, MAXPATHLEN);
 171                 if (error != 0)
 172                         return (ENXIO);
 173                 vp = dvp;       /* use the devfs vp */
 174         }
 175 
 176         /* device is configured at this point */
 177         maj = getmajor(vp->v_rdev);
 178         if (!STREAMSTAB(maj)) {
 179                 VN_RELE(vp);
 180                 return (ENOSTR);
 181         }
 182 
 183         *vpp = vp;
 184         return (0);
 185 }
 186 
 187 /*
 188  * Update the accessed, updated, or changed times in an sonode
 189  * with the current time.
 190  *
 191  * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable
 192  * attributes in a fstat call. (They return the current time and 0 for
 193  * all timestamps, respectively.) We maintain the current timestamps
 194  * here primarily so that should sockmod be popped the resulting
 195  * file descriptor will behave like a stream w.r.t. the timestamps.
 196  */
 197 void
 198 so_update_attrs(struct sonode *so, int flag)
 199 {
 200         time_t now = gethrestime_sec();
 201 
 202         if (SOCK_IS_NONSTR(so))
 203                 return;
 204 
 205         mutex_enter(&so->so_lock);
 206         so->so_flag |= flag;
 207         if (flag & SOACC)
 208                 SOTOTPI(so)->sti_atime = now;
 209         if (flag & SOMOD)
 210                 SOTOTPI(so)->sti_mtime = now;
 211         mutex_exit(&so->so_lock);
 212 }
 213 
 214 extern so_create_func_t sock_comm_create_function;
 215 extern so_destroy_func_t sock_comm_destroy_function;
 216 /*
 217  * Init function called when sockfs is loaded.
 218  */
 219 int
 220 sockinit(int fstype, char *name)
 221 {
 222         static const fs_operation_def_t sock_vfsops_template[] = {
 223                 NULL, NULL
 224         };
 225         int error;
 226         major_t dev;
 227         char *err_str;
 228 
 229         error = vfs_setfsops(fstype, sock_vfsops_template, NULL);
 230         if (error != 0) {
 231                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 232                     "sockinit: bad vfs ops template");
 233                 return (error);
 234         }
 235 
 236         error = vn_make_ops(name, socket_vnodeops_template,
 237             &socket_vnodeops);
 238         if (error != 0) {
 239                 err_str = "sockinit: bad socket vnode ops template";
 240                 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */
 241                 socket_vnodeops = NULL;
 242                 goto failure;
 243         }
 244 
 245         socket_cache = kmem_cache_create("socket_cache",
 246             sizeof (struct sonode), 0, sonode_constructor,
 247             sonode_destructor, NULL, NULL, NULL, 0);
 248 
 249         rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL);
 250 
 251         error = socktpi_init();
 252         if (error != 0) {
 253                 err_str = NULL;
 254                 goto failure;
 255         }
 256 
 257         error = sod_init();
 258         if (error != 0) {
 259                 err_str = NULL;
 260                 goto failure;
 261         }
 262 
 263         /*
 264          * Set up the default create and destroy functions
 265          */
 266         sock_comm_create_function = socket_sonode_create;
 267         sock_comm_destroy_function = socket_sonode_destroy;
 268 
 269         /*
 270          * Build initial list mapping socket parameters to vnode.
 271          */
 272         smod_init();
 273         smod_add(sotpi_smod_create());
 274 
 275         sockparams_init();
 276 
 277         /*
 278          * If sockets are needed before init runs /sbin/soconfig
 279          * it is possible to preload the sockparams list here using
 280          * calls like:
 281          *      sockconfig(1,2,3, "/dev/tcp", 0);
 282          */
 283 
 284         /*
 285          * Create a unique dev_t for use in so_fsid.
 286          */
 287 
 288         if ((dev = getudev()) == (major_t)-1)
 289                 dev = 0;
 290         sockdev = makedevice(dev, 0);
 291 
 292         mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL);
 293         sendfile_init();
 294         if (!modrootloaded) {
 295                 sockfs_defer_nl7c_init = 1;
 296         } else {
 297                 nl7c_init();
 298         }
 299 
 300         /* Initialize socket filters */
 301         sof_init();
 302 
 303         return (0);
 304 
 305 failure:
 306         (void) vfs_freevfsops_by_type(fstype);
 307         if (socket_vnodeops != NULL)
 308                 vn_freevnodeops(socket_vnodeops);
 309         if (err_str != NULL)
 310                 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str);
 311         return (error);
 312 }
 313 
 314 /*
 315  * Caller must hold the mutex. Used to set SOLOCKED.
 316  */
 317 void
 318 so_lock_single(struct sonode *so)
 319 {
 320         ASSERT(MUTEX_HELD(&so->so_lock));
 321 
 322         while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) {
 323                 cv_wait_stop(&so->so_single_cv, &so->so_lock,
 324                     SO_LOCK_WAKEUP_TIME);
 325         }
 326         so->so_flag |= SOLOCKED;
 327 }
 328 
 329 /*
 330  * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND.
 331  * Used to clear SOLOCKED or SOASYNC_UNBIND.
 332  */
 333 void
 334 so_unlock_single(struct sonode *so, int flag)
 335 {
 336         ASSERT(MUTEX_HELD(&so->so_lock));
 337         ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND));
 338         ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0);
 339         ASSERT(so->so_flag & flag);
 340         /*
 341          * Process the T_DISCON_IND on sti_discon_ind_mp.
 342          *
 343          * Call to so_drain_discon_ind will result in so_lock
 344          * being dropped and re-acquired later.
 345          */
 346         if (!SOCK_IS_NONSTR(so)) {
 347                 sotpi_info_t *sti = SOTOTPI(so);
 348 
 349                 if (sti->sti_discon_ind_mp != NULL)
 350                         so_drain_discon_ind(so);
 351         }
 352 
 353         cv_signal(&so->so_single_cv);
 354         so->so_flag &= ~flag;
 355 }
 356 
 357 /*
 358  * Caller must hold the mutex. Used to set SOREADLOCKED.
 359  * If the caller wants nonblocking behavior it should set fmode.
 360  */
 361 int
 362 so_lock_read(struct sonode *so, int fmode)
 363 {
 364         ASSERT(MUTEX_HELD(&so->so_lock));
 365 
 366         while (so->so_flag & SOREADLOCKED) {
 367                 if (fmode & (FNDELAY|FNONBLOCK))
 368                         return (EWOULDBLOCK);
 369                 cv_wait_stop(&so->so_read_cv, &so->so_lock,
 370                     SO_LOCK_WAKEUP_TIME);
 371         }
 372         so->so_flag |= SOREADLOCKED;
 373         return (0);
 374 }
 375 
 376 /*
 377  * Like so_lock_read above but allows signals.
 378  */
 379 int
 380 so_lock_read_intr(struct sonode *so, int fmode)
 381 {
 382         ASSERT(MUTEX_HELD(&so->so_lock));
 383 
 384         while (so->so_flag & SOREADLOCKED) {
 385                 if (fmode & (FNDELAY|FNONBLOCK))
 386                         return (EWOULDBLOCK);
 387                 if (!cv_wait_sig(&so->so_read_cv, &so->so_lock))
 388                         return (EINTR);
 389         }
 390         so->so_flag |= SOREADLOCKED;
 391         return (0);
 392 }
 393 
 394 /*
 395  * Caller must hold the mutex. Used to clear SOREADLOCKED,
 396  * set in so_lock_read() or so_lock_read_intr().
 397  */
 398 void
 399 so_unlock_read(struct sonode *so)
 400 {
 401         ASSERT(MUTEX_HELD(&so->so_lock));
 402         ASSERT(so->so_flag & SOREADLOCKED);
 403 
 404         cv_signal(&so->so_read_cv);
 405         so->so_flag &= ~SOREADLOCKED;
 406 }
 407 
 408 /*
 409  * Verify that the specified offset falls within the mblk and
 410  * that the resulting pointer is aligned.
 411  * Returns NULL if not.
 412  */
 413 void *
 414 sogetoff(mblk_t *mp, t_uscalar_t offset,
 415     t_uscalar_t length, uint_t align_size)
 416 {
 417         uintptr_t ptr1, ptr2;
 418 
 419         ASSERT(mp && mp->b_wptr >= mp->b_rptr);
 420         ptr1 = (uintptr_t)mp->b_rptr + offset;
 421         ptr2 = (uintptr_t)ptr1 + length;
 422         if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) {
 423                 eprintline(0);
 424                 return (NULL);
 425         }
 426         if ((ptr1 & (align_size - 1)) != 0) {
 427                 eprintline(0);
 428                 return (NULL);
 429         }
 430         return ((void *)ptr1);
 431 }
 432 
 433 /*
 434  * Return the AF_UNIX underlying filesystem vnode matching a given name.
 435  * Makes sure the sending and the destination sonodes are compatible.
 436  * The vnode is returned held.
 437  *
 438  * The underlying filesystem VSOCK vnode has a v_stream pointer that
 439  * references the actual stream head (hence indirectly the actual sonode).
 440  *
 441  * This function is non-static so it can be used by brand emulation.
 442  */
 443 int
 444 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess,
 445     vnode_t **vpp)
 446 {
 447         vnode_t         *vp;    /* Underlying filesystem vnode */
 448         vnode_t         *rvp;   /* real vnode */
 449         vnode_t         *svp;   /* sockfs vnode */
 450         struct sonode   *so2;
 451         int             error;
 452 
 453         dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so,
 454             soun->sun_path));
 455 
 456         error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
 457         if (error) {
 458                 eprintsoline(so, error);
 459                 return (error);
 460         }
 461 
 462         /*
 463          * Traverse lofs mounts get the real vnode
 464          */
 465         if (VOP_REALVP(vp, &rvp, NULL) == 0) {
 466                 VN_HOLD(rvp);           /* hold the real vnode */
 467                 VN_RELE(vp);            /* release hold from lookup */
 468                 vp = rvp;
 469         }
 470 
 471         if (vp->v_type != VSOCK) {
 472                 error = ENOTSOCK;
 473                 eprintsoline(so, error);
 474                 goto done2;
 475         }
 476 
 477         if (checkaccess) {
 478                 /*
 479                  * Check that we have permissions to access the destination
 480                  * vnode. This check is not done in BSD but it is required
 481                  * by X/Open.
 482                  */
 483                 if (error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL)) {
 484                         eprintsoline(so, error);
 485                         goto done2;
 486                 }
 487         }
 488 
 489         /*
 490          * Check if the remote socket has been closed.
 491          *
 492          * Synchronize with vn_rele_stream by holding v_lock while traversing
 493          * v_stream->sd_vnode.
 494          */
 495         mutex_enter(&vp->v_lock);
 496         if (vp->v_stream == NULL) {
 497                 mutex_exit(&vp->v_lock);
 498                 if (so->so_type == SOCK_DGRAM)
 499                         error = EDESTADDRREQ;
 500                 else
 501                         error = ECONNREFUSED;
 502 
 503                 eprintsoline(so, error);
 504                 goto done2;
 505         }
 506         ASSERT(vp->v_stream->sd_vnode);
 507         svp = vp->v_stream->sd_vnode;
 508         /*
 509          * holding v_lock on underlying filesystem vnode and acquiring
 510          * it on sockfs vnode. Assumes that no code ever attempts to
 511          * acquire these locks in the reverse order.
 512          */
 513         VN_HOLD(svp);
 514         mutex_exit(&vp->v_lock);
 515 
 516         if (svp->v_type != VSOCK) {
 517                 error = ENOTSOCK;
 518                 eprintsoline(so, error);
 519                 goto done;
 520         }
 521 
 522         so2 = VTOSO(svp);
 523 
 524         if (so->so_type != so2->so_type) {
 525                 error = EPROTOTYPE;
 526                 eprintsoline(so, error);
 527                 goto done;
 528         }
 529 
 530         VN_RELE(svp);
 531         *vpp = vp;
 532         return (0);
 533 
 534 done:
 535         VN_RELE(svp);
 536 done2:
 537         VN_RELE(vp);
 538         return (error);
 539 }
 540 
 541 /*
 542  * Verify peer address for connect and sendto/sendmsg.
 543  * Since sendto/sendmsg would not get synchronous errors from the transport
 544  * provider we have to do these ugly checks in the socket layer to
 545  * preserve compatibility with SunOS 4.X.
 546  */
 547 int
 548 so_addr_verify(struct sonode *so, const struct sockaddr *name,
 549     socklen_t namelen)
 550 {
 551         int             family;
 552 
 553         dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n",
 554             (void *)so, (void *)name, namelen));
 555 
 556         ASSERT(name != NULL);
 557 
 558         family = so->so_family;
 559         switch (family) {
 560         case AF_INET:
 561                 if (name->sa_family != family) {
 562                         eprintsoline(so, EAFNOSUPPORT);
 563                         return (EAFNOSUPPORT);
 564                 }
 565                 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) {
 566                         eprintsoline(so, EINVAL);
 567                         return (EINVAL);
 568                 }
 569                 break;
 570         case AF_INET6: {
 571 #ifdef DEBUG
 572                 struct sockaddr_in6 *sin6;
 573 #endif /* DEBUG */
 574 
 575                 if (name->sa_family != family) {
 576                         eprintsoline(so, EAFNOSUPPORT);
 577                         return (EAFNOSUPPORT);
 578                 }
 579                 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) {
 580                         eprintsoline(so, EINVAL);
 581                         return (EINVAL);
 582                 }
 583 #ifdef DEBUG
 584                 /* Verify that apps don't forget to clear sin6_scope_id etc */
 585                 sin6 = (struct sockaddr_in6 *)name;
 586                 if (sin6->sin6_scope_id != 0 &&
 587                     !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
 588                         zcmn_err(getzoneid(), CE_WARN,
 589                             "connect/send* with uninitialized sin6_scope_id "
 590                             "(%d) on socket. Pid = %d\n",
 591                             (int)sin6->sin6_scope_id, (int)curproc->p_pid);
 592                 }
 593 #endif /* DEBUG */
 594                 break;
 595         }
 596         case AF_UNIX:
 597                 if (SOTOTPI(so)->sti_faddr_noxlate) {
 598                         return (0);
 599                 }
 600                 if (namelen < (socklen_t)sizeof (short)) {
 601                         eprintsoline(so, ENOENT);
 602                         return (ENOENT);
 603                 }
 604                 if (name->sa_family != family) {
 605                         eprintsoline(so, EAFNOSUPPORT);
 606                         return (EAFNOSUPPORT);
 607                 }
 608                 /* MAXPATHLEN + soun_family + nul termination */
 609                 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
 610                         eprintsoline(so, ENAMETOOLONG);
 611                         return (ENAMETOOLONG);
 612                 }
 613 
 614                 break;
 615 
 616         default:
 617                 /*
 618                  * Default is don't do any length or sa_family check
 619                  * to allow non-sockaddr style addresses.
 620                  */
 621                 break;
 622         }
 623 
 624         return (0);
 625 }
 626 
 627 
 628 /*
 629  * Translate an AF_UNIX sockaddr_un to the transport internal name.
 630  * Assumes caller has called so_addr_verify first.
 631  */
 632 /*ARGSUSED*/
 633 int
 634 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name,
 635     socklen_t namelen, int checkaccess,
 636     void **addrp, socklen_t *addrlenp)
 637 {
 638         int                     error;
 639         struct sockaddr_un      *soun;
 640         vnode_t                 *vp;
 641         void                    *addr;
 642         socklen_t               addrlen;
 643         sotpi_info_t            *sti = SOTOTPI(so);
 644 
 645         dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n",
 646             (void *)so, (void *)name, namelen, checkaccess));
 647 
 648         ASSERT(name != NULL);
 649         ASSERT(so->so_family == AF_UNIX);
 650         ASSERT(!sti->sti_faddr_noxlate);
 651         ASSERT(namelen >= (socklen_t)sizeof (short));
 652         ASSERT(name->sa_family == AF_UNIX);
 653         soun = (struct sockaddr_un *)name;
 654         /*
 655          * Lookup vnode for the specified path name and verify that
 656          * it is a socket.
 657          */
 658         error = so_ux_lookup(so, soun, checkaccess, &vp);
 659         if (error) {
 660                 eprintsoline(so, error);
 661                 return (error);
 662         }
 663         /*
 664          * Use the address of the peer vnode as the address to send
 665          * to. We release the peer vnode here. In case it has been
 666          * closed by the time the T_CONN_REQ or T_UNITDATA_REQ reaches the
 667          * transport the message will get an error or be dropped.
 668          */
 669         sti->sti_ux_faddr.soua_vp = vp;
 670         sti->sti_ux_faddr.soua_magic = SOU_MAGIC_EXPLICIT;
 671         addr = &sti->sti_ux_faddr;
 672         addrlen = (socklen_t)sizeof (sti->sti_ux_faddr);
 673         dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n",
 674             addrlen, (void *)vp));
 675         VN_RELE(vp);
 676         *addrp = addr;
 677         *addrlenp = (socklen_t)addrlen;
 678         return (0);
 679 }
 680 
 681 /*
 682  * Esballoc free function for messages that contain SO_FILEP option.
 683  * Decrement the reference count on the file pointers using closef.
 684  */
 685 void
 686 fdbuf_free(struct fdbuf *fdbuf)
 687 {
 688         int     i;
 689         struct file *fp;
 690 
 691         dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd));
 692         for (i = 0; i < fdbuf->fd_numfd; i++) {
 693                 /*
 694                  * We need pointer size alignment for fd_fds. On a LP64
 695                  * kernel, the required alignment is 8 bytes while
 696                  * the option headers and values are only 4 bytes
 697                  * aligned. So its safer to do a bcopy compared to
 698                  * assigning fdbuf->fd_fds[i] to fp.
 699                  */
 700                 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
 701                 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp));
 702                 (void) closef(fp);
 703         }
 704         if (fdbuf->fd_ebuf != NULL)
 705                 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen);
 706         kmem_free(fdbuf, fdbuf->fd_size);
 707 }
 708 
 709 /*
 710  * Allocate an esballoc'ed message for AF_UNIX file descriptor passing.
 711  * Waits if memory is not available.
 712  */
 713 mblk_t *
 714 fdbuf_allocmsg(int size, struct fdbuf *fdbuf)
 715 {
 716         uchar_t *buf;
 717         mblk_t  *mp;
 718 
 719         dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd));
 720         buf = kmem_alloc(size, KM_SLEEP);
 721         fdbuf->fd_ebuf = (caddr_t)buf;
 722         fdbuf->fd_ebuflen = size;
 723         fdbuf->fd_frtn.free_func = fdbuf_free;
 724         fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf;
 725 
 726         mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn);
 727         mp->b_datap->db_type = M_PROTO;
 728         return (mp);
 729 }
 730 
 731 /*
 732  * Extract file descriptors from a fdbuf.
 733  * Return list in rights/rightslen.
 734  */
 735 /*ARGSUSED*/
 736 static int
 737 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen)
 738 {
 739         int     i, fd;
 740         int     *rp;
 741         struct file *fp;
 742         int     numfd;
 743 
 744         dprint(1, ("fdbuf_extract: %d fds, len %d\n",
 745             fdbuf->fd_numfd, rightslen));
 746 
 747         numfd = fdbuf->fd_numfd;
 748         ASSERT(rightslen == numfd * (int)sizeof (int));
 749 
 750         /*
 751          * Allocate a file descriptor and increment the f_count.
 752          * The latter is needed since we always call fdbuf_free
 753          * which performs a closef.
 754          */
 755         rp = (int *)rights;
 756         for (i = 0; i < numfd; i++) {
 757                 if ((fd = ufalloc(0)) == -1)
 758                         goto cleanup;
 759                 /*
 760                  * We need pointer size alignment for fd_fds. On a LP64
 761                  * kernel, the required alignment is 8 bytes while
 762                  * the option headers and values are only 4 bytes
 763                  * aligned. So its safer to do a bcopy compared to
 764                  * assigning fdbuf->fd_fds[i] to fp.
 765                  */
 766                 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
 767                 mutex_enter(&fp->f_tlock);
 768                 fp->f_count++;
 769                 mutex_exit(&fp->f_tlock);
 770                 setf(fd, fp);
 771                 *rp++ = fd;
 772                 if (AU_AUDITING())
 773                         audit_fdrecv(fd, fp);
 774                 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n",
 775                     i, fd, (void *)fp, fp->f_count));
 776         }
 777         return (0);
 778 
 779 cleanup:
 780         /*
 781          * Undo whatever partial work the loop above has done.
 782          */
 783         {
 784                 int j;
 785 
 786                 rp = (int *)rights;
 787                 for (j = 0; j < i; j++) {
 788                         dprint(0,
 789                             ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp));
 790                         (void) closeandsetf(*rp++, NULL);
 791                 }
 792         }
 793 
 794         return (EMFILE);
 795 }
 796 
 797 /*
 798  * Insert file descriptors into an fdbuf.
 799  * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed
 800  * by calling fdbuf_free().
 801  */
 802 int
 803 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp)
 804 {
 805         int             numfd, i;
 806         int             *fds;
 807         struct file     *fp;
 808         struct fdbuf    *fdbuf;
 809         int             fdbufsize;
 810 
 811         dprint(1, ("fdbuf_create: len %d\n", rightslen));
 812 
 813         numfd = rightslen / (int)sizeof (int);
 814 
 815         fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *));
 816         fdbuf = kmem_alloc(fdbufsize, KM_SLEEP);
 817         fdbuf->fd_size = fdbufsize;
 818         fdbuf->fd_numfd = 0;
 819         fdbuf->fd_ebuf = NULL;
 820         fdbuf->fd_ebuflen = 0;
 821         fds = (int *)rights;
 822         for (i = 0; i < numfd; i++) {
 823                 if ((fp = getf(fds[i])) == NULL) {
 824                         fdbuf_free(fdbuf);
 825                         return (EBADF);
 826                 }
 827                 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n",
 828                     i, fds[i], (void *)fp, fp->f_count));
 829                 mutex_enter(&fp->f_tlock);
 830                 fp->f_count++;
 831                 mutex_exit(&fp->f_tlock);
 832                 /*
 833                  * The maximum alignment for fdbuf (or any option header
 834                  * and its value) it 4 bytes. On a LP64 kernel, the alignment
 835                  * is not sufficient for pointers (fd_fds in this case). Since
 836                  * we just did a kmem_alloc (we get a double word alignment),
 837                  * we don't need to do anything on the send side (we loose
 838                  * the double word alignment because fdbuf goes after an
 839                  * option header (eg T_unitdata_req) which is only 4 byte
 840                  * aligned). We take care of this when we extract the file
 841                  * descriptor in fdbuf_extract or fdbuf_free.
 842                  */
 843                 fdbuf->fd_fds[i] = fp;
 844                 fdbuf->fd_numfd++;
 845                 releasef(fds[i]);
 846                 if (AU_AUDITING())
 847                         audit_fdsend(fds[i], fp, 0);
 848         }
 849         *fdbufp = fdbuf;
 850         return (0);
 851 }
 852 
 853 static int
 854 fdbuf_optlen(int rightslen)
 855 {
 856         int numfd;
 857 
 858         numfd = rightslen / (int)sizeof (int);
 859 
 860         return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)));
 861 }
 862 
 863 static t_uscalar_t
 864 fdbuf_cmsglen(int fdbuflen)
 865 {
 866         return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) /
 867             (int)sizeof (struct file *) * (int)sizeof (int));
 868 }
 869 
 870 
 871 /*
 872  * Return non-zero if the mblk and fdbuf are consistent.
 873  */
 874 static int
 875 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen)
 876 {
 877         if (fdbuflen >= FDBUF_HDRSIZE &&
 878             fdbuflen == fdbuf->fd_size) {
 879                 frtn_t *frp = mp->b_datap->db_frtnp;
 880                 /*
 881                  * Check that the SO_FILEP portion of the
 882                  * message has not been modified by
 883                  * the loopback transport. The sending sockfs generates
 884                  * a message that is esballoc'ed with the free function
 885                  * being fdbuf_free() and where free_arg contains the
 886                  * identical information as the SO_FILEP content.
 887                  *
 888                  * If any of these constraints are not satisfied we
 889                  * silently ignore the option.
 890                  */
 891                 ASSERT(mp);
 892                 if (frp != NULL &&
 893                     frp->free_func == fdbuf_free &&
 894                     frp->free_arg != NULL &&
 895                     bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) {
 896                         dprint(1, ("fdbuf_verify: fdbuf %p len %d\n",
 897                             (void *)fdbuf, fdbuflen));
 898                         return (1);
 899                 } else {
 900                         zcmn_err(getzoneid(), CE_WARN,
 901                             "sockfs: mismatched fdbuf content (%p)",
 902                             (void *)mp);
 903                         return (0);
 904                 }
 905         } else {
 906                 zcmn_err(getzoneid(), CE_WARN,
 907                     "sockfs: mismatched fdbuf len %d, %d\n",
 908                     fdbuflen, fdbuf->fd_size);
 909                 return (0);
 910         }
 911 }
 912 
 913 /*
 914  * When the file descriptors returned by sorecvmsg can not be passed
 915  * to the application this routine will cleanup the references on
 916  * the files. Start at startoff bytes into the buffer.
 917  */
 918 static void
 919 close_fds(void *fdbuf, int fdbuflen, int startoff)
 920 {
 921         int *fds = (int *)fdbuf;
 922         int numfd = fdbuflen / (int)sizeof (int);
 923         int i;
 924 
 925         dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff));
 926 
 927         for (i = 0; i < numfd; i++) {
 928                 if (startoff < 0)
 929                         startoff = 0;
 930                 if (startoff < (int)sizeof (int)) {
 931                         /*
 932                          * This file descriptor is partially or fully after
 933                          * the offset
 934                          */
 935                         dprint(0,
 936                             ("close_fds: cleanup[%d] = %d\n", i, fds[i]));
 937                         (void) closeandsetf(fds[i], NULL);
 938                 }
 939                 startoff -= (int)sizeof (int);
 940         }
 941 }
 942 
 943 /*
 944  * Close all file descriptors contained in the control part starting at
 945  * the startoffset.
 946  */
 947 void
 948 so_closefds(void *control, t_uscalar_t controllen, int oldflg,
 949     int startoff)
 950 {
 951         struct cmsghdr *cmsg;
 952 
 953         if (control == NULL)
 954                 return;
 955 
 956         if (oldflg) {
 957                 close_fds(control, controllen, startoff);
 958                 return;
 959         }
 960         /* Scan control part for file descriptors. */
 961         for (cmsg = (struct cmsghdr *)control;
 962             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
 963             cmsg = CMSG_NEXT(cmsg)) {
 964                 if (cmsg->cmsg_level == SOL_SOCKET &&
 965                     cmsg->cmsg_type == SCM_RIGHTS) {
 966                         close_fds(CMSG_CONTENT(cmsg),
 967                             (int)CMSG_CONTENTLEN(cmsg),
 968                             startoff - (int)sizeof (struct cmsghdr));
 969                 }
 970                 startoff -= cmsg->cmsg_len;
 971         }
 972 }
 973 
 974 /*
 975  * Returns a pointer/length for the file descriptors contained
 976  * in the control buffer. Returns with *fdlenp == -1 if there are no
 977  * file descriptor options present. This is different than there being
 978  * a zero-length file descriptor option.
 979  * Fail if there are multiple SCM_RIGHT cmsgs.
 980  */
 981 int
 982 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg,
 983     void **fdsp, int *fdlenp)
 984 {
 985         struct cmsghdr *cmsg;
 986         void *fds;
 987         int fdlen;
 988 
 989         if (control == NULL) {
 990                 *fdsp = NULL;
 991                 *fdlenp = -1;
 992                 return (0);
 993         }
 994 
 995         if (oldflg) {
 996                 *fdsp = control;
 997                 if (controllen == 0)
 998                         *fdlenp = -1;
 999                 else
1000                         *fdlenp = controllen;
1001                 dprint(1, ("so_getfdopt: old %d\n", *fdlenp));
1002                 return (0);
1003         }
1004 
1005         fds = NULL;
1006         fdlen = 0;
1007 
1008         for (cmsg = (struct cmsghdr *)control;
1009             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1010             cmsg = CMSG_NEXT(cmsg)) {
1011                 if (cmsg->cmsg_level == SOL_SOCKET &&
1012                     cmsg->cmsg_type == SCM_RIGHTS) {
1013                         if (fds != NULL)
1014                                 return (EINVAL);
1015                         fds = CMSG_CONTENT(cmsg);
1016                         fdlen = (int)CMSG_CONTENTLEN(cmsg);
1017                         dprint(1, ("so_getfdopt: new %lu\n",
1018                             (size_t)CMSG_CONTENTLEN(cmsg)));
1019                 }
1020         }
1021         if (fds == NULL) {
1022                 dprint(1, ("so_getfdopt: NONE\n"));
1023                 *fdlenp = -1;
1024         } else
1025                 *fdlenp = fdlen;
1026         *fdsp = fds;
1027         return (0);
1028 }
1029 
1030 /*
1031  * Return the length of the options including any file descriptor options.
1032  */
1033 t_uscalar_t
1034 so_optlen(void *control, t_uscalar_t controllen, int oldflg)
1035 {
1036         struct cmsghdr *cmsg;
1037         t_uscalar_t optlen = 0;
1038         t_uscalar_t len;
1039 
1040         if (control == NULL)
1041                 return (0);
1042 
1043         if (oldflg)
1044                 return ((t_uscalar_t)(sizeof (struct T_opthdr) +
1045                     fdbuf_optlen(controllen)));
1046 
1047         for (cmsg = (struct cmsghdr *)control;
1048             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1049             cmsg = CMSG_NEXT(cmsg)) {
1050                 if (cmsg->cmsg_level == SOL_SOCKET &&
1051                     cmsg->cmsg_type == SCM_RIGHTS) {
1052                         len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg));
1053                 } else {
1054                         len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1055                 }
1056                 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) +
1057                     sizeof (struct T_opthdr));
1058         }
1059         dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n",
1060             controllen, oldflg, optlen));
1061         return (optlen);
1062 }
1063 
1064 /*
1065  * Copy options from control to the mblk. Skip any file descriptor options.
1066  */
1067 void
1068 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp)
1069 {
1070         struct T_opthdr toh;
1071         struct cmsghdr *cmsg;
1072 
1073         if (control == NULL)
1074                 return;
1075 
1076         if (oldflg) {
1077                 /* No real options - caller has handled file descriptors */
1078                 return;
1079         }
1080         for (cmsg = (struct cmsghdr *)control;
1081             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1082             cmsg = CMSG_NEXT(cmsg)) {
1083                 /*
1084                  * Note: The caller handles file descriptors prior
1085                  * to calling this function.
1086                  */
1087                 t_uscalar_t len;
1088 
1089                 if (cmsg->cmsg_level == SOL_SOCKET &&
1090                     cmsg->cmsg_type == SCM_RIGHTS)
1091                         continue;
1092 
1093                 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1094                 toh.level = cmsg->cmsg_level;
1095                 toh.name = cmsg->cmsg_type;
1096                 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr);
1097                 toh.status = 0;
1098 
1099                 soappendmsg(mp, &toh, sizeof (toh));
1100                 soappendmsg(mp, CMSG_CONTENT(cmsg), len);
1101                 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len;
1102                 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
1103         }
1104 }
1105 
1106 /*
1107  * Return the length of the control message derived from the options.
1108  * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP.
1109  * When oldflg is set only include SO_FILEP.
1110  * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1111  * allocates the space that so_opt2cmsg fills. If one changes, the other should
1112  * also be checked for any possible impacts.
1113  */
1114 t_uscalar_t
1115 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg)
1116 {
1117         t_uscalar_t cmsglen = 0;
1118         struct T_opthdr *tohp;
1119         t_uscalar_t len;
1120         t_uscalar_t last_roundup = 0;
1121 
1122         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1123 
1124         for (tohp = (struct T_opthdr *)opt;
1125             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1126             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1127                 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n",
1128                     tohp->level, tohp->name, tohp->len));
1129                 if (tohp->level == SOL_SOCKET &&
1130                     (tohp->name == SO_SRCADDR ||
1131                     tohp->name == SO_UNIX_CLOSE)) {
1132                         continue;
1133                 }
1134                 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1135                         struct fdbuf *fdbuf;
1136                         int fdbuflen;
1137 
1138                         fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1139                         fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1140 
1141                         if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1142                                 continue;
1143                         if (oldflg) {
1144                                 cmsglen += fdbuf_cmsglen(fdbuflen);
1145                                 continue;
1146                         }
1147                         len = fdbuf_cmsglen(fdbuflen);
1148                 } else if (tohp->level == SOL_SOCKET &&
1149                     tohp->name == SCM_TIMESTAMP) {
1150                         if (oldflg)
1151                                 continue;
1152 
1153                         if (get_udatamodel() == DATAMODEL_NATIVE) {
1154                                 len = sizeof (struct timeval);
1155                         } else {
1156                                 len = sizeof (struct timeval32);
1157                         }
1158                 } else {
1159                         if (oldflg)
1160                                 continue;
1161                         len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1162                 }
1163                 /*
1164                  * Exclude roundup for last option to not set
1165                  * MSG_CTRUNC when the cmsg fits but the padding doesn't fit.
1166                  */
1167                 last_roundup = (t_uscalar_t)
1168                     (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) -
1169                     (len + (int)sizeof (struct cmsghdr)));
1170                 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) +
1171                     last_roundup;
1172         }
1173         cmsglen -= last_roundup;
1174         dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n",
1175             optlen, oldflg, cmsglen));
1176         return (cmsglen);
1177 }
1178 
1179 /*
1180  * Copy options from options to the control. Convert SO_FILEP to
1181  * file descriptors.
1182  * Returns errno or zero.
1183  * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1184  * allocates the space that so_opt2cmsg fills. If one changes, the other should
1185  * also be checked for any possible impacts.
1186  */
1187 int
1188 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg,
1189     void *control, t_uscalar_t controllen)
1190 {
1191         struct T_opthdr *tohp;
1192         struct cmsghdr *cmsg;
1193         struct fdbuf *fdbuf;
1194         int fdbuflen;
1195         int error;
1196 #if defined(DEBUG) || defined(__lint)
1197         struct cmsghdr *cend = (struct cmsghdr *)
1198             (((uint8_t *)control) + ROUNDUP_cmsglen(controllen));
1199 #endif
1200         cmsg = (struct cmsghdr *)control;
1201 
1202         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1203 
1204         for (tohp = (struct T_opthdr *)opt;
1205             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1206             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1207                 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n",
1208                     tohp->level, tohp->name, tohp->len));
1209 
1210                 if (tohp->level == SOL_SOCKET &&
1211                     (tohp->name == SO_SRCADDR ||
1212                     tohp->name == SO_UNIX_CLOSE)) {
1213                         continue;
1214                 }
1215                 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen);
1216                 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1217                         fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1218                         fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1219 
1220                         if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1221                                 return (EPROTO);
1222                         if (oldflg) {
1223                                 error = fdbuf_extract(fdbuf, control,
1224                                     (int)controllen);
1225                                 if (error != 0)
1226                                         return (error);
1227                                 continue;
1228                         } else {
1229                                 int fdlen;
1230 
1231                                 fdlen = (int)fdbuf_cmsglen(
1232                                     (int)_TPI_TOPT_DATALEN(tohp));
1233 
1234                                 cmsg->cmsg_level = tohp->level;
1235                                 cmsg->cmsg_type = SCM_RIGHTS;
1236                                 cmsg->cmsg_len = (socklen_t)(fdlen +
1237                                     sizeof (struct cmsghdr));
1238 
1239                                 error = fdbuf_extract(fdbuf,
1240                                     CMSG_CONTENT(cmsg), fdlen);
1241                                 if (error != 0)
1242                                         return (error);
1243                         }
1244                 } else if (tohp->level == SOL_SOCKET &&
1245                     tohp->name == SCM_TIMESTAMP) {
1246                         timestruc_t *timestamp;
1247 
1248                         if (oldflg)
1249                                 continue;
1250 
1251                         cmsg->cmsg_level = tohp->level;
1252                         cmsg->cmsg_type = tohp->name;
1253 
1254                         timestamp =
1255                             (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1],
1256                             sizeof (intptr_t));
1257 
1258                         if (get_udatamodel() == DATAMODEL_NATIVE) {
1259                                 struct timeval tv;
1260 
1261                                 cmsg->cmsg_len = sizeof (struct timeval) +
1262                                     sizeof (struct cmsghdr);
1263                                 tv.tv_sec = timestamp->tv_sec;
1264                                 tv.tv_usec = timestamp->tv_nsec /
1265                                     (NANOSEC / MICROSEC);
1266                                 /*
1267                                  * on LP64 systems, the struct timeval in
1268                                  * the destination will not be 8-byte aligned,
1269                                  * so use bcopy to avoid alignment trouble
1270                                  */
1271                                 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv));
1272                         } else {
1273                                 struct timeval32 *time32;
1274 
1275                                 cmsg->cmsg_len = sizeof (struct timeval32) +
1276                                     sizeof (struct cmsghdr);
1277                                 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg);
1278                                 time32->tv_sec = (time32_t)timestamp->tv_sec;
1279                                 time32->tv_usec =
1280                                     (int32_t)(timestamp->tv_nsec /
1281                                     (NANOSEC / MICROSEC));
1282                         }
1283 
1284                 } else {
1285                         if (oldflg)
1286                                 continue;
1287 
1288                         cmsg->cmsg_level = tohp->level;
1289                         cmsg->cmsg_type = tohp->name;
1290                         cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) +
1291                             sizeof (struct cmsghdr));
1292 
1293                         /* copy content to control data part */
1294                         bcopy(&tohp[1], CMSG_CONTENT(cmsg),
1295                             CMSG_CONTENTLEN(cmsg));
1296                 }
1297                 /* move to next CMSG structure! */
1298                 cmsg = CMSG_NEXT(cmsg);
1299         }
1300         dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n",
1301             control, controllen, (void *)cend, (void *)cmsg));
1302         ASSERT(cmsg <= cend);
1303         return (0);
1304 }
1305 
1306 /*
1307  * Extract the SO_SRCADDR option value if present.
1308  */
1309 void
1310 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp,
1311     t_uscalar_t *srclenp)
1312 {
1313         struct T_opthdr         *tohp;
1314 
1315         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1316 
1317         ASSERT(srcp != NULL && srclenp != NULL);
1318         *srcp = NULL;
1319         *srclenp = 0;
1320 
1321         for (tohp = (struct T_opthdr *)opt;
1322             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1323             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1324                 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n",
1325                     tohp->level, tohp->name, tohp->len));
1326                 if (tohp->level == SOL_SOCKET &&
1327                     tohp->name == SO_SRCADDR) {
1328                         *srcp = _TPI_TOPT_DATA(tohp);
1329                         *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1330                 }
1331         }
1332 }
1333 
1334 /*
1335  * Verify if the SO_UNIX_CLOSE option is present.
1336  */
1337 int
1338 so_getopt_unix_close(void *opt, t_uscalar_t optlen)
1339 {
1340         struct T_opthdr         *tohp;
1341 
1342         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1343 
1344         for (tohp = (struct T_opthdr *)opt;
1345             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1346             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1347                 dprint(1,
1348                     ("so_getopt_unix_close: level 0x%x, name %d, len %d\n",
1349                     tohp->level, tohp->name, tohp->len));
1350                 if (tohp->level == SOL_SOCKET &&
1351                     tohp->name == SO_UNIX_CLOSE)
1352                         return (1);
1353         }
1354         return (0);
1355 }
1356 
1357 /*
1358  * Allocate an M_PROTO message.
1359  *
1360  * If allocation fails the behavior depends on sleepflg:
1361  *      _ALLOC_NOSLEEP  fail immediately
1362  *      _ALLOC_INTR     sleep for memory until a signal is caught
1363  *      _ALLOC_SLEEP    sleep forever. Don't return NULL.
1364  */
1365 mblk_t *
1366 soallocproto(size_t size, int sleepflg, cred_t *cr)
1367 {
1368         mblk_t  *mp;
1369 
1370         /* Round up size for reuse */
1371         size = MAX(size, 64);
1372         if (cr != NULL)
1373                 mp = allocb_cred(size, cr, curproc->p_pid);
1374         else
1375                 mp = allocb(size, BPRI_MED);
1376 
1377         if (mp == NULL) {
1378                 int error;      /* Dummy - error not returned to caller */
1379 
1380                 switch (sleepflg) {
1381                 case _ALLOC_SLEEP:
1382                         if (cr != NULL) {
1383                                 mp = allocb_cred_wait(size, STR_NOSIG, &error,
1384                                     cr, curproc->p_pid);
1385                         } else {
1386                                 mp = allocb_wait(size, BPRI_MED, STR_NOSIG,
1387                                     &error);
1388                         }
1389                         ASSERT(mp);
1390                         break;
1391                 case _ALLOC_INTR:
1392                         if (cr != NULL) {
1393                                 mp = allocb_cred_wait(size, 0, &error, cr,
1394                                     curproc->p_pid);
1395                         } else {
1396                                 mp = allocb_wait(size, BPRI_MED, 0, &error);
1397                         }
1398                         if (mp == NULL) {
1399                                 /* Caught signal while sleeping for memory */
1400                                 eprintline(ENOBUFS);
1401                                 return (NULL);
1402                         }
1403                         break;
1404                 case _ALLOC_NOSLEEP:
1405                 default:
1406                         eprintline(ENOBUFS);
1407                         return (NULL);
1408                 }
1409         }
1410         DB_TYPE(mp) = M_PROTO;
1411         return (mp);
1412 }
1413 
1414 /*
1415  * Allocate an M_PROTO message with a single component.
1416  * len is the length of buf. size is the amount to allocate.
1417  *
1418  * buf can be NULL with a non-zero len.
1419  * This results in a bzero'ed chunk being placed the message.
1420  */
1421 mblk_t *
1422 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg,
1423     cred_t *cr)
1424 {
1425         mblk_t  *mp;
1426 
1427         if (size == 0)
1428                 size = len;
1429 
1430         ASSERT(size >= len);
1431         /* Round up size for reuse */
1432         size = MAX(size, 64);
1433         mp = soallocproto(size, sleepflg, cr);
1434         if (mp == NULL)
1435                 return (NULL);
1436         mp->b_datap->db_type = M_PROTO;
1437         if (len != 0) {
1438                 if (buf != NULL)
1439                         bcopy(buf, mp->b_wptr, len);
1440                 else
1441                         bzero(mp->b_wptr, len);
1442                 mp->b_wptr += len;
1443         }
1444         return (mp);
1445 }
1446 
1447 /*
1448  * Append buf/len to mp.
1449  * The caller has to ensure that there is enough room in the mblk.
1450  *
1451  * buf can be NULL with a non-zero len.
1452  * This results in a bzero'ed chunk being placed the message.
1453  */
1454 void
1455 soappendmsg(mblk_t *mp, const void *buf, ssize_t len)
1456 {
1457         ASSERT(mp);
1458 
1459         if (len != 0) {
1460                 /* Assert for room left */
1461                 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len);
1462                 if (buf != NULL)
1463                         bcopy(buf, mp->b_wptr, len);
1464                 else
1465                         bzero(mp->b_wptr, len);
1466         }
1467         mp->b_wptr += len;
1468 }
1469 
1470 /*
1471  * Create a message using two kernel buffers.
1472  * If size is set that will determine the allocation size (e.g. for future
1473  * soappendmsg calls). If size is zero it is derived from the buffer
1474  * lengths.
1475  */
1476 mblk_t *
1477 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1478     ssize_t size, int sleepflg, cred_t *cr)
1479 {
1480         mblk_t *mp;
1481 
1482         if (size == 0)
1483                 size = len1 + len2;
1484         ASSERT(size >= len1 + len2);
1485 
1486         mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1487         if (mp)
1488                 soappendmsg(mp, buf2, len2);
1489         return (mp);
1490 }
1491 
1492 /*
1493  * Create a message using three kernel buffers.
1494  * If size is set that will determine the allocation size (for future
1495  * soappendmsg calls). If size is zero it is derived from the buffer
1496  * lengths.
1497  */
1498 mblk_t *
1499 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1500     const void *buf3, ssize_t len3, ssize_t size, int sleepflg, cred_t *cr)
1501 {
1502         mblk_t *mp;
1503 
1504         if (size == 0)
1505                 size = len1 + len2 +len3;
1506         ASSERT(size >= len1 + len2 + len3);
1507 
1508         mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1509         if (mp != NULL) {
1510                 soappendmsg(mp, buf2, len2);
1511                 soappendmsg(mp, buf3, len3);
1512         }
1513         return (mp);
1514 }
1515 
1516 #ifdef DEBUG
1517 char *
1518 pr_state(uint_t state, uint_t mode)
1519 {
1520         static char buf[1024];
1521 
1522         buf[0] = 0;
1523         if (state & SS_ISCONNECTED)
1524                 (void) strcat(buf, "ISCONNECTED ");
1525         if (state & SS_ISCONNECTING)
1526                 (void) strcat(buf, "ISCONNECTING ");
1527         if (state & SS_ISDISCONNECTING)
1528                 (void) strcat(buf, "ISDISCONNECTING ");
1529         if (state & SS_CANTSENDMORE)
1530                 (void) strcat(buf, "CANTSENDMORE ");
1531 
1532         if (state & SS_CANTRCVMORE)
1533                 (void) strcat(buf, "CANTRCVMORE ");
1534         if (state & SS_ISBOUND)
1535                 (void) strcat(buf, "ISBOUND ");
1536         if (state & SS_NDELAY)
1537                 (void) strcat(buf, "NDELAY ");
1538         if (state & SS_NONBLOCK)
1539                 (void) strcat(buf, "NONBLOCK ");
1540 
1541         if (state & SS_ASYNC)
1542                 (void) strcat(buf, "ASYNC ");
1543         if (state & SS_ACCEPTCONN)
1544                 (void) strcat(buf, "ACCEPTCONN ");
1545         if (state & SS_SAVEDEOR)
1546                 (void) strcat(buf, "SAVEDEOR ");
1547 
1548         if (state & SS_RCVATMARK)
1549                 (void) strcat(buf, "RCVATMARK ");
1550         if (state & SS_OOBPEND)
1551                 (void) strcat(buf, "OOBPEND ");
1552         if (state & SS_HAVEOOBDATA)
1553                 (void) strcat(buf, "HAVEOOBDATA ");
1554         if (state & SS_HADOOBDATA)
1555                 (void) strcat(buf, "HADOOBDATA ");
1556 
1557         if (mode & SM_PRIV)
1558                 (void) strcat(buf, "PRIV ");
1559         if (mode & SM_ATOMIC)
1560                 (void) strcat(buf, "ATOMIC ");
1561         if (mode & SM_ADDR)
1562                 (void) strcat(buf, "ADDR ");
1563         if (mode & SM_CONNREQUIRED)
1564                 (void) strcat(buf, "CONNREQUIRED ");
1565 
1566         if (mode & SM_FDPASSING)
1567                 (void) strcat(buf, "FDPASSING ");
1568         if (mode & SM_EXDATA)
1569                 (void) strcat(buf, "EXDATA ");
1570         if (mode & SM_OPTDATA)
1571                 (void) strcat(buf, "OPTDATA ");
1572         if (mode & SM_BYTESTREAM)
1573                 (void) strcat(buf, "BYTESTREAM ");
1574         return (buf);
1575 }
1576 
1577 char *
1578 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen)
1579 {
1580         static char buf[1024];
1581 
1582         if (addr == NULL || addrlen == 0) {
1583                 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr);
1584                 return (buf);
1585         }
1586         switch (family) {
1587         case AF_INET: {
1588                 struct sockaddr_in sin;
1589 
1590                 bcopy(addr, &sin, sizeof (sin));
1591 
1592                 (void) sprintf(buf, "(len %d) %x/%d",
1593                     addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1594                 break;
1595         }
1596         case AF_INET6: {
1597                 struct sockaddr_in6 sin6;
1598                 uint16_t *piece = (uint16_t *)&sin6.sin6_addr;
1599 
1600                 bcopy((char *)addr, (char *)&sin6, sizeof (sin6));
1601                 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d",
1602                     addrlen,
1603                     ntohs(piece[0]), ntohs(piece[1]),
1604                     ntohs(piece[2]), ntohs(piece[3]),
1605                     ntohs(piece[4]), ntohs(piece[5]),
1606                     ntohs(piece[6]), ntohs(piece[7]),
1607                     ntohs(sin6.sin6_port));
1608                 break;
1609         }
1610         case AF_UNIX: {
1611                 struct sockaddr_un *soun = (struct sockaddr_un *)addr;
1612 
1613                 (void) sprintf(buf, "(len %d) %s", addrlen,
1614                     (soun == NULL) ? "(none)" : soun->sun_path);
1615                 break;
1616         }
1617         default:
1618                 (void) sprintf(buf, "(unknown af %d)", family);
1619                 break;
1620         }
1621         return (buf);
1622 }
1623 
1624 /* The logical equivalence operator (a if-and-only-if b) */
1625 #define EQUIVALENT(a, b)        (((a) && (b)) || (!(a) && (!(b))))
1626 
1627 /*
1628  * Verify limitations and invariants on oob state.
1629  * Return 1 if OK, otherwise 0 so that it can be used as
1630  *      ASSERT(verify_oobstate(so));
1631  */
1632 int
1633 so_verify_oobstate(struct sonode *so)
1634 {
1635         boolean_t havemark;
1636 
1637         ASSERT(MUTEX_HELD(&so->so_lock));
1638 
1639         /*
1640          * The possible state combinations are:
1641          *      0
1642          *      SS_OOBPEND
1643          *      SS_OOBPEND|SS_HAVEOOBDATA
1644          *      SS_OOBPEND|SS_HADOOBDATA
1645          *      SS_HADOOBDATA
1646          */
1647         switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) {
1648         case 0:
1649         case SS_OOBPEND:
1650         case SS_OOBPEND|SS_HAVEOOBDATA:
1651         case SS_OOBPEND|SS_HADOOBDATA:
1652         case SS_HADOOBDATA:
1653                 break;
1654         default:
1655                 printf("Bad oob state 1 (%p): state %s\n",
1656                     (void *)so, pr_state(so->so_state, so->so_mode));
1657                 return (0);
1658         }
1659 
1660         /* SS_RCVATMARK should only be set when SS_OOBPEND is set */
1661         if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) {
1662                 printf("Bad oob state 2 (%p): state %s\n",
1663                     (void *)so, pr_state(so->so_state, so->so_mode));
1664                 return (0);
1665         }
1666 
1667         /*
1668          * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND
1669          * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt.
1670          */
1671         havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 :
1672             SOTOTPI(so)->sti_oobsigcnt > 0;
1673 
1674         if (!EQUIVALENT(havemark || (so->so_state & SS_RCVATMARK),
1675             so->so_state & SS_OOBPEND)) {
1676                 printf("Bad oob state 3 (%p): state %s\n",
1677                     (void *)so, pr_state(so->so_state, so->so_mode));
1678                 return (0);
1679         }
1680 
1681         /*
1682          * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA
1683          */
1684         if (!(so->so_options & SO_OOBINLINE) &&
1685             !EQUIVALENT(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) {
1686                 printf("Bad oob state 4 (%p): state %s\n",
1687                     (void *)so, pr_state(so->so_state, so->so_mode));
1688                 return (0);
1689         }
1690 
1691         if (!SOCK_IS_NONSTR(so) &&
1692             SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) {
1693                 printf("Bad oob state 5 (%p): counts %d/%d state %s\n",
1694                     (void *)so, SOTOTPI(so)->sti_oobsigcnt,
1695                     SOTOTPI(so)->sti_oobcnt,
1696                     pr_state(so->so_state, so->so_mode));
1697                 return (0);
1698         }
1699 
1700         return (1);
1701 }
1702 #undef  EQUIVALENT
1703 #endif /* DEBUG */
1704 
1705 /* initialize sockfs zone specific kstat related items                  */
1706 void *
1707 sock_kstat_init(zoneid_t zoneid)
1708 {
1709         kstat_t *ksp;
1710 
1711         ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc",
1712             KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid);
1713 
1714         if (ksp != NULL) {
1715                 ksp->ks_update = sockfs_update;
1716                 ksp->ks_snapshot = sockfs_snapshot;
1717                 ksp->ks_lock = &socklist.sl_lock;
1718                 ksp->ks_private = (void *)(uintptr_t)zoneid;
1719                 kstat_install(ksp);
1720         }
1721 
1722         return (ksp);
1723 }
1724 
1725 /* tear down sockfs zone specific kstat related items                   */
1726 /*ARGSUSED*/
1727 void
1728 sock_kstat_fini(zoneid_t zoneid, void *arg)
1729 {
1730         kstat_t *ksp = (kstat_t *)arg;
1731 
1732         if (ksp != NULL) {
1733                 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private);
1734                 kstat_delete(ksp);
1735         }
1736 }
1737 
1738 /*
1739  * Zones:
1740  * Note that nactive is going to be different for each zone.
1741  * This means we require kstat to call sockfs_update and then sockfs_snapshot
1742  * for the same zone, or sockfs_snapshot will be taken into the wrong size
1743  * buffer. This is safe, but if the buffer is too small, user will not be
1744  * given details of all sockets. However, as this kstat has a ks_lock, kstat
1745  * driver will keep it locked between the update and the snapshot, so no
1746  * other process (zone) can currently get inbetween resulting in a wrong size
1747  * buffer allocation.
1748  */
1749 static int
1750 sockfs_update(kstat_t *ksp, int rw)
1751 {
1752         uint_t  nactive = 0;            /* # of active AF_UNIX sockets  */
1753         struct sonode   *so;            /* current sonode on socklist   */
1754         zoneid_t        myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1755 
1756         ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1757 
1758         if (rw == KSTAT_WRITE) {        /* bounce all writes            */
1759                 return (EACCES);
1760         }
1761 
1762         for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1763                 if (so->so_count != 0 && so->so_zoneid == myzoneid) {
1764                         nactive++;
1765                 }
1766         }
1767         ksp->ks_ndata = nactive;
1768         ksp->ks_data_size = nactive * sizeof (struct k_sockinfo);
1769 
1770         return (0);
1771 }
1772 
1773 static int
1774 sockfs_snapshot(kstat_t *ksp, void *buf, int rw)
1775 {
1776         int                     ns;     /* # of sonodes we've copied    */
1777         struct sonode           *so;    /* current sonode on socklist   */
1778         struct k_sockinfo       *pksi;  /* where we put sockinfo data   */
1779         t_uscalar_t             sn_len; /* soa_len                      */
1780         zoneid_t                myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1781         sotpi_info_t            *sti;
1782 
1783         ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1784 
1785         ksp->ks_snaptime = gethrtime();
1786 
1787         if (rw == KSTAT_WRITE) {        /* bounce all writes            */
1788                 return (EACCES);
1789         }
1790 
1791         /*
1792          * for each sonode on the socklist, we massage the important
1793          * info into buf, in k_sockinfo format.
1794          */
1795         pksi = (struct k_sockinfo *)buf;
1796         ns = 0;
1797         for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1798                 /* only stuff active sonodes and the same zone:         */
1799                 if (so->so_count == 0 || so->so_zoneid != myzoneid) {
1800                         continue;
1801                 }
1802 
1803                 /*
1804                  * If the sonode was activated between the update and the
1805                  * snapshot, we're done - as this is only a snapshot.
1806                  */
1807                 if ((caddr_t)(pksi) >= (caddr_t)buf + ksp->ks_data_size) {
1808                         break;
1809                 }
1810 
1811                 sti = SOTOTPI(so);
1812                 /* copy important info into buf:                        */
1813                 pksi->ks_si.si_size = sizeof (struct k_sockinfo);
1814                 pksi->ks_si.si_family = so->so_family;
1815                 pksi->ks_si.si_type = so->so_type;
1816                 pksi->ks_si.si_flag = so->so_flag;
1817                 pksi->ks_si.si_state = so->so_state;
1818                 pksi->ks_si.si_serv_type = sti->sti_serv_type;
1819                 pksi->ks_si.si_ux_laddr_sou_magic =
1820                     sti->sti_ux_laddr.soua_magic;
1821                 pksi->ks_si.si_ux_faddr_sou_magic =
1822                     sti->sti_ux_faddr.soua_magic;
1823                 pksi->ks_si.si_laddr_soa_len = sti->sti_laddr.soa_len;
1824                 pksi->ks_si.si_faddr_soa_len = sti->sti_faddr.soa_len;
1825                 pksi->ks_si.si_szoneid = so->so_zoneid;
1826                 pksi->ks_si.si_faddr_noxlate = sti->sti_faddr_noxlate;
1827 
1828                 mutex_enter(&so->so_lock);
1829 
1830                 if (sti->sti_laddr_sa != NULL) {
1831                         ASSERT(sti->sti_laddr_sa->sa_data != NULL);
1832                         sn_len = sti->sti_laddr_len;
1833                         ASSERT(sn_len <= sizeof (short) +
1834                             sizeof (pksi->ks_si.si_laddr_sun_path));
1835 
1836                         pksi->ks_si.si_laddr_family =
1837                             sti->sti_laddr_sa->sa_family;
1838                         if (sn_len != 0) {
1839                                 /* AF_UNIX socket names are NULL terminated */
1840                                 (void) strncpy(pksi->ks_si.si_laddr_sun_path,
1841                                     sti->sti_laddr_sa->sa_data,
1842                                     sizeof (pksi->ks_si.si_laddr_sun_path));
1843                                 sn_len = strlen(pksi->ks_si.si_laddr_sun_path);
1844                         }
1845                         pksi->ks_si.si_laddr_sun_path[sn_len] = 0;
1846                 }
1847 
1848                 if (sti->sti_faddr_sa != NULL) {
1849                         ASSERT(sti->sti_faddr_sa->sa_data != NULL);
1850                         sn_len = sti->sti_faddr_len;
1851                         ASSERT(sn_len <= sizeof (short) +
1852                             sizeof (pksi->ks_si.si_faddr_sun_path));
1853 
1854                         pksi->ks_si.si_faddr_family =
1855                             sti->sti_faddr_sa->sa_family;
1856                         if (sn_len != 0) {
1857                                 (void) strncpy(pksi->ks_si.si_faddr_sun_path,
1858                                     sti->sti_faddr_sa->sa_data,
1859                                     sizeof (pksi->ks_si.si_faddr_sun_path));
1860                                 sn_len = strlen(pksi->ks_si.si_faddr_sun_path);
1861                         }
1862                         pksi->ks_si.si_faddr_sun_path[sn_len] = 0;
1863                 }
1864 
1865                 mutex_exit(&so->so_lock);
1866 
1867                 (void) sprintf(pksi->ks_straddr[0], "%p", (void *)so);
1868                 (void) sprintf(pksi->ks_straddr[1], "%p",
1869                     (void *)sti->sti_ux_laddr.soua_vp);
1870                 (void) sprintf(pksi->ks_straddr[2], "%p",
1871                     (void *)sti->sti_ux_faddr.soua_vp);
1872 
1873                 ns++;
1874                 pksi++;
1875         }
1876 
1877         ksp->ks_ndata = ns;
1878         return (0);
1879 }
1880 
1881 ssize_t
1882 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size)
1883 {
1884         struct uio auio;
1885         struct iovec aiov[1];
1886         register vnode_t *vp;
1887         int ioflag, rwflag;
1888         ssize_t cnt;
1889         int error = 0;
1890         int iovcnt = 0;
1891         short fflag;
1892 
1893         vp = fp->f_vnode;
1894         fflag = fp->f_flag;
1895 
1896         rwflag = 0;
1897         aiov[0].iov_base = (caddr_t)buf;
1898         aiov[0].iov_len = size;
1899         iovcnt = 1;
1900         cnt = (ssize_t)size;
1901         (void) VOP_RWLOCK(vp, rwflag, NULL);
1902 
1903         auio.uio_loffset = fileoff;
1904         auio.uio_iov = aiov;
1905         auio.uio_iovcnt = iovcnt;
1906         auio.uio_resid = cnt;
1907         auio.uio_segflg = UIO_SYSSPACE;
1908         auio.uio_llimit = MAXOFFSET_T;
1909         auio.uio_fmode = fflag;
1910         auio.uio_extflg = UIO_COPY_CACHED;
1911 
1912         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1913 
1914         /* If read sync is not asked for, filter sync flags */
1915         if ((ioflag & FRSYNC) == 0)
1916                 ioflag &= ~(FSYNC|FDSYNC);
1917         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1918         cnt -= auio.uio_resid;
1919 
1920         VOP_RWUNLOCK(vp, rwflag, NULL);
1921 
1922         if (error == EINTR && cnt != 0)
1923                 error = 0;
1924 out:
1925         if (error != 0) {
1926                 *err = error;
1927                 return (0);
1928         } else {
1929                 *err = 0;
1930                 return (cnt);
1931         }
1932 }
1933 
1934 int
1935 so_copyin(const void *from, void *to, size_t size, int fromkernel)
1936 {
1937         if (fromkernel) {
1938                 bcopy(from, to, size);
1939                 return (0);
1940         }
1941         return (xcopyin(from, to, size));
1942 }
1943 
1944 int
1945 so_copyout(const void *from, void *to, size_t size, int tokernel)
1946 {
1947         if (tokernel) {
1948                 bcopy(from, to, size);
1949                 return (0);
1950         }
1951         return (xcopyout(from, to, size));
1952 }