4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2013, Joyent, Inc. All rights reserved.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/signal.h>
34 #include <sys/cred.h>
35 #include <sys/policy.h>
36 #include <sys/user.h>
37 #include <sys/systm.h>
38 #include <sys/cpuvar.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/file.h>
42 #include <sys/errno.h>
43 #include <sys/time.h>
44 #include <sys/proc.h>
67 #include <sys/resource.h>
68 #include <sys/cyclic.h>
69 #include <sys/lgrp.h>
70 #include <sys/rctl.h>
71 #include <sys/contract_impl.h>
72 #include <sys/contract/process_impl.h>
73 #include <sys/list.h>
74 #include <sys/dtrace.h>
75 #include <sys/pool.h>
76 #include <sys/zone.h>
77 #include <sys/sdt.h>
78 #include <sys/class.h>
79 #include <sys/corectl.h>
80 #include <sys/brand.h>
81 #include <sys/fork.h>
82
83 static int64_t cfork(int, int, int);
84 static int getproc(proc_t **, pid_t, uint_t);
85 #define GETPROC_USER 0x0
86 #define GETPROC_KERNEL 0x1
87
88 static void fork_fail(proc_t *);
89 static void forklwp_fail(proc_t *);
90
91 int fork_fail_pending;
92
93 extern struct kmem_cache *process_cache;
94
95 /*
96 * The vfork() system call trap is no longer invoked by libc.
97 * It is retained only for the benefit of applications running
98 * within a solaris10 branded zone. It should be eliminated
99 * when we no longer support solaris10 branded zones.
100 */
101 int64_t
102 vfork(void)
103 {
104 curthread->t_post_sys = 1; /* so vfwait() will be called */
105 return (cfork(1, 1, 0));
106 }
108 /*
109 * forksys system call - forkx, forkallx, vforkx. This is the
110 * interface invoked by libc for fork1(), forkall(), and vfork()
111 */
112 int64_t
113 forksys(int subcode, int flags)
114 {
115 switch (subcode) {
116 case 0:
117 return (cfork(0, 1, flags)); /* forkx(flags) */
118 case 1:
119 return (cfork(0, 0, flags)); /* forkallx(flags) */
120 case 2:
121 curthread->t_post_sys = 1; /* so vfwait() will be called */
122 return (cfork(1, 1, flags)); /* vforkx(flags) */
123 default:
124 return ((int64_t)set_errno(EINVAL));
125 }
126 }
127
128 /* ARGSUSED */
129 static int64_t
130 cfork(int isvfork, int isfork1, int flags)
131 {
132 proc_t *p = ttoproc(curthread);
133 struct as *as;
134 proc_t *cp, **orphpp;
135 klwp_t *clone;
136 kthread_t *t;
137 task_t *tk;
138 rval_t r;
139 int error;
140 int i;
141 rctl_set_t *dup_set;
142 rctl_alloc_gp_t *dup_gp;
143 rctl_entity_p_t e;
144 lwpdir_t *ldp;
145 lwpent_t *lep;
146 lwpent_t *clep;
147
148 /*
149 * Allow only these two flags.
150 */
151 if ((flags & ~(FORK_NOSIGCHLD | FORK_WAITPID)) != 0) {
152 error = EINVAL;
153 atomic_inc_32(&curproc->p_zone->zone_ffmisc);
154 goto forkerr;
249 cp->p_segacct = p->p_segacct;
250 } else {
251 /*
252 * We need to hold P_PR_LOCK until the address space has
253 * been duplicated and we've had a chance to remove from the
254 * child any DTrace probes that were in the parent. Holding
255 * P_PR_LOCK prevents any new probes from being added and any
256 * extant probes from being removed.
257 */
258 mutex_enter(&p->p_lock);
259 sprlock_proc(p);
260 p->p_flag |= SFORKING;
261 mutex_exit(&p->p_lock);
262
263 error = as_dup(p->p_as, cp);
264 if (error != 0) {
265 mutex_enter(&p->p_lock);
266 sprunlock(p);
267 fork_fail(cp);
268 mutex_enter(&pidlock);
269 orphpp = &p->p_orphan;
270 while (*orphpp != cp)
271 orphpp = &(*orphpp)->p_nextorph;
272 *orphpp = cp->p_nextorph;
273 if (p->p_child == cp)
274 p->p_child = cp->p_sibling;
275 if (cp->p_sibling)
276 cp->p_sibling->p_psibling = cp->p_psibling;
277 if (cp->p_psibling)
278 cp->p_psibling->p_sibling = cp->p_sibling;
279 mutex_enter(&cp->p_lock);
280 tk = cp->p_task;
281 task_detach(cp);
282 ASSERT(cp->p_pool->pool_ref > 0);
283 atomic_dec_32(&cp->p_pool->pool_ref);
284 mutex_exit(&cp->p_lock);
285 pid_exit(cp, tk);
286 mutex_exit(&pidlock);
287 task_rele(tk);
288
289 mutex_enter(&p->p_lock);
290 p->p_flag &= ~SFORKING;
291 pool_barrier_exit();
292 continuelwps(p);
293 mutex_exit(&p->p_lock);
294 /*
295 * Preserve ENOMEM error condition but
296 * map all others to EAGAIN.
297 */
298 error = (error == ENOMEM) ? ENOMEM : EAGAIN;
623 kmem_free(cp->p_tidhash,
624 cp->p_tidhash_sz * sizeof (*cp->p_tidhash));
625 cp->p_tidhash = NULL;
626 cp->p_tidhash_sz = 0;
627
628 forklwp_fail(cp);
629 fork_fail(cp);
630 rctl_set_free(cp->p_rctls);
631 mutex_enter(&pidlock);
632
633 /*
634 * Detach failed child from task.
635 */
636 mutex_enter(&cp->p_lock);
637 tk = cp->p_task;
638 task_detach(cp);
639 ASSERT(cp->p_pool->pool_ref > 0);
640 atomic_dec_32(&cp->p_pool->pool_ref);
641 mutex_exit(&cp->p_lock);
642
643 orphpp = &p->p_orphan;
644 while (*orphpp != cp)
645 orphpp = &(*orphpp)->p_nextorph;
646 *orphpp = cp->p_nextorph;
647 if (p->p_child == cp)
648 p->p_child = cp->p_sibling;
649 if (cp->p_sibling)
650 cp->p_sibling->p_psibling = cp->p_psibling;
651 if (cp->p_psibling)
652 cp->p_psibling->p_sibling = cp->p_sibling;
653 pid_exit(cp, tk);
654 mutex_exit(&pidlock);
655
656 task_rele(tk);
657
658 mutex_enter(&p->p_lock);
659 pool_barrier_exit();
660 continuelwps(p);
661 mutex_exit(&p->p_lock);
662 error = EAGAIN;
663 forkerr:
664 return ((int64_t)set_errno(error));
665 }
666
667 /*
668 * Free allocated resources from getproc() if a fork failed.
669 */
670 static void
671 fork_fail(proc_t *cp)
672 {
679 upcount_dec(crgetruid(cp->p_cred), crgetzoneid(cp->p_cred));
680 mutex_exit(&pidlock);
681
682 /*
683 * single threaded, so no locking needed here
684 */
685 crfree(cp->p_cred);
686
687 kmem_free(fip->fi_list, fip->fi_nfiles * sizeof (uf_entry_t));
688
689 VN_RELE(PTOU(curproc)->u_cdir);
690 if (PTOU(curproc)->u_rdir)
691 VN_RELE(PTOU(curproc)->u_rdir);
692 if (cp->p_exec)
693 VN_RELE(cp->p_exec);
694 if (cp->p_execdir)
695 VN_RELE(cp->p_execdir);
696 if (PTOU(curproc)->u_cwd)
697 refstr_rele(PTOU(curproc)->u_cwd);
698 if (PROC_IS_BRANDED(cp)) {
699 brand_clearbrand(cp, B_TRUE);
700 }
701 }
702
703 /*
704 * Clean up the lwps already created for this child process.
705 * The fork failed while duplicating all the lwps of the parent
706 * and those lwps already created must be freed.
707 * This process is invisible to the rest of the system,
708 * so we don't need to hold p->p_lock to protect the list.
709 */
710 static void
711 forklwp_fail(proc_t *p)
712 {
713 kthread_t *t;
714 task_t *tk;
715 int branded = 0;
716
717 if (PROC_IS_BRANDED(p))
718 branded = 1;
719
728 p->p_lwpcnt--;
729 t->t_forw->t_back = t->t_back;
730 t->t_back->t_forw = t->t_forw;
731
732 tk = p->p_task;
733 mutex_enter(&p->p_zone->zone_nlwps_lock);
734 tk->tk_nlwps--;
735 tk->tk_proj->kpj_nlwps--;
736 p->p_zone->zone_nlwps--;
737 mutex_exit(&p->p_zone->zone_nlwps_lock);
738
739 ASSERT(t->t_schedctl == NULL);
740
741 if (branded)
742 BROP(p)->b_freelwp(ttolwp(t));
743
744 if (t->t_door != NULL) {
745 kmem_free(t->t_door, sizeof (door_data_t));
746 t->t_door = NULL;
747 }
748 lwp_ctmpl_clear(ttolwp(t));
749
750 /*
751 * Remove the thread from the all threads list.
752 * We need to hold pidlock for this.
753 */
754 mutex_enter(&pidlock);
755 t->t_next->t_prev = t->t_prev;
756 t->t_prev->t_next = t->t_next;
757 CL_EXIT(t); /* tell the scheduler that we're exiting */
758 cv_broadcast(&t->t_joincv); /* tell anyone in thread_join */
759 mutex_exit(&pidlock);
760
761 /*
762 * Let the lgroup load averages know that this thread isn't
763 * going to show up (i.e. un-do what was done on behalf of
764 * this thread by the earlier lgrp_move_thread()).
765 */
766 kpreempt_disable();
767 lgrp_move_thread(t, NULL, 1);
768 kpreempt_enable();
769
770 /*
771 * The thread was created TS_STOPPED.
772 * We change it to TS_FREE to avoid an
773 * ASSERT() panic in thread_free().
774 */
775 t->t_state = TS_FREE;
776 thread_rele(t);
777 thread_free(t);
778 }
779 }
780
781 extern struct as kas;
782
783 /*
784 * fork a kernel process.
785 */
786 int
787 newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
788 pid_t pid)
789 {
790 proc_t *p;
791 struct user *up;
792 kthread_t *t;
793 cont_process_t *ctp = NULL;
794 rctl_entity_p_t e;
795
796 ASSERT(cid != sysdccid);
797 ASSERT(cid != syscid || ct == NULL);
798 if (CLASS_KERNEL(cid)) {
799 rctl_alloc_gp_t *init_gp;
800 rctl_set_t *init_set;
801
802 ASSERT(pid != 1);
803
804 if (getproc(&p, pid, GETPROC_KERNEL) < 0)
805 return (EAGAIN);
806
807 /*
808 * Release the hold on the p_exec and p_execdir, these
809 * were acquired in getproc()
810 */
811 if (p->p_execdir != NULL)
812 VN_RELE(p->p_execdir);
813 if (p->p_exec != NULL)
814 VN_RELE(p->p_exec);
815 p->p_flag |= SNOWAIT;
816 p->p_exec = NULL;
817 p->p_execdir = NULL;
818
819 init_set = rctl_set_create();
820 init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
821
822 /*
826 premptyset(&p->p_fltmask);
827 up = PTOU(p);
828 up->u_systrap = 0;
829 premptyset(&(up->u_entrymask));
830 premptyset(&(up->u_exitmask));
831 mutex_enter(&p->p_lock);
832 e.rcep_p.proc = p;
833 e.rcep_t = RCENTITY_PROCESS;
834 p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
835 init_gp);
836 mutex_exit(&p->p_lock);
837
838 rctl_prealloc_destroy(init_gp);
839
840 t = lwp_kernel_create(p, pc, arg, TS_STOPPED, pri);
841 } else {
842 rctl_alloc_gp_t *init_gp, *default_gp;
843 rctl_set_t *init_set;
844 task_t *tk, *tk_old;
845 klwp_t *lwp;
846
847 if (getproc(&p, pid, GETPROC_USER) < 0)
848 return (EAGAIN);
849 /*
850 * init creates a new task, distinct from the task
851 * containing kernel "processes".
852 */
853 tk = task_create(0, p->p_zone);
854 mutex_enter(&tk->tk_zone->zone_nlwps_lock);
855 tk->tk_proj->kpj_ntasks++;
856 tk->tk_nprocs++;
857 mutex_exit(&tk->tk_zone->zone_nlwps_lock);
858
859 default_gp = rctl_rlimit_set_prealloc(RLIM_NLIMITS);
860 init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
861 init_set = rctl_set_create();
862
863 mutex_enter(&pidlock);
864 mutex_enter(&p->p_lock);
865 tk_old = p->p_task; /* switch to new task */
866
867 task_detach(p);
869 mutex_exit(&pidlock);
870
871 mutex_enter(&tk_old->tk_zone->zone_nlwps_lock);
872 tk_old->tk_nprocs--;
873 mutex_exit(&tk_old->tk_zone->zone_nlwps_lock);
874
875 e.rcep_p.proc = p;
876 e.rcep_t = RCENTITY_PROCESS;
877 p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
878 init_gp);
879 rctlproc_default_init(p, default_gp);
880 mutex_exit(&p->p_lock);
881
882 task_rele(tk_old);
883 rctl_prealloc_destroy(default_gp);
884 rctl_prealloc_destroy(init_gp);
885
886 if ((lwp = lwp_create(pc, arg, 0, p, TS_STOPPED, pri,
887 &curthread->t_hold, cid, 1)) == NULL) {
888 task_t *tk;
889 fork_fail(p);
890 mutex_enter(&pidlock);
891 mutex_enter(&p->p_lock);
892 tk = p->p_task;
893 task_detach(p);
894 ASSERT(p->p_pool->pool_ref > 0);
895 atomic_add_32(&p->p_pool->pool_ref, -1);
896 mutex_exit(&p->p_lock);
897 pid_exit(p, tk);
898 mutex_exit(&pidlock);
899 task_rele(tk);
900
901 return (EAGAIN);
902 }
903 t = lwptot(lwp);
904
905 ctp = contract_process_fork(sys_process_tmpl, p, curproc,
906 B_FALSE);
907 ASSERT(ctp != NULL);
908 if (ct != NULL)
909 *ct = &ctp->conp_contract;
910 }
911
912 ASSERT3U(t->t_tid, ==, 1);
913 p->p_lwpid = 1;
914 mutex_enter(&pidlock);
915 pgjoin(p, p->p_parent->p_pgidp);
916 p->p_stat = SRUN;
917 mutex_enter(&p->p_lock);
918 t->t_proc_flag &= ~TP_HOLDLWP;
919 lwp_create_done(t);
920 mutex_exit(&p->p_lock);
921 mutex_exit(&pidlock);
922 return (0);
923 }
924
925 /*
926 * create a child proc struct.
927 */
928 static int
929 getproc(proc_t **cpp, pid_t pid, uint_t flags)
930 {
931 proc_t *pp, *cp;
932 pid_t newpid;
933 struct user *uarea;
934 extern uint_t nproc;
935 struct cred *cr;
936 uid_t ruid;
937 zoneid_t zoneid;
938 task_t *task;
939 kproject_t *proj;
940 zone_t *zone;
941 int rctlfail = 0;
942
943 if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
944 return (-1); /* no point in starting new processes */
945
946 pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
947 task = pp->p_task;
948 proj = task->tk_proj;
949 zone = pp->p_zone;
950
951 mutex_enter(&pp->p_lock);
952 mutex_enter(&zone->zone_nlwps_lock);
953 if (proj != proj0p) {
954 if (task->tk_nprocs >= task->tk_nprocs_ctl)
955 if (rctl_test(rc_task_nprocs, task->tk_rctls,
956 pp, 1, 0) & RCT_DENY)
957 rctlfail = 1;
958
959 if (proj->kpj_nprocs >= proj->kpj_nprocs_ctl)
960 if (rctl_test(rc_project_nprocs, proj->kpj_rctls,
961 pp, 1, 0) & RCT_DENY)
962 rctlfail = 1;
963
964 if (zone->zone_nprocs >= zone->zone_nprocs_ctl)
965 if (rctl_test(rc_zone_nprocs, zone->zone_rctls,
966 pp, 1, 0) & RCT_DENY)
987 */
988 mutex_init(&cp->p_splock, NULL, MUTEX_DEFAULT, NULL);
989 mutex_init(&cp->p_crlock, NULL, MUTEX_DEFAULT, NULL);
990 mutex_init(&cp->p_pflock, NULL, MUTEX_DEFAULT, NULL);
991 #if defined(__x86)
992 mutex_init(&cp->p_ldtlock, NULL, MUTEX_DEFAULT, NULL);
993 #endif
994 mutex_init(&cp->p_maplock, NULL, MUTEX_DEFAULT, NULL);
995 cp->p_stat = SIDL;
996 cp->p_mstart = gethrtime();
997 cp->p_as = &kas;
998 /*
999 * p_zone must be set before we call pid_allocate since the process
1000 * will be visible after that and code such as prfind_zone will
1001 * look at the p_zone field.
1002 */
1003 cp->p_zone = pp->p_zone;
1004 cp->p_t1_lgrpid = LGRP_NONE;
1005 cp->p_tr_lgrpid = LGRP_NONE;
1006
1007 if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
1008 if (nproc == v.v_proc) {
1009 CPU_STATS_ADDQ(CPU, sys, procovf, 1);
1010 cmn_err(CE_WARN, "out of processes");
1011 }
1012 goto bad;
1013 }
1014
1015 mutex_enter(&pp->p_lock);
1016 cp->p_exec = pp->p_exec;
1017 cp->p_execdir = pp->p_execdir;
1018 mutex_exit(&pp->p_lock);
1019
1020 if (cp->p_exec) {
1021 VN_HOLD(cp->p_exec);
1022 /*
1023 * Each VOP_OPEN() must be paired with a corresponding
1024 * VOP_CLOSE(). In this case, the executable will be
1025 * closed for the child in either proc_exit() or gexec().
1026 */
1054 goto bad;
1055 }
1056
1057 /*
1058 * Everything is cool, put the new proc on the active process list.
1059 * It is already on the pid list and in /proc.
1060 * Increment the per uid process count (upcount).
1061 */
1062 nproc++;
1063 upcount_inc(ruid, zoneid);
1064
1065 cp->p_next = practive;
1066 practive->p_prev = cp;
1067 practive = cp;
1068
1069 cp->p_ignore = pp->p_ignore;
1070 cp->p_siginfo = pp->p_siginfo;
1071 cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
1072 cp->p_sessp = pp->p_sessp;
1073 sess_hold(pp);
1074 cp->p_brand = pp->p_brand;
1075 if (PROC_IS_BRANDED(pp))
1076 BROP(pp)->b_copy_procdata(cp, pp);
1077 cp->p_bssbase = pp->p_bssbase;
1078 cp->p_brkbase = pp->p_brkbase;
1079 cp->p_brksize = pp->p_brksize;
1080 cp->p_brkpageszc = pp->p_brkpageszc;
1081 cp->p_stksize = pp->p_stksize;
1082 cp->p_stkpageszc = pp->p_stkpageszc;
1083 cp->p_stkprot = pp->p_stkprot;
1084 cp->p_datprot = pp->p_datprot;
1085 cp->p_usrstack = pp->p_usrstack;
1086 cp->p_model = pp->p_model;
1087 cp->p_ppid = pp->p_pid;
1088 cp->p_ancpid = pp->p_pid;
1089 cp->p_portcnt = pp->p_portcnt;
1090
1091 /*
1092 * Initialize watchpoint structures
1093 */
1094 avl_create(&cp->p_warea, wa_compare, sizeof (struct watched_area),
1095 offsetof(struct watched_area, wa_link));
1096
1136 cp->p_pool = pool_default;
1137 cp->p_flag |= SSYS;
1138 } else {
1139 cp->p_pool = pp->p_pool;
1140 }
1141 atomic_inc_32(&cp->p_pool->pool_ref);
1142 mutex_exit(&pp->p_lock);
1143
1144 /*
1145 * Add the child process to the current task. Kernel processes
1146 * are always attached to task0.
1147 */
1148 mutex_enter(&cp->p_lock);
1149 if (flags & GETPROC_KERNEL)
1150 task_attach(task0p, cp);
1151 else
1152 task_attach(pp->p_task, cp);
1153 mutex_exit(&cp->p_lock);
1154 mutex_exit(&pidlock);
1155
1156 avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
1157 offsetof(contract_t, ct_ctlist));
1158
1159 /*
1160 * Duplicate any audit information kept in the process table
1161 */
1162 if (audit_active) /* copy audit data to cp */
1163 audit_newproc(cp);
1164
1165 crhold(cp->p_cred = cr);
1166
1167 /*
1168 * Bump up the counts on the file structures pointed at by the
1169 * parent's file table since the child will point at them too.
1170 */
1171 fcnt_add(P_FINFO(pp), 1);
1172
1173 if (PTOU(pp)->u_cdir) {
1174 VN_HOLD(PTOU(pp)->u_cdir);
1175 } else {
|
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2016, Joyent, Inc.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/signal.h>
34 #include <sys/cred.h>
35 #include <sys/policy.h>
36 #include <sys/user.h>
37 #include <sys/systm.h>
38 #include <sys/cpuvar.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/file.h>
42 #include <sys/errno.h>
43 #include <sys/time.h>
44 #include <sys/proc.h>
67 #include <sys/resource.h>
68 #include <sys/cyclic.h>
69 #include <sys/lgrp.h>
70 #include <sys/rctl.h>
71 #include <sys/contract_impl.h>
72 #include <sys/contract/process_impl.h>
73 #include <sys/list.h>
74 #include <sys/dtrace.h>
75 #include <sys/pool.h>
76 #include <sys/zone.h>
77 #include <sys/sdt.h>
78 #include <sys/class.h>
79 #include <sys/corectl.h>
80 #include <sys/brand.h>
81 #include <sys/fork.h>
82
83 static int64_t cfork(int, int, int);
84 static int getproc(proc_t **, pid_t, uint_t);
85 #define GETPROC_USER 0x0
86 #define GETPROC_KERNEL 0x1
87 #define GETPROC_ZSCHED 0x2
88
89 static void fork_fail(proc_t *);
90 static void forklwp_fail(proc_t *);
91
92 int fork_fail_pending;
93
94 extern struct kmem_cache *process_cache;
95
96 /*
97 * The vfork() system call trap is no longer invoked by libc.
98 * It is retained only for the benefit of applications running
99 * within a solaris10 branded zone. It should be eliminated
100 * when we no longer support solaris10 branded zones.
101 */
102 int64_t
103 vfork(void)
104 {
105 curthread->t_post_sys = 1; /* so vfwait() will be called */
106 return (cfork(1, 1, 0));
107 }
109 /*
110 * forksys system call - forkx, forkallx, vforkx. This is the
111 * interface invoked by libc for fork1(), forkall(), and vfork()
112 */
113 int64_t
114 forksys(int subcode, int flags)
115 {
116 switch (subcode) {
117 case 0:
118 return (cfork(0, 1, flags)); /* forkx(flags) */
119 case 1:
120 return (cfork(0, 0, flags)); /* forkallx(flags) */
121 case 2:
122 curthread->t_post_sys = 1; /* so vfwait() will be called */
123 return (cfork(1, 1, flags)); /* vforkx(flags) */
124 default:
125 return ((int64_t)set_errno(EINVAL));
126 }
127 }
128
129 /*
130 * Remove the associations of a child process from its parent and siblings.
131 */
132 static void
133 disown_proc(proc_t *pp, proc_t *cp)
134 {
135 proc_t **orphpp;
136
137 ASSERT(MUTEX_HELD(&pidlock));
138
139 orphpp = &pp->p_orphan;
140 while (*orphpp != cp)
141 orphpp = &(*orphpp)->p_nextorph;
142 *orphpp = cp->p_nextorph;
143
144 if (pp->p_child == cp)
145 pp->p_child = cp->p_sibling;
146 if (cp->p_sibling)
147 cp->p_sibling->p_psibling = cp->p_psibling;
148 if (cp->p_psibling)
149 cp->p_psibling->p_sibling = cp->p_sibling;
150 }
151
152 /* ARGSUSED */
153 static int64_t
154 cfork(int isvfork, int isfork1, int flags)
155 {
156 proc_t *p = ttoproc(curthread);
157 struct as *as;
158 proc_t *cp;
159 klwp_t *clone;
160 kthread_t *t;
161 task_t *tk;
162 rval_t r;
163 int error;
164 int i;
165 rctl_set_t *dup_set;
166 rctl_alloc_gp_t *dup_gp;
167 rctl_entity_p_t e;
168 lwpdir_t *ldp;
169 lwpent_t *lep;
170 lwpent_t *clep;
171
172 /*
173 * Allow only these two flags.
174 */
175 if ((flags & ~(FORK_NOSIGCHLD | FORK_WAITPID)) != 0) {
176 error = EINVAL;
177 atomic_inc_32(&curproc->p_zone->zone_ffmisc);
178 goto forkerr;
273 cp->p_segacct = p->p_segacct;
274 } else {
275 /*
276 * We need to hold P_PR_LOCK until the address space has
277 * been duplicated and we've had a chance to remove from the
278 * child any DTrace probes that were in the parent. Holding
279 * P_PR_LOCK prevents any new probes from being added and any
280 * extant probes from being removed.
281 */
282 mutex_enter(&p->p_lock);
283 sprlock_proc(p);
284 p->p_flag |= SFORKING;
285 mutex_exit(&p->p_lock);
286
287 error = as_dup(p->p_as, cp);
288 if (error != 0) {
289 mutex_enter(&p->p_lock);
290 sprunlock(p);
291 fork_fail(cp);
292 mutex_enter(&pidlock);
293 disown_proc(p, cp);
294 mutex_enter(&cp->p_lock);
295 tk = cp->p_task;
296 task_detach(cp);
297 ASSERT(cp->p_pool->pool_ref > 0);
298 atomic_dec_32(&cp->p_pool->pool_ref);
299 mutex_exit(&cp->p_lock);
300 pid_exit(cp, tk);
301 mutex_exit(&pidlock);
302 task_rele(tk);
303
304 mutex_enter(&p->p_lock);
305 p->p_flag &= ~SFORKING;
306 pool_barrier_exit();
307 continuelwps(p);
308 mutex_exit(&p->p_lock);
309 /*
310 * Preserve ENOMEM error condition but
311 * map all others to EAGAIN.
312 */
313 error = (error == ENOMEM) ? ENOMEM : EAGAIN;
638 kmem_free(cp->p_tidhash,
639 cp->p_tidhash_sz * sizeof (*cp->p_tidhash));
640 cp->p_tidhash = NULL;
641 cp->p_tidhash_sz = 0;
642
643 forklwp_fail(cp);
644 fork_fail(cp);
645 rctl_set_free(cp->p_rctls);
646 mutex_enter(&pidlock);
647
648 /*
649 * Detach failed child from task.
650 */
651 mutex_enter(&cp->p_lock);
652 tk = cp->p_task;
653 task_detach(cp);
654 ASSERT(cp->p_pool->pool_ref > 0);
655 atomic_dec_32(&cp->p_pool->pool_ref);
656 mutex_exit(&cp->p_lock);
657
658 disown_proc(p, cp);
659 pid_exit(cp, tk);
660 mutex_exit(&pidlock);
661
662 task_rele(tk);
663
664 mutex_enter(&p->p_lock);
665 pool_barrier_exit();
666 continuelwps(p);
667 mutex_exit(&p->p_lock);
668 error = EAGAIN;
669 forkerr:
670 return ((int64_t)set_errno(error));
671 }
672
673 /*
674 * Free allocated resources from getproc() if a fork failed.
675 */
676 static void
677 fork_fail(proc_t *cp)
678 {
685 upcount_dec(crgetruid(cp->p_cred), crgetzoneid(cp->p_cred));
686 mutex_exit(&pidlock);
687
688 /*
689 * single threaded, so no locking needed here
690 */
691 crfree(cp->p_cred);
692
693 kmem_free(fip->fi_list, fip->fi_nfiles * sizeof (uf_entry_t));
694
695 VN_RELE(PTOU(curproc)->u_cdir);
696 if (PTOU(curproc)->u_rdir)
697 VN_RELE(PTOU(curproc)->u_rdir);
698 if (cp->p_exec)
699 VN_RELE(cp->p_exec);
700 if (cp->p_execdir)
701 VN_RELE(cp->p_execdir);
702 if (PTOU(curproc)->u_cwd)
703 refstr_rele(PTOU(curproc)->u_cwd);
704 if (PROC_IS_BRANDED(cp)) {
705 brand_clearbrand(cp, B_FALSE);
706 }
707 }
708
709 /*
710 * Clean up the lwps already created for this child process.
711 * The fork failed while duplicating all the lwps of the parent
712 * and those lwps already created must be freed.
713 * This process is invisible to the rest of the system,
714 * so we don't need to hold p->p_lock to protect the list.
715 */
716 static void
717 forklwp_fail(proc_t *p)
718 {
719 kthread_t *t;
720 task_t *tk;
721 int branded = 0;
722
723 if (PROC_IS_BRANDED(p))
724 branded = 1;
725
734 p->p_lwpcnt--;
735 t->t_forw->t_back = t->t_back;
736 t->t_back->t_forw = t->t_forw;
737
738 tk = p->p_task;
739 mutex_enter(&p->p_zone->zone_nlwps_lock);
740 tk->tk_nlwps--;
741 tk->tk_proj->kpj_nlwps--;
742 p->p_zone->zone_nlwps--;
743 mutex_exit(&p->p_zone->zone_nlwps_lock);
744
745 ASSERT(t->t_schedctl == NULL);
746
747 if (branded)
748 BROP(p)->b_freelwp(ttolwp(t));
749
750 if (t->t_door != NULL) {
751 kmem_free(t->t_door, sizeof (door_data_t));
752 t->t_door = NULL;
753 }
754 lwp_ctmpl_clear(ttolwp(t), B_FALSE);
755
756 /*
757 * Remove the thread from the all threads list.
758 * We need to hold pidlock for this.
759 */
760 mutex_enter(&pidlock);
761 t->t_next->t_prev = t->t_prev;
762 t->t_prev->t_next = t->t_next;
763 CL_EXIT(t); /* tell the scheduler that we're exiting */
764 cv_broadcast(&t->t_joincv); /* tell anyone in thread_join */
765 mutex_exit(&pidlock);
766
767 /*
768 * Let the lgroup load averages know that this thread isn't
769 * going to show up (i.e. un-do what was done on behalf of
770 * this thread by the earlier lgrp_move_thread()).
771 */
772 kpreempt_disable();
773 lgrp_move_thread(t, NULL, 1);
774 kpreempt_enable();
775
776 /*
777 * The thread was created TS_STOPPED.
778 * We change it to TS_FREE to avoid an
779 * ASSERT() panic in thread_free().
780 */
781 t->t_state = TS_FREE;
782 thread_rele(t);
783 thread_free(t);
784 }
785 }
786
787 extern struct as kas;
788
789 /*
790 * fork a kernel process.
791 *
792 * Passing a pid argument of -1 indicates that the new process should be
793 * launched as a child of 'zsched' within the zone.
794 */
795 int
796 newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
797 pid_t pid)
798 {
799 proc_t *p;
800 struct user *up;
801 kthread_t *t;
802 cont_process_t *ctp = NULL;
803 rctl_entity_p_t e;
804
805 ASSERT(cid != sysdccid);
806 ASSERT(cid != syscid || ct == NULL);
807 if (CLASS_KERNEL(cid)) {
808 rctl_alloc_gp_t *init_gp;
809 rctl_set_t *init_set;
810
811 ASSERT(pid != 1);
812 ASSERT(pid >= 0);
813
814 if (getproc(&p, pid, GETPROC_KERNEL) < 0)
815 return (EAGAIN);
816
817 /*
818 * Release the hold on the p_exec and p_execdir, these
819 * were acquired in getproc()
820 */
821 if (p->p_execdir != NULL)
822 VN_RELE(p->p_execdir);
823 if (p->p_exec != NULL)
824 VN_RELE(p->p_exec);
825 p->p_flag |= SNOWAIT;
826 p->p_exec = NULL;
827 p->p_execdir = NULL;
828
829 init_set = rctl_set_create();
830 init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
831
832 /*
836 premptyset(&p->p_fltmask);
837 up = PTOU(p);
838 up->u_systrap = 0;
839 premptyset(&(up->u_entrymask));
840 premptyset(&(up->u_exitmask));
841 mutex_enter(&p->p_lock);
842 e.rcep_p.proc = p;
843 e.rcep_t = RCENTITY_PROCESS;
844 p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
845 init_gp);
846 mutex_exit(&p->p_lock);
847
848 rctl_prealloc_destroy(init_gp);
849
850 t = lwp_kernel_create(p, pc, arg, TS_STOPPED, pri);
851 } else {
852 rctl_alloc_gp_t *init_gp, *default_gp;
853 rctl_set_t *init_set;
854 task_t *tk, *tk_old;
855 klwp_t *lwp;
856 boolean_t pzsched = B_FALSE;
857 int flag = GETPROC_USER;
858
859 /* Handle a new user-level thread as child of zsched. */
860 if (pid < 0) {
861 VERIFY(curzone != global_zone);
862 flag = GETPROC_ZSCHED;
863 pzsched = B_TRUE;
864 pid = 0;
865 }
866
867 if (getproc(&p, pid, flag) < 0)
868 return (EAGAIN);
869 /*
870 * init creates a new task, distinct from the task
871 * containing kernel "processes".
872 */
873 tk = task_create(0, p->p_zone);
874 mutex_enter(&tk->tk_zone->zone_nlwps_lock);
875 tk->tk_proj->kpj_ntasks++;
876 tk->tk_nprocs++;
877 mutex_exit(&tk->tk_zone->zone_nlwps_lock);
878
879 default_gp = rctl_rlimit_set_prealloc(RLIM_NLIMITS);
880 init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
881 init_set = rctl_set_create();
882
883 mutex_enter(&pidlock);
884 mutex_enter(&p->p_lock);
885 tk_old = p->p_task; /* switch to new task */
886
887 task_detach(p);
889 mutex_exit(&pidlock);
890
891 mutex_enter(&tk_old->tk_zone->zone_nlwps_lock);
892 tk_old->tk_nprocs--;
893 mutex_exit(&tk_old->tk_zone->zone_nlwps_lock);
894
895 e.rcep_p.proc = p;
896 e.rcep_t = RCENTITY_PROCESS;
897 p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
898 init_gp);
899 rctlproc_default_init(p, default_gp);
900 mutex_exit(&p->p_lock);
901
902 task_rele(tk_old);
903 rctl_prealloc_destroy(default_gp);
904 rctl_prealloc_destroy(init_gp);
905
906 if ((lwp = lwp_create(pc, arg, 0, p, TS_STOPPED, pri,
907 &curthread->t_hold, cid, 1)) == NULL) {
908 task_t *tk;
909
910 fork_fail(p);
911 mutex_enter(&pidlock);
912 disown_proc(p->p_parent, p);
913
914 mutex_enter(&p->p_lock);
915 tk = p->p_task;
916 task_detach(p);
917 ASSERT(p->p_pool->pool_ref > 0);
918 atomic_add_32(&p->p_pool->pool_ref, -1);
919 mutex_exit(&p->p_lock);
920
921 pid_exit(p, tk);
922 mutex_exit(&pidlock);
923 task_rele(tk);
924 return (EAGAIN);
925 }
926 t = lwptot(lwp);
927
928 ctp = contract_process_fork(sys_process_tmpl, p,
929 (pzsched ? curproc->p_zone->zone_zsched : curproc),
930 B_FALSE);
931 ASSERT(ctp != NULL);
932 if (ct != NULL)
933 *ct = &ctp->conp_contract;
934 }
935
936 ASSERT3U(t->t_tid, ==, 1);
937 p->p_lwpid = 1;
938 mutex_enter(&pidlock);
939 pgjoin(p, p->p_parent->p_pgidp);
940 p->p_stat = SRUN;
941 mutex_enter(&p->p_lock);
942 t->t_proc_flag &= ~TP_HOLDLWP;
943 lwp_create_done(t);
944 mutex_exit(&p->p_lock);
945 mutex_exit(&pidlock);
946 return (0);
947 }
948
949 /*
950 * create a child proc struct.
951 */
952 static int
953 getproc(proc_t **cpp, pid_t pid, uint_t flags)
954 {
955 proc_t *pp, *cp;
956 pid_t newpid;
957 struct user *uarea;
958 extern uint_t nproc;
959 struct cred *cr;
960 uid_t ruid;
961 zoneid_t zoneid;
962 task_t *task;
963 kproject_t *proj;
964 zone_t *zone;
965 int rctlfail = 0;
966
967 if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
968 return (-1); /* no point in starting new processes */
969
970 if (flags & GETPROC_ZSCHED) {
971 pp = curproc->p_zone->zone_zsched;
972 } else {
973 pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
974 }
975 task = pp->p_task;
976 proj = task->tk_proj;
977 zone = pp->p_zone;
978
979 mutex_enter(&pp->p_lock);
980 mutex_enter(&zone->zone_nlwps_lock);
981 if (proj != proj0p) {
982 if (task->tk_nprocs >= task->tk_nprocs_ctl)
983 if (rctl_test(rc_task_nprocs, task->tk_rctls,
984 pp, 1, 0) & RCT_DENY)
985 rctlfail = 1;
986
987 if (proj->kpj_nprocs >= proj->kpj_nprocs_ctl)
988 if (rctl_test(rc_project_nprocs, proj->kpj_rctls,
989 pp, 1, 0) & RCT_DENY)
990 rctlfail = 1;
991
992 if (zone->zone_nprocs >= zone->zone_nprocs_ctl)
993 if (rctl_test(rc_zone_nprocs, zone->zone_rctls,
994 pp, 1, 0) & RCT_DENY)
1015 */
1016 mutex_init(&cp->p_splock, NULL, MUTEX_DEFAULT, NULL);
1017 mutex_init(&cp->p_crlock, NULL, MUTEX_DEFAULT, NULL);
1018 mutex_init(&cp->p_pflock, NULL, MUTEX_DEFAULT, NULL);
1019 #if defined(__x86)
1020 mutex_init(&cp->p_ldtlock, NULL, MUTEX_DEFAULT, NULL);
1021 #endif
1022 mutex_init(&cp->p_maplock, NULL, MUTEX_DEFAULT, NULL);
1023 cp->p_stat = SIDL;
1024 cp->p_mstart = gethrtime();
1025 cp->p_as = &kas;
1026 /*
1027 * p_zone must be set before we call pid_allocate since the process
1028 * will be visible after that and code such as prfind_zone will
1029 * look at the p_zone field.
1030 */
1031 cp->p_zone = pp->p_zone;
1032 cp->p_t1_lgrpid = LGRP_NONE;
1033 cp->p_tr_lgrpid = LGRP_NONE;
1034
1035 /* Default to native brand initially */
1036 cp->p_brand = &native_brand;
1037
1038 if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
1039 if (nproc == v.v_proc) {
1040 CPU_STATS_ADDQ(CPU, sys, procovf, 1);
1041 cmn_err(CE_WARN, "out of processes");
1042 }
1043 goto bad;
1044 }
1045
1046 mutex_enter(&pp->p_lock);
1047 cp->p_exec = pp->p_exec;
1048 cp->p_execdir = pp->p_execdir;
1049 mutex_exit(&pp->p_lock);
1050
1051 if (cp->p_exec) {
1052 VN_HOLD(cp->p_exec);
1053 /*
1054 * Each VOP_OPEN() must be paired with a corresponding
1055 * VOP_CLOSE(). In this case, the executable will be
1056 * closed for the child in either proc_exit() or gexec().
1057 */
1085 goto bad;
1086 }
1087
1088 /*
1089 * Everything is cool, put the new proc on the active process list.
1090 * It is already on the pid list and in /proc.
1091 * Increment the per uid process count (upcount).
1092 */
1093 nproc++;
1094 upcount_inc(ruid, zoneid);
1095
1096 cp->p_next = practive;
1097 practive->p_prev = cp;
1098 practive = cp;
1099
1100 cp->p_ignore = pp->p_ignore;
1101 cp->p_siginfo = pp->p_siginfo;
1102 cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
1103 cp->p_sessp = pp->p_sessp;
1104 sess_hold(pp);
1105 cp->p_bssbase = pp->p_bssbase;
1106 cp->p_brkbase = pp->p_brkbase;
1107 cp->p_brksize = pp->p_brksize;
1108 cp->p_brkpageszc = pp->p_brkpageszc;
1109 cp->p_stksize = pp->p_stksize;
1110 cp->p_stkpageszc = pp->p_stkpageszc;
1111 cp->p_stkprot = pp->p_stkprot;
1112 cp->p_datprot = pp->p_datprot;
1113 cp->p_usrstack = pp->p_usrstack;
1114 cp->p_model = pp->p_model;
1115 cp->p_ppid = pp->p_pid;
1116 cp->p_ancpid = pp->p_pid;
1117 cp->p_portcnt = pp->p_portcnt;
1118
1119 /*
1120 * Initialize watchpoint structures
1121 */
1122 avl_create(&cp->p_warea, wa_compare, sizeof (struct watched_area),
1123 offsetof(struct watched_area, wa_link));
1124
1164 cp->p_pool = pool_default;
1165 cp->p_flag |= SSYS;
1166 } else {
1167 cp->p_pool = pp->p_pool;
1168 }
1169 atomic_inc_32(&cp->p_pool->pool_ref);
1170 mutex_exit(&pp->p_lock);
1171
1172 /*
1173 * Add the child process to the current task. Kernel processes
1174 * are always attached to task0.
1175 */
1176 mutex_enter(&cp->p_lock);
1177 if (flags & GETPROC_KERNEL)
1178 task_attach(task0p, cp);
1179 else
1180 task_attach(pp->p_task, cp);
1181 mutex_exit(&cp->p_lock);
1182 mutex_exit(&pidlock);
1183
1184 if (PROC_IS_BRANDED(pp)) {
1185 /*
1186 * The only reason why process branding should fail is when
1187 * the procedure is complicated by multiple LWPs on the scene.
1188 * With an LWP count of 0, this newly allocated process has no
1189 * reason to fail branding.
1190 */
1191 VERIFY0(brand_setbrand(cp, B_FALSE));
1192
1193 BROP(pp)->b_copy_procdata(cp, pp);
1194 }
1195
1196 avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
1197 offsetof(contract_t, ct_ctlist));
1198
1199 /*
1200 * Duplicate any audit information kept in the process table
1201 */
1202 if (audit_active) /* copy audit data to cp */
1203 audit_newproc(cp);
1204
1205 crhold(cp->p_cred = cr);
1206
1207 /*
1208 * Bump up the counts on the file structures pointed at by the
1209 * parent's file table since the child will point at them too.
1210 */
1211 fcnt_add(P_FINFO(pp), 1);
1212
1213 if (PTOU(pp)->u_cdir) {
1214 VN_HOLD(PTOU(pp)->u_cdir);
1215 } else {
|