1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2016 Joyent, Inc.
24 */
25
26 #include <sys/types.h>
27 #include <sys/errno.h>
28 #include <sys/param.h>
29 #include <sys/t_lock.h>
30 #include <sys/systm.h>
31 #include <sys/sysmacros.h>
32 #include <sys/debug.h>
33 #include <sys/time.h>
34 #include <sys/cmn_err.h>
35 #include <sys/vnode.h>
36 #include <sys/stat.h>
37 #include <sys/vfs.h>
38 #include <sys/cred.h>
39 #include <sys/kmem.h>
40 #include <sys/atomic.h>
41 #include <sys/policy.h>
42 #include <sys/fs/tmp.h>
43 #include <sys/fs/tmpnode.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <vm/anon.h>
47
48 #define KILOBYTE 1024
49 #define MEGABYTE (1024 * KILOBYTE)
50 #define GIGABYTE (1024 * MEGABYTE)
51
52 #define MODESHIFT 3
53
54 #define VALIDMODEBITS 07777
55
56 extern pgcnt_t swapfs_minfree;
57
58 void *
59 tmp_kmem_zalloc(struct tmount *tm, size_t size, int flag)
60 {
61 void *buf;
62 zone_t *zone;
63 size_t pages;
64
65 mutex_enter(&tm->tm_contents);
66 zone = tm->tm_vfsp->vfs_zone;
67 if (tm->tm_anonmem + size > tm->tm_anonmax ||
68 tm->tm_anonmem + size < tm->tm_anonmem ||
69 size + ptob(tmpfs_minfree) <= size ||
70 !anon_checkspace(size + ptob(tmpfs_minfree), zone)) {
71 mutex_exit(&tm->tm_contents);
72 return (NULL);
73 }
74
75 /*
76 * Only make anonymous memory reservations when a page boundary is
77 * crossed. This is necessary since the anon_resv functions rounds up
78 * to PAGESIZE internally.
79 */
80 pages = btopr(tm->tm_allocmem + size);
81 pages -= btopr(tm->tm_allocmem);
82 if (pages > 0 && anon_try_resv_zone(ptob(pages), zone) == 0) {
83 mutex_exit(&tm->tm_contents);
84 return (NULL);
85 }
86
87 tm->tm_allocmem += size;
88 tm->tm_anonmem += size;
89 mutex_exit(&tm->tm_contents);
90
91 buf = kmem_zalloc(size, flag);
92 if (buf == NULL) {
93 mutex_enter(&tm->tm_contents);
94 ASSERT(tm->tm_anonmem > tm->tm_anonmem - size);
95 tm->tm_anonmem -= size;
96 if (pages > 0) {
97 /*
98 * Re-chasing the zone pointer is necessary since a
99 * forced umount could have been performed while the
100 * tm_contents lock was dropped during allocation.
101 */
102 anon_unresv_zone(ptob(pages), tm->tm_vfsp->vfs_zone);
103 }
104 mutex_exit(&tm->tm_contents);
105 }
106
107 return (buf);
108 }
109
110 void
111 tmp_kmem_free(struct tmount *tm, void *buf, size_t size)
112 {
113 size_t pages;
114
115 kmem_free(buf, size);
116 mutex_enter(&tm->tm_contents);
117 ASSERT(tm->tm_anonmem > tm->tm_anonmem - size);
118 tm->tm_anonmem -= size;
119 pages = btopr(tm->tm_allocmem);
120 tm->tm_allocmem -= size;
121 pages -= btopr(tm->tm_allocmem);
122 /*
123 * Like the tmp_kmem_zalloc case, only unreserve anonymous memory when
124 * a page boundary has been crossed.
125 */
126 if (pages > 0) {
127 anon_unresv_zone(size, tm->tm_vfsp->vfs_zone);
128 }
129 mutex_exit(&tm->tm_contents);
130 }
131
132 int
133 tmp_taccess(void *vtp, int mode, struct cred *cred)
134 {
135 struct tmpnode *tp = vtp;
136 int shift = 0;
137 /*
138 * Check access based on owner, group and
139 * public permissions in tmpnode.
140 */
141 if (crgetuid(cred) != tp->tn_uid) {
142 shift += MODESHIFT;
143 if (groupmember(tp->tn_gid, cred) == 0)
144 shift += MODESHIFT;
145 }
146
147 return (secpolicy_vnode_access2(cred, TNTOV(tp), tp->tn_uid,
148 tp->tn_mode << shift, mode));
149 }
150
151 /*
152 * Decide whether it is okay to remove within a sticky directory.
153 * Two conditions need to be met: write access to the directory
154 * is needed. In sticky directories, write access is not sufficient;
155 * you can remove entries from a directory only if you own the directory,
156 * if you are privileged, if you own the entry or if they entry is
157 * a plain file and you have write access to that file.
158 * Function returns 0 if remove access is granted.
159 */
160 int
161 tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry,
162 struct cred *cr)
163 {
164 uid_t uid = crgetuid(cr);
165
166 if ((dir->tn_mode & S_ISVTX) &&
167 uid != dir->tn_uid &&
168 uid != entry->tn_uid &&
169 (entry->tn_type != VREG ||
170 tmp_taccess(entry, VWRITE, cr) != 0))
171 return (secpolicy_vnode_remove(cr));
172
173 return (0);
174 }
175
176 /*
177 * Convert a string containing a number (number of bytes) to a size_t,
178 * containing the corresponding number of bytes. On 32-bit kernels, the
179 * maximum value encoded in 'str' is PAGESIZE * ULONG_MAX, while the value
180 * returned in 'maxpg' is at most ULONG_MAX.
181 *
182 * The number may be followed by a magnitude suffix: "k" or "K" for kilobytes;
183 * "m" or "M" for megabytes; "g" or "G" for gigabytes. This interface allows
184 * for an arguably esoteric interpretation of multiple suffix characters:
185 * namely, they cascade. For example, the caller may specify "2mk", which is
186 * interpreted as 2 gigabytes. It would seem, at this late stage, that the
187 * horse has left not only the barn but indeed the country, and possibly the
188 * entire planetary system. Alternatively, the number may be followed by a
189 * single '%' sign, indicating the size is a percentage of either the zone's
190 * swap limit or the system's overall swap size.
191 *
192 * Parse and overflow errors are detected and a non-zero number returned on
193 * error.
194 */
195 int
196 tmp_convnum(char *str, size_t *maxbytes)
197 {
198 u_longlong_t num = 0;
199 u_longlong_t max_bytes = (uint64_t)SIZE_MAX;
200 size_t pages;
201
202 char *c;
203 const struct convchar {
204 char *cc_char;
205 uint64_t cc_factor;
206 } convchars[] = {
207 { "kK", KILOBYTE },
208 { "mM", MEGABYTE },
209 { "gG", GIGABYTE },
210 { NULL, 0 }
211 };
212
213 if (str == NULL) {
214 return (EINVAL);
215 }
216 c = str;
217
218 /*
219 * Convert the initial numeric portion of the input string.
220 */
221 if (ddi_strtoull(str, &c, 10, &num) != 0) {
222 return (EINVAL);
223 }
224
225 /*
226 * Handle a size in percent. Anything other than a single percent
227 * modifier is invalid. We use either the zone's swap limit or the
228 * system's total available swap size as the initial value. Perform the
229 * intermediate calculation in pages to avoid overflow.
230 */
231 if (*c == '%') {
232 u_longlong_t cap;
233
234 if (*(c + 1) != '\0')
235 return (EINVAL);
236
237 if (num > 100)
238 return (EINVAL);
239
240 cap = (u_longlong_t)curproc->p_zone->zone_max_swap_ctl;
241 if (cap == UINT64_MAX) {
242 /*
243 * Use the amount of available physical and memory swap
244 */
245 mutex_enter(&anoninfo_lock);
246 cap = TOTAL_AVAILABLE_SWAP;
247 mutex_exit(&anoninfo_lock);
248 } else {
249 cap = btop(cap);
250 }
251
252 num = ptob(cap * num / 100);
253 goto done;
254 }
255
256 /*
257 * Apply the (potentially cascading) magnitude suffixes until an
258 * invalid character is found, or the string comes to an end.
259 */
260 for (; *c != '\0'; c++) {
261 int i;
262
263 for (i = 0; convchars[i].cc_char != NULL; i++) {
264 /*
265 * Check if this character matches this multiplier
266 * class:
267 */
268 if (strchr(convchars[i].cc_char, *c) != NULL) {
269 /*
270 * Check for overflow:
271 */
272 if (num > max_bytes / convchars[i].cc_factor) {
273 return (EINVAL);
274 }
275
276 num *= convchars[i].cc_factor;
277 goto valid_char;
278 }
279 }
280
281 /*
282 * This was not a valid multiplier suffix character.
283 */
284 return (EINVAL);
285
286 valid_char:
287 continue;
288 }
289
290 done:
291
292 /*
293 * We've been given a size in bytes; however, we want to make sure that
294 * we have at least one page worth no matter what. Therefore we use
295 * btopr to round up. However, this may cause an overflow only if 'num'
296 * is between (max_bytes - PAGESIZE) and (max_bytes). In this case the
297 * resulting number is zero, which is what we check for below. Note, we
298 * require at least one page, so if pages is zero, well, it wasn't going
299 * to work anyways.
300 */
301 pages = btopr(num);
302 if (pages == 0) {
303 return (EINVAL);
304 }
305
306 *maxbytes = ptob(pages);
307
308 return (0);
309 }
310
311 /*
312 * Parse an octal mode string for use as the permissions set for the root
313 * of the tmpfs mount.
314 */
315 int
316 tmp_convmode(char *str, mode_t *mode)
317 {
318 ulong_t num;
319 char *c;
320
321 if (str == NULL) {
322 return (EINVAL);
323 }
324
325 if (ddi_strtoul(str, &c, 8, &num) != 0) {
326 return (EINVAL);
327 }
328
329 if ((num & ~VALIDMODEBITS) != 0) {
330 return (EINVAL);
331 }
332
333 *mode = VALIDMODEBITS & num;
334 return (0);
335 }