113 * zonecfg attribute tunables for memory capping.
114 * phys-mcap-cmd
115 * type: string
116 * specifies a command that can be run when over the cap
117 * phys-mcap-no-vmusage
118 * type: boolean
119 * true disables vm_getusage and just uses zone's proc. rss sum
120 * phys-mcap-no-pageout
121 * type: boolean
122 * true disables pageout when over
123 * phys-mcap-no-pf-throttle
124 * type: boolean
125 * true disables page fault throttling when over
126 */
127 #define TUNE_CMD "phys-mcap-cmd"
128 #define TUNE_NVMU "phys-mcap-no-vmusage"
129 #define TUNE_NPAGE "phys-mcap-no-pageout"
130 #define TUNE_NPFTHROT "phys-mcap-no-pf-throttle"
131
132 /*
133 * These are only used in get_mem_info but global. We always need scale_rss and
134 * prev_fast_rss to be persistent but we also have the other two global so we
135 * can easily see these with mdb.
136 */
137 uint64_t scale_rss = 0;
138 uint64_t prev_fast_rss = 0;
139 uint64_t fast_rss = 0;
140 uint64_t accurate_rss = 0;
141
142 static char zoneproc[MAXPATHLEN];
143 static char debug_log[MAXPATHLEN];
144 static zoneid_t zid;
145 static mutex_t shutdown_mx;
146 static cond_t shutdown_cv;
147 static int shutting_down = 0;
148 static thread_t mcap_tid;
149 static FILE *debug_log_fp = NULL;
150 static uint64_t zone_rss_cap; /* RSS cap(KB) */
151 static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */
152 static boolean_t skip_vmusage = B_FALSE;
153 static boolean_t skip_pageout = B_FALSE;
154 static boolean_t skip_pf_throttle = B_FALSE;
155
156 static zlog_t *logp;
157
158 static int64_t check_suspend();
159 static void get_mcap_tunables();
160
161 /*
162 * Structure to hold current state about a process address space that we're
163 * working on.
164 */
165 typedef struct {
166 int pr_curr; /* the # of the mapping we're working on */
167 int pr_nmap; /* number of mappings in address space */
168 prmap_t *pr_mapp; /* process's map array */
169 } proc_map_t;
170
171 typedef struct zsd_vmusage64 {
172 id_t vmu_zoneid;
173 uint_t vmu_type;
174 id_t vmu_id;
175 /*
176 * An amd64 kernel will align the following uint64_t members, but a
177 * 32bit i386 process will not without help.
362 if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
363 return (0);
364
365 errno = 0;
366 res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
367 pmp->pr_size);
368
369 return (res);
370 }
371
372 /*
373 * Work through a process paging out mappings until the whole address space was
374 * examined or the excess is < 0. Return our estimate of the updated excess.
375 */
376 static int64_t
377 pageout_process(pid_t pid, int64_t excess)
378 {
379 int psfd;
380 prmap_t *pmap;
381 proc_map_t cur;
382 int res;
383 int64_t sum_d_rss, d_rss;
384 int64_t old_rss;
385 int map_cnt;
386 psinfo_t psinfo;
387 char pathbuf[MAXPATHLEN];
388
389 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
390 pid);
391 if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
392 return (excess);
393
394 cur.pr_mapp = NULL;
395
396 if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
397 goto done;
398
399 old_rss = (int64_t)psinfo.pr_rssize;
400 map_cnt = 0;
401
402 /* If unscannable, skip it. */
414 }
415
416 /* Get segment residency information. */
417 pmap = init_map(&cur, pid);
418
419 /* Skip process if it has no mappings. */
420 if (pmap == NULL) {
421 debug("pid %ld: map unreadable; ignoring\n", pid);
422 goto done;
423 }
424
425 debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
426 pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
427
428 /*
429 * Within the process's address space, attempt to page out mappings.
430 */
431 sum_d_rss = 0;
432 while (excess > 0 && pmap != NULL && !shutting_down) {
433 /* invalidate the entire mapping */
434 if ((res = pageout_mapping(pid, pmap)) < 0)
435 debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
436 pid, pmap->pr_vaddr, pmap->pr_size / 1024, errno);
437
438 map_cnt++;
439
440 /*
441 * Re-check the process rss and get the delta.
442 */
443 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
444 != sizeof (psinfo)) {
445 excess -= old_rss;
446 goto done;
447 }
448
449 d_rss = (int64_t)psinfo.pr_rssize - old_rss;
450 old_rss = (int64_t)psinfo.pr_rssize;
451 sum_d_rss += d_rss;
452
453 /*
454 * d_rss hopefully should be negative (or 0 if nothing
455 * invalidated) but can be positive if more got paged in.
456 */
1130
1131 debug("process pass done; excess %lld\n", (long long)excess);
1132 rewinddir(pdir);
1133
1134 if (skip_pageout)
1135 (void) sleep_shutdown(120);
1136 }
1137
1138 if (pdir != NULL)
1139 (void) closedir(pdir);
1140 debug("thread shutdown\n");
1141 }
1142
1143 void
1144 create_mcap_thread(zlog_t *zlogp, zoneid_t id)
1145 {
1146 int res;
1147
1148 shutting_down = 0;
1149 zid = id;
1150 logp = zlogp;
1151
1152 /* all but the lx brand currently use /proc */
1153 if (strcmp(brand_name, "lx") == 0) {
1154 (void) snprintf(zoneproc, sizeof (zoneproc),
1155 "%s/root/native/proc", zonepath);
1156 } else {
1157 (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
1158 zonepath);
1159 }
1160
1161 (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
1162 zonepath);
1163
1164 res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
1165 &mcap_tid);
1166 if (res != 0) {
1167 zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
1168 res);
1169 mcap_tid = 0;
1170 }
|
113 * zonecfg attribute tunables for memory capping.
114 * phys-mcap-cmd
115 * type: string
116 * specifies a command that can be run when over the cap
117 * phys-mcap-no-vmusage
118 * type: boolean
119 * true disables vm_getusage and just uses zone's proc. rss sum
120 * phys-mcap-no-pageout
121 * type: boolean
122 * true disables pageout when over
123 * phys-mcap-no-pf-throttle
124 * type: boolean
125 * true disables page fault throttling when over
126 */
127 #define TUNE_CMD "phys-mcap-cmd"
128 #define TUNE_NVMU "phys-mcap-no-vmusage"
129 #define TUNE_NPAGE "phys-mcap-no-pageout"
130 #define TUNE_NPFTHROT "phys-mcap-no-pf-throttle"
131
132 /*
133 * The large mapping value was derived empirically by seeing that mappings
134 * much bigger than 16mb sometimes take a relatively long time to invalidate
135 * (significant fraction of a second).
136 */
137 #define SEC_INTERIM 4 /* num secs to pause after stopped too long */
138 #define MSEC_TOO_LONG 100 /* release proc. after stopped for 100ms */
139 #define LARGE_MAPPING 16384 /* >= 16MB in KB - pageout in chunks */
140
141 /*
142 * These are only used in get_mem_info but global. We always need scale_rss and
143 * prev_fast_rss to be persistent but we also have the other two global so we
144 * can easily see these with mdb.
145 */
146 uint64_t scale_rss = 0;
147 uint64_t prev_fast_rss = 0;
148 uint64_t fast_rss = 0;
149 uint64_t accurate_rss = 0;
150
151 static char zoneproc[MAXPATHLEN];
152 static char debug_log[MAXPATHLEN];
153 static zoneid_t zid;
154 static mutex_t shutdown_mx;
155 static cond_t shutdown_cv;
156 static int shutting_down = 0;
157 static thread_t mcap_tid;
158 static FILE *debug_log_fp = NULL;
159 static uint64_t zone_rss_cap; /* RSS cap(KB) */
160 static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */
161 static boolean_t skip_vmusage = B_FALSE;
162 static boolean_t skip_pageout = B_FALSE;
163 static boolean_t skip_pf_throttle = B_FALSE;
164
165 static int64_t check_suspend();
166 static void get_mcap_tunables();
167
168 /*
169 * Structure to hold current state about a process address space that we're
170 * working on.
171 */
172 typedef struct {
173 int pr_curr; /* the # of the mapping we're working on */
174 int pr_nmap; /* number of mappings in address space */
175 prmap_t *pr_mapp; /* process's map array */
176 } proc_map_t;
177
178 typedef struct zsd_vmusage64 {
179 id_t vmu_zoneid;
180 uint_t vmu_type;
181 id_t vmu_id;
182 /*
183 * An amd64 kernel will align the following uint64_t members, but a
184 * 32bit i386 process will not without help.
369 if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
370 return (0);
371
372 errno = 0;
373 res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
374 pmp->pr_size);
375
376 return (res);
377 }
378
379 /*
380 * Work through a process paging out mappings until the whole address space was
381 * examined or the excess is < 0. Return our estimate of the updated excess.
382 */
383 static int64_t
384 pageout_process(pid_t pid, int64_t excess)
385 {
386 int psfd;
387 prmap_t *pmap;
388 proc_map_t cur;
389 int64_t sum_d_rss, d_rss;
390 int64_t old_rss;
391 int map_cnt;
392 psinfo_t psinfo;
393 char pathbuf[MAXPATHLEN];
394
395 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
396 pid);
397 if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
398 return (excess);
399
400 cur.pr_mapp = NULL;
401
402 if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
403 goto done;
404
405 old_rss = (int64_t)psinfo.pr_rssize;
406 map_cnt = 0;
407
408 /* If unscannable, skip it. */
420 }
421
422 /* Get segment residency information. */
423 pmap = init_map(&cur, pid);
424
425 /* Skip process if it has no mappings. */
426 if (pmap == NULL) {
427 debug("pid %ld: map unreadable; ignoring\n", pid);
428 goto done;
429 }
430
431 debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
432 pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
433
434 /*
435 * Within the process's address space, attempt to page out mappings.
436 */
437 sum_d_rss = 0;
438 while (excess > 0 && pmap != NULL && !shutting_down) {
439 /* invalidate the entire mapping */
440 if (pageout_mapping(pid, pmap) < 0)
441 debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
442 pid, (void *)pmap->pr_vaddr,
443 (long)pmap->pr_size / 1024L, errno);
444
445 map_cnt++;
446
447 /*
448 * Re-check the process rss and get the delta.
449 */
450 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
451 != sizeof (psinfo)) {
452 excess -= old_rss;
453 goto done;
454 }
455
456 d_rss = (int64_t)psinfo.pr_rssize - old_rss;
457 old_rss = (int64_t)psinfo.pr_rssize;
458 sum_d_rss += d_rss;
459
460 /*
461 * d_rss hopefully should be negative (or 0 if nothing
462 * invalidated) but can be positive if more got paged in.
463 */
1137
1138 debug("process pass done; excess %lld\n", (long long)excess);
1139 rewinddir(pdir);
1140
1141 if (skip_pageout)
1142 (void) sleep_shutdown(120);
1143 }
1144
1145 if (pdir != NULL)
1146 (void) closedir(pdir);
1147 debug("thread shutdown\n");
1148 }
1149
1150 void
1151 create_mcap_thread(zlog_t *zlogp, zoneid_t id)
1152 {
1153 int res;
1154
1155 shutting_down = 0;
1156 zid = id;
1157
1158 /* all but the lx brand currently use /proc */
1159 if (strcmp(brand_name, "lx") == 0) {
1160 (void) snprintf(zoneproc, sizeof (zoneproc),
1161 "%s/root/native/proc", zonepath);
1162 } else {
1163 (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
1164 zonepath);
1165 }
1166
1167 (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
1168 zonepath);
1169
1170 res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
1171 &mcap_tid);
1172 if (res != 0) {
1173 zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
1174 res);
1175 mcap_tid = 0;
1176 }
|