Print this page




 113  * zonecfg attribute tunables for memory capping.
 114  *    phys-mcap-cmd
 115  *      type: string
 116  *      specifies a command that can be run when over the cap
 117  *    phys-mcap-no-vmusage
 118  *      type: boolean
 119  *      true disables vm_getusage and just uses zone's proc. rss sum
 120  *    phys-mcap-no-pageout
 121  *      type: boolean
 122  *      true disables pageout when over
 123  *    phys-mcap-no-pf-throttle
 124  *      type: boolean
 125  *      true disables page fault throttling when over
 126  */
 127 #define TUNE_CMD        "phys-mcap-cmd"
 128 #define TUNE_NVMU       "phys-mcap-no-vmusage"
 129 #define TUNE_NPAGE      "phys-mcap-no-pageout"
 130 #define TUNE_NPFTHROT   "phys-mcap-no-pf-throttle"
 131 
 132 /*









 133  * These are only used in get_mem_info but global. We always need scale_rss and
 134  * prev_fast_rss to be persistent but we also have the other two global so we
 135  * can easily see these with mdb.
 136  */
 137 uint64_t        scale_rss = 0;
 138 uint64_t        prev_fast_rss = 0;
 139 uint64_t        fast_rss = 0;
 140 uint64_t        accurate_rss = 0;
 141 
 142 static char     zoneproc[MAXPATHLEN];
 143 static char     debug_log[MAXPATHLEN];
 144 static zoneid_t zid;
 145 static mutex_t  shutdown_mx;
 146 static cond_t   shutdown_cv;
 147 static int      shutting_down = 0;
 148 static thread_t mcap_tid;
 149 static FILE     *debug_log_fp = NULL;
 150 static uint64_t zone_rss_cap;           /* RSS cap(KB) */
 151 static char     over_cmd[2 * BUFSIZ];   /* same size as zone_attr_value */
 152 static boolean_t skip_vmusage = B_FALSE;
 153 static boolean_t skip_pageout = B_FALSE;
 154 static boolean_t skip_pf_throttle = B_FALSE;
 155 
 156 static zlog_t   *logp;
 157 
 158 static int64_t check_suspend();
 159 static void get_mcap_tunables();
 160 
 161 /*
 162  * Structure to hold current state about a process address space that we're
 163  * working on.
 164  */
 165 typedef struct {
 166         int pr_curr;            /* the # of the mapping we're working on */
 167         int pr_nmap;            /* number of mappings in address space */
 168         prmap_t *pr_mapp;       /* process's map array */
 169 } proc_map_t;
 170 
 171 typedef struct zsd_vmusage64 {
 172         id_t vmu_zoneid;
 173         uint_t vmu_type;
 174         id_t vmu_id;
 175         /*
 176          * An amd64 kernel will align the following uint64_t members, but a
 177          * 32bit i386 process will not without help.


 362         if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
 363                 return (0);
 364 
 365         errno = 0;
 366         res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
 367             pmp->pr_size);
 368 
 369         return (res);
 370 }
 371 
 372 /*
 373  * Work through a process paging out mappings until the whole address space was
 374  * examined or the excess is < 0.  Return our estimate of the updated excess.
 375  */
 376 static int64_t
 377 pageout_process(pid_t pid, int64_t excess)
 378 {
 379         int                     psfd;
 380         prmap_t                 *pmap;
 381         proc_map_t              cur;
 382         int                     res;
 383         int64_t                 sum_d_rss, d_rss;
 384         int64_t                 old_rss;
 385         int                     map_cnt;
 386         psinfo_t                psinfo;
 387         char                    pathbuf[MAXPATHLEN];
 388 
 389         (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
 390             pid);
 391         if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
 392                 return (excess);
 393 
 394         cur.pr_mapp = NULL;
 395 
 396         if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
 397                 goto done;
 398 
 399         old_rss = (int64_t)psinfo.pr_rssize;
 400         map_cnt = 0;
 401 
 402         /* If unscannable, skip it. */


 414         }
 415 
 416         /* Get segment residency information. */
 417         pmap = init_map(&cur, pid);
 418 
 419         /* Skip process if it has no mappings. */
 420         if (pmap == NULL) {
 421                 debug("pid %ld: map unreadable; ignoring\n", pid);
 422                 goto done;
 423         }
 424 
 425         debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
 426             pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
 427 
 428         /*
 429          * Within the process's address space, attempt to page out mappings.
 430          */
 431         sum_d_rss = 0;
 432         while (excess > 0 && pmap != NULL && !shutting_down) {
 433                 /* invalidate the entire mapping */
 434                 if ((res = pageout_mapping(pid, pmap)) < 0)
 435                         debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
 436                             pid, pmap->pr_vaddr, pmap->pr_size / 1024, errno);

 437 
 438                 map_cnt++;
 439 
 440                 /*
 441                  * Re-check the process rss and get the delta.
 442                  */
 443                 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
 444                     != sizeof (psinfo)) {
 445                         excess -= old_rss;
 446                         goto done;
 447                 }
 448 
 449                 d_rss = (int64_t)psinfo.pr_rssize - old_rss;
 450                 old_rss = (int64_t)psinfo.pr_rssize;
 451                 sum_d_rss += d_rss;
 452 
 453                 /*
 454                  * d_rss hopefully should be negative (or 0 if nothing
 455                  * invalidated) but can be positive if more got paged in.
 456                  */


1130 
1131                 debug("process pass done; excess %lld\n", (long long)excess);
1132                 rewinddir(pdir);
1133 
1134                 if (skip_pageout)
1135                         (void) sleep_shutdown(120);
1136         }
1137 
1138         if (pdir != NULL)
1139                 (void) closedir(pdir);
1140         debug("thread shutdown\n");
1141 }
1142 
1143 void
1144 create_mcap_thread(zlog_t *zlogp, zoneid_t id)
1145 {
1146         int             res;
1147 
1148         shutting_down = 0;
1149         zid = id;
1150         logp = zlogp;
1151 
1152         /* all but the lx brand currently use /proc */
1153         if (strcmp(brand_name, "lx") == 0) {
1154                 (void) snprintf(zoneproc, sizeof (zoneproc),
1155                     "%s/root/native/proc", zonepath);
1156         } else {
1157                 (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
1158                     zonepath);
1159         }
1160 
1161         (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
1162             zonepath);
1163 
1164         res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
1165             &mcap_tid);
1166         if (res != 0) {
1167                 zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
1168                     res);
1169                 mcap_tid = 0;
1170         }


 113  * zonecfg attribute tunables for memory capping.
 114  *    phys-mcap-cmd
 115  *      type: string
 116  *      specifies a command that can be run when over the cap
 117  *    phys-mcap-no-vmusage
 118  *      type: boolean
 119  *      true disables vm_getusage and just uses zone's proc. rss sum
 120  *    phys-mcap-no-pageout
 121  *      type: boolean
 122  *      true disables pageout when over
 123  *    phys-mcap-no-pf-throttle
 124  *      type: boolean
 125  *      true disables page fault throttling when over
 126  */
 127 #define TUNE_CMD        "phys-mcap-cmd"
 128 #define TUNE_NVMU       "phys-mcap-no-vmusage"
 129 #define TUNE_NPAGE      "phys-mcap-no-pageout"
 130 #define TUNE_NPFTHROT   "phys-mcap-no-pf-throttle"
 131 
 132 /*
 133  * The large mapping value was derived empirically by seeing that mappings
 134  * much bigger than 16mb sometimes take a relatively long time to invalidate
 135  * (significant fraction of a second).
 136  */
 137 #define SEC_INTERIM     4       /* num secs to pause after stopped too long */
 138 #define MSEC_TOO_LONG   100     /* release proc. after stopped for 100ms */
 139 #define LARGE_MAPPING   16384   /* >= 16MB in KB - pageout in chunks */
 140 
 141 /*
 142  * These are only used in get_mem_info but global. We always need scale_rss and
 143  * prev_fast_rss to be persistent but we also have the other two global so we
 144  * can easily see these with mdb.
 145  */
 146 uint64_t        scale_rss = 0;
 147 uint64_t        prev_fast_rss = 0;
 148 uint64_t        fast_rss = 0;
 149 uint64_t        accurate_rss = 0;
 150 
 151 static char     zoneproc[MAXPATHLEN];
 152 static char     debug_log[MAXPATHLEN];
 153 static zoneid_t zid;
 154 static mutex_t  shutdown_mx;
 155 static cond_t   shutdown_cv;
 156 static int      shutting_down = 0;
 157 static thread_t mcap_tid;
 158 static FILE     *debug_log_fp = NULL;
 159 static uint64_t zone_rss_cap;           /* RSS cap(KB) */
 160 static char     over_cmd[2 * BUFSIZ];   /* same size as zone_attr_value */
 161 static boolean_t skip_vmusage = B_FALSE;
 162 static boolean_t skip_pageout = B_FALSE;
 163 static boolean_t skip_pf_throttle = B_FALSE;
 164 


 165 static int64_t check_suspend();
 166 static void get_mcap_tunables();
 167 
 168 /*
 169  * Structure to hold current state about a process address space that we're
 170  * working on.
 171  */
 172 typedef struct {
 173         int pr_curr;            /* the # of the mapping we're working on */
 174         int pr_nmap;            /* number of mappings in address space */
 175         prmap_t *pr_mapp;       /* process's map array */
 176 } proc_map_t;
 177 
 178 typedef struct zsd_vmusage64 {
 179         id_t vmu_zoneid;
 180         uint_t vmu_type;
 181         id_t vmu_id;
 182         /*
 183          * An amd64 kernel will align the following uint64_t members, but a
 184          * 32bit i386 process will not without help.


 369         if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
 370                 return (0);
 371 
 372         errno = 0;
 373         res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
 374             pmp->pr_size);
 375 
 376         return (res);
 377 }
 378 
 379 /*
 380  * Work through a process paging out mappings until the whole address space was
 381  * examined or the excess is < 0.  Return our estimate of the updated excess.
 382  */
 383 static int64_t
 384 pageout_process(pid_t pid, int64_t excess)
 385 {
 386         int                     psfd;
 387         prmap_t                 *pmap;
 388         proc_map_t              cur;

 389         int64_t                 sum_d_rss, d_rss;
 390         int64_t                 old_rss;
 391         int                     map_cnt;
 392         psinfo_t                psinfo;
 393         char                    pathbuf[MAXPATHLEN];
 394 
 395         (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
 396             pid);
 397         if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
 398                 return (excess);
 399 
 400         cur.pr_mapp = NULL;
 401 
 402         if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
 403                 goto done;
 404 
 405         old_rss = (int64_t)psinfo.pr_rssize;
 406         map_cnt = 0;
 407 
 408         /* If unscannable, skip it. */


 420         }
 421 
 422         /* Get segment residency information. */
 423         pmap = init_map(&cur, pid);
 424 
 425         /* Skip process if it has no mappings. */
 426         if (pmap == NULL) {
 427                 debug("pid %ld: map unreadable; ignoring\n", pid);
 428                 goto done;
 429         }
 430 
 431         debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
 432             pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
 433 
 434         /*
 435          * Within the process's address space, attempt to page out mappings.
 436          */
 437         sum_d_rss = 0;
 438         while (excess > 0 && pmap != NULL && !shutting_down) {
 439                 /* invalidate the entire mapping */
 440                 if (pageout_mapping(pid, pmap) < 0)
 441                         debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
 442                             pid, (void *)pmap->pr_vaddr,
 443                             (long)pmap->pr_size / 1024L, errno);
 444 
 445                 map_cnt++;
 446 
 447                 /*
 448                  * Re-check the process rss and get the delta.
 449                  */
 450                 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
 451                     != sizeof (psinfo)) {
 452                         excess -= old_rss;
 453                         goto done;
 454                 }
 455 
 456                 d_rss = (int64_t)psinfo.pr_rssize - old_rss;
 457                 old_rss = (int64_t)psinfo.pr_rssize;
 458                 sum_d_rss += d_rss;
 459 
 460                 /*
 461                  * d_rss hopefully should be negative (or 0 if nothing
 462                  * invalidated) but can be positive if more got paged in.
 463                  */


1137 
1138                 debug("process pass done; excess %lld\n", (long long)excess);
1139                 rewinddir(pdir);
1140 
1141                 if (skip_pageout)
1142                         (void) sleep_shutdown(120);
1143         }
1144 
1145         if (pdir != NULL)
1146                 (void) closedir(pdir);
1147         debug("thread shutdown\n");
1148 }
1149 
1150 void
1151 create_mcap_thread(zlog_t *zlogp, zoneid_t id)
1152 {
1153         int             res;
1154 
1155         shutting_down = 0;
1156         zid = id;

1157 
1158         /* all but the lx brand currently use /proc */
1159         if (strcmp(brand_name, "lx") == 0) {
1160                 (void) snprintf(zoneproc, sizeof (zoneproc),
1161                     "%s/root/native/proc", zonepath);
1162         } else {
1163                 (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
1164                     zonepath);
1165         }
1166 
1167         (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
1168             zonepath);
1169 
1170         res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
1171             &mcap_tid);
1172         if (res != 0) {
1173                 zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
1174                     res);
1175                 mcap_tid = 0;
1176         }