1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/param.h>
  31 #include <sys/systm.h>
  32 #include <sys/errno.h>
  33 #include <sys/kmem.h>
  34 #include <sys/vnode.h>
  35 #include <sys/vfs_opreg.h>
  36 #include <sys/swap.h>
  37 #include <sys/sysmacros.h>
  38 #include <sys/buf.h>
  39 #include <sys/callb.h>
  40 #include <sys/debug.h>
  41 #include <vm/seg.h>
  42 #include <sys/fs/swapnode.h>
  43 #include <fs/fs_subr.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/mem_config.h>
  46 #include <sys/atomic.h>
  47 
  48 extern const fs_operation_def_t swap_vnodeops_template[];
  49 
  50 /*
  51  * swapfs_minfree is the amount of physical memory (actually remaining
  52  * availrmem) that we want to keep free for the rest of the system.  This
  53  * means that swapfs can only grow to availrmem - swapfs_minfree.  This
  54  * can be set as just constant value or a certain percentage of installed
  55  * physical memory. It is set in swapinit().
  56  *
  57  * Users who want to change the amount of memory that can be used as swap
  58  * space should do so by setting swapfs_desfree at boot time,
  59  * not swapfs_minfree.
  60  */
  61 
  62 pgcnt_t swapfs_desfree = 0;
  63 volatile pgcnt_t swapfs_minfree = 0;
  64 volatile pgcnt_t swapfs_reserve = 0;
  65 
  66 #ifdef SWAPFS_DEBUG
  67 int swapfs_debug;
  68 #endif /* SWAPFS_DEBUG */
  69 
  70 
  71 static int swapfs_vpcount;
  72 static kmutex_t swapfs_lock;
  73 static struct async_reqs *sw_ar, *sw_pendlist, *sw_freelist;
  74 
  75 static struct vnode **swap_vnodes;      /* ptr's to swap vnodes */
  76 
  77 static void swap_init_mem_config(void);
  78 
  79 static pgcnt_t initial_swapfs_desfree;
  80 static pgcnt_t initial_swapfs_minfree;
  81 static pgcnt_t initial_swapfs_reserve;
  82 
  83 static int swap_sync(struct vfs *vfsp, short flag, struct cred *cr);
  84 
  85 static void
  86 swapfs_recalc_save_initial(void)
  87 {
  88         initial_swapfs_desfree = swapfs_desfree;
  89         initial_swapfs_minfree = swapfs_minfree;
  90         initial_swapfs_reserve = swapfs_reserve;
  91 }
  92 
  93 static int
  94 swapfs_recalc(pgcnt_t pgs)
  95 {
  96         pgcnt_t new_swapfs_desfree;
  97         pgcnt_t new_swapfs_minfree;
  98         pgcnt_t new_swapfs_reserve;
  99 
 100         new_swapfs_desfree = initial_swapfs_desfree;
 101         new_swapfs_minfree = initial_swapfs_minfree;
 102         new_swapfs_reserve = initial_swapfs_reserve;
 103 
 104         if (new_swapfs_desfree == 0)
 105                 new_swapfs_desfree = btopr(7 * 512 * 1024); /* 3-1/2Mb */;
 106 
 107         if (new_swapfs_minfree == 0) {
 108                 /*
 109                  * We set this lower than we'd like here, 2Mb, because we
 110                  * always boot on swapfs. It's up to a safer value,
 111                  * swapfs_desfree, when/if we add physical swap devices
 112                  * in swapadd(). Users who want to change the amount of
 113                  * memory that can be used as swap space should do so by
 114                  * setting swapfs_desfree at boot time, not swapfs_minfree.
 115                  * However, swapfs_minfree is tunable by install as a
 116                  * workaround for bugid 1147463.
 117                  */
 118                 new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3);
 119         }
 120 
 121         /*
 122          * priv processes can reserve memory as swap as long as availrmem
 123          * remains greater than swapfs_minfree; in the case of non-priv
 124          * processes, memory can be reserved as swap only if availrmem
 125          * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
 126          * swapfs_reserve amount of memswap is not available to non-priv
 127          * processes. This protects daemons such as automounter dying
 128          * as a result of application processes eating away almost entire
 129          * membased swap. This safeguard becomes useless if apps are run
 130          * with root access.
 131          *
 132          * set swapfs_reserve to a minimum of 4Mb or 1/128 of physmem whichever
 133          * is greater up to the limit of 128 MB.
 134          */
 135         if (new_swapfs_reserve == 0)
 136                 new_swapfs_reserve = MIN(btopr(128 * 1024 * 1024),
 137                     MAX(btopr(4 * 1024 * 1024), pgs >> 7));
 138 
 139         /* Test basic numeric viability. */
 140         if (new_swapfs_minfree > pgs)
 141                 return (0);
 142 
 143         /* Equivalent test to anon_resvmem() check. */
 144         if (availrmem < new_swapfs_minfree) {
 145                 /*
 146                  * If ism pages are being used, then there must be agreement
 147                  * between these two policies.
 148                  */
 149                 if ((availrmem > segspt_minfree) && (segspt_minfree > 0)) {
 150                         new_swapfs_minfree = segspt_minfree;
 151                 } else {
 152                         return (0);
 153                 }
 154         }
 155 
 156         swapfs_desfree = new_swapfs_desfree;
 157         swapfs_minfree = new_swapfs_minfree;
 158         swapfs_reserve = new_swapfs_reserve;
 159 
 160         return (1);
 161 }
 162 
 163 /*ARGSUSED1*/
 164 int
 165 swapinit(int fstype, char *name)
 166 {                                                       /* reserve for mp */
 167         ssize_t sw_freelist_size = klustsize / PAGESIZE * 2;
 168         int i, error;
 169 
 170         static const fs_operation_def_t swap_vfsops[] = {
 171                 VFSNAME_SYNC, { .vfs_sync = swap_sync },
 172                 NULL, NULL
 173         };
 174 
 175         SWAPFS_PRINT(SWAP_SUBR, "swapinit\n", 0, 0, 0, 0, 0);
 176         mutex_init(&swapfs_lock, NULL, MUTEX_DEFAULT, NULL);
 177 
 178         swap_vnodes = kmem_zalloc(MAX_SWAP_VNODES * sizeof (struct vnode *),
 179             KM_SLEEP);
 180 
 181         swapfs_recalc_save_initial();
 182         if (!swapfs_recalc(physmem))
 183                 cmn_err(CE_PANIC, "swapfs_minfree(%lu) > physmem(%lu)",
 184                     swapfs_minfree, physmem);
 185 
 186         /*
 187          * Arrange for a callback on memory size change.
 188          */
 189         swap_init_mem_config();
 190 
 191         sw_ar = (struct async_reqs *)
 192             kmem_zalloc(sw_freelist_size*sizeof (struct async_reqs), KM_SLEEP);
 193 
 194         error = vfs_setfsops(fstype, swap_vfsops, NULL);
 195         if (error != 0) {
 196                 cmn_err(CE_WARN, "swapinit: bad vfs ops template");
 197                 return (error);
 198         }
 199 
 200         error = vn_make_ops(name, swap_vnodeops_template, &swap_vnodeops);
 201         if (error != 0) {
 202                 (void) vfs_freevfsops_by_type(fstype);
 203                 cmn_err(CE_WARN, "swapinit: bad vnode ops template");
 204                 return (error);
 205         }
 206         sw_freelist = sw_ar;
 207         for (i = 0; i < sw_freelist_size - 1; i++)
 208                 sw_ar[i].a_next = &sw_ar[i + 1];
 209 
 210         return (0);
 211 }
 212 
 213 /*
 214  * Get a swapfs vnode corresponding to the specified identifier.
 215  */
 216 struct vnode *
 217 swapfs_getvp(ulong_t vidx)
 218 {
 219         struct vnode *vp;
 220 
 221         vp = swap_vnodes[vidx];
 222         if (vp) {
 223                 return (vp);
 224         }
 225 
 226         mutex_enter(&swapfs_lock);
 227         vp = swap_vnodes[vidx];
 228         if (vp == NULL) {
 229                 vp = vn_alloc(KM_SLEEP);
 230                 vn_setops(vp, swap_vnodeops);
 231                 vp->v_type = VREG;
 232                 vp->v_flag |= (VISSWAP|VISSWAPFS);
 233                 swap_vnodes[vidx] = vp;
 234                 swapfs_vpcount++;
 235         }
 236         mutex_exit(&swapfs_lock);
 237         return (vp);
 238 }
 239 
 240 int swap_lo;
 241 
 242 /*ARGSUSED*/
 243 static int
 244 swap_sync(struct vfs *vfsp, short flag, struct cred *cr)
 245 {
 246         struct vnode *vp;
 247         int i;
 248 
 249         if (!(flag & SYNC_ALL))
 250                 return (1);
 251 
 252         /*
 253          * assumes that we are the only one left to access this so that
 254          * no need to use swapfs_lock (since it's staticly defined)
 255          */
 256         for (i = 0; i < MAX_SWAP_VNODES; i++) {
 257                 vp = swap_vnodes[i];
 258                 if (vp) {
 259                         VN_HOLD(vp);
 260                         (void) VOP_PUTPAGE(vp, (offset_t)0, 0,
 261                             (B_ASYNC | B_FREE), kcred, NULL);
 262                         VN_RELE(vp);
 263                 }
 264         }
 265         return (0);
 266 }
 267 
 268 extern int sw_pending_size;
 269 
 270 /*
 271  * Take an async request off the pending queue
 272  */
 273 struct async_reqs *
 274 sw_getreq()
 275 {
 276         struct async_reqs *arg;
 277 
 278         mutex_enter(&swapfs_lock);
 279         arg = sw_pendlist;
 280         if (arg) {
 281                 sw_pendlist = arg->a_next;
 282                 arg->a_next = NULL;
 283                 sw_pending_size -= PAGESIZE;
 284         }
 285         ASSERT(sw_pending_size >= 0);
 286         mutex_exit(&swapfs_lock);
 287         return (arg);
 288 }
 289 
 290 /*
 291  * Put an async request on the pending queue
 292  */
 293 void
 294 sw_putreq(struct async_reqs *arg)
 295 {
 296         /* Hold onto it */
 297         VN_HOLD(arg->a_vp);
 298 
 299         mutex_enter(&swapfs_lock);
 300         arg->a_next = sw_pendlist;
 301         sw_pendlist = arg;
 302         sw_pending_size += PAGESIZE;
 303         mutex_exit(&swapfs_lock);
 304 }
 305 
 306 /*
 307  * Put an async request back on the pending queue
 308  */
 309 void
 310 sw_putbackreq(struct async_reqs *arg)
 311 {
 312         mutex_enter(&swapfs_lock);
 313         arg->a_next = sw_pendlist;
 314         sw_pendlist = arg;
 315         sw_pending_size += PAGESIZE;
 316         mutex_exit(&swapfs_lock);
 317 }
 318 
 319 /*
 320  * Take an async request structure off the free list
 321  */
 322 struct async_reqs *
 323 sw_getfree()
 324 {
 325         struct async_reqs *arg;
 326 
 327         mutex_enter(&swapfs_lock);
 328         arg = sw_freelist;
 329         if (arg) {
 330                 sw_freelist = arg->a_next;
 331                 arg->a_next = NULL;
 332         }
 333         mutex_exit(&swapfs_lock);
 334         return (arg);
 335 }
 336 
 337 /*
 338  * Put an async request structure on the free list
 339  */
 340 void
 341 sw_putfree(struct async_reqs *arg)
 342 {
 343         /* Release our hold - should have locked the page by now */
 344         VN_RELE(arg->a_vp);
 345 
 346         mutex_enter(&swapfs_lock);
 347         arg->a_next = sw_freelist;
 348         sw_freelist = arg;
 349         mutex_exit(&swapfs_lock);
 350 }
 351 
 352 static pgcnt_t swapfs_pending_delete;
 353 
 354 /*ARGSUSED*/
 355 static void
 356 swap_mem_config_post_add(
 357         void *arg,
 358         pgcnt_t delta_swaps)
 359 {
 360         (void) swapfs_recalc(physmem - swapfs_pending_delete);
 361 }
 362 
 363 /*ARGSUSED*/
 364 static int
 365 swap_mem_config_pre_del(
 366         void *arg,
 367         pgcnt_t delta_swaps)
 368 {
 369         pgcnt_t nv;
 370 
 371         nv = atomic_add_long_nv(&swapfs_pending_delete, (spgcnt_t)delta_swaps);
 372         if (!swapfs_recalc(physmem - nv)) {
 373                 /*
 374                  * Tidy-up is done by the call to post_del which
 375                  * is always made.
 376                  */
 377                 cmn_err(CE_NOTE, "Memory operation refused to ensure system "
 378                     "doesn't deadlock due to excessive consumption by swapfs.");
 379                 return (EBUSY);
 380         }
 381         return (0);
 382 }
 383 
 384 /*ARGSUSED*/
 385 static void
 386 swap_mem_config_post_del(
 387         void *arg,
 388         pgcnt_t delta_swaps,
 389         int cancelled)
 390 {
 391         pgcnt_t nv;
 392 
 393         nv = atomic_add_long_nv(&swapfs_pending_delete, -(spgcnt_t)delta_swaps);
 394         (void) swapfs_recalc(physmem - nv);
 395 }
 396 
 397 static kphysm_setup_vector_t swap_mem_config_vec = {
 398         KPHYSM_SETUP_VECTOR_VERSION,
 399         swap_mem_config_post_add,
 400         swap_mem_config_pre_del,
 401         swap_mem_config_post_del,
 402 };
 403 
 404 static void
 405 swap_init_mem_config(void)
 406 {
 407         int ret;
 408 
 409         ret = kphysm_setup_func_register(&swap_mem_config_vec, (void *)NULL);
 410         ASSERT(ret == 0);
 411 }