Commit | Line | Data |
---|---|---|
a50777c7 DM |
1 | /****************************************************************************** |
2 | * Xen selfballoon driver (and optional frontswap self-shrinking driver) | |
3 | * | |
4 | * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. | |
5 | * | |
6 | * This code complements the cleancache and frontswap patchsets to optimize | |
7 | * support for Xen Transcendent Memory ("tmem"). The policy it implements | |
8 | * is rudimentary and will likely improve over time, but it does work well | |
9 | * enough today. | |
10 | * | |
11 | * Two functionalities are implemented here which both use "control theory" | |
12 | * (feedback) to optimize memory utilization. In a virtualized environment | |
13 | * such as Xen, RAM is often a scarce resource and we would like to ensure | |
14 | * that each of a possibly large number of virtual machines is using RAM | |
15 | * efficiently, i.e. using as little as possible when under light load | |
16 | * and obtaining as much as possible when memory demands are high. | |
17 | * Since RAM needs vary highly dynamically and sometimes dramatically, | |
18 | * "hysteresis" is used, that is, memory target is determined not just | |
19 | * on current data but also on past data stored in the system. | |
20 | * | |
21 | * "Selfballooning" creates memory pressure by managing the Xen balloon | |
22 | * driver to decrease and increase available kernel memory, driven | |
23 | * largely by the target value of "Committed_AS" (see /proc/meminfo). | |
24 | * Since Committed_AS does not account for clean mapped pages (i.e. pages | |
25 | * in RAM that are identical to pages on disk), selfballooning has the | |
26 | * affect of pushing less frequently used clean pagecache pages out of | |
27 | * kernel RAM and, presumably using cleancache, into Xen tmem where | |
28 | * Xen can more efficiently optimize RAM utilization for such pages. | |
29 | * | |
30 | * When kernel memory demand unexpectedly increases faster than Xen, via | |
31 | * the selfballoon driver, is able to (or chooses to) provide usable RAM, | |
32 | * the kernel may invoke swapping. In most cases, frontswap is able | |
33 | * to absorb this swapping into Xen tmem. However, due to the fact | |
34 | * that the kernel swap subsystem assumes swapping occurs to a disk, | |
35 | * swapped pages may sit on the disk for a very long time; even if | |
36 | * the kernel knows the page will never be used again. This is because | |
37 | * the disk space costs very little and can be overwritten when | |
38 | * necessary. When such stale pages are in frontswap, however, they | |
39 | * are taking up valuable real estate. "Frontswap selfshrinking" works | |
40 | * to resolve this: When frontswap activity is otherwise stable | |
41 | * and the guest kernel is not under memory pressure, the "frontswap | |
42 | * selfshrinking" accounts for this by providing pressure to remove some | |
43 | * pages from frontswap and return them to kernel memory. | |
44 | * | |
45 | * For both "selfballooning" and "frontswap-selfshrinking", a worker | |
46 | * thread is used and sysfs tunables are provided to adjust the frequency | |
47 | * and rate of adjustments to achieve the goal, as well as to disable one | |
48 | * or both functions independently. | |
49 | * | |
50 | * While some argue that this functionality can and should be implemented | |
51 | * in userspace, it has been observed that bad things happen (e.g. OOMs). | |
52 | * | |
53 | * System configuration note: Selfballooning should not be enabled on | |
54 | * systems without a sufficiently large swap device configured; for best | |
55 | * results, it is recommended that total swap be increased by the size | |
56 | * of the guest memory. Also, while technically not required to be | |
57 | * configured, it is highly recommended that frontswap also be configured | |
58 | * and enabled when selfballooning is running. So, selfballooning | |
59 | * is disabled by default if frontswap is not configured and can only | |
60 | * be enabled with the "selfballooning" kernel boot option; similarly | |
61 | * selfballooning is enabled by default if frontswap is configured and | |
62 | * can be disabled with the "noselfballooning" kernel boot option. Finally, | |
63 | * when frontswap is configured, frontswap-selfshrinking can be disabled | |
64 | * with the "noselfshrink" kernel boot option. | |
65 | * | |
66 | * Selfballooning is disallowed in domain0 and force-disabled. | |
67 | * | |
68 | */ | |
69 | ||
70 | #include <linux/kernel.h> | |
71 | #include <linux/mm.h> | |
72 | #include <linux/mman.h> | |
0642d2ed | 73 | #include <linux/workqueue.h> |
a50777c7 | 74 | #include <xen/balloon.h> |
a50777c7 | 75 | #include <xen/tmem.h> |
0642d2ed | 76 | #include <xen/xen.h> |
a50777c7 DM |
77 | |
78 | /* Enable/disable with sysfs. */ | |
79 | static int xen_selfballooning_enabled __read_mostly; | |
80 | ||
81 | /* | |
82 | * Controls rate at which memory target (this iteration) approaches | |
83 | * ultimate goal when memory need is increasing (up-hysteresis) or | |
84 | * decreasing (down-hysteresis). Higher values of hysteresis cause | |
85 | * slower increases/decreases. The default values for the various | |
86 | * parameters were deemed reasonable by experimentation, may be | |
87 | * workload-dependent, and can all be adjusted via sysfs. | |
88 | */ | |
89 | static unsigned int selfballoon_downhysteresis __read_mostly = 8; | |
90 | static unsigned int selfballoon_uphysteresis __read_mostly = 1; | |
91 | ||
92 | /* In HZ, controls frequency of worker invocation. */ | |
93 | static unsigned int selfballoon_interval __read_mostly = 5; | |
94 | ||
95 | static void selfballoon_process(struct work_struct *work); | |
96 | static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); | |
97 | ||
98 | #ifdef CONFIG_FRONTSWAP | |
99 | #include <linux/frontswap.h> | |
100 | ||
101 | /* Enable/disable with sysfs. */ | |
102 | static bool frontswap_selfshrinking __read_mostly; | |
103 | ||
104 | /* Enable/disable with kernel boot option. */ | |
105 | static bool use_frontswap_selfshrink __initdata = true; | |
106 | ||
107 | /* | |
108 | * The default values for the following parameters were deemed reasonable | |
109 | * by experimentation, may be workload-dependent, and can all be | |
110 | * adjusted via sysfs. | |
111 | */ | |
112 | ||
113 | /* Control rate for frontswap shrinking. Higher hysteresis is slower. */ | |
114 | static unsigned int frontswap_hysteresis __read_mostly = 20; | |
115 | ||
116 | /* | |
117 | * Number of selfballoon worker invocations to wait before observing that | |
118 | * frontswap selfshrinking should commence. Note that selfshrinking does | |
119 | * not use a separate worker thread. | |
120 | */ | |
121 | static unsigned int frontswap_inertia __read_mostly = 3; | |
122 | ||
123 | /* Countdown to next invocation of frontswap_shrink() */ | |
124 | static unsigned long frontswap_inertia_counter; | |
125 | ||
126 | /* | |
127 | * Invoked by the selfballoon worker thread, uses current number of pages | |
128 | * in frontswap (frontswap_curr_pages()), previous status, and control | |
129 | * values (hysteresis and inertia) to determine if frontswap should be | |
130 | * shrunk and what the new frontswap size should be. Note that | |
131 | * frontswap_shrink is essentially a partial swapoff that immediately | |
132 | * transfers pages from the "swap device" (frontswap) back into kernel | |
133 | * RAM; despite the name, frontswap "shrinking" is very different from | |
134 | * the "shrinker" interface used by the kernel MM subsystem to reclaim | |
135 | * memory. | |
136 | */ | |
137 | static void frontswap_selfshrink(void) | |
138 | { | |
139 | static unsigned long cur_frontswap_pages; | |
140 | static unsigned long last_frontswap_pages; | |
141 | static unsigned long tgt_frontswap_pages; | |
142 | ||
143 | last_frontswap_pages = cur_frontswap_pages; | |
144 | cur_frontswap_pages = frontswap_curr_pages(); | |
145 | if (!cur_frontswap_pages || | |
146 | (cur_frontswap_pages > last_frontswap_pages)) { | |
147 | frontswap_inertia_counter = frontswap_inertia; | |
148 | return; | |
149 | } | |
150 | if (frontswap_inertia_counter && --frontswap_inertia_counter) | |
151 | return; | |
152 | if (cur_frontswap_pages <= frontswap_hysteresis) | |
153 | tgt_frontswap_pages = 0; | |
154 | else | |
155 | tgt_frontswap_pages = cur_frontswap_pages - | |
156 | (cur_frontswap_pages / frontswap_hysteresis); | |
157 | frontswap_shrink(tgt_frontswap_pages); | |
158 | } | |
159 | ||
160 | static int __init xen_nofrontswap_selfshrink_setup(char *s) | |
161 | { | |
162 | use_frontswap_selfshrink = false; | |
163 | return 1; | |
164 | } | |
165 | ||
166 | __setup("noselfshrink", xen_nofrontswap_selfshrink_setup); | |
167 | ||
168 | /* Disable with kernel boot option. */ | |
169 | static bool use_selfballooning __initdata = true; | |
170 | ||
171 | static int __init xen_noselfballooning_setup(char *s) | |
172 | { | |
173 | use_selfballooning = false; | |
174 | return 1; | |
175 | } | |
176 | ||
177 | __setup("noselfballooning", xen_noselfballooning_setup); | |
178 | #else /* !CONFIG_FRONTSWAP */ | |
179 | /* Enable with kernel boot option. */ | |
180 | static bool use_selfballooning __initdata = false; | |
181 | ||
182 | static int __init xen_selfballooning_setup(char *s) | |
183 | { | |
184 | use_selfballooning = true; | |
185 | return 1; | |
186 | } | |
187 | ||
188 | __setup("selfballooning", xen_selfballooning_setup); | |
189 | #endif /* CONFIG_FRONTSWAP */ | |
190 | ||
191 | /* | |
192 | * Use current balloon size, the goal (vm_committed_as), and hysteresis | |
193 | * parameters to set a new target balloon size | |
194 | */ | |
195 | static void selfballoon_process(struct work_struct *work) | |
196 | { | |
197 | unsigned long cur_pages, goal_pages, tgt_pages; | |
198 | bool reset_timer = false; | |
199 | ||
200 | if (xen_selfballooning_enabled) { | |
201 | cur_pages = balloon_stats.current_pages; | |
202 | tgt_pages = cur_pages; /* default is no change */ | |
203 | goal_pages = percpu_counter_read_positive(&vm_committed_as) + | |
204 | balloon_stats.current_pages - totalram_pages; | |
205 | #ifdef CONFIG_FRONTSWAP | |
206 | /* allow space for frontswap pages to be repatriated */ | |
207 | if (frontswap_selfshrinking && frontswap_enabled) | |
208 | goal_pages += frontswap_curr_pages(); | |
209 | #endif | |
210 | if (cur_pages > goal_pages) | |
211 | tgt_pages = cur_pages - | |
212 | ((cur_pages - goal_pages) / | |
213 | selfballoon_downhysteresis); | |
214 | else if (cur_pages < goal_pages) | |
215 | tgt_pages = cur_pages + | |
216 | ((goal_pages - cur_pages) / | |
217 | selfballoon_uphysteresis); | |
218 | /* else if cur_pages == goal_pages, no change */ | |
219 | balloon_set_new_target(tgt_pages); | |
220 | reset_timer = true; | |
221 | } | |
222 | #ifdef CONFIG_FRONTSWAP | |
223 | if (frontswap_selfshrinking && frontswap_enabled) { | |
224 | frontswap_selfshrink(); | |
225 | reset_timer = true; | |
226 | } | |
227 | #endif | |
228 | if (reset_timer) | |
229 | schedule_delayed_work(&selfballoon_worker, | |
230 | selfballoon_interval * HZ); | |
231 | } | |
232 | ||
233 | #ifdef CONFIG_SYSFS | |
234 | ||
235 | #include <linux/sysdev.h> | |
236 | #include <linux/capability.h> | |
237 | ||
238 | #define SELFBALLOON_SHOW(name, format, args...) \ | |
239 | static ssize_t show_##name(struct sys_device *dev, \ | |
240 | struct sysdev_attribute *attr, \ | |
241 | char *buf) \ | |
242 | { \ | |
243 | return sprintf(buf, format, ##args); \ | |
244 | } | |
245 | ||
246 | SELFBALLOON_SHOW(selfballooning, "%d\n", xen_selfballooning_enabled); | |
247 | ||
248 | static ssize_t store_selfballooning(struct sys_device *dev, | |
249 | struct sysdev_attribute *attr, | |
250 | const char *buf, | |
251 | size_t count) | |
252 | { | |
253 | bool was_enabled = xen_selfballooning_enabled; | |
254 | unsigned long tmp; | |
255 | int err; | |
256 | ||
257 | if (!capable(CAP_SYS_ADMIN)) | |
258 | return -EPERM; | |
259 | ||
260 | err = strict_strtoul(buf, 10, &tmp); | |
261 | if (err || ((tmp != 0) && (tmp != 1))) | |
262 | return -EINVAL; | |
263 | ||
264 | xen_selfballooning_enabled = !!tmp; | |
265 | if (!was_enabled && xen_selfballooning_enabled) | |
266 | schedule_delayed_work(&selfballoon_worker, | |
267 | selfballoon_interval * HZ); | |
268 | ||
269 | return count; | |
270 | } | |
271 | ||
272 | static SYSDEV_ATTR(selfballooning, S_IRUGO | S_IWUSR, | |
273 | show_selfballooning, store_selfballooning); | |
274 | ||
275 | SELFBALLOON_SHOW(selfballoon_interval, "%d\n", selfballoon_interval); | |
276 | ||
277 | static ssize_t store_selfballoon_interval(struct sys_device *dev, | |
278 | struct sysdev_attribute *attr, | |
279 | const char *buf, | |
280 | size_t count) | |
281 | { | |
282 | unsigned long val; | |
283 | int err; | |
284 | ||
285 | if (!capable(CAP_SYS_ADMIN)) | |
286 | return -EPERM; | |
287 | err = strict_strtoul(buf, 10, &val); | |
288 | if (err || val == 0) | |
289 | return -EINVAL; | |
290 | selfballoon_interval = val; | |
291 | return count; | |
292 | } | |
293 | ||
294 | static SYSDEV_ATTR(selfballoon_interval, S_IRUGO | S_IWUSR, | |
295 | show_selfballoon_interval, store_selfballoon_interval); | |
296 | ||
297 | SELFBALLOON_SHOW(selfballoon_downhys, "%d\n", selfballoon_downhysteresis); | |
298 | ||
299 | static ssize_t store_selfballoon_downhys(struct sys_device *dev, | |
300 | struct sysdev_attribute *attr, | |
301 | const char *buf, | |
302 | size_t count) | |
303 | { | |
304 | unsigned long val; | |
305 | int err; | |
306 | ||
307 | if (!capable(CAP_SYS_ADMIN)) | |
308 | return -EPERM; | |
309 | err = strict_strtoul(buf, 10, &val); | |
310 | if (err || val == 0) | |
311 | return -EINVAL; | |
312 | selfballoon_downhysteresis = val; | |
313 | return count; | |
314 | } | |
315 | ||
316 | static SYSDEV_ATTR(selfballoon_downhysteresis, S_IRUGO | S_IWUSR, | |
317 | show_selfballoon_downhys, store_selfballoon_downhys); | |
318 | ||
319 | ||
320 | SELFBALLOON_SHOW(selfballoon_uphys, "%d\n", selfballoon_uphysteresis); | |
321 | ||
322 | static ssize_t store_selfballoon_uphys(struct sys_device *dev, | |
323 | struct sysdev_attribute *attr, | |
324 | const char *buf, | |
325 | size_t count) | |
326 | { | |
327 | unsigned long val; | |
328 | int err; | |
329 | ||
330 | if (!capable(CAP_SYS_ADMIN)) | |
331 | return -EPERM; | |
332 | err = strict_strtoul(buf, 10, &val); | |
333 | if (err || val == 0) | |
334 | return -EINVAL; | |
335 | selfballoon_uphysteresis = val; | |
336 | return count; | |
337 | } | |
338 | ||
339 | static SYSDEV_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR, | |
340 | show_selfballoon_uphys, store_selfballoon_uphys); | |
341 | ||
342 | #ifdef CONFIG_FRONTSWAP | |
343 | SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking); | |
344 | ||
345 | static ssize_t store_frontswap_selfshrinking(struct sys_device *dev, | |
346 | struct sysdev_attribute *attr, | |
347 | const char *buf, | |
348 | size_t count) | |
349 | { | |
350 | bool was_enabled = frontswap_selfshrinking; | |
351 | unsigned long tmp; | |
352 | int err; | |
353 | ||
354 | if (!capable(CAP_SYS_ADMIN)) | |
355 | return -EPERM; | |
356 | err = strict_strtoul(buf, 10, &tmp); | |
357 | if (err || ((tmp != 0) && (tmp != 1))) | |
358 | return -EINVAL; | |
359 | frontswap_selfshrinking = !!tmp; | |
360 | if (!was_enabled && !xen_selfballooning_enabled && | |
361 | frontswap_selfshrinking) | |
362 | schedule_delayed_work(&selfballoon_worker, | |
363 | selfballoon_interval * HZ); | |
364 | ||
365 | return count; | |
366 | } | |
367 | ||
368 | static SYSDEV_ATTR(frontswap_selfshrinking, S_IRUGO | S_IWUSR, | |
369 | show_frontswap_selfshrinking, store_frontswap_selfshrinking); | |
370 | ||
371 | SELFBALLOON_SHOW(frontswap_inertia, "%d\n", frontswap_inertia); | |
372 | ||
373 | static ssize_t store_frontswap_inertia(struct sys_device *dev, | |
374 | struct sysdev_attribute *attr, | |
375 | const char *buf, | |
376 | size_t count) | |
377 | { | |
378 | unsigned long val; | |
379 | int err; | |
380 | ||
381 | if (!capable(CAP_SYS_ADMIN)) | |
382 | return -EPERM; | |
383 | err = strict_strtoul(buf, 10, &val); | |
384 | if (err || val == 0) | |
385 | return -EINVAL; | |
386 | frontswap_inertia = val; | |
387 | frontswap_inertia_counter = val; | |
388 | return count; | |
389 | } | |
390 | ||
391 | static SYSDEV_ATTR(frontswap_inertia, S_IRUGO | S_IWUSR, | |
392 | show_frontswap_inertia, store_frontswap_inertia); | |
393 | ||
394 | SELFBALLOON_SHOW(frontswap_hysteresis, "%d\n", frontswap_hysteresis); | |
395 | ||
396 | static ssize_t store_frontswap_hysteresis(struct sys_device *dev, | |
397 | struct sysdev_attribute *attr, | |
398 | const char *buf, | |
399 | size_t count) | |
400 | { | |
401 | unsigned long val; | |
402 | int err; | |
403 | ||
404 | if (!capable(CAP_SYS_ADMIN)) | |
405 | return -EPERM; | |
406 | err = strict_strtoul(buf, 10, &val); | |
407 | if (err || val == 0) | |
408 | return -EINVAL; | |
409 | frontswap_hysteresis = val; | |
410 | return count; | |
411 | } | |
412 | ||
413 | static SYSDEV_ATTR(frontswap_hysteresis, S_IRUGO | S_IWUSR, | |
414 | show_frontswap_hysteresis, store_frontswap_hysteresis); | |
415 | ||
416 | #endif /* CONFIG_FRONTSWAP */ | |
417 | ||
418 | static struct attribute *selfballoon_attrs[] = { | |
419 | &attr_selfballooning.attr, | |
420 | &attr_selfballoon_interval.attr, | |
421 | &attr_selfballoon_downhysteresis.attr, | |
422 | &attr_selfballoon_uphysteresis.attr, | |
423 | #ifdef CONFIG_FRONTSWAP | |
424 | &attr_frontswap_selfshrinking.attr, | |
425 | &attr_frontswap_hysteresis.attr, | |
426 | &attr_frontswap_inertia.attr, | |
427 | #endif | |
428 | NULL | |
429 | }; | |
430 | ||
431 | static struct attribute_group selfballoon_group = { | |
432 | .name = "selfballoon", | |
433 | .attrs = selfballoon_attrs | |
434 | }; | |
435 | #endif | |
436 | ||
437 | int register_xen_selfballooning(struct sys_device *sysdev) | |
438 | { | |
439 | int error = -1; | |
440 | ||
441 | #ifdef CONFIG_SYSFS | |
442 | error = sysfs_create_group(&sysdev->kobj, &selfballoon_group); | |
443 | #endif | |
444 | return error; | |
445 | } | |
446 | EXPORT_SYMBOL(register_xen_selfballooning); | |
447 | ||
448 | static int __init xen_selfballoon_init(void) | |
449 | { | |
450 | bool enable = false; | |
451 | ||
452 | if (!xen_domain()) | |
453 | return -ENODEV; | |
454 | ||
455 | if (xen_initial_domain()) { | |
456 | pr_info("xen/balloon: Xen selfballooning driver " | |
457 | "disabled for domain0.\n"); | |
458 | return -ENODEV; | |
459 | } | |
460 | ||
461 | xen_selfballooning_enabled = tmem_enabled && use_selfballooning; | |
462 | if (xen_selfballooning_enabled) { | |
463 | pr_info("xen/balloon: Initializing Xen " | |
464 | "selfballooning driver.\n"); | |
465 | enable = true; | |
466 | } | |
467 | #ifdef CONFIG_FRONTSWAP | |
468 | frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink; | |
469 | if (frontswap_selfshrinking) { | |
470 | pr_info("xen/balloon: Initializing frontswap " | |
471 | "selfshrinking driver.\n"); | |
472 | enable = true; | |
473 | } | |
474 | #endif | |
475 | if (!enable) | |
476 | return -ENODEV; | |
477 | ||
478 | schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); | |
479 | ||
480 | return 0; | |
481 | } | |
482 | ||
483 | subsys_initcall(xen_selfballoon_init); | |
484 | ||
485 | MODULE_LICENSE("GPL"); |