Commit | Line | Data |
---|---|---|
a50777c7 DM |
1 | /****************************************************************************** |
2 | * Xen selfballoon driver (and optional frontswap self-shrinking driver) | |
3 | * | |
4 | * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. | |
5 | * | |
6 | * This code complements the cleancache and frontswap patchsets to optimize | |
7 | * support for Xen Transcendent Memory ("tmem"). The policy it implements | |
8 | * is rudimentary and will likely improve over time, but it does work well | |
9 | * enough today. | |
10 | * | |
11 | * Two functionalities are implemented here which both use "control theory" | |
12 | * (feedback) to optimize memory utilization. In a virtualized environment | |
13 | * such as Xen, RAM is often a scarce resource and we would like to ensure | |
14 | * that each of a possibly large number of virtual machines is using RAM | |
15 | * efficiently, i.e. using as little as possible when under light load | |
16 | * and obtaining as much as possible when memory demands are high. | |
17 | * Since RAM needs vary highly dynamically and sometimes dramatically, | |
18 | * "hysteresis" is used, that is, memory target is determined not just | |
19 | * on current data but also on past data stored in the system. | |
20 | * | |
21 | * "Selfballooning" creates memory pressure by managing the Xen balloon | |
22 | * driver to decrease and increase available kernel memory, driven | |
23 | * largely by the target value of "Committed_AS" (see /proc/meminfo). | |
24 | * Since Committed_AS does not account for clean mapped pages (i.e. pages | |
25 | * in RAM that are identical to pages on disk), selfballooning has the | |
26 | * affect of pushing less frequently used clean pagecache pages out of | |
27 | * kernel RAM and, presumably using cleancache, into Xen tmem where | |
28 | * Xen can more efficiently optimize RAM utilization for such pages. | |
29 | * | |
30 | * When kernel memory demand unexpectedly increases faster than Xen, via | |
31 | * the selfballoon driver, is able to (or chooses to) provide usable RAM, | |
32 | * the kernel may invoke swapping. In most cases, frontswap is able | |
33 | * to absorb this swapping into Xen tmem. However, due to the fact | |
34 | * that the kernel swap subsystem assumes swapping occurs to a disk, | |
35 | * swapped pages may sit on the disk for a very long time; even if | |
36 | * the kernel knows the page will never be used again. This is because | |
37 | * the disk space costs very little and can be overwritten when | |
38 | * necessary. When such stale pages are in frontswap, however, they | |
39 | * are taking up valuable real estate. "Frontswap selfshrinking" works | |
40 | * to resolve this: When frontswap activity is otherwise stable | |
41 | * and the guest kernel is not under memory pressure, the "frontswap | |
42 | * selfshrinking" accounts for this by providing pressure to remove some | |
43 | * pages from frontswap and return them to kernel memory. | |
44 | * | |
45 | * For both "selfballooning" and "frontswap-selfshrinking", a worker | |
46 | * thread is used and sysfs tunables are provided to adjust the frequency | |
47 | * and rate of adjustments to achieve the goal, as well as to disable one | |
48 | * or both functions independently. | |
49 | * | |
50 | * While some argue that this functionality can and should be implemented | |
51 | * in userspace, it has been observed that bad things happen (e.g. OOMs). | |
52 | * | |
53 | * System configuration note: Selfballooning should not be enabled on | |
54 | * systems without a sufficiently large swap device configured; for best | |
55 | * results, it is recommended that total swap be increased by the size | |
56 | * of the guest memory. Also, while technically not required to be | |
57 | * configured, it is highly recommended that frontswap also be configured | |
58 | * and enabled when selfballooning is running. So, selfballooning | |
59 | * is disabled by default if frontswap is not configured and can only | |
60 | * be enabled with the "selfballooning" kernel boot option; similarly | |
61 | * selfballooning is enabled by default if frontswap is configured and | |
62 | * can be disabled with the "noselfballooning" kernel boot option. Finally, | |
63 | * when frontswap is configured, frontswap-selfshrinking can be disabled | |
64 | * with the "noselfshrink" kernel boot option. | |
65 | * | |
66 | * Selfballooning is disallowed in domain0 and force-disabled. | |
67 | * | |
68 | */ | |
69 | ||
70 | #include <linux/kernel.h> | |
71 | #include <linux/mm.h> | |
72 | #include <linux/mman.h> | |
4fec0e0b | 73 | #include <linux/module.h> |
0642d2ed | 74 | #include <linux/workqueue.h> |
a50777c7 | 75 | #include <xen/balloon.h> |
a50777c7 | 76 | #include <xen/tmem.h> |
0642d2ed | 77 | #include <xen/xen.h> |
a50777c7 DM |
78 | |
79 | /* Enable/disable with sysfs. */ | |
80 | static int xen_selfballooning_enabled __read_mostly; | |
81 | ||
82 | /* | |
83 | * Controls rate at which memory target (this iteration) approaches | |
84 | * ultimate goal when memory need is increasing (up-hysteresis) or | |
85 | * decreasing (down-hysteresis). Higher values of hysteresis cause | |
86 | * slower increases/decreases. The default values for the various | |
87 | * parameters were deemed reasonable by experimentation, may be | |
88 | * workload-dependent, and can all be adjusted via sysfs. | |
89 | */ | |
90 | static unsigned int selfballoon_downhysteresis __read_mostly = 8; | |
91 | static unsigned int selfballoon_uphysteresis __read_mostly = 1; | |
92 | ||
93 | /* In HZ, controls frequency of worker invocation. */ | |
94 | static unsigned int selfballoon_interval __read_mostly = 5; | |
95 | ||
96 | static void selfballoon_process(struct work_struct *work); | |
97 | static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); | |
98 | ||
99 | #ifdef CONFIG_FRONTSWAP | |
100 | #include <linux/frontswap.h> | |
101 | ||
102 | /* Enable/disable with sysfs. */ | |
103 | static bool frontswap_selfshrinking __read_mostly; | |
104 | ||
105 | /* Enable/disable with kernel boot option. */ | |
106 | static bool use_frontswap_selfshrink __initdata = true; | |
107 | ||
108 | /* | |
109 | * The default values for the following parameters were deemed reasonable | |
110 | * by experimentation, may be workload-dependent, and can all be | |
111 | * adjusted via sysfs. | |
112 | */ | |
113 | ||
114 | /* Control rate for frontswap shrinking. Higher hysteresis is slower. */ | |
115 | static unsigned int frontswap_hysteresis __read_mostly = 20; | |
116 | ||
117 | /* | |
118 | * Number of selfballoon worker invocations to wait before observing that | |
119 | * frontswap selfshrinking should commence. Note that selfshrinking does | |
120 | * not use a separate worker thread. | |
121 | */ | |
122 | static unsigned int frontswap_inertia __read_mostly = 3; | |
123 | ||
124 | /* Countdown to next invocation of frontswap_shrink() */ | |
125 | static unsigned long frontswap_inertia_counter; | |
126 | ||
127 | /* | |
128 | * Invoked by the selfballoon worker thread, uses current number of pages | |
129 | * in frontswap (frontswap_curr_pages()), previous status, and control | |
130 | * values (hysteresis and inertia) to determine if frontswap should be | |
131 | * shrunk and what the new frontswap size should be. Note that | |
132 | * frontswap_shrink is essentially a partial swapoff that immediately | |
133 | * transfers pages from the "swap device" (frontswap) back into kernel | |
134 | * RAM; despite the name, frontswap "shrinking" is very different from | |
135 | * the "shrinker" interface used by the kernel MM subsystem to reclaim | |
136 | * memory. | |
137 | */ | |
138 | static void frontswap_selfshrink(void) | |
139 | { | |
140 | static unsigned long cur_frontswap_pages; | |
141 | static unsigned long last_frontswap_pages; | |
142 | static unsigned long tgt_frontswap_pages; | |
143 | ||
144 | last_frontswap_pages = cur_frontswap_pages; | |
145 | cur_frontswap_pages = frontswap_curr_pages(); | |
146 | if (!cur_frontswap_pages || | |
147 | (cur_frontswap_pages > last_frontswap_pages)) { | |
148 | frontswap_inertia_counter = frontswap_inertia; | |
149 | return; | |
150 | } | |
151 | if (frontswap_inertia_counter && --frontswap_inertia_counter) | |
152 | return; | |
153 | if (cur_frontswap_pages <= frontswap_hysteresis) | |
154 | tgt_frontswap_pages = 0; | |
155 | else | |
156 | tgt_frontswap_pages = cur_frontswap_pages - | |
157 | (cur_frontswap_pages / frontswap_hysteresis); | |
158 | frontswap_shrink(tgt_frontswap_pages); | |
159 | } | |
160 | ||
161 | static int __init xen_nofrontswap_selfshrink_setup(char *s) | |
162 | { | |
163 | use_frontswap_selfshrink = false; | |
164 | return 1; | |
165 | } | |
166 | ||
167 | __setup("noselfshrink", xen_nofrontswap_selfshrink_setup); | |
168 | ||
169 | /* Disable with kernel boot option. */ | |
170 | static bool use_selfballooning __initdata = true; | |
171 | ||
172 | static int __init xen_noselfballooning_setup(char *s) | |
173 | { | |
174 | use_selfballooning = false; | |
175 | return 1; | |
176 | } | |
177 | ||
178 | __setup("noselfballooning", xen_noselfballooning_setup); | |
179 | #else /* !CONFIG_FRONTSWAP */ | |
180 | /* Enable with kernel boot option. */ | |
181 | static bool use_selfballooning __initdata = false; | |
182 | ||
183 | static int __init xen_selfballooning_setup(char *s) | |
184 | { | |
185 | use_selfballooning = true; | |
186 | return 1; | |
187 | } | |
188 | ||
189 | __setup("selfballooning", xen_selfballooning_setup); | |
190 | #endif /* CONFIG_FRONTSWAP */ | |
191 | ||
192 | /* | |
193 | * Use current balloon size, the goal (vm_committed_as), and hysteresis | |
194 | * parameters to set a new target balloon size | |
195 | */ | |
196 | static void selfballoon_process(struct work_struct *work) | |
197 | { | |
198 | unsigned long cur_pages, goal_pages, tgt_pages; | |
199 | bool reset_timer = false; | |
200 | ||
201 | if (xen_selfballooning_enabled) { | |
202 | cur_pages = balloon_stats.current_pages; | |
203 | tgt_pages = cur_pages; /* default is no change */ | |
204 | goal_pages = percpu_counter_read_positive(&vm_committed_as) + | |
205 | balloon_stats.current_pages - totalram_pages; | |
206 | #ifdef CONFIG_FRONTSWAP | |
207 | /* allow space for frontswap pages to be repatriated */ | |
208 | if (frontswap_selfshrinking && frontswap_enabled) | |
209 | goal_pages += frontswap_curr_pages(); | |
210 | #endif | |
211 | if (cur_pages > goal_pages) | |
212 | tgt_pages = cur_pages - | |
213 | ((cur_pages - goal_pages) / | |
214 | selfballoon_downhysteresis); | |
215 | else if (cur_pages < goal_pages) | |
216 | tgt_pages = cur_pages + | |
217 | ((goal_pages - cur_pages) / | |
218 | selfballoon_uphysteresis); | |
219 | /* else if cur_pages == goal_pages, no change */ | |
220 | balloon_set_new_target(tgt_pages); | |
221 | reset_timer = true; | |
222 | } | |
223 | #ifdef CONFIG_FRONTSWAP | |
224 | if (frontswap_selfshrinking && frontswap_enabled) { | |
225 | frontswap_selfshrink(); | |
226 | reset_timer = true; | |
227 | } | |
228 | #endif | |
229 | if (reset_timer) | |
230 | schedule_delayed_work(&selfballoon_worker, | |
231 | selfballoon_interval * HZ); | |
232 | } | |
233 | ||
234 | #ifdef CONFIG_SYSFS | |
235 | ||
236 | #include <linux/sysdev.h> | |
237 | #include <linux/capability.h> | |
238 | ||
239 | #define SELFBALLOON_SHOW(name, format, args...) \ | |
240 | static ssize_t show_##name(struct sys_device *dev, \ | |
241 | struct sysdev_attribute *attr, \ | |
242 | char *buf) \ | |
243 | { \ | |
244 | return sprintf(buf, format, ##args); \ | |
245 | } | |
246 | ||
247 | SELFBALLOON_SHOW(selfballooning, "%d\n", xen_selfballooning_enabled); | |
248 | ||
249 | static ssize_t store_selfballooning(struct sys_device *dev, | |
250 | struct sysdev_attribute *attr, | |
251 | const char *buf, | |
252 | size_t count) | |
253 | { | |
254 | bool was_enabled = xen_selfballooning_enabled; | |
255 | unsigned long tmp; | |
256 | int err; | |
257 | ||
258 | if (!capable(CAP_SYS_ADMIN)) | |
259 | return -EPERM; | |
260 | ||
261 | err = strict_strtoul(buf, 10, &tmp); | |
262 | if (err || ((tmp != 0) && (tmp != 1))) | |
263 | return -EINVAL; | |
264 | ||
265 | xen_selfballooning_enabled = !!tmp; | |
266 | if (!was_enabled && xen_selfballooning_enabled) | |
267 | schedule_delayed_work(&selfballoon_worker, | |
268 | selfballoon_interval * HZ); | |
269 | ||
270 | return count; | |
271 | } | |
272 | ||
273 | static SYSDEV_ATTR(selfballooning, S_IRUGO | S_IWUSR, | |
274 | show_selfballooning, store_selfballooning); | |
275 | ||
276 | SELFBALLOON_SHOW(selfballoon_interval, "%d\n", selfballoon_interval); | |
277 | ||
278 | static ssize_t store_selfballoon_interval(struct sys_device *dev, | |
279 | struct sysdev_attribute *attr, | |
280 | const char *buf, | |
281 | size_t count) | |
282 | { | |
283 | unsigned long val; | |
284 | int err; | |
285 | ||
286 | if (!capable(CAP_SYS_ADMIN)) | |
287 | return -EPERM; | |
288 | err = strict_strtoul(buf, 10, &val); | |
289 | if (err || val == 0) | |
290 | return -EINVAL; | |
291 | selfballoon_interval = val; | |
292 | return count; | |
293 | } | |
294 | ||
295 | static SYSDEV_ATTR(selfballoon_interval, S_IRUGO | S_IWUSR, | |
296 | show_selfballoon_interval, store_selfballoon_interval); | |
297 | ||
298 | SELFBALLOON_SHOW(selfballoon_downhys, "%d\n", selfballoon_downhysteresis); | |
299 | ||
300 | static ssize_t store_selfballoon_downhys(struct sys_device *dev, | |
301 | struct sysdev_attribute *attr, | |
302 | const char *buf, | |
303 | size_t count) | |
304 | { | |
305 | unsigned long val; | |
306 | int err; | |
307 | ||
308 | if (!capable(CAP_SYS_ADMIN)) | |
309 | return -EPERM; | |
310 | err = strict_strtoul(buf, 10, &val); | |
311 | if (err || val == 0) | |
312 | return -EINVAL; | |
313 | selfballoon_downhysteresis = val; | |
314 | return count; | |
315 | } | |
316 | ||
317 | static SYSDEV_ATTR(selfballoon_downhysteresis, S_IRUGO | S_IWUSR, | |
318 | show_selfballoon_downhys, store_selfballoon_downhys); | |
319 | ||
320 | ||
321 | SELFBALLOON_SHOW(selfballoon_uphys, "%d\n", selfballoon_uphysteresis); | |
322 | ||
323 | static ssize_t store_selfballoon_uphys(struct sys_device *dev, | |
324 | struct sysdev_attribute *attr, | |
325 | const char *buf, | |
326 | size_t count) | |
327 | { | |
328 | unsigned long val; | |
329 | int err; | |
330 | ||
331 | if (!capable(CAP_SYS_ADMIN)) | |
332 | return -EPERM; | |
333 | err = strict_strtoul(buf, 10, &val); | |
334 | if (err || val == 0) | |
335 | return -EINVAL; | |
336 | selfballoon_uphysteresis = val; | |
337 | return count; | |
338 | } | |
339 | ||
340 | static SYSDEV_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR, | |
341 | show_selfballoon_uphys, store_selfballoon_uphys); | |
342 | ||
343 | #ifdef CONFIG_FRONTSWAP | |
344 | SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking); | |
345 | ||
346 | static ssize_t store_frontswap_selfshrinking(struct sys_device *dev, | |
347 | struct sysdev_attribute *attr, | |
348 | const char *buf, | |
349 | size_t count) | |
350 | { | |
351 | bool was_enabled = frontswap_selfshrinking; | |
352 | unsigned long tmp; | |
353 | int err; | |
354 | ||
355 | if (!capable(CAP_SYS_ADMIN)) | |
356 | return -EPERM; | |
357 | err = strict_strtoul(buf, 10, &tmp); | |
358 | if (err || ((tmp != 0) && (tmp != 1))) | |
359 | return -EINVAL; | |
360 | frontswap_selfshrinking = !!tmp; | |
361 | if (!was_enabled && !xen_selfballooning_enabled && | |
362 | frontswap_selfshrinking) | |
363 | schedule_delayed_work(&selfballoon_worker, | |
364 | selfballoon_interval * HZ); | |
365 | ||
366 | return count; | |
367 | } | |
368 | ||
369 | static SYSDEV_ATTR(frontswap_selfshrinking, S_IRUGO | S_IWUSR, | |
370 | show_frontswap_selfshrinking, store_frontswap_selfshrinking); | |
371 | ||
372 | SELFBALLOON_SHOW(frontswap_inertia, "%d\n", frontswap_inertia); | |
373 | ||
374 | static ssize_t store_frontswap_inertia(struct sys_device *dev, | |
375 | struct sysdev_attribute *attr, | |
376 | const char *buf, | |
377 | size_t count) | |
378 | { | |
379 | unsigned long val; | |
380 | int err; | |
381 | ||
382 | if (!capable(CAP_SYS_ADMIN)) | |
383 | return -EPERM; | |
384 | err = strict_strtoul(buf, 10, &val); | |
385 | if (err || val == 0) | |
386 | return -EINVAL; | |
387 | frontswap_inertia = val; | |
388 | frontswap_inertia_counter = val; | |
389 | return count; | |
390 | } | |
391 | ||
392 | static SYSDEV_ATTR(frontswap_inertia, S_IRUGO | S_IWUSR, | |
393 | show_frontswap_inertia, store_frontswap_inertia); | |
394 | ||
395 | SELFBALLOON_SHOW(frontswap_hysteresis, "%d\n", frontswap_hysteresis); | |
396 | ||
397 | static ssize_t store_frontswap_hysteresis(struct sys_device *dev, | |
398 | struct sysdev_attribute *attr, | |
399 | const char *buf, | |
400 | size_t count) | |
401 | { | |
402 | unsigned long val; | |
403 | int err; | |
404 | ||
405 | if (!capable(CAP_SYS_ADMIN)) | |
406 | return -EPERM; | |
407 | err = strict_strtoul(buf, 10, &val); | |
408 | if (err || val == 0) | |
409 | return -EINVAL; | |
410 | frontswap_hysteresis = val; | |
411 | return count; | |
412 | } | |
413 | ||
414 | static SYSDEV_ATTR(frontswap_hysteresis, S_IRUGO | S_IWUSR, | |
415 | show_frontswap_hysteresis, store_frontswap_hysteresis); | |
416 | ||
417 | #endif /* CONFIG_FRONTSWAP */ | |
418 | ||
419 | static struct attribute *selfballoon_attrs[] = { | |
420 | &attr_selfballooning.attr, | |
421 | &attr_selfballoon_interval.attr, | |
422 | &attr_selfballoon_downhysteresis.attr, | |
423 | &attr_selfballoon_uphysteresis.attr, | |
424 | #ifdef CONFIG_FRONTSWAP | |
425 | &attr_frontswap_selfshrinking.attr, | |
426 | &attr_frontswap_hysteresis.attr, | |
427 | &attr_frontswap_inertia.attr, | |
428 | #endif | |
429 | NULL | |
430 | }; | |
431 | ||
432 | static struct attribute_group selfballoon_group = { | |
433 | .name = "selfballoon", | |
434 | .attrs = selfballoon_attrs | |
435 | }; | |
436 | #endif | |
437 | ||
438 | int register_xen_selfballooning(struct sys_device *sysdev) | |
439 | { | |
440 | int error = -1; | |
441 | ||
442 | #ifdef CONFIG_SYSFS | |
443 | error = sysfs_create_group(&sysdev->kobj, &selfballoon_group); | |
444 | #endif | |
445 | return error; | |
446 | } | |
447 | EXPORT_SYMBOL(register_xen_selfballooning); | |
448 | ||
449 | static int __init xen_selfballoon_init(void) | |
450 | { | |
451 | bool enable = false; | |
452 | ||
453 | if (!xen_domain()) | |
454 | return -ENODEV; | |
455 | ||
456 | if (xen_initial_domain()) { | |
457 | pr_info("xen/balloon: Xen selfballooning driver " | |
458 | "disabled for domain0.\n"); | |
459 | return -ENODEV; | |
460 | } | |
461 | ||
462 | xen_selfballooning_enabled = tmem_enabled && use_selfballooning; | |
463 | if (xen_selfballooning_enabled) { | |
464 | pr_info("xen/balloon: Initializing Xen " | |
465 | "selfballooning driver.\n"); | |
466 | enable = true; | |
467 | } | |
468 | #ifdef CONFIG_FRONTSWAP | |
469 | frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink; | |
470 | if (frontswap_selfshrinking) { | |
471 | pr_info("xen/balloon: Initializing frontswap " | |
472 | "selfshrinking driver.\n"); | |
473 | enable = true; | |
474 | } | |
475 | #endif | |
476 | if (!enable) | |
477 | return -ENODEV; | |
478 | ||
479 | schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); | |
480 | ||
481 | return 0; | |
482 | } | |
483 | ||
484 | subsys_initcall(xen_selfballoon_init); | |
485 | ||
486 | MODULE_LICENSE("GPL"); |