Commit | Line | Data |
---|---|---|
14c9fda5 DM |
1 | /* |
2 | * ramster.c | |
3 | * | |
4 | * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp. | |
5 | * | |
6 | * RAMster implements peer-to-peer transcendent memory, allowing a "cluster" of | |
7 | * kernels to dynamically pool their RAM so that a RAM-hungry workload on one | |
8 | * machine can temporarily and transparently utilize RAM on another machine | |
9 | * which is presumably idle or running a non-RAM-hungry workload. | |
10 | * | |
11 | * RAMster combines a clustering and messaging foundation based on the ocfs2 | |
12 | * cluster layer with the in-kernel compression implementation of zcache, and | |
13 | * adds code to glue them together. When a page is "put" to RAMster, it is | |
14 | * compressed and stored locally. Periodically, a thread will "remotify" these | |
15 | * pages by sending them via messages to a remote machine. When the page is | |
16 | * later needed as indicated by a page fault, a "get" is issued. If the data | |
17 | * is local, it is uncompressed and the fault is resolved. If the data is | |
18 | * remote, a message is sent to fetch the data and the faulting thread sleeps; | |
19 | * when the data arrives, the thread awakens, the data is decompressed and | |
20 | * the fault is resolved. | |
21 | ||
22 | * As of V5, clusters up to eight nodes are supported; each node can remotify | |
23 | * pages to one specified node, so clusters can be configured as clients to | |
24 | * a "memory server". Some simple policy is in place that will need to be | |
25 | * refined over time. Larger clusters and fault-resistant protocols can also | |
26 | * be added over time. | |
27 | */ | |
28 | ||
29 | #include <linux/module.h> | |
30 | #include <linux/cpu.h> | |
31 | #include <linux/highmem.h> | |
32 | #include <linux/list.h> | |
33 | #include <linux/lzo.h> | |
34 | #include <linux/slab.h> | |
35 | #include <linux/spinlock.h> | |
36 | #include <linux/types.h> | |
37 | #include <linux/atomic.h> | |
38 | #include <linux/frontswap.h> | |
39 | #include "../tmem.h" | |
40 | #include "../zcache.h" | |
41 | #include "../zbud.h" | |
42 | #include "ramster.h" | |
43 | #include "ramster_nodemanager.h" | |
44 | #include "tcp.h" | |
45 | ||
46 | #define RAMSTER_TESTING | |
47 | ||
48 | #ifndef CONFIG_SYSFS | |
49 | #error "ramster needs sysfs to define cluster nodes to use" | |
50 | #endif | |
51 | ||
52 | static bool use_cleancache __read_mostly; | |
53 | static bool use_frontswap __read_mostly; | |
54 | static bool use_frontswap_exclusive_gets __read_mostly; | |
55 | ||
56 | /* These must be sysfs not debugfs as they are checked/used by userland!! */ | |
57 | static unsigned long ramster_interface_revision __read_mostly = | |
58 | R2NM_API_VERSION; /* interface revision must match userspace! */ | |
59 | static unsigned long ramster_pers_remotify_enable __read_mostly; | |
60 | static unsigned long ramster_eph_remotify_enable __read_mostly; | |
61 | static atomic_t ramster_remote_pers_pages = ATOMIC_INIT(0); | |
62 | #define MANUAL_NODES 8 | |
63 | static bool ramster_nodes_manual_up[MANUAL_NODES] __read_mostly; | |
64 | static int ramster_remote_target_nodenum __read_mostly = -1; | |
65 | ||
66 | /* these counters are made available via debugfs */ | |
67 | static long ramster_flnodes; | |
68 | static atomic_t ramster_flnodes_atomic = ATOMIC_INIT(0); | |
69 | static unsigned long ramster_flnodes_max; | |
93cc39b6 WL |
70 | static inline void inc_ramster_flnodes(void) |
71 | { | |
72 | ramster_flnodes = atomic_inc_return(&ramster_flnodes_atomic); | |
73 | if (ramster_flnodes > ramster_flnodes_max) | |
74 | ramster_flnodes_max = ramster_flnodes; | |
75 | } | |
80e9fc9a | 76 | static ssize_t ramster_foreign_eph_pages; |
14c9fda5 | 77 | static atomic_t ramster_foreign_eph_pages_atomic = ATOMIC_INIT(0); |
80e9fc9a | 78 | static ssize_t ramster_foreign_eph_pages_max; |
93cc39b6 WL |
79 | static inline void inc_ramster_foreign_eph_pages(void) |
80 | { | |
81 | ramster_foreign_eph_pages = atomic_inc_return( | |
82 | &ramster_foreign_eph_pages_atomic); | |
83 | if (ramster_foreign_eph_pages > ramster_foreign_eph_pages_max) | |
84 | ramster_foreign_eph_pages_max = ramster_foreign_eph_pages; | |
85 | } | |
80e9fc9a | 86 | static ssize_t ramster_foreign_pers_pages; |
14c9fda5 | 87 | static atomic_t ramster_foreign_pers_pages_atomic = ATOMIC_INIT(0); |
80e9fc9a | 88 | static ssize_t ramster_foreign_pers_pages_max; |
93cc39b6 WL |
89 | static inline void inc_ramster_foreign_pers_pages(void) |
90 | { | |
91 | ramster_foreign_pers_pages = atomic_inc_return( | |
92 | &ramster_foreign_pers_pages_atomic); | |
93 | if (ramster_foreign_pers_pages > ramster_foreign_pers_pages_max) | |
94 | ramster_foreign_pers_pages_max = ramster_foreign_pers_pages; | |
95 | } | |
80e9fc9a KRW |
96 | static ssize_t ramster_eph_pages_remoted; |
97 | static ssize_t ramster_pers_pages_remoted; | |
98 | static ssize_t ramster_eph_pages_remote_failed; | |
99 | static ssize_t ramster_pers_pages_remote_failed; | |
100 | static ssize_t ramster_remote_eph_pages_succ_get; | |
101 | static ssize_t ramster_remote_pers_pages_succ_get; | |
102 | static ssize_t ramster_remote_eph_pages_unsucc_get; | |
103 | static ssize_t ramster_remote_pers_pages_unsucc_get; | |
104 | static ssize_t ramster_pers_pages_remote_nomem; | |
105 | static ssize_t ramster_remote_objects_flushed; | |
106 | static ssize_t ramster_remote_object_flushes_failed; | |
107 | static ssize_t ramster_remote_pages_flushed; | |
108 | static ssize_t ramster_remote_page_flushes_failed; | |
14c9fda5 DM |
109 | /* FIXME frontswap selfshrinking knobs in debugfs? */ |
110 | ||
111 | #ifdef CONFIG_DEBUG_FS | |
112 | #include <linux/debugfs.h> | |
113 | #define zdfs debugfs_create_size_t | |
114 | #define zdfs64 debugfs_create_u64 | |
115 | static int __init ramster_debugfs_init(void) | |
116 | { | |
117 | struct dentry *root = debugfs_create_dir("ramster", NULL); | |
118 | if (root == NULL) | |
119 | return -ENXIO; | |
120 | ||
121 | zdfs("eph_pages_remoted", S_IRUGO, root, &ramster_eph_pages_remoted); | |
122 | zdfs("pers_pages_remoted", S_IRUGO, root, &ramster_pers_pages_remoted); | |
123 | zdfs("eph_pages_remote_failed", S_IRUGO, root, | |
124 | &ramster_eph_pages_remote_failed); | |
125 | zdfs("pers_pages_remote_failed", S_IRUGO, root, | |
126 | &ramster_pers_pages_remote_failed); | |
127 | zdfs("remote_eph_pages_succ_get", S_IRUGO, root, | |
128 | &ramster_remote_eph_pages_succ_get); | |
129 | zdfs("remote_pers_pages_succ_get", S_IRUGO, root, | |
130 | &ramster_remote_pers_pages_succ_get); | |
131 | zdfs("remote_eph_pages_unsucc_get", S_IRUGO, root, | |
132 | &ramster_remote_eph_pages_unsucc_get); | |
133 | zdfs("remote_pers_pages_unsucc_get", S_IRUGO, root, | |
134 | &ramster_remote_pers_pages_unsucc_get); | |
135 | zdfs("pers_pages_remote_nomem", S_IRUGO, root, | |
136 | &ramster_pers_pages_remote_nomem); | |
137 | zdfs("remote_objects_flushed", S_IRUGO, root, | |
138 | &ramster_remote_objects_flushed); | |
139 | zdfs("remote_pages_flushed", S_IRUGO, root, | |
140 | &ramster_remote_pages_flushed); | |
141 | zdfs("remote_object_flushes_failed", S_IRUGO, root, | |
142 | &ramster_remote_object_flushes_failed); | |
143 | zdfs("remote_page_flushes_failed", S_IRUGO, root, | |
144 | &ramster_remote_page_flushes_failed); | |
145 | zdfs("foreign_eph_pages", S_IRUGO, root, | |
146 | &ramster_foreign_eph_pages); | |
147 | zdfs("foreign_eph_pages_max", S_IRUGO, root, | |
148 | &ramster_foreign_eph_pages_max); | |
149 | zdfs("foreign_pers_pages", S_IRUGO, root, | |
150 | &ramster_foreign_pers_pages); | |
151 | zdfs("foreign_pers_pages_max", S_IRUGO, root, | |
152 | &ramster_foreign_pers_pages_max); | |
153 | return 0; | |
154 | } | |
155 | #undef zdebugfs | |
156 | #undef zdfs64 | |
555ee46f BL |
157 | #else |
158 | static inline int ramster_debugfs_init(void) | |
159 | { | |
160 | return 0; | |
161 | } | |
14c9fda5 DM |
162 | #endif |
163 | ||
164 | static LIST_HEAD(ramster_rem_op_list); | |
165 | static DEFINE_SPINLOCK(ramster_rem_op_list_lock); | |
166 | static DEFINE_PER_CPU(struct ramster_preload, ramster_preloads); | |
167 | ||
168 | static DEFINE_PER_CPU(unsigned char *, ramster_remoteputmem1); | |
169 | static DEFINE_PER_CPU(unsigned char *, ramster_remoteputmem2); | |
170 | ||
171 | static struct kmem_cache *ramster_flnode_cache __read_mostly; | |
172 | ||
173 | static struct flushlist_node *ramster_flnode_alloc(struct tmem_pool *pool) | |
174 | { | |
175 | struct flushlist_node *flnode = NULL; | |
176 | struct ramster_preload *kp; | |
177 | ||
178 | kp = &__get_cpu_var(ramster_preloads); | |
179 | flnode = kp->flnode; | |
180 | BUG_ON(flnode == NULL); | |
181 | kp->flnode = NULL; | |
93cc39b6 | 182 | inc_ramster_flnodes(); |
14c9fda5 DM |
183 | return flnode; |
184 | } | |
185 | ||
186 | /* the "flush list" asynchronously collects pages to remotely flush */ | |
187 | #define FLUSH_ENTIRE_OBJECT ((uint32_t)-1) | |
188 | static void ramster_flnode_free(struct flushlist_node *flnode, | |
189 | struct tmem_pool *pool) | |
190 | { | |
191 | int flnodes; | |
192 | ||
193 | flnodes = atomic_dec_return(&ramster_flnodes_atomic); | |
194 | BUG_ON(flnodes < 0); | |
195 | kmem_cache_free(ramster_flnode_cache, flnode); | |
196 | } | |
197 | ||
198 | int ramster_do_preload_flnode(struct tmem_pool *pool) | |
199 | { | |
200 | struct ramster_preload *kp; | |
201 | struct flushlist_node *flnode; | |
202 | int ret = -ENOMEM; | |
203 | ||
204 | BUG_ON(!irqs_disabled()); | |
205 | if (unlikely(ramster_flnode_cache == NULL)) | |
206 | BUG(); | |
207 | kp = &__get_cpu_var(ramster_preloads); | |
208 | flnode = kmem_cache_alloc(ramster_flnode_cache, GFP_ATOMIC); | |
209 | if (unlikely(flnode == NULL) && kp->flnode == NULL) | |
210 | BUG(); /* FIXME handle more gracefully, but how??? */ | |
211 | else if (kp->flnode == NULL) | |
212 | kp->flnode = flnode; | |
213 | else | |
214 | kmem_cache_free(ramster_flnode_cache, flnode); | |
215 | return ret; | |
216 | } | |
217 | ||
218 | /* | |
219 | * Called by the message handler after a (still compressed) page has been | |
220 | * fetched from the remote machine in response to an "is_remote" tmem_get | |
221 | * or persistent tmem_localify. For a tmem_get, "extra" is the address of | |
222 | * the page that is to be filled to successfully resolve the tmem_get; for | |
223 | * a (persistent) tmem_localify, "extra" is NULL (as the data is placed only | |
224 | * in the local zcache). "data" points to "size" bytes of (compressed) data | |
225 | * passed in the message. In the case of a persistent remote get, if | |
226 | * pre-allocation was successful (see ramster_repatriate_preload), the page | |
227 | * is placed into both local zcache and at "extra". | |
228 | */ | |
229 | int ramster_localify(int pool_id, struct tmem_oid *oidp, uint32_t index, | |
230 | char *data, unsigned int size, void *extra) | |
231 | { | |
232 | int ret = -ENOENT; | |
233 | unsigned long flags; | |
234 | struct tmem_pool *pool; | |
235 | bool eph, delete = false; | |
236 | void *pampd, *saved_hb; | |
237 | struct tmem_obj *obj; | |
238 | ||
239 | pool = zcache_get_pool_by_id(LOCAL_CLIENT, pool_id); | |
240 | if (unlikely(pool == NULL)) | |
241 | /* pool doesn't exist anymore */ | |
242 | goto out; | |
243 | eph = is_ephemeral(pool); | |
244 | local_irq_save(flags); /* FIXME: maybe only disable softirqs? */ | |
245 | pampd = tmem_localify_get_pampd(pool, oidp, index, &obj, &saved_hb); | |
246 | if (pampd == NULL) { | |
247 | /* hmmm... must have been a flush while waiting */ | |
248 | #ifdef RAMSTER_TESTING | |
249 | pr_err("UNTESTED pampd==NULL in ramster_localify\n"); | |
250 | #endif | |
251 | if (eph) | |
252 | ramster_remote_eph_pages_unsucc_get++; | |
253 | else | |
254 | ramster_remote_pers_pages_unsucc_get++; | |
255 | obj = NULL; | |
256 | goto finish; | |
257 | } else if (unlikely(!pampd_is_remote(pampd))) { | |
258 | /* hmmm... must have been a dup put while waiting */ | |
259 | #ifdef RAMSTER_TESTING | |
260 | pr_err("UNTESTED dup while waiting in ramster_localify\n"); | |
261 | #endif | |
262 | if (eph) | |
263 | ramster_remote_eph_pages_unsucc_get++; | |
264 | else | |
265 | ramster_remote_pers_pages_unsucc_get++; | |
266 | obj = NULL; | |
267 | pampd = NULL; | |
268 | ret = -EEXIST; | |
269 | goto finish; | |
270 | } else if (size == 0) { | |
271 | /* no remote data, delete the local is_remote pampd */ | |
272 | pampd = NULL; | |
273 | if (eph) | |
274 | ramster_remote_eph_pages_unsucc_get++; | |
275 | else | |
276 | BUG(); | |
277 | delete = true; | |
278 | goto finish; | |
279 | } | |
280 | if (pampd_is_intransit(pampd)) { | |
281 | /* | |
282 | * a pampd is marked intransit if it is remote and space has | |
283 | * been allocated for it locally (note, only happens for | |
284 | * persistent pages, in which case the remote copy is freed) | |
285 | */ | |
286 | BUG_ON(eph); | |
287 | pampd = pampd_mask_intransit_and_remote(pampd); | |
288 | zbud_copy_to_zbud(pampd, data, size); | |
289 | } else { | |
290 | /* | |
291 | * setting pampd to NULL tells tmem_localify_finish to leave | |
292 | * pampd alone... meaning it is left pointing to the | |
293 | * remote copy | |
294 | */ | |
295 | pampd = NULL; | |
296 | obj = NULL; | |
297 | } | |
298 | /* | |
299 | * but in all cases, we decompress direct-to-memory to complete | |
300 | * the remotify and return success | |
301 | */ | |
302 | BUG_ON(extra == NULL); | |
303 | zcache_decompress_to_page(data, size, (struct page *)extra); | |
304 | if (eph) | |
305 | ramster_remote_eph_pages_succ_get++; | |
306 | else | |
307 | ramster_remote_pers_pages_succ_get++; | |
308 | ret = 0; | |
309 | finish: | |
310 | tmem_localify_finish(obj, index, pampd, saved_hb, delete); | |
311 | zcache_put_pool(pool); | |
312 | local_irq_restore(flags); | |
313 | out: | |
314 | return ret; | |
315 | } | |
316 | ||
317 | void ramster_pampd_new_obj(struct tmem_obj *obj) | |
318 | { | |
319 | obj->extra = NULL; | |
320 | } | |
321 | ||
322 | void ramster_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj, | |
323 | bool pool_destroy) | |
324 | { | |
325 | struct flushlist_node *flnode; | |
326 | ||
327 | BUG_ON(preemptible()); | |
328 | if (obj->extra == NULL) | |
329 | return; | |
330 | if (pool_destroy && is_ephemeral(pool)) | |
331 | /* FIXME don't bother with remote eph data for now */ | |
332 | return; | |
333 | BUG_ON(!pampd_is_remote(obj->extra)); | |
334 | flnode = ramster_flnode_alloc(pool); | |
335 | flnode->xh.client_id = pampd_remote_node(obj->extra); | |
336 | flnode->xh.pool_id = pool->pool_id; | |
337 | flnode->xh.oid = obj->oid; | |
338 | flnode->xh.index = FLUSH_ENTIRE_OBJECT; | |
339 | flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_OBJ; | |
340 | spin_lock(&ramster_rem_op_list_lock); | |
341 | list_add(&flnode->rem_op.list, &ramster_rem_op_list); | |
342 | spin_unlock(&ramster_rem_op_list_lock); | |
343 | } | |
344 | ||
345 | /* | |
346 | * Called on a remote persistent tmem_get to attempt to preallocate | |
347 | * local storage for the data contained in the remote persistent page. | |
348 | * If successfully preallocated, returns the pampd, marked as remote and | |
349 | * in_transit. Else returns NULL. Note that the appropriate tmem data | |
350 | * structure must be locked. | |
351 | */ | |
352 | void *ramster_pampd_repatriate_preload(void *pampd, struct tmem_pool *pool, | |
353 | struct tmem_oid *oidp, uint32_t index, | |
354 | bool *intransit) | |
355 | { | |
356 | int clen = pampd_remote_size(pampd), c; | |
357 | void *ret_pampd = NULL; | |
358 | unsigned long flags; | |
359 | struct tmem_handle th; | |
360 | ||
361 | BUG_ON(!pampd_is_remote(pampd)); | |
362 | BUG_ON(is_ephemeral(pool)); | |
363 | if (use_frontswap_exclusive_gets) | |
364 | /* don't need local storage */ | |
365 | goto out; | |
366 | if (pampd_is_intransit(pampd)) { | |
367 | /* | |
368 | * to avoid multiple allocations (and maybe a memory leak) | |
369 | * don't preallocate if already in the process of being | |
370 | * repatriated | |
371 | */ | |
372 | *intransit = true; | |
373 | goto out; | |
374 | } | |
375 | *intransit = false; | |
376 | local_irq_save(flags); | |
377 | th.client_id = pampd_remote_node(pampd); | |
378 | th.pool_id = pool->pool_id; | |
379 | th.oid = *oidp; | |
380 | th.index = index; | |
381 | ret_pampd = zcache_pampd_create(NULL, clen, true, false, &th); | |
382 | if (ret_pampd != NULL) { | |
383 | /* | |
384 | * a pampd is marked intransit if it is remote and space has | |
385 | * been allocated for it locally (note, only happens for | |
386 | * persistent pages, in which case the remote copy is freed) | |
387 | */ | |
388 | ret_pampd = pampd_mark_intransit(ret_pampd); | |
389 | c = atomic_dec_return(&ramster_remote_pers_pages); | |
390 | WARN_ON_ONCE(c < 0); | |
391 | } else { | |
392 | ramster_pers_pages_remote_nomem++; | |
393 | } | |
394 | local_irq_restore(flags); | |
395 | out: | |
396 | return ret_pampd; | |
397 | } | |
398 | ||
399 | /* | |
400 | * Called on a remote tmem_get to invoke a message to fetch the page. | |
401 | * Might sleep so no tmem locks can be held. "extra" is passed | |
402 | * all the way through the round-trip messaging to ramster_localify. | |
403 | */ | |
404 | int ramster_pampd_repatriate(void *fake_pampd, void *real_pampd, | |
405 | struct tmem_pool *pool, | |
406 | struct tmem_oid *oid, uint32_t index, | |
407 | bool free, void *extra) | |
408 | { | |
409 | struct tmem_xhandle xh; | |
410 | int ret; | |
411 | ||
412 | if (pampd_is_intransit(real_pampd)) | |
413 | /* have local space pre-reserved, so free remote copy */ | |
414 | free = true; | |
415 | xh = tmem_xhandle_fill(LOCAL_CLIENT, pool, oid, index); | |
416 | /* unreliable request/response for now */ | |
417 | ret = r2net_remote_async_get(&xh, free, | |
418 | pampd_remote_node(fake_pampd), | |
419 | pampd_remote_size(fake_pampd), | |
420 | pampd_remote_cksum(fake_pampd), | |
421 | extra); | |
422 | return ret; | |
423 | } | |
424 | ||
425 | bool ramster_pampd_is_remote(void *pampd) | |
426 | { | |
427 | return pampd_is_remote(pampd); | |
428 | } | |
429 | ||
430 | int ramster_pampd_replace_in_obj(void *new_pampd, struct tmem_obj *obj) | |
431 | { | |
432 | int ret = -1; | |
433 | ||
434 | if (new_pampd != NULL) { | |
435 | if (obj->extra == NULL) | |
436 | obj->extra = new_pampd; | |
437 | /* enforce that all remote pages in an object reside | |
438 | * in the same node! */ | |
439 | else if (pampd_remote_node(new_pampd) != | |
440 | pampd_remote_node((void *)(obj->extra))) | |
441 | BUG(); | |
442 | ret = 0; | |
443 | } | |
444 | return ret; | |
445 | } | |
446 | ||
447 | void *ramster_pampd_free(void *pampd, struct tmem_pool *pool, | |
448 | struct tmem_oid *oid, uint32_t index, bool acct) | |
449 | { | |
450 | bool eph = is_ephemeral(pool); | |
451 | void *local_pampd = NULL; | |
452 | int c; | |
453 | ||
454 | BUG_ON(preemptible()); | |
455 | BUG_ON(!pampd_is_remote(pampd)); | |
456 | WARN_ON(acct == false); | |
457 | if (oid == NULL) { | |
458 | /* | |
459 | * a NULL oid means to ignore this pampd free | |
460 | * as the remote freeing will be handled elsewhere | |
461 | */ | |
462 | } else if (eph) { | |
463 | /* FIXME remote flush optional but probably good idea */ | |
464 | } else if (pampd_is_intransit(pampd)) { | |
465 | /* did a pers remote get_and_free, so just free local */ | |
466 | local_pampd = pampd_mask_intransit_and_remote(pampd); | |
467 | } else { | |
468 | struct flushlist_node *flnode = | |
469 | ramster_flnode_alloc(pool); | |
470 | ||
471 | flnode->xh.client_id = pampd_remote_node(pampd); | |
472 | flnode->xh.pool_id = pool->pool_id; | |
473 | flnode->xh.oid = *oid; | |
474 | flnode->xh.index = index; | |
475 | flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_PAGE; | |
476 | spin_lock(&ramster_rem_op_list_lock); | |
477 | list_add(&flnode->rem_op.list, &ramster_rem_op_list); | |
478 | spin_unlock(&ramster_rem_op_list_lock); | |
479 | c = atomic_dec_return(&ramster_remote_pers_pages); | |
480 | WARN_ON_ONCE(c < 0); | |
481 | } | |
482 | return local_pampd; | |
483 | } | |
484 | ||
485 | void ramster_count_foreign_pages(bool eph, int count) | |
486 | { | |
487 | int c; | |
488 | ||
489 | BUG_ON(count != 1 && count != -1); | |
490 | if (eph) { | |
491 | if (count > 0) { | |
93cc39b6 | 492 | inc_ramster_foreign_eph_pages(); |
14c9fda5 DM |
493 | } else { |
494 | c = atomic_dec_return(&ramster_foreign_eph_pages_atomic); | |
495 | WARN_ON_ONCE(c < 0); | |
496 | } | |
497 | ramster_foreign_eph_pages = c; | |
498 | } else { | |
499 | if (count > 0) { | |
93cc39b6 | 500 | inc_ramster_foreign_pers_pages(); |
14c9fda5 DM |
501 | } else { |
502 | c = atomic_dec_return( | |
503 | &ramster_foreign_pers_pages_atomic); | |
504 | WARN_ON_ONCE(c < 0); | |
505 | } | |
506 | ramster_foreign_pers_pages = c; | |
507 | } | |
508 | } | |
509 | ||
510 | /* | |
511 | * For now, just push over a few pages every few seconds to | |
512 | * ensure that it basically works | |
513 | */ | |
514 | static struct workqueue_struct *ramster_remotify_workqueue; | |
515 | static void ramster_remotify_process(struct work_struct *work); | |
516 | static DECLARE_DELAYED_WORK(ramster_remotify_worker, | |
517 | ramster_remotify_process); | |
518 | ||
519 | static void ramster_remotify_queue_delayed_work(unsigned long delay) | |
520 | { | |
521 | if (!queue_delayed_work(ramster_remotify_workqueue, | |
522 | &ramster_remotify_worker, delay)) | |
523 | pr_err("ramster_remotify: bad workqueue\n"); | |
524 | } | |
525 | ||
526 | static void ramster_remote_flush_page(struct flushlist_node *flnode) | |
527 | { | |
528 | struct tmem_xhandle *xh; | |
529 | int remotenode, ret; | |
530 | ||
531 | preempt_disable(); | |
532 | xh = &flnode->xh; | |
533 | remotenode = flnode->xh.client_id; | |
534 | ret = r2net_remote_flush(xh, remotenode); | |
535 | if (ret >= 0) | |
536 | ramster_remote_pages_flushed++; | |
537 | else | |
538 | ramster_remote_page_flushes_failed++; | |
539 | preempt_enable_no_resched(); | |
540 | ramster_flnode_free(flnode, NULL); | |
541 | } | |
542 | ||
543 | static void ramster_remote_flush_object(struct flushlist_node *flnode) | |
544 | { | |
545 | struct tmem_xhandle *xh; | |
546 | int remotenode, ret; | |
547 | ||
548 | preempt_disable(); | |
549 | xh = &flnode->xh; | |
550 | remotenode = flnode->xh.client_id; | |
551 | ret = r2net_remote_flush_object(xh, remotenode); | |
552 | if (ret >= 0) | |
553 | ramster_remote_objects_flushed++; | |
554 | else | |
555 | ramster_remote_object_flushes_failed++; | |
556 | preempt_enable_no_resched(); | |
557 | ramster_flnode_free(flnode, NULL); | |
558 | } | |
559 | ||
560 | int ramster_remotify_pageframe(bool eph) | |
561 | { | |
562 | struct tmem_xhandle xh; | |
563 | unsigned int size; | |
564 | int remotenode, ret, zbuds; | |
565 | struct tmem_pool *pool; | |
566 | unsigned long flags; | |
567 | unsigned char cksum; | |
568 | char *p; | |
569 | int i, j; | |
570 | unsigned char *tmpmem[2]; | |
571 | struct tmem_handle th[2]; | |
572 | unsigned int zsize[2]; | |
573 | ||
574 | tmpmem[0] = __get_cpu_var(ramster_remoteputmem1); | |
575 | tmpmem[1] = __get_cpu_var(ramster_remoteputmem2); | |
576 | local_bh_disable(); | |
577 | zbuds = zbud_make_zombie_lru(&th[0], &tmpmem[0], &zsize[0], eph); | |
578 | /* now OK to release lock set in caller */ | |
579 | local_bh_enable(); | |
580 | if (zbuds == 0) | |
581 | goto out; | |
582 | BUG_ON(zbuds > 2); | |
583 | for (i = 0; i < zbuds; i++) { | |
584 | xh.client_id = th[i].client_id; | |
585 | xh.pool_id = th[i].pool_id; | |
586 | xh.oid = th[i].oid; | |
587 | xh.index = th[i].index; | |
588 | size = zsize[i]; | |
589 | BUG_ON(size == 0 || size > zbud_max_buddy_size()); | |
590 | for (p = tmpmem[i], cksum = 0, j = 0; j < size; j++) | |
591 | cksum += *p++; | |
592 | ret = r2net_remote_put(&xh, tmpmem[i], size, eph, &remotenode); | |
593 | if (ret != 0) { | |
594 | /* | |
595 | * This is some form of a memory leak... if the remote put | |
596 | * fails, there will never be another attempt to remotify | |
597 | * this page. But since we've dropped the zv pointer, | |
598 | * the page may have been freed or the data replaced | |
599 | * so we can't just "put it back" in the remote op list. | |
600 | * Even if we could, not sure where to put it in the list | |
601 | * because there may be flushes that must be strictly | |
602 | * ordered vs the put. So leave this as a FIXME for now. | |
603 | * But count them so we know if it becomes a problem. | |
604 | */ | |
605 | if (eph) | |
606 | ramster_eph_pages_remote_failed++; | |
607 | else | |
608 | ramster_pers_pages_remote_failed++; | |
609 | break; | |
610 | } else { | |
611 | if (!eph) | |
612 | atomic_inc(&ramster_remote_pers_pages); | |
613 | } | |
614 | if (eph) | |
615 | ramster_eph_pages_remoted++; | |
616 | else | |
617 | ramster_pers_pages_remoted++; | |
618 | /* | |
619 | * data was successfully remoted so change the local version to | |
620 | * point to the remote node where it landed | |
621 | */ | |
622 | local_bh_disable(); | |
623 | pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh.pool_id); | |
624 | local_irq_save(flags); | |
625 | (void)tmem_replace(pool, &xh.oid, xh.index, | |
626 | pampd_make_remote(remotenode, size, cksum)); | |
627 | local_irq_restore(flags); | |
628 | zcache_put_pool(pool); | |
629 | local_bh_enable(); | |
630 | } | |
631 | out: | |
632 | return zbuds; | |
633 | } | |
634 | ||
635 | static void zcache_do_remotify_flushes(void) | |
636 | { | |
637 | struct ramster_remotify_hdr *rem_op; | |
638 | union remotify_list_node *u; | |
639 | ||
640 | while (1) { | |
641 | spin_lock(&ramster_rem_op_list_lock); | |
642 | if (list_empty(&ramster_rem_op_list)) { | |
643 | spin_unlock(&ramster_rem_op_list_lock); | |
644 | goto out; | |
645 | } | |
646 | rem_op = list_first_entry(&ramster_rem_op_list, | |
647 | struct ramster_remotify_hdr, list); | |
648 | list_del_init(&rem_op->list); | |
649 | spin_unlock(&ramster_rem_op_list_lock); | |
650 | u = (union remotify_list_node *)rem_op; | |
651 | switch (rem_op->op) { | |
652 | case RAMSTER_REMOTIFY_FLUSH_PAGE: | |
653 | ramster_remote_flush_page((struct flushlist_node *)u); | |
654 | break; | |
655 | case RAMSTER_REMOTIFY_FLUSH_OBJ: | |
656 | ramster_remote_flush_object((struct flushlist_node *)u); | |
657 | break; | |
658 | default: | |
659 | BUG(); | |
660 | } | |
661 | } | |
662 | out: | |
663 | return; | |
664 | } | |
665 | ||
666 | static void ramster_remotify_process(struct work_struct *work) | |
667 | { | |
668 | static bool remotify_in_progress; | |
669 | int i; | |
670 | ||
671 | BUG_ON(irqs_disabled()); | |
672 | if (remotify_in_progress) | |
673 | goto requeue; | |
674 | if (ramster_remote_target_nodenum == -1) | |
675 | goto requeue; | |
676 | remotify_in_progress = true; | |
677 | if (use_cleancache && ramster_eph_remotify_enable) { | |
678 | for (i = 0; i < 100; i++) { | |
679 | zcache_do_remotify_flushes(); | |
680 | (void)ramster_remotify_pageframe(true); | |
681 | } | |
682 | } | |
683 | if (use_frontswap && ramster_pers_remotify_enable) { | |
684 | for (i = 0; i < 100; i++) { | |
685 | zcache_do_remotify_flushes(); | |
686 | (void)ramster_remotify_pageframe(false); | |
687 | } | |
688 | } | |
689 | remotify_in_progress = false; | |
690 | requeue: | |
691 | ramster_remotify_queue_delayed_work(HZ); | |
692 | } | |
693 | ||
694 | void __init ramster_remotify_init(void) | |
695 | { | |
696 | unsigned long n = 60UL; | |
697 | ramster_remotify_workqueue = | |
698 | create_singlethread_workqueue("ramster_remotify"); | |
699 | ramster_remotify_queue_delayed_work(n * HZ); | |
700 | } | |
701 | ||
702 | static ssize_t ramster_manual_node_up_show(struct kobject *kobj, | |
703 | struct kobj_attribute *attr, char *buf) | |
704 | { | |
705 | int i; | |
706 | char *p = buf; | |
707 | for (i = 0; i < MANUAL_NODES; i++) | |
708 | if (ramster_nodes_manual_up[i]) | |
709 | p += sprintf(p, "%d ", i); | |
710 | p += sprintf(p, "\n"); | |
711 | return p - buf; | |
712 | } | |
713 | ||
714 | static ssize_t ramster_manual_node_up_store(struct kobject *kobj, | |
715 | struct kobj_attribute *attr, const char *buf, size_t count) | |
716 | { | |
717 | int err; | |
718 | unsigned long node_num; | |
719 | ||
720 | err = kstrtoul(buf, 10, &node_num); | |
721 | if (err) { | |
722 | pr_err("ramster: bad strtoul?\n"); | |
723 | return -EINVAL; | |
724 | } | |
725 | if (node_num >= MANUAL_NODES) { | |
726 | pr_err("ramster: bad node_num=%lu?\n", node_num); | |
727 | return -EINVAL; | |
728 | } | |
729 | if (ramster_nodes_manual_up[node_num]) { | |
730 | pr_err("ramster: node %d already up, ignoring\n", | |
731 | (int)node_num); | |
732 | } else { | |
733 | ramster_nodes_manual_up[node_num] = true; | |
734 | r2net_hb_node_up_manual((int)node_num); | |
735 | } | |
736 | return count; | |
737 | } | |
738 | ||
739 | static struct kobj_attribute ramster_manual_node_up_attr = { | |
740 | .attr = { .name = "manual_node_up", .mode = 0644 }, | |
741 | .show = ramster_manual_node_up_show, | |
742 | .store = ramster_manual_node_up_store, | |
743 | }; | |
744 | ||
745 | static ssize_t ramster_remote_target_nodenum_show(struct kobject *kobj, | |
746 | struct kobj_attribute *attr, char *buf) | |
747 | { | |
748 | if (ramster_remote_target_nodenum == -1UL) | |
749 | return sprintf(buf, "unset\n"); | |
750 | else | |
751 | return sprintf(buf, "%d\n", ramster_remote_target_nodenum); | |
752 | } | |
753 | ||
754 | static ssize_t ramster_remote_target_nodenum_store(struct kobject *kobj, | |
755 | struct kobj_attribute *attr, const char *buf, size_t count) | |
756 | { | |
757 | int err; | |
758 | unsigned long node_num; | |
759 | ||
760 | err = kstrtoul(buf, 10, &node_num); | |
761 | if (err) { | |
762 | pr_err("ramster: bad strtoul?\n"); | |
763 | return -EINVAL; | |
764 | } else if (node_num == -1UL) { | |
765 | pr_err("ramster: disabling all remotification, " | |
766 | "data may still reside on remote nodes however\n"); | |
767 | return -EINVAL; | |
768 | } else if (node_num >= MANUAL_NODES) { | |
769 | pr_err("ramster: bad node_num=%lu?\n", node_num); | |
770 | return -EINVAL; | |
771 | } else if (!ramster_nodes_manual_up[node_num]) { | |
772 | pr_err("ramster: node %d not up, ignoring setting " | |
773 | "of remotification target\n", (int)node_num); | |
774 | } else if (r2net_remote_target_node_set((int)node_num) >= 0) { | |
775 | pr_info("ramster: node %d set as remotification target\n", | |
776 | (int)node_num); | |
777 | ramster_remote_target_nodenum = (int)node_num; | |
778 | } else { | |
779 | pr_err("ramster: bad num to node node_num=%d?\n", | |
780 | (int)node_num); | |
781 | return -EINVAL; | |
782 | } | |
783 | return count; | |
784 | } | |
785 | ||
786 | static struct kobj_attribute ramster_remote_target_nodenum_attr = { | |
787 | .attr = { .name = "remote_target_nodenum", .mode = 0644 }, | |
788 | .show = ramster_remote_target_nodenum_show, | |
789 | .store = ramster_remote_target_nodenum_store, | |
790 | }; | |
791 | ||
792 | #define RAMSTER_SYSFS_RO(_name) \ | |
793 | static ssize_t ramster_##_name##_show(struct kobject *kobj, \ | |
794 | struct kobj_attribute *attr, char *buf) \ | |
795 | { \ | |
796 | return sprintf(buf, "%lu\n", ramster_##_name); \ | |
797 | } \ | |
798 | static struct kobj_attribute ramster_##_name##_attr = { \ | |
799 | .attr = { .name = __stringify(_name), .mode = 0444 }, \ | |
800 | .show = ramster_##_name##_show, \ | |
801 | } | |
802 | ||
803 | #define RAMSTER_SYSFS_RW(_name) \ | |
804 | static ssize_t ramster_##_name##_show(struct kobject *kobj, \ | |
805 | struct kobj_attribute *attr, char *buf) \ | |
806 | { \ | |
807 | return sprintf(buf, "%lu\n", ramster_##_name); \ | |
808 | } \ | |
809 | static ssize_t ramster_##_name##_store(struct kobject *kobj, \ | |
810 | struct kobj_attribute *attr, const char *buf, size_t count) \ | |
811 | { \ | |
812 | int err; \ | |
813 | unsigned long enable; \ | |
814 | err = kstrtoul(buf, 10, &enable); \ | |
815 | if (err) \ | |
816 | return -EINVAL; \ | |
817 | ramster_##_name = enable; \ | |
818 | return count; \ | |
819 | } \ | |
820 | static struct kobj_attribute ramster_##_name##_attr = { \ | |
821 | .attr = { .name = __stringify(_name), .mode = 0644 }, \ | |
822 | .show = ramster_##_name##_show, \ | |
823 | .store = ramster_##_name##_store, \ | |
824 | } | |
825 | ||
826 | #define RAMSTER_SYSFS_RO_ATOMIC(_name) \ | |
827 | static ssize_t ramster_##_name##_show(struct kobject *kobj, \ | |
828 | struct kobj_attribute *attr, char *buf) \ | |
829 | { \ | |
830 | return sprintf(buf, "%d\n", atomic_read(&ramster_##_name)); \ | |
831 | } \ | |
832 | static struct kobj_attribute ramster_##_name##_attr = { \ | |
833 | .attr = { .name = __stringify(_name), .mode = 0444 }, \ | |
834 | .show = ramster_##_name##_show, \ | |
835 | } | |
836 | ||
837 | RAMSTER_SYSFS_RO(interface_revision); | |
838 | RAMSTER_SYSFS_RO_ATOMIC(remote_pers_pages); | |
839 | RAMSTER_SYSFS_RW(pers_remotify_enable); | |
840 | RAMSTER_SYSFS_RW(eph_remotify_enable); | |
841 | ||
842 | static struct attribute *ramster_attrs[] = { | |
843 | &ramster_interface_revision_attr.attr, | |
844 | &ramster_remote_pers_pages_attr.attr, | |
845 | &ramster_manual_node_up_attr.attr, | |
846 | &ramster_remote_target_nodenum_attr.attr, | |
847 | &ramster_pers_remotify_enable_attr.attr, | |
848 | &ramster_eph_remotify_enable_attr.attr, | |
849 | NULL, | |
850 | }; | |
851 | ||
852 | static struct attribute_group ramster_attr_group = { | |
853 | .attrs = ramster_attrs, | |
854 | .name = "ramster", | |
855 | }; | |
856 | ||
857 | /* | |
858 | * frontswap selfshrinking | |
859 | */ | |
860 | ||
861 | /* In HZ, controls frequency of worker invocation. */ | |
862 | static unsigned int selfshrink_interval __read_mostly = 5; | |
863 | /* Enable/disable with sysfs. */ | |
864 | static bool frontswap_selfshrinking __read_mostly; | |
865 | ||
866 | static void selfshrink_process(struct work_struct *work); | |
867 | static DECLARE_DELAYED_WORK(selfshrink_worker, selfshrink_process); | |
868 | ||
869 | /* Enable/disable with kernel boot option. */ | |
870 | static bool use_frontswap_selfshrink __initdata = true; | |
871 | ||
872 | /* | |
873 | * The default values for the following parameters were deemed reasonable | |
874 | * by experimentation, may be workload-dependent, and can all be | |
875 | * adjusted via sysfs. | |
876 | */ | |
877 | ||
878 | /* Control rate for frontswap shrinking. Higher hysteresis is slower. */ | |
879 | static unsigned int frontswap_hysteresis __read_mostly = 20; | |
880 | ||
881 | /* | |
882 | * Number of selfshrink worker invocations to wait before observing that | |
883 | * frontswap selfshrinking should commence. Note that selfshrinking does | |
884 | * not use a separate worker thread. | |
885 | */ | |
886 | static unsigned int frontswap_inertia __read_mostly = 3; | |
887 | ||
888 | /* Countdown to next invocation of frontswap_shrink() */ | |
889 | static unsigned long frontswap_inertia_counter; | |
890 | ||
891 | /* | |
892 | * Invoked by the selfshrink worker thread, uses current number of pages | |
893 | * in frontswap (frontswap_curr_pages()), previous status, and control | |
894 | * values (hysteresis and inertia) to determine if frontswap should be | |
895 | * shrunk and what the new frontswap size should be. Note that | |
896 | * frontswap_shrink is essentially a partial swapoff that immediately | |
897 | * transfers pages from the "swap device" (frontswap) back into kernel | |
898 | * RAM; despite the name, frontswap "shrinking" is very different from | |
899 | * the "shrinker" interface used by the kernel MM subsystem to reclaim | |
900 | * memory. | |
901 | */ | |
902 | static void frontswap_selfshrink(void) | |
903 | { | |
904 | static unsigned long cur_frontswap_pages; | |
905 | static unsigned long last_frontswap_pages; | |
906 | static unsigned long tgt_frontswap_pages; | |
907 | ||
908 | last_frontswap_pages = cur_frontswap_pages; | |
909 | cur_frontswap_pages = frontswap_curr_pages(); | |
910 | if (!cur_frontswap_pages || | |
911 | (cur_frontswap_pages > last_frontswap_pages)) { | |
912 | frontswap_inertia_counter = frontswap_inertia; | |
913 | return; | |
914 | } | |
915 | if (frontswap_inertia_counter && --frontswap_inertia_counter) | |
916 | return; | |
917 | if (cur_frontswap_pages <= frontswap_hysteresis) | |
918 | tgt_frontswap_pages = 0; | |
919 | else | |
920 | tgt_frontswap_pages = cur_frontswap_pages - | |
921 | (cur_frontswap_pages / frontswap_hysteresis); | |
922 | frontswap_shrink(tgt_frontswap_pages); | |
923 | } | |
924 | ||
925 | static int __init ramster_nofrontswap_selfshrink_setup(char *s) | |
926 | { | |
927 | use_frontswap_selfshrink = false; | |
928 | return 1; | |
929 | } | |
930 | ||
931 | __setup("noselfshrink", ramster_nofrontswap_selfshrink_setup); | |
932 | ||
933 | static void selfshrink_process(struct work_struct *work) | |
934 | { | |
935 | if (frontswap_selfshrinking && frontswap_enabled) { | |
936 | frontswap_selfshrink(); | |
937 | schedule_delayed_work(&selfshrink_worker, | |
938 | selfshrink_interval * HZ); | |
939 | } | |
940 | } | |
941 | ||
942 | void ramster_cpu_up(int cpu) | |
943 | { | |
944 | unsigned char *p1 = kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT); | |
945 | unsigned char *p2 = kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT); | |
946 | BUG_ON(!p1 || !p2); | |
947 | per_cpu(ramster_remoteputmem1, cpu) = p1; | |
948 | per_cpu(ramster_remoteputmem2, cpu) = p2; | |
949 | } | |
950 | ||
951 | void ramster_cpu_down(int cpu) | |
952 | { | |
953 | struct ramster_preload *kp; | |
954 | ||
955 | kfree(per_cpu(ramster_remoteputmem1, cpu)); | |
956 | per_cpu(ramster_remoteputmem1, cpu) = NULL; | |
957 | kfree(per_cpu(ramster_remoteputmem2, cpu)); | |
958 | per_cpu(ramster_remoteputmem2, cpu) = NULL; | |
959 | kp = &per_cpu(ramster_preloads, cpu); | |
960 | if (kp->flnode) { | |
961 | kmem_cache_free(ramster_flnode_cache, kp->flnode); | |
962 | kp->flnode = NULL; | |
963 | } | |
964 | } | |
965 | ||
966 | void ramster_register_pamops(struct tmem_pamops *pamops) | |
967 | { | |
968 | pamops->free_obj = ramster_pampd_free_obj; | |
969 | pamops->new_obj = ramster_pampd_new_obj; | |
970 | pamops->replace_in_obj = ramster_pampd_replace_in_obj; | |
971 | pamops->is_remote = ramster_pampd_is_remote; | |
972 | pamops->repatriate = ramster_pampd_repatriate; | |
973 | pamops->repatriate_preload = ramster_pampd_repatriate_preload; | |
974 | } | |
975 | ||
976 | void __init ramster_init(bool cleancache, bool frontswap, | |
977 | bool frontswap_exclusive_gets) | |
978 | { | |
979 | int ret = 0; | |
980 | ||
981 | if (cleancache) | |
982 | use_cleancache = true; | |
983 | if (frontswap) | |
984 | use_frontswap = true; | |
985 | if (frontswap_exclusive_gets) | |
986 | use_frontswap_exclusive_gets = true; | |
987 | ramster_debugfs_init(); | |
988 | ret = sysfs_create_group(mm_kobj, &ramster_attr_group); | |
989 | if (ret) | |
990 | pr_err("ramster: can't create sysfs for ramster\n"); | |
991 | (void)r2net_register_handlers(); | |
992 | INIT_LIST_HEAD(&ramster_rem_op_list); | |
993 | ramster_flnode_cache = kmem_cache_create("ramster_flnode", | |
994 | sizeof(struct flushlist_node), 0, 0, NULL); | |
995 | frontswap_selfshrinking = use_frontswap_selfshrink; | |
996 | if (frontswap_selfshrinking) { | |
997 | pr_info("ramster: Initializing frontswap selfshrink driver.\n"); | |
998 | schedule_delayed_work(&selfshrink_worker, | |
999 | selfshrink_interval * HZ); | |
1000 | } | |
1001 | ramster_remotify_init(); | |
1002 | } |