444189e10895a61bf74df58c78fc90dc77147b86
[deliverable/linux.git] / drivers / staging / zcache / ramster / ramster.c
1 /*
2 * ramster.c
3 *
4 * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
5 *
6 * RAMster implements peer-to-peer transcendent memory, allowing a "cluster" of
7 * kernels to dynamically pool their RAM so that a RAM-hungry workload on one
8 * machine can temporarily and transparently utilize RAM on another machine
9 * which is presumably idle or running a non-RAM-hungry workload.
10 *
11 * RAMster combines a clustering and messaging foundation based on the ocfs2
12 * cluster layer with the in-kernel compression implementation of zcache, and
13 * adds code to glue them together. When a page is "put" to RAMster, it is
14 * compressed and stored locally. Periodically, a thread will "remotify" these
15 * pages by sending them via messages to a remote machine. When the page is
16 * later needed as indicated by a page fault, a "get" is issued. If the data
17 * is local, it is uncompressed and the fault is resolved. If the data is
18 * remote, a message is sent to fetch the data and the faulting thread sleeps;
19 * when the data arrives, the thread awakens, the data is decompressed and
20 * the fault is resolved.
21
22 * As of V5, clusters up to eight nodes are supported; each node can remotify
23 * pages to one specified node, so clusters can be configured as clients to
24 * a "memory server". Some simple policy is in place that will need to be
25 * refined over time. Larger clusters and fault-resistant protocols can also
26 * be added over time.
27 */
28
29 #include <linux/module.h>
30 #include <linux/cpu.h>
31 #include <linux/highmem.h>
32 #include <linux/list.h>
33 #include <linux/lzo.h>
34 #include <linux/slab.h>
35 #include <linux/spinlock.h>
36 #include <linux/types.h>
37 #include <linux/atomic.h>
38 #include <linux/frontswap.h>
39 #include "../tmem.h"
40 #include "../zcache.h"
41 #include "../zbud.h"
42 #include "ramster.h"
43 #include "ramster_nodemanager.h"
44 #include "tcp.h"
45
46 #define RAMSTER_TESTING
47
48 #ifndef CONFIG_SYSFS
49 #error "ramster needs sysfs to define cluster nodes to use"
50 #endif
51
52 static bool use_cleancache __read_mostly;
53 static bool use_frontswap __read_mostly;
54 static bool use_frontswap_exclusive_gets __read_mostly;
55
56 /* These must be sysfs not debugfs as they are checked/used by userland!! */
57 static unsigned long ramster_interface_revision __read_mostly =
58 R2NM_API_VERSION; /* interface revision must match userspace! */
59 static unsigned long ramster_pers_remotify_enable __read_mostly;
60 static unsigned long ramster_eph_remotify_enable __read_mostly;
61 static atomic_t ramster_remote_pers_pages = ATOMIC_INIT(0);
62 #define MANUAL_NODES 8
63 static bool ramster_nodes_manual_up[MANUAL_NODES] __read_mostly;
64 static int ramster_remote_target_nodenum __read_mostly = -1;
65
66 /* these counters are made available via debugfs */
67 static long ramster_flnodes;
68 static atomic_t ramster_flnodes_atomic = ATOMIC_INIT(0);
69 static unsigned long ramster_flnodes_max;
70 static inline void inc_ramster_flnodes(void)
71 {
72 ramster_flnodes = atomic_inc_return(&ramster_flnodes_atomic);
73 if (ramster_flnodes > ramster_flnodes_max)
74 ramster_flnodes_max = ramster_flnodes;
75 }
76 static inline void dec_ramster_flnodes(void)
77 {
78 ramster_flnodes = atomic_dec_return(&ramster_flnodes_atomic);
79 }
80 static ssize_t ramster_foreign_eph_pages;
81 static atomic_t ramster_foreign_eph_pages_atomic = ATOMIC_INIT(0);
82 static ssize_t ramster_foreign_eph_pages_max;
83 static inline void inc_ramster_foreign_eph_pages(void)
84 {
85 ramster_foreign_eph_pages = atomic_inc_return(
86 &ramster_foreign_eph_pages_atomic);
87 if (ramster_foreign_eph_pages > ramster_foreign_eph_pages_max)
88 ramster_foreign_eph_pages_max = ramster_foreign_eph_pages;
89 }
90 static inline void dec_ramster_foreign_eph_pages(void)
91 {
92 ramster_foreign_eph_pages = atomic_dec_return(
93 &ramster_foreign_eph_pages_atomic);
94 }
95 static ssize_t ramster_foreign_pers_pages;
96 static atomic_t ramster_foreign_pers_pages_atomic = ATOMIC_INIT(0);
97 static ssize_t ramster_foreign_pers_pages_max;
98 static inline void inc_ramster_foreign_pers_pages(void)
99 {
100 ramster_foreign_pers_pages = atomic_inc_return(
101 &ramster_foreign_pers_pages_atomic);
102 if (ramster_foreign_pers_pages > ramster_foreign_pers_pages_max)
103 ramster_foreign_pers_pages_max = ramster_foreign_pers_pages;
104 }
105 static inline void dec_ramster_foreign_pers_pages(void)
106 {
107 ramster_foreign_pers_pages = atomic_dec_return(
108 &ramster_foreign_pers_pages_atomic);
109 }
110 static ssize_t ramster_eph_pages_remoted;
111 static ssize_t ramster_pers_pages_remoted;
112 static ssize_t ramster_eph_pages_remote_failed;
113 static ssize_t ramster_pers_pages_remote_failed;
114 static ssize_t ramster_remote_eph_pages_succ_get;
115 static ssize_t ramster_remote_pers_pages_succ_get;
116 static ssize_t ramster_remote_eph_pages_unsucc_get;
117 static ssize_t ramster_remote_pers_pages_unsucc_get;
118 static ssize_t ramster_pers_pages_remote_nomem;
119 static ssize_t ramster_remote_objects_flushed;
120 static ssize_t ramster_remote_object_flushes_failed;
121 static ssize_t ramster_remote_pages_flushed;
122 static ssize_t ramster_remote_page_flushes_failed;
123 /* FIXME frontswap selfshrinking knobs in debugfs? */
124
125 #ifdef CONFIG_DEBUG_FS
126 #include <linux/debugfs.h>
127 #define zdfs debugfs_create_size_t
128 #define zdfs64 debugfs_create_u64
129 static int __init ramster_debugfs_init(void)
130 {
131 struct dentry *root = debugfs_create_dir("ramster", NULL);
132 if (root == NULL)
133 return -ENXIO;
134
135 zdfs("eph_pages_remoted", S_IRUGO, root, &ramster_eph_pages_remoted);
136 zdfs("pers_pages_remoted", S_IRUGO, root, &ramster_pers_pages_remoted);
137 zdfs("eph_pages_remote_failed", S_IRUGO, root,
138 &ramster_eph_pages_remote_failed);
139 zdfs("pers_pages_remote_failed", S_IRUGO, root,
140 &ramster_pers_pages_remote_failed);
141 zdfs("remote_eph_pages_succ_get", S_IRUGO, root,
142 &ramster_remote_eph_pages_succ_get);
143 zdfs("remote_pers_pages_succ_get", S_IRUGO, root,
144 &ramster_remote_pers_pages_succ_get);
145 zdfs("remote_eph_pages_unsucc_get", S_IRUGO, root,
146 &ramster_remote_eph_pages_unsucc_get);
147 zdfs("remote_pers_pages_unsucc_get", S_IRUGO, root,
148 &ramster_remote_pers_pages_unsucc_get);
149 zdfs("pers_pages_remote_nomem", S_IRUGO, root,
150 &ramster_pers_pages_remote_nomem);
151 zdfs("remote_objects_flushed", S_IRUGO, root,
152 &ramster_remote_objects_flushed);
153 zdfs("remote_pages_flushed", S_IRUGO, root,
154 &ramster_remote_pages_flushed);
155 zdfs("remote_object_flushes_failed", S_IRUGO, root,
156 &ramster_remote_object_flushes_failed);
157 zdfs("remote_page_flushes_failed", S_IRUGO, root,
158 &ramster_remote_page_flushes_failed);
159 zdfs("foreign_eph_pages", S_IRUGO, root,
160 &ramster_foreign_eph_pages);
161 zdfs("foreign_eph_pages_max", S_IRUGO, root,
162 &ramster_foreign_eph_pages_max);
163 zdfs("foreign_pers_pages", S_IRUGO, root,
164 &ramster_foreign_pers_pages);
165 zdfs("foreign_pers_pages_max", S_IRUGO, root,
166 &ramster_foreign_pers_pages_max);
167 return 0;
168 }
169 #undef zdebugfs
170 #undef zdfs64
171 #else
172 static inline int ramster_debugfs_init(void)
173 {
174 return 0;
175 }
176 #endif
177
178 static LIST_HEAD(ramster_rem_op_list);
179 static DEFINE_SPINLOCK(ramster_rem_op_list_lock);
180 static DEFINE_PER_CPU(struct ramster_preload, ramster_preloads);
181
182 static DEFINE_PER_CPU(unsigned char *, ramster_remoteputmem1);
183 static DEFINE_PER_CPU(unsigned char *, ramster_remoteputmem2);
184
185 static struct kmem_cache *ramster_flnode_cache __read_mostly;
186
187 static struct flushlist_node *ramster_flnode_alloc(struct tmem_pool *pool)
188 {
189 struct flushlist_node *flnode = NULL;
190 struct ramster_preload *kp;
191
192 kp = &__get_cpu_var(ramster_preloads);
193 flnode = kp->flnode;
194 BUG_ON(flnode == NULL);
195 kp->flnode = NULL;
196 inc_ramster_flnodes();
197 return flnode;
198 }
199
200 /* the "flush list" asynchronously collects pages to remotely flush */
201 #define FLUSH_ENTIRE_OBJECT ((uint32_t)-1)
202 static void ramster_flnode_free(struct flushlist_node *flnode,
203 struct tmem_pool *pool)
204 {
205 dec_ramster_flnodes();
206 BUG_ON(ramster_flnodes < 0);
207 kmem_cache_free(ramster_flnode_cache, flnode);
208 }
209
210 int ramster_do_preload_flnode(struct tmem_pool *pool)
211 {
212 struct ramster_preload *kp;
213 struct flushlist_node *flnode;
214 int ret = -ENOMEM;
215
216 BUG_ON(!irqs_disabled());
217 if (unlikely(ramster_flnode_cache == NULL))
218 BUG();
219 kp = &__get_cpu_var(ramster_preloads);
220 flnode = kmem_cache_alloc(ramster_flnode_cache, GFP_ATOMIC);
221 if (unlikely(flnode == NULL) && kp->flnode == NULL)
222 BUG(); /* FIXME handle more gracefully, but how??? */
223 else if (kp->flnode == NULL)
224 kp->flnode = flnode;
225 else
226 kmem_cache_free(ramster_flnode_cache, flnode);
227 return ret;
228 }
229
230 /*
231 * Called by the message handler after a (still compressed) page has been
232 * fetched from the remote machine in response to an "is_remote" tmem_get
233 * or persistent tmem_localify. For a tmem_get, "extra" is the address of
234 * the page that is to be filled to successfully resolve the tmem_get; for
235 * a (persistent) tmem_localify, "extra" is NULL (as the data is placed only
236 * in the local zcache). "data" points to "size" bytes of (compressed) data
237 * passed in the message. In the case of a persistent remote get, if
238 * pre-allocation was successful (see ramster_repatriate_preload), the page
239 * is placed into both local zcache and at "extra".
240 */
241 int ramster_localify(int pool_id, struct tmem_oid *oidp, uint32_t index,
242 char *data, unsigned int size, void *extra)
243 {
244 int ret = -ENOENT;
245 unsigned long flags;
246 struct tmem_pool *pool;
247 bool eph, delete = false;
248 void *pampd, *saved_hb;
249 struct tmem_obj *obj;
250
251 pool = zcache_get_pool_by_id(LOCAL_CLIENT, pool_id);
252 if (unlikely(pool == NULL))
253 /* pool doesn't exist anymore */
254 goto out;
255 eph = is_ephemeral(pool);
256 local_irq_save(flags); /* FIXME: maybe only disable softirqs? */
257 pampd = tmem_localify_get_pampd(pool, oidp, index, &obj, &saved_hb);
258 if (pampd == NULL) {
259 /* hmmm... must have been a flush while waiting */
260 #ifdef RAMSTER_TESTING
261 pr_err("UNTESTED pampd==NULL in ramster_localify\n");
262 #endif
263 if (eph)
264 ramster_remote_eph_pages_unsucc_get++;
265 else
266 ramster_remote_pers_pages_unsucc_get++;
267 obj = NULL;
268 goto finish;
269 } else if (unlikely(!pampd_is_remote(pampd))) {
270 /* hmmm... must have been a dup put while waiting */
271 #ifdef RAMSTER_TESTING
272 pr_err("UNTESTED dup while waiting in ramster_localify\n");
273 #endif
274 if (eph)
275 ramster_remote_eph_pages_unsucc_get++;
276 else
277 ramster_remote_pers_pages_unsucc_get++;
278 obj = NULL;
279 pampd = NULL;
280 ret = -EEXIST;
281 goto finish;
282 } else if (size == 0) {
283 /* no remote data, delete the local is_remote pampd */
284 pampd = NULL;
285 if (eph)
286 ramster_remote_eph_pages_unsucc_get++;
287 else
288 BUG();
289 delete = true;
290 goto finish;
291 }
292 if (pampd_is_intransit(pampd)) {
293 /*
294 * a pampd is marked intransit if it is remote and space has
295 * been allocated for it locally (note, only happens for
296 * persistent pages, in which case the remote copy is freed)
297 */
298 BUG_ON(eph);
299 pampd = pampd_mask_intransit_and_remote(pampd);
300 zbud_copy_to_zbud(pampd, data, size);
301 } else {
302 /*
303 * setting pampd to NULL tells tmem_localify_finish to leave
304 * pampd alone... meaning it is left pointing to the
305 * remote copy
306 */
307 pampd = NULL;
308 obj = NULL;
309 }
310 /*
311 * but in all cases, we decompress direct-to-memory to complete
312 * the remotify and return success
313 */
314 BUG_ON(extra == NULL);
315 zcache_decompress_to_page(data, size, (struct page *)extra);
316 if (eph)
317 ramster_remote_eph_pages_succ_get++;
318 else
319 ramster_remote_pers_pages_succ_get++;
320 ret = 0;
321 finish:
322 tmem_localify_finish(obj, index, pampd, saved_hb, delete);
323 zcache_put_pool(pool);
324 local_irq_restore(flags);
325 out:
326 return ret;
327 }
328
329 void ramster_pampd_new_obj(struct tmem_obj *obj)
330 {
331 obj->extra = NULL;
332 }
333
334 void ramster_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj,
335 bool pool_destroy)
336 {
337 struct flushlist_node *flnode;
338
339 BUG_ON(preemptible());
340 if (obj->extra == NULL)
341 return;
342 if (pool_destroy && is_ephemeral(pool))
343 /* FIXME don't bother with remote eph data for now */
344 return;
345 BUG_ON(!pampd_is_remote(obj->extra));
346 flnode = ramster_flnode_alloc(pool);
347 flnode->xh.client_id = pampd_remote_node(obj->extra);
348 flnode->xh.pool_id = pool->pool_id;
349 flnode->xh.oid = obj->oid;
350 flnode->xh.index = FLUSH_ENTIRE_OBJECT;
351 flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_OBJ;
352 spin_lock(&ramster_rem_op_list_lock);
353 list_add(&flnode->rem_op.list, &ramster_rem_op_list);
354 spin_unlock(&ramster_rem_op_list_lock);
355 }
356
357 /*
358 * Called on a remote persistent tmem_get to attempt to preallocate
359 * local storage for the data contained in the remote persistent page.
360 * If successfully preallocated, returns the pampd, marked as remote and
361 * in_transit. Else returns NULL. Note that the appropriate tmem data
362 * structure must be locked.
363 */
364 void *ramster_pampd_repatriate_preload(void *pampd, struct tmem_pool *pool,
365 struct tmem_oid *oidp, uint32_t index,
366 bool *intransit)
367 {
368 int clen = pampd_remote_size(pampd), c;
369 void *ret_pampd = NULL;
370 unsigned long flags;
371 struct tmem_handle th;
372
373 BUG_ON(!pampd_is_remote(pampd));
374 BUG_ON(is_ephemeral(pool));
375 if (use_frontswap_exclusive_gets)
376 /* don't need local storage */
377 goto out;
378 if (pampd_is_intransit(pampd)) {
379 /*
380 * to avoid multiple allocations (and maybe a memory leak)
381 * don't preallocate if already in the process of being
382 * repatriated
383 */
384 *intransit = true;
385 goto out;
386 }
387 *intransit = false;
388 local_irq_save(flags);
389 th.client_id = pampd_remote_node(pampd);
390 th.pool_id = pool->pool_id;
391 th.oid = *oidp;
392 th.index = index;
393 ret_pampd = zcache_pampd_create(NULL, clen, true, false, &th);
394 if (ret_pampd != NULL) {
395 /*
396 * a pampd is marked intransit if it is remote and space has
397 * been allocated for it locally (note, only happens for
398 * persistent pages, in which case the remote copy is freed)
399 */
400 ret_pampd = pampd_mark_intransit(ret_pampd);
401 c = atomic_dec_return(&ramster_remote_pers_pages);
402 WARN_ON_ONCE(c < 0);
403 } else {
404 ramster_pers_pages_remote_nomem++;
405 }
406 local_irq_restore(flags);
407 out:
408 return ret_pampd;
409 }
410
411 /*
412 * Called on a remote tmem_get to invoke a message to fetch the page.
413 * Might sleep so no tmem locks can be held. "extra" is passed
414 * all the way through the round-trip messaging to ramster_localify.
415 */
416 int ramster_pampd_repatriate(void *fake_pampd, void *real_pampd,
417 struct tmem_pool *pool,
418 struct tmem_oid *oid, uint32_t index,
419 bool free, void *extra)
420 {
421 struct tmem_xhandle xh;
422 int ret;
423
424 if (pampd_is_intransit(real_pampd))
425 /* have local space pre-reserved, so free remote copy */
426 free = true;
427 xh = tmem_xhandle_fill(LOCAL_CLIENT, pool, oid, index);
428 /* unreliable request/response for now */
429 ret = r2net_remote_async_get(&xh, free,
430 pampd_remote_node(fake_pampd),
431 pampd_remote_size(fake_pampd),
432 pampd_remote_cksum(fake_pampd),
433 extra);
434 return ret;
435 }
436
437 bool ramster_pampd_is_remote(void *pampd)
438 {
439 return pampd_is_remote(pampd);
440 }
441
442 int ramster_pampd_replace_in_obj(void *new_pampd, struct tmem_obj *obj)
443 {
444 int ret = -1;
445
446 if (new_pampd != NULL) {
447 if (obj->extra == NULL)
448 obj->extra = new_pampd;
449 /* enforce that all remote pages in an object reside
450 * in the same node! */
451 else if (pampd_remote_node(new_pampd) !=
452 pampd_remote_node((void *)(obj->extra)))
453 BUG();
454 ret = 0;
455 }
456 return ret;
457 }
458
459 void *ramster_pampd_free(void *pampd, struct tmem_pool *pool,
460 struct tmem_oid *oid, uint32_t index, bool acct)
461 {
462 bool eph = is_ephemeral(pool);
463 void *local_pampd = NULL;
464 int c;
465
466 BUG_ON(preemptible());
467 BUG_ON(!pampd_is_remote(pampd));
468 WARN_ON(acct == false);
469 if (oid == NULL) {
470 /*
471 * a NULL oid means to ignore this pampd free
472 * as the remote freeing will be handled elsewhere
473 */
474 } else if (eph) {
475 /* FIXME remote flush optional but probably good idea */
476 } else if (pampd_is_intransit(pampd)) {
477 /* did a pers remote get_and_free, so just free local */
478 local_pampd = pampd_mask_intransit_and_remote(pampd);
479 } else {
480 struct flushlist_node *flnode =
481 ramster_flnode_alloc(pool);
482
483 flnode->xh.client_id = pampd_remote_node(pampd);
484 flnode->xh.pool_id = pool->pool_id;
485 flnode->xh.oid = *oid;
486 flnode->xh.index = index;
487 flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_PAGE;
488 spin_lock(&ramster_rem_op_list_lock);
489 list_add(&flnode->rem_op.list, &ramster_rem_op_list);
490 spin_unlock(&ramster_rem_op_list_lock);
491 c = atomic_dec_return(&ramster_remote_pers_pages);
492 WARN_ON_ONCE(c < 0);
493 }
494 return local_pampd;
495 }
496
497 void ramster_count_foreign_pages(bool eph, int count)
498 {
499 BUG_ON(count != 1 && count != -1);
500 if (eph) {
501 if (count > 0) {
502 inc_ramster_foreign_eph_pages();
503 } else {
504 dec_ramster_foreign_eph_pages();
505 WARN_ON_ONCE(ramster_foreign_eph_pages < 0);
506 }
507 } else {
508 if (count > 0) {
509 inc_ramster_foreign_pers_pages();
510 } else {
511 dec_ramster_foreign_pers_pages();
512 WARN_ON_ONCE(ramster_foreign_pers_pages < 0);
513 }
514 }
515 }
516
517 /*
518 * For now, just push over a few pages every few seconds to
519 * ensure that it basically works
520 */
521 static struct workqueue_struct *ramster_remotify_workqueue;
522 static void ramster_remotify_process(struct work_struct *work);
523 static DECLARE_DELAYED_WORK(ramster_remotify_worker,
524 ramster_remotify_process);
525
526 static void ramster_remotify_queue_delayed_work(unsigned long delay)
527 {
528 if (!queue_delayed_work(ramster_remotify_workqueue,
529 &ramster_remotify_worker, delay))
530 pr_err("ramster_remotify: bad workqueue\n");
531 }
532
533 static void ramster_remote_flush_page(struct flushlist_node *flnode)
534 {
535 struct tmem_xhandle *xh;
536 int remotenode, ret;
537
538 preempt_disable();
539 xh = &flnode->xh;
540 remotenode = flnode->xh.client_id;
541 ret = r2net_remote_flush(xh, remotenode);
542 if (ret >= 0)
543 ramster_remote_pages_flushed++;
544 else
545 ramster_remote_page_flushes_failed++;
546 preempt_enable_no_resched();
547 ramster_flnode_free(flnode, NULL);
548 }
549
550 static void ramster_remote_flush_object(struct flushlist_node *flnode)
551 {
552 struct tmem_xhandle *xh;
553 int remotenode, ret;
554
555 preempt_disable();
556 xh = &flnode->xh;
557 remotenode = flnode->xh.client_id;
558 ret = r2net_remote_flush_object(xh, remotenode);
559 if (ret >= 0)
560 ramster_remote_objects_flushed++;
561 else
562 ramster_remote_object_flushes_failed++;
563 preempt_enable_no_resched();
564 ramster_flnode_free(flnode, NULL);
565 }
566
567 int ramster_remotify_pageframe(bool eph)
568 {
569 struct tmem_xhandle xh;
570 unsigned int size;
571 int remotenode, ret, zbuds;
572 struct tmem_pool *pool;
573 unsigned long flags;
574 unsigned char cksum;
575 char *p;
576 int i, j;
577 unsigned char *tmpmem[2];
578 struct tmem_handle th[2];
579 unsigned int zsize[2];
580
581 tmpmem[0] = __get_cpu_var(ramster_remoteputmem1);
582 tmpmem[1] = __get_cpu_var(ramster_remoteputmem2);
583 local_bh_disable();
584 zbuds = zbud_make_zombie_lru(&th[0], &tmpmem[0], &zsize[0], eph);
585 /* now OK to release lock set in caller */
586 local_bh_enable();
587 if (zbuds == 0)
588 goto out;
589 BUG_ON(zbuds > 2);
590 for (i = 0; i < zbuds; i++) {
591 xh.client_id = th[i].client_id;
592 xh.pool_id = th[i].pool_id;
593 xh.oid = th[i].oid;
594 xh.index = th[i].index;
595 size = zsize[i];
596 BUG_ON(size == 0 || size > zbud_max_buddy_size());
597 for (p = tmpmem[i], cksum = 0, j = 0; j < size; j++)
598 cksum += *p++;
599 ret = r2net_remote_put(&xh, tmpmem[i], size, eph, &remotenode);
600 if (ret != 0) {
601 /*
602 * This is some form of a memory leak... if the remote put
603 * fails, there will never be another attempt to remotify
604 * this page. But since we've dropped the zv pointer,
605 * the page may have been freed or the data replaced
606 * so we can't just "put it back" in the remote op list.
607 * Even if we could, not sure where to put it in the list
608 * because there may be flushes that must be strictly
609 * ordered vs the put. So leave this as a FIXME for now.
610 * But count them so we know if it becomes a problem.
611 */
612 if (eph)
613 ramster_eph_pages_remote_failed++;
614 else
615 ramster_pers_pages_remote_failed++;
616 break;
617 } else {
618 if (!eph)
619 atomic_inc(&ramster_remote_pers_pages);
620 }
621 if (eph)
622 ramster_eph_pages_remoted++;
623 else
624 ramster_pers_pages_remoted++;
625 /*
626 * data was successfully remoted so change the local version to
627 * point to the remote node where it landed
628 */
629 local_bh_disable();
630 pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh.pool_id);
631 local_irq_save(flags);
632 (void)tmem_replace(pool, &xh.oid, xh.index,
633 pampd_make_remote(remotenode, size, cksum));
634 local_irq_restore(flags);
635 zcache_put_pool(pool);
636 local_bh_enable();
637 }
638 out:
639 return zbuds;
640 }
641
642 static void zcache_do_remotify_flushes(void)
643 {
644 struct ramster_remotify_hdr *rem_op;
645 union remotify_list_node *u;
646
647 while (1) {
648 spin_lock(&ramster_rem_op_list_lock);
649 if (list_empty(&ramster_rem_op_list)) {
650 spin_unlock(&ramster_rem_op_list_lock);
651 goto out;
652 }
653 rem_op = list_first_entry(&ramster_rem_op_list,
654 struct ramster_remotify_hdr, list);
655 list_del_init(&rem_op->list);
656 spin_unlock(&ramster_rem_op_list_lock);
657 u = (union remotify_list_node *)rem_op;
658 switch (rem_op->op) {
659 case RAMSTER_REMOTIFY_FLUSH_PAGE:
660 ramster_remote_flush_page((struct flushlist_node *)u);
661 break;
662 case RAMSTER_REMOTIFY_FLUSH_OBJ:
663 ramster_remote_flush_object((struct flushlist_node *)u);
664 break;
665 default:
666 BUG();
667 }
668 }
669 out:
670 return;
671 }
672
673 static void ramster_remotify_process(struct work_struct *work)
674 {
675 static bool remotify_in_progress;
676 int i;
677
678 BUG_ON(irqs_disabled());
679 if (remotify_in_progress)
680 goto requeue;
681 if (ramster_remote_target_nodenum == -1)
682 goto requeue;
683 remotify_in_progress = true;
684 if (use_cleancache && ramster_eph_remotify_enable) {
685 for (i = 0; i < 100; i++) {
686 zcache_do_remotify_flushes();
687 (void)ramster_remotify_pageframe(true);
688 }
689 }
690 if (use_frontswap && ramster_pers_remotify_enable) {
691 for (i = 0; i < 100; i++) {
692 zcache_do_remotify_flushes();
693 (void)ramster_remotify_pageframe(false);
694 }
695 }
696 remotify_in_progress = false;
697 requeue:
698 ramster_remotify_queue_delayed_work(HZ);
699 }
700
701 void __init ramster_remotify_init(void)
702 {
703 unsigned long n = 60UL;
704 ramster_remotify_workqueue =
705 create_singlethread_workqueue("ramster_remotify");
706 ramster_remotify_queue_delayed_work(n * HZ);
707 }
708
709 static ssize_t ramster_manual_node_up_show(struct kobject *kobj,
710 struct kobj_attribute *attr, char *buf)
711 {
712 int i;
713 char *p = buf;
714 for (i = 0; i < MANUAL_NODES; i++)
715 if (ramster_nodes_manual_up[i])
716 p += sprintf(p, "%d ", i);
717 p += sprintf(p, "\n");
718 return p - buf;
719 }
720
721 static ssize_t ramster_manual_node_up_store(struct kobject *kobj,
722 struct kobj_attribute *attr, const char *buf, size_t count)
723 {
724 int err;
725 unsigned long node_num;
726
727 err = kstrtoul(buf, 10, &node_num);
728 if (err) {
729 pr_err("ramster: bad strtoul?\n");
730 return -EINVAL;
731 }
732 if (node_num >= MANUAL_NODES) {
733 pr_err("ramster: bad node_num=%lu?\n", node_num);
734 return -EINVAL;
735 }
736 if (ramster_nodes_manual_up[node_num]) {
737 pr_err("ramster: node %d already up, ignoring\n",
738 (int)node_num);
739 } else {
740 ramster_nodes_manual_up[node_num] = true;
741 r2net_hb_node_up_manual((int)node_num);
742 }
743 return count;
744 }
745
746 static struct kobj_attribute ramster_manual_node_up_attr = {
747 .attr = { .name = "manual_node_up", .mode = 0644 },
748 .show = ramster_manual_node_up_show,
749 .store = ramster_manual_node_up_store,
750 };
751
752 static ssize_t ramster_remote_target_nodenum_show(struct kobject *kobj,
753 struct kobj_attribute *attr, char *buf)
754 {
755 if (ramster_remote_target_nodenum == -1UL)
756 return sprintf(buf, "unset\n");
757 else
758 return sprintf(buf, "%d\n", ramster_remote_target_nodenum);
759 }
760
761 static ssize_t ramster_remote_target_nodenum_store(struct kobject *kobj,
762 struct kobj_attribute *attr, const char *buf, size_t count)
763 {
764 int err;
765 unsigned long node_num;
766
767 err = kstrtoul(buf, 10, &node_num);
768 if (err) {
769 pr_err("ramster: bad strtoul?\n");
770 return -EINVAL;
771 } else if (node_num == -1UL) {
772 pr_err("ramster: disabling all remotification, "
773 "data may still reside on remote nodes however\n");
774 return -EINVAL;
775 } else if (node_num >= MANUAL_NODES) {
776 pr_err("ramster: bad node_num=%lu?\n", node_num);
777 return -EINVAL;
778 } else if (!ramster_nodes_manual_up[node_num]) {
779 pr_err("ramster: node %d not up, ignoring setting "
780 "of remotification target\n", (int)node_num);
781 } else if (r2net_remote_target_node_set((int)node_num) >= 0) {
782 pr_info("ramster: node %d set as remotification target\n",
783 (int)node_num);
784 ramster_remote_target_nodenum = (int)node_num;
785 } else {
786 pr_err("ramster: bad num to node node_num=%d?\n",
787 (int)node_num);
788 return -EINVAL;
789 }
790 return count;
791 }
792
793 static struct kobj_attribute ramster_remote_target_nodenum_attr = {
794 .attr = { .name = "remote_target_nodenum", .mode = 0644 },
795 .show = ramster_remote_target_nodenum_show,
796 .store = ramster_remote_target_nodenum_store,
797 };
798
799 #define RAMSTER_SYSFS_RO(_name) \
800 static ssize_t ramster_##_name##_show(struct kobject *kobj, \
801 struct kobj_attribute *attr, char *buf) \
802 { \
803 return sprintf(buf, "%lu\n", ramster_##_name); \
804 } \
805 static struct kobj_attribute ramster_##_name##_attr = { \
806 .attr = { .name = __stringify(_name), .mode = 0444 }, \
807 .show = ramster_##_name##_show, \
808 }
809
810 #define RAMSTER_SYSFS_RW(_name) \
811 static ssize_t ramster_##_name##_show(struct kobject *kobj, \
812 struct kobj_attribute *attr, char *buf) \
813 { \
814 return sprintf(buf, "%lu\n", ramster_##_name); \
815 } \
816 static ssize_t ramster_##_name##_store(struct kobject *kobj, \
817 struct kobj_attribute *attr, const char *buf, size_t count) \
818 { \
819 int err; \
820 unsigned long enable; \
821 err = kstrtoul(buf, 10, &enable); \
822 if (err) \
823 return -EINVAL; \
824 ramster_##_name = enable; \
825 return count; \
826 } \
827 static struct kobj_attribute ramster_##_name##_attr = { \
828 .attr = { .name = __stringify(_name), .mode = 0644 }, \
829 .show = ramster_##_name##_show, \
830 .store = ramster_##_name##_store, \
831 }
832
833 #define RAMSTER_SYSFS_RO_ATOMIC(_name) \
834 static ssize_t ramster_##_name##_show(struct kobject *kobj, \
835 struct kobj_attribute *attr, char *buf) \
836 { \
837 return sprintf(buf, "%d\n", atomic_read(&ramster_##_name)); \
838 } \
839 static struct kobj_attribute ramster_##_name##_attr = { \
840 .attr = { .name = __stringify(_name), .mode = 0444 }, \
841 .show = ramster_##_name##_show, \
842 }
843
844 RAMSTER_SYSFS_RO(interface_revision);
845 RAMSTER_SYSFS_RO_ATOMIC(remote_pers_pages);
846 RAMSTER_SYSFS_RW(pers_remotify_enable);
847 RAMSTER_SYSFS_RW(eph_remotify_enable);
848
849 static struct attribute *ramster_attrs[] = {
850 &ramster_interface_revision_attr.attr,
851 &ramster_remote_pers_pages_attr.attr,
852 &ramster_manual_node_up_attr.attr,
853 &ramster_remote_target_nodenum_attr.attr,
854 &ramster_pers_remotify_enable_attr.attr,
855 &ramster_eph_remotify_enable_attr.attr,
856 NULL,
857 };
858
859 static struct attribute_group ramster_attr_group = {
860 .attrs = ramster_attrs,
861 .name = "ramster",
862 };
863
864 /*
865 * frontswap selfshrinking
866 */
867
868 /* In HZ, controls frequency of worker invocation. */
869 static unsigned int selfshrink_interval __read_mostly = 5;
870 /* Enable/disable with sysfs. */
871 static bool frontswap_selfshrinking __read_mostly;
872
873 static void selfshrink_process(struct work_struct *work);
874 static DECLARE_DELAYED_WORK(selfshrink_worker, selfshrink_process);
875
876 /* Enable/disable with kernel boot option. */
877 static bool use_frontswap_selfshrink __initdata = true;
878
879 /*
880 * The default values for the following parameters were deemed reasonable
881 * by experimentation, may be workload-dependent, and can all be
882 * adjusted via sysfs.
883 */
884
885 /* Control rate for frontswap shrinking. Higher hysteresis is slower. */
886 static unsigned int frontswap_hysteresis __read_mostly = 20;
887
888 /*
889 * Number of selfshrink worker invocations to wait before observing that
890 * frontswap selfshrinking should commence. Note that selfshrinking does
891 * not use a separate worker thread.
892 */
893 static unsigned int frontswap_inertia __read_mostly = 3;
894
895 /* Countdown to next invocation of frontswap_shrink() */
896 static unsigned long frontswap_inertia_counter;
897
898 /*
899 * Invoked by the selfshrink worker thread, uses current number of pages
900 * in frontswap (frontswap_curr_pages()), previous status, and control
901 * values (hysteresis and inertia) to determine if frontswap should be
902 * shrunk and what the new frontswap size should be. Note that
903 * frontswap_shrink is essentially a partial swapoff that immediately
904 * transfers pages from the "swap device" (frontswap) back into kernel
905 * RAM; despite the name, frontswap "shrinking" is very different from
906 * the "shrinker" interface used by the kernel MM subsystem to reclaim
907 * memory.
908 */
909 static void frontswap_selfshrink(void)
910 {
911 static unsigned long cur_frontswap_pages;
912 static unsigned long last_frontswap_pages;
913 static unsigned long tgt_frontswap_pages;
914
915 last_frontswap_pages = cur_frontswap_pages;
916 cur_frontswap_pages = frontswap_curr_pages();
917 if (!cur_frontswap_pages ||
918 (cur_frontswap_pages > last_frontswap_pages)) {
919 frontswap_inertia_counter = frontswap_inertia;
920 return;
921 }
922 if (frontswap_inertia_counter && --frontswap_inertia_counter)
923 return;
924 if (cur_frontswap_pages <= frontswap_hysteresis)
925 tgt_frontswap_pages = 0;
926 else
927 tgt_frontswap_pages = cur_frontswap_pages -
928 (cur_frontswap_pages / frontswap_hysteresis);
929 frontswap_shrink(tgt_frontswap_pages);
930 }
931
932 static int __init ramster_nofrontswap_selfshrink_setup(char *s)
933 {
934 use_frontswap_selfshrink = false;
935 return 1;
936 }
937
938 __setup("noselfshrink", ramster_nofrontswap_selfshrink_setup);
939
940 static void selfshrink_process(struct work_struct *work)
941 {
942 if (frontswap_selfshrinking && frontswap_enabled) {
943 frontswap_selfshrink();
944 schedule_delayed_work(&selfshrink_worker,
945 selfshrink_interval * HZ);
946 }
947 }
948
949 void ramster_cpu_up(int cpu)
950 {
951 unsigned char *p1 = kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT);
952 unsigned char *p2 = kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT);
953 BUG_ON(!p1 || !p2);
954 per_cpu(ramster_remoteputmem1, cpu) = p1;
955 per_cpu(ramster_remoteputmem2, cpu) = p2;
956 }
957
958 void ramster_cpu_down(int cpu)
959 {
960 struct ramster_preload *kp;
961
962 kfree(per_cpu(ramster_remoteputmem1, cpu));
963 per_cpu(ramster_remoteputmem1, cpu) = NULL;
964 kfree(per_cpu(ramster_remoteputmem2, cpu));
965 per_cpu(ramster_remoteputmem2, cpu) = NULL;
966 kp = &per_cpu(ramster_preloads, cpu);
967 if (kp->flnode) {
968 kmem_cache_free(ramster_flnode_cache, kp->flnode);
969 kp->flnode = NULL;
970 }
971 }
972
973 void ramster_register_pamops(struct tmem_pamops *pamops)
974 {
975 pamops->free_obj = ramster_pampd_free_obj;
976 pamops->new_obj = ramster_pampd_new_obj;
977 pamops->replace_in_obj = ramster_pampd_replace_in_obj;
978 pamops->is_remote = ramster_pampd_is_remote;
979 pamops->repatriate = ramster_pampd_repatriate;
980 pamops->repatriate_preload = ramster_pampd_repatriate_preload;
981 }
982
983 void __init ramster_init(bool cleancache, bool frontswap,
984 bool frontswap_exclusive_gets)
985 {
986 int ret = 0;
987
988 if (cleancache)
989 use_cleancache = true;
990 if (frontswap)
991 use_frontswap = true;
992 if (frontswap_exclusive_gets)
993 use_frontswap_exclusive_gets = true;
994 ramster_debugfs_init();
995 ret = sysfs_create_group(mm_kobj, &ramster_attr_group);
996 if (ret)
997 pr_err("ramster: can't create sysfs for ramster\n");
998 (void)r2net_register_handlers();
999 INIT_LIST_HEAD(&ramster_rem_op_list);
1000 ramster_flnode_cache = kmem_cache_create("ramster_flnode",
1001 sizeof(struct flushlist_node), 0, 0, NULL);
1002 frontswap_selfshrinking = use_frontswap_selfshrink;
1003 if (frontswap_selfshrinking) {
1004 pr_info("ramster: Initializing frontswap selfshrink driver.\n");
1005 schedule_delayed_work(&selfshrink_worker,
1006 selfshrink_interval * HZ);
1007 }
1008 ramster_remotify_init();
1009 }
This page took 0.119036 seconds and 4 git commands to generate.