4 * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
6 * RAMster implements peer-to-peer transcendent memory, allowing a "cluster" of
7 * kernels to dynamically pool their RAM so that a RAM-hungry workload on one
8 * machine can temporarily and transparently utilize RAM on another machine
9 * which is presumably idle or running a non-RAM-hungry workload.
11 * RAMster combines a clustering and messaging foundation based on the ocfs2
12 * cluster layer with the in-kernel compression implementation of zcache, and
13 * adds code to glue them together. When a page is "put" to RAMster, it is
14 * compressed and stored locally. Periodically, a thread will "remotify" these
15 * pages by sending them via messages to a remote machine. When the page is
16 * later needed as indicated by a page fault, a "get" is issued. If the data
17 * is local, it is uncompressed and the fault is resolved. If the data is
18 * remote, a message is sent to fetch the data and the faulting thread sleeps;
19 * when the data arrives, the thread awakens, the data is decompressed and
20 * the fault is resolved.
22 * As of V5, clusters up to eight nodes are supported; each node can remotify
23 * pages to one specified node, so clusters can be configured as clients to
24 * a "memory server". Some simple policy is in place that will need to be
25 * refined over time. Larger clusters and fault-resistant protocols can also
29 #include <linux/module.h>
30 #include <linux/cpu.h>
31 #include <linux/highmem.h>
32 #include <linux/list.h>
33 #include <linux/lzo.h>
34 #include <linux/slab.h>
35 #include <linux/spinlock.h>
36 #include <linux/types.h>
37 #include <linux/atomic.h>
38 #include <linux/frontswap.h>
40 #include "../zcache.h"
43 #include "ramster_nodemanager.h"
46 #define RAMSTER_TESTING
49 #error "ramster needs sysfs to define cluster nodes to use"
52 static bool use_cleancache __read_mostly
;
53 static bool use_frontswap __read_mostly
;
54 static bool use_frontswap_exclusive_gets __read_mostly
;
56 /* These must be sysfs not debugfs as they are checked/used by userland!! */
57 static unsigned long ramster_interface_revision __read_mostly
=
58 R2NM_API_VERSION
; /* interface revision must match userspace! */
59 static unsigned long ramster_pers_remotify_enable __read_mostly
;
60 static unsigned long ramster_eph_remotify_enable __read_mostly
;
61 static atomic_t ramster_remote_pers_pages
= ATOMIC_INIT(0);
62 #define MANUAL_NODES 8
63 static bool ramster_nodes_manual_up
[MANUAL_NODES
] __read_mostly
;
64 static int ramster_remote_target_nodenum __read_mostly
= -1;
66 /* these counters are made available via debugfs */
67 static long ramster_flnodes
;
68 static atomic_t ramster_flnodes_atomic
= ATOMIC_INIT(0);
69 static unsigned long ramster_flnodes_max
;
70 static inline void inc_ramster_flnodes(void)
72 ramster_flnodes
= atomic_inc_return(&ramster_flnodes_atomic
);
73 if (ramster_flnodes
> ramster_flnodes_max
)
74 ramster_flnodes_max
= ramster_flnodes
;
76 static inline void dec_ramster_flnodes(void)
78 ramster_flnodes
= atomic_dec_return(&ramster_flnodes_atomic
);
80 static ssize_t ramster_foreign_eph_pages
;
81 static atomic_t ramster_foreign_eph_pages_atomic
= ATOMIC_INIT(0);
82 static ssize_t ramster_foreign_eph_pages_max
;
83 static inline void inc_ramster_foreign_eph_pages(void)
85 ramster_foreign_eph_pages
= atomic_inc_return(
86 &ramster_foreign_eph_pages_atomic
);
87 if (ramster_foreign_eph_pages
> ramster_foreign_eph_pages_max
)
88 ramster_foreign_eph_pages_max
= ramster_foreign_eph_pages
;
90 static inline void dec_ramster_foreign_eph_pages(void)
92 ramster_foreign_eph_pages
= atomic_dec_return(
93 &ramster_foreign_eph_pages_atomic
);
95 static ssize_t ramster_foreign_pers_pages
;
96 static atomic_t ramster_foreign_pers_pages_atomic
= ATOMIC_INIT(0);
97 static ssize_t ramster_foreign_pers_pages_max
;
98 static inline void inc_ramster_foreign_pers_pages(void)
100 ramster_foreign_pers_pages
= atomic_inc_return(
101 &ramster_foreign_pers_pages_atomic
);
102 if (ramster_foreign_pers_pages
> ramster_foreign_pers_pages_max
)
103 ramster_foreign_pers_pages_max
= ramster_foreign_pers_pages
;
105 static inline void dec_ramster_foreign_pers_pages(void)
107 ramster_foreign_pers_pages
= atomic_dec_return(
108 &ramster_foreign_pers_pages_atomic
);
110 static ssize_t ramster_eph_pages_remoted
;
111 static ssize_t ramster_pers_pages_remoted
;
112 static ssize_t ramster_eph_pages_remote_failed
;
113 static ssize_t ramster_pers_pages_remote_failed
;
114 static ssize_t ramster_remote_eph_pages_succ_get
;
115 static ssize_t ramster_remote_pers_pages_succ_get
;
116 static ssize_t ramster_remote_eph_pages_unsucc_get
;
117 static ssize_t ramster_remote_pers_pages_unsucc_get
;
118 static ssize_t ramster_pers_pages_remote_nomem
;
119 static ssize_t ramster_remote_objects_flushed
;
120 static ssize_t ramster_remote_object_flushes_failed
;
121 static ssize_t ramster_remote_pages_flushed
;
122 static ssize_t ramster_remote_page_flushes_failed
;
123 /* FIXME frontswap selfshrinking knobs in debugfs? */
125 #ifdef CONFIG_DEBUG_FS
126 #include <linux/debugfs.h>
127 #define zdfs debugfs_create_size_t
128 #define zdfs64 debugfs_create_u64
129 static int __init
ramster_debugfs_init(void)
131 struct dentry
*root
= debugfs_create_dir("ramster", NULL
);
135 zdfs("eph_pages_remoted", S_IRUGO
, root
, &ramster_eph_pages_remoted
);
136 zdfs("pers_pages_remoted", S_IRUGO
, root
, &ramster_pers_pages_remoted
);
137 zdfs("eph_pages_remote_failed", S_IRUGO
, root
,
138 &ramster_eph_pages_remote_failed
);
139 zdfs("pers_pages_remote_failed", S_IRUGO
, root
,
140 &ramster_pers_pages_remote_failed
);
141 zdfs("remote_eph_pages_succ_get", S_IRUGO
, root
,
142 &ramster_remote_eph_pages_succ_get
);
143 zdfs("remote_pers_pages_succ_get", S_IRUGO
, root
,
144 &ramster_remote_pers_pages_succ_get
);
145 zdfs("remote_eph_pages_unsucc_get", S_IRUGO
, root
,
146 &ramster_remote_eph_pages_unsucc_get
);
147 zdfs("remote_pers_pages_unsucc_get", S_IRUGO
, root
,
148 &ramster_remote_pers_pages_unsucc_get
);
149 zdfs("pers_pages_remote_nomem", S_IRUGO
, root
,
150 &ramster_pers_pages_remote_nomem
);
151 zdfs("remote_objects_flushed", S_IRUGO
, root
,
152 &ramster_remote_objects_flushed
);
153 zdfs("remote_pages_flushed", S_IRUGO
, root
,
154 &ramster_remote_pages_flushed
);
155 zdfs("remote_object_flushes_failed", S_IRUGO
, root
,
156 &ramster_remote_object_flushes_failed
);
157 zdfs("remote_page_flushes_failed", S_IRUGO
, root
,
158 &ramster_remote_page_flushes_failed
);
159 zdfs("foreign_eph_pages", S_IRUGO
, root
,
160 &ramster_foreign_eph_pages
);
161 zdfs("foreign_eph_pages_max", S_IRUGO
, root
,
162 &ramster_foreign_eph_pages_max
);
163 zdfs("foreign_pers_pages", S_IRUGO
, root
,
164 &ramster_foreign_pers_pages
);
165 zdfs("foreign_pers_pages_max", S_IRUGO
, root
,
166 &ramster_foreign_pers_pages_max
);
172 static inline int ramster_debugfs_init(void)
178 static LIST_HEAD(ramster_rem_op_list
);
179 static DEFINE_SPINLOCK(ramster_rem_op_list_lock
);
180 static DEFINE_PER_CPU(struct ramster_preload
, ramster_preloads
);
182 static DEFINE_PER_CPU(unsigned char *, ramster_remoteputmem1
);
183 static DEFINE_PER_CPU(unsigned char *, ramster_remoteputmem2
);
185 static struct kmem_cache
*ramster_flnode_cache __read_mostly
;
187 static struct flushlist_node
*ramster_flnode_alloc(struct tmem_pool
*pool
)
189 struct flushlist_node
*flnode
= NULL
;
190 struct ramster_preload
*kp
;
192 kp
= &__get_cpu_var(ramster_preloads
);
194 BUG_ON(flnode
== NULL
);
196 inc_ramster_flnodes();
200 /* the "flush list" asynchronously collects pages to remotely flush */
201 #define FLUSH_ENTIRE_OBJECT ((uint32_t)-1)
202 static void ramster_flnode_free(struct flushlist_node
*flnode
,
203 struct tmem_pool
*pool
)
205 dec_ramster_flnodes();
206 BUG_ON(ramster_flnodes
< 0);
207 kmem_cache_free(ramster_flnode_cache
, flnode
);
210 int ramster_do_preload_flnode(struct tmem_pool
*pool
)
212 struct ramster_preload
*kp
;
213 struct flushlist_node
*flnode
;
216 BUG_ON(!irqs_disabled());
217 if (unlikely(ramster_flnode_cache
== NULL
))
219 kp
= &__get_cpu_var(ramster_preloads
);
220 flnode
= kmem_cache_alloc(ramster_flnode_cache
, GFP_ATOMIC
);
221 if (unlikely(flnode
== NULL
) && kp
->flnode
== NULL
)
222 BUG(); /* FIXME handle more gracefully, but how??? */
223 else if (kp
->flnode
== NULL
)
226 kmem_cache_free(ramster_flnode_cache
, flnode
);
231 * Called by the message handler after a (still compressed) page has been
232 * fetched from the remote machine in response to an "is_remote" tmem_get
233 * or persistent tmem_localify. For a tmem_get, "extra" is the address of
234 * the page that is to be filled to successfully resolve the tmem_get; for
235 * a (persistent) tmem_localify, "extra" is NULL (as the data is placed only
236 * in the local zcache). "data" points to "size" bytes of (compressed) data
237 * passed in the message. In the case of a persistent remote get, if
238 * pre-allocation was successful (see ramster_repatriate_preload), the page
239 * is placed into both local zcache and at "extra".
241 int ramster_localify(int pool_id
, struct tmem_oid
*oidp
, uint32_t index
,
242 char *data
, unsigned int size
, void *extra
)
246 struct tmem_pool
*pool
;
247 bool eph
, delete = false;
248 void *pampd
, *saved_hb
;
249 struct tmem_obj
*obj
;
251 pool
= zcache_get_pool_by_id(LOCAL_CLIENT
, pool_id
);
252 if (unlikely(pool
== NULL
))
253 /* pool doesn't exist anymore */
255 eph
= is_ephemeral(pool
);
256 local_irq_save(flags
); /* FIXME: maybe only disable softirqs? */
257 pampd
= tmem_localify_get_pampd(pool
, oidp
, index
, &obj
, &saved_hb
);
259 /* hmmm... must have been a flush while waiting */
260 #ifdef RAMSTER_TESTING
261 pr_err("UNTESTED pampd==NULL in ramster_localify\n");
264 ramster_remote_eph_pages_unsucc_get
++;
266 ramster_remote_pers_pages_unsucc_get
++;
269 } else if (unlikely(!pampd_is_remote(pampd
))) {
270 /* hmmm... must have been a dup put while waiting */
271 #ifdef RAMSTER_TESTING
272 pr_err("UNTESTED dup while waiting in ramster_localify\n");
275 ramster_remote_eph_pages_unsucc_get
++;
277 ramster_remote_pers_pages_unsucc_get
++;
282 } else if (size
== 0) {
283 /* no remote data, delete the local is_remote pampd */
286 ramster_remote_eph_pages_unsucc_get
++;
292 if (pampd_is_intransit(pampd
)) {
294 * a pampd is marked intransit if it is remote and space has
295 * been allocated for it locally (note, only happens for
296 * persistent pages, in which case the remote copy is freed)
299 pampd
= pampd_mask_intransit_and_remote(pampd
);
300 zbud_copy_to_zbud(pampd
, data
, size
);
303 * setting pampd to NULL tells tmem_localify_finish to leave
304 * pampd alone... meaning it is left pointing to the
311 * but in all cases, we decompress direct-to-memory to complete
312 * the remotify and return success
314 BUG_ON(extra
== NULL
);
315 zcache_decompress_to_page(data
, size
, (struct page
*)extra
);
317 ramster_remote_eph_pages_succ_get
++;
319 ramster_remote_pers_pages_succ_get
++;
322 tmem_localify_finish(obj
, index
, pampd
, saved_hb
, delete);
323 zcache_put_pool(pool
);
324 local_irq_restore(flags
);
329 void ramster_pampd_new_obj(struct tmem_obj
*obj
)
334 void ramster_pampd_free_obj(struct tmem_pool
*pool
, struct tmem_obj
*obj
,
337 struct flushlist_node
*flnode
;
339 BUG_ON(preemptible());
340 if (obj
->extra
== NULL
)
342 if (pool_destroy
&& is_ephemeral(pool
))
343 /* FIXME don't bother with remote eph data for now */
345 BUG_ON(!pampd_is_remote(obj
->extra
));
346 flnode
= ramster_flnode_alloc(pool
);
347 flnode
->xh
.client_id
= pampd_remote_node(obj
->extra
);
348 flnode
->xh
.pool_id
= pool
->pool_id
;
349 flnode
->xh
.oid
= obj
->oid
;
350 flnode
->xh
.index
= FLUSH_ENTIRE_OBJECT
;
351 flnode
->rem_op
.op
= RAMSTER_REMOTIFY_FLUSH_OBJ
;
352 spin_lock(&ramster_rem_op_list_lock
);
353 list_add(&flnode
->rem_op
.list
, &ramster_rem_op_list
);
354 spin_unlock(&ramster_rem_op_list_lock
);
358 * Called on a remote persistent tmem_get to attempt to preallocate
359 * local storage for the data contained in the remote persistent page.
360 * If successfully preallocated, returns the pampd, marked as remote and
361 * in_transit. Else returns NULL. Note that the appropriate tmem data
362 * structure must be locked.
364 void *ramster_pampd_repatriate_preload(void *pampd
, struct tmem_pool
*pool
,
365 struct tmem_oid
*oidp
, uint32_t index
,
368 int clen
= pampd_remote_size(pampd
), c
;
369 void *ret_pampd
= NULL
;
371 struct tmem_handle th
;
373 BUG_ON(!pampd_is_remote(pampd
));
374 BUG_ON(is_ephemeral(pool
));
375 if (use_frontswap_exclusive_gets
)
376 /* don't need local storage */
378 if (pampd_is_intransit(pampd
)) {
380 * to avoid multiple allocations (and maybe a memory leak)
381 * don't preallocate if already in the process of being
388 local_irq_save(flags
);
389 th
.client_id
= pampd_remote_node(pampd
);
390 th
.pool_id
= pool
->pool_id
;
393 ret_pampd
= zcache_pampd_create(NULL
, clen
, true, false, &th
);
394 if (ret_pampd
!= NULL
) {
396 * a pampd is marked intransit if it is remote and space has
397 * been allocated for it locally (note, only happens for
398 * persistent pages, in which case the remote copy is freed)
400 ret_pampd
= pampd_mark_intransit(ret_pampd
);
401 c
= atomic_dec_return(&ramster_remote_pers_pages
);
404 ramster_pers_pages_remote_nomem
++;
406 local_irq_restore(flags
);
412 * Called on a remote tmem_get to invoke a message to fetch the page.
413 * Might sleep so no tmem locks can be held. "extra" is passed
414 * all the way through the round-trip messaging to ramster_localify.
416 int ramster_pampd_repatriate(void *fake_pampd
, void *real_pampd
,
417 struct tmem_pool
*pool
,
418 struct tmem_oid
*oid
, uint32_t index
,
419 bool free
, void *extra
)
421 struct tmem_xhandle xh
;
424 if (pampd_is_intransit(real_pampd
))
425 /* have local space pre-reserved, so free remote copy */
427 xh
= tmem_xhandle_fill(LOCAL_CLIENT
, pool
, oid
, index
);
428 /* unreliable request/response for now */
429 ret
= r2net_remote_async_get(&xh
, free
,
430 pampd_remote_node(fake_pampd
),
431 pampd_remote_size(fake_pampd
),
432 pampd_remote_cksum(fake_pampd
),
437 bool ramster_pampd_is_remote(void *pampd
)
439 return pampd_is_remote(pampd
);
442 int ramster_pampd_replace_in_obj(void *new_pampd
, struct tmem_obj
*obj
)
446 if (new_pampd
!= NULL
) {
447 if (obj
->extra
== NULL
)
448 obj
->extra
= new_pampd
;
449 /* enforce that all remote pages in an object reside
450 * in the same node! */
451 else if (pampd_remote_node(new_pampd
) !=
452 pampd_remote_node((void *)(obj
->extra
)))
459 void *ramster_pampd_free(void *pampd
, struct tmem_pool
*pool
,
460 struct tmem_oid
*oid
, uint32_t index
, bool acct
)
462 bool eph
= is_ephemeral(pool
);
463 void *local_pampd
= NULL
;
466 BUG_ON(preemptible());
467 BUG_ON(!pampd_is_remote(pampd
));
468 WARN_ON(acct
== false);
471 * a NULL oid means to ignore this pampd free
472 * as the remote freeing will be handled elsewhere
475 /* FIXME remote flush optional but probably good idea */
476 } else if (pampd_is_intransit(pampd
)) {
477 /* did a pers remote get_and_free, so just free local */
478 local_pampd
= pampd_mask_intransit_and_remote(pampd
);
480 struct flushlist_node
*flnode
=
481 ramster_flnode_alloc(pool
);
483 flnode
->xh
.client_id
= pampd_remote_node(pampd
);
484 flnode
->xh
.pool_id
= pool
->pool_id
;
485 flnode
->xh
.oid
= *oid
;
486 flnode
->xh
.index
= index
;
487 flnode
->rem_op
.op
= RAMSTER_REMOTIFY_FLUSH_PAGE
;
488 spin_lock(&ramster_rem_op_list_lock
);
489 list_add(&flnode
->rem_op
.list
, &ramster_rem_op_list
);
490 spin_unlock(&ramster_rem_op_list_lock
);
491 c
= atomic_dec_return(&ramster_remote_pers_pages
);
497 void ramster_count_foreign_pages(bool eph
, int count
)
499 BUG_ON(count
!= 1 && count
!= -1);
502 inc_ramster_foreign_eph_pages();
504 dec_ramster_foreign_eph_pages();
505 WARN_ON_ONCE(ramster_foreign_eph_pages
< 0);
509 inc_ramster_foreign_pers_pages();
511 dec_ramster_foreign_pers_pages();
512 WARN_ON_ONCE(ramster_foreign_pers_pages
< 0);
518 * For now, just push over a few pages every few seconds to
519 * ensure that it basically works
521 static struct workqueue_struct
*ramster_remotify_workqueue
;
522 static void ramster_remotify_process(struct work_struct
*work
);
523 static DECLARE_DELAYED_WORK(ramster_remotify_worker
,
524 ramster_remotify_process
);
526 static void ramster_remotify_queue_delayed_work(unsigned long delay
)
528 if (!queue_delayed_work(ramster_remotify_workqueue
,
529 &ramster_remotify_worker
, delay
))
530 pr_err("ramster_remotify: bad workqueue\n");
533 static void ramster_remote_flush_page(struct flushlist_node
*flnode
)
535 struct tmem_xhandle
*xh
;
540 remotenode
= flnode
->xh
.client_id
;
541 ret
= r2net_remote_flush(xh
, remotenode
);
543 ramster_remote_pages_flushed
++;
545 ramster_remote_page_flushes_failed
++;
546 preempt_enable_no_resched();
547 ramster_flnode_free(flnode
, NULL
);
550 static void ramster_remote_flush_object(struct flushlist_node
*flnode
)
552 struct tmem_xhandle
*xh
;
557 remotenode
= flnode
->xh
.client_id
;
558 ret
= r2net_remote_flush_object(xh
, remotenode
);
560 ramster_remote_objects_flushed
++;
562 ramster_remote_object_flushes_failed
++;
563 preempt_enable_no_resched();
564 ramster_flnode_free(flnode
, NULL
);
567 int ramster_remotify_pageframe(bool eph
)
569 struct tmem_xhandle xh
;
571 int remotenode
, ret
, zbuds
;
572 struct tmem_pool
*pool
;
577 unsigned char *tmpmem
[2];
578 struct tmem_handle th
[2];
579 unsigned int zsize
[2];
581 tmpmem
[0] = __get_cpu_var(ramster_remoteputmem1
);
582 tmpmem
[1] = __get_cpu_var(ramster_remoteputmem2
);
584 zbuds
= zbud_make_zombie_lru(&th
[0], &tmpmem
[0], &zsize
[0], eph
);
585 /* now OK to release lock set in caller */
590 for (i
= 0; i
< zbuds
; i
++) {
591 xh
.client_id
= th
[i
].client_id
;
592 xh
.pool_id
= th
[i
].pool_id
;
594 xh
.index
= th
[i
].index
;
596 BUG_ON(size
== 0 || size
> zbud_max_buddy_size());
597 for (p
= tmpmem
[i
], cksum
= 0, j
= 0; j
< size
; j
++)
599 ret
= r2net_remote_put(&xh
, tmpmem
[i
], size
, eph
, &remotenode
);
602 * This is some form of a memory leak... if the remote put
603 * fails, there will never be another attempt to remotify
604 * this page. But since we've dropped the zv pointer,
605 * the page may have been freed or the data replaced
606 * so we can't just "put it back" in the remote op list.
607 * Even if we could, not sure where to put it in the list
608 * because there may be flushes that must be strictly
609 * ordered vs the put. So leave this as a FIXME for now.
610 * But count them so we know if it becomes a problem.
613 ramster_eph_pages_remote_failed
++;
615 ramster_pers_pages_remote_failed
++;
619 atomic_inc(&ramster_remote_pers_pages
);
622 ramster_eph_pages_remoted
++;
624 ramster_pers_pages_remoted
++;
626 * data was successfully remoted so change the local version to
627 * point to the remote node where it landed
630 pool
= zcache_get_pool_by_id(LOCAL_CLIENT
, xh
.pool_id
);
631 local_irq_save(flags
);
632 (void)tmem_replace(pool
, &xh
.oid
, xh
.index
,
633 pampd_make_remote(remotenode
, size
, cksum
));
634 local_irq_restore(flags
);
635 zcache_put_pool(pool
);
642 static void zcache_do_remotify_flushes(void)
644 struct ramster_remotify_hdr
*rem_op
;
645 union remotify_list_node
*u
;
648 spin_lock(&ramster_rem_op_list_lock
);
649 if (list_empty(&ramster_rem_op_list
)) {
650 spin_unlock(&ramster_rem_op_list_lock
);
653 rem_op
= list_first_entry(&ramster_rem_op_list
,
654 struct ramster_remotify_hdr
, list
);
655 list_del_init(&rem_op
->list
);
656 spin_unlock(&ramster_rem_op_list_lock
);
657 u
= (union remotify_list_node
*)rem_op
;
658 switch (rem_op
->op
) {
659 case RAMSTER_REMOTIFY_FLUSH_PAGE
:
660 ramster_remote_flush_page((struct flushlist_node
*)u
);
662 case RAMSTER_REMOTIFY_FLUSH_OBJ
:
663 ramster_remote_flush_object((struct flushlist_node
*)u
);
673 static void ramster_remotify_process(struct work_struct
*work
)
675 static bool remotify_in_progress
;
678 BUG_ON(irqs_disabled());
679 if (remotify_in_progress
)
681 if (ramster_remote_target_nodenum
== -1)
683 remotify_in_progress
= true;
684 if (use_cleancache
&& ramster_eph_remotify_enable
) {
685 for (i
= 0; i
< 100; i
++) {
686 zcache_do_remotify_flushes();
687 (void)ramster_remotify_pageframe(true);
690 if (use_frontswap
&& ramster_pers_remotify_enable
) {
691 for (i
= 0; i
< 100; i
++) {
692 zcache_do_remotify_flushes();
693 (void)ramster_remotify_pageframe(false);
696 remotify_in_progress
= false;
698 ramster_remotify_queue_delayed_work(HZ
);
701 void __init
ramster_remotify_init(void)
703 unsigned long n
= 60UL;
704 ramster_remotify_workqueue
=
705 create_singlethread_workqueue("ramster_remotify");
706 ramster_remotify_queue_delayed_work(n
* HZ
);
709 static ssize_t
ramster_manual_node_up_show(struct kobject
*kobj
,
710 struct kobj_attribute
*attr
, char *buf
)
714 for (i
= 0; i
< MANUAL_NODES
; i
++)
715 if (ramster_nodes_manual_up
[i
])
716 p
+= sprintf(p
, "%d ", i
);
717 p
+= sprintf(p
, "\n");
721 static ssize_t
ramster_manual_node_up_store(struct kobject
*kobj
,
722 struct kobj_attribute
*attr
, const char *buf
, size_t count
)
725 unsigned long node_num
;
727 err
= kstrtoul(buf
, 10, &node_num
);
729 pr_err("ramster: bad strtoul?\n");
732 if (node_num
>= MANUAL_NODES
) {
733 pr_err("ramster: bad node_num=%lu?\n", node_num
);
736 if (ramster_nodes_manual_up
[node_num
]) {
737 pr_err("ramster: node %d already up, ignoring\n",
740 ramster_nodes_manual_up
[node_num
] = true;
741 r2net_hb_node_up_manual((int)node_num
);
746 static struct kobj_attribute ramster_manual_node_up_attr
= {
747 .attr
= { .name
= "manual_node_up", .mode
= 0644 },
748 .show
= ramster_manual_node_up_show
,
749 .store
= ramster_manual_node_up_store
,
752 static ssize_t
ramster_remote_target_nodenum_show(struct kobject
*kobj
,
753 struct kobj_attribute
*attr
, char *buf
)
755 if (ramster_remote_target_nodenum
== -1UL)
756 return sprintf(buf
, "unset\n");
758 return sprintf(buf
, "%d\n", ramster_remote_target_nodenum
);
761 static ssize_t
ramster_remote_target_nodenum_store(struct kobject
*kobj
,
762 struct kobj_attribute
*attr
, const char *buf
, size_t count
)
765 unsigned long node_num
;
767 err
= kstrtoul(buf
, 10, &node_num
);
769 pr_err("ramster: bad strtoul?\n");
771 } else if (node_num
== -1UL) {
772 pr_err("ramster: disabling all remotification, "
773 "data may still reside on remote nodes however\n");
775 } else if (node_num
>= MANUAL_NODES
) {
776 pr_err("ramster: bad node_num=%lu?\n", node_num
);
778 } else if (!ramster_nodes_manual_up
[node_num
]) {
779 pr_err("ramster: node %d not up, ignoring setting "
780 "of remotification target\n", (int)node_num
);
781 } else if (r2net_remote_target_node_set((int)node_num
) >= 0) {
782 pr_info("ramster: node %d set as remotification target\n",
784 ramster_remote_target_nodenum
= (int)node_num
;
786 pr_err("ramster: bad num to node node_num=%d?\n",
793 static struct kobj_attribute ramster_remote_target_nodenum_attr
= {
794 .attr
= { .name
= "remote_target_nodenum", .mode
= 0644 },
795 .show
= ramster_remote_target_nodenum_show
,
796 .store
= ramster_remote_target_nodenum_store
,
799 #define RAMSTER_SYSFS_RO(_name) \
800 static ssize_t ramster_##_name##_show(struct kobject *kobj, \
801 struct kobj_attribute *attr, char *buf) \
803 return sprintf(buf, "%lu\n", ramster_##_name); \
805 static struct kobj_attribute ramster_##_name##_attr = { \
806 .attr = { .name = __stringify(_name), .mode = 0444 }, \
807 .show = ramster_##_name##_show, \
810 #define RAMSTER_SYSFS_RW(_name) \
811 static ssize_t ramster_##_name##_show(struct kobject *kobj, \
812 struct kobj_attribute *attr, char *buf) \
814 return sprintf(buf, "%lu\n", ramster_##_name); \
816 static ssize_t ramster_##_name##_store(struct kobject *kobj, \
817 struct kobj_attribute *attr, const char *buf, size_t count) \
820 unsigned long enable; \
821 err = kstrtoul(buf, 10, &enable); \
824 ramster_##_name = enable; \
827 static struct kobj_attribute ramster_##_name##_attr = { \
828 .attr = { .name = __stringify(_name), .mode = 0644 }, \
829 .show = ramster_##_name##_show, \
830 .store = ramster_##_name##_store, \
833 #define RAMSTER_SYSFS_RO_ATOMIC(_name) \
834 static ssize_t ramster_##_name##_show(struct kobject *kobj, \
835 struct kobj_attribute *attr, char *buf) \
837 return sprintf(buf, "%d\n", atomic_read(&ramster_##_name)); \
839 static struct kobj_attribute ramster_##_name##_attr = { \
840 .attr = { .name = __stringify(_name), .mode = 0444 }, \
841 .show = ramster_##_name##_show, \
844 RAMSTER_SYSFS_RO(interface_revision
);
845 RAMSTER_SYSFS_RO_ATOMIC(remote_pers_pages
);
846 RAMSTER_SYSFS_RW(pers_remotify_enable
);
847 RAMSTER_SYSFS_RW(eph_remotify_enable
);
849 static struct attribute
*ramster_attrs
[] = {
850 &ramster_interface_revision_attr
.attr
,
851 &ramster_remote_pers_pages_attr
.attr
,
852 &ramster_manual_node_up_attr
.attr
,
853 &ramster_remote_target_nodenum_attr
.attr
,
854 &ramster_pers_remotify_enable_attr
.attr
,
855 &ramster_eph_remotify_enable_attr
.attr
,
859 static struct attribute_group ramster_attr_group
= {
860 .attrs
= ramster_attrs
,
865 * frontswap selfshrinking
868 /* In HZ, controls frequency of worker invocation. */
869 static unsigned int selfshrink_interval __read_mostly
= 5;
870 /* Enable/disable with sysfs. */
871 static bool frontswap_selfshrinking __read_mostly
;
873 static void selfshrink_process(struct work_struct
*work
);
874 static DECLARE_DELAYED_WORK(selfshrink_worker
, selfshrink_process
);
876 /* Enable/disable with kernel boot option. */
877 static bool use_frontswap_selfshrink __initdata
= true;
880 * The default values for the following parameters were deemed reasonable
881 * by experimentation, may be workload-dependent, and can all be
882 * adjusted via sysfs.
885 /* Control rate for frontswap shrinking. Higher hysteresis is slower. */
886 static unsigned int frontswap_hysteresis __read_mostly
= 20;
889 * Number of selfshrink worker invocations to wait before observing that
890 * frontswap selfshrinking should commence. Note that selfshrinking does
891 * not use a separate worker thread.
893 static unsigned int frontswap_inertia __read_mostly
= 3;
895 /* Countdown to next invocation of frontswap_shrink() */
896 static unsigned long frontswap_inertia_counter
;
899 * Invoked by the selfshrink worker thread, uses current number of pages
900 * in frontswap (frontswap_curr_pages()), previous status, and control
901 * values (hysteresis and inertia) to determine if frontswap should be
902 * shrunk and what the new frontswap size should be. Note that
903 * frontswap_shrink is essentially a partial swapoff that immediately
904 * transfers pages from the "swap device" (frontswap) back into kernel
905 * RAM; despite the name, frontswap "shrinking" is very different from
906 * the "shrinker" interface used by the kernel MM subsystem to reclaim
909 static void frontswap_selfshrink(void)
911 static unsigned long cur_frontswap_pages
;
912 static unsigned long last_frontswap_pages
;
913 static unsigned long tgt_frontswap_pages
;
915 last_frontswap_pages
= cur_frontswap_pages
;
916 cur_frontswap_pages
= frontswap_curr_pages();
917 if (!cur_frontswap_pages
||
918 (cur_frontswap_pages
> last_frontswap_pages
)) {
919 frontswap_inertia_counter
= frontswap_inertia
;
922 if (frontswap_inertia_counter
&& --frontswap_inertia_counter
)
924 if (cur_frontswap_pages
<= frontswap_hysteresis
)
925 tgt_frontswap_pages
= 0;
927 tgt_frontswap_pages
= cur_frontswap_pages
-
928 (cur_frontswap_pages
/ frontswap_hysteresis
);
929 frontswap_shrink(tgt_frontswap_pages
);
932 static int __init
ramster_nofrontswap_selfshrink_setup(char *s
)
934 use_frontswap_selfshrink
= false;
938 __setup("noselfshrink", ramster_nofrontswap_selfshrink_setup
);
940 static void selfshrink_process(struct work_struct
*work
)
942 if (frontswap_selfshrinking
&& frontswap_enabled
) {
943 frontswap_selfshrink();
944 schedule_delayed_work(&selfshrink_worker
,
945 selfshrink_interval
* HZ
);
949 void ramster_cpu_up(int cpu
)
951 unsigned char *p1
= kzalloc(PAGE_SIZE
, GFP_KERNEL
| __GFP_REPEAT
);
952 unsigned char *p2
= kzalloc(PAGE_SIZE
, GFP_KERNEL
| __GFP_REPEAT
);
954 per_cpu(ramster_remoteputmem1
, cpu
) = p1
;
955 per_cpu(ramster_remoteputmem2
, cpu
) = p2
;
958 void ramster_cpu_down(int cpu
)
960 struct ramster_preload
*kp
;
962 kfree(per_cpu(ramster_remoteputmem1
, cpu
));
963 per_cpu(ramster_remoteputmem1
, cpu
) = NULL
;
964 kfree(per_cpu(ramster_remoteputmem2
, cpu
));
965 per_cpu(ramster_remoteputmem2
, cpu
) = NULL
;
966 kp
= &per_cpu(ramster_preloads
, cpu
);
968 kmem_cache_free(ramster_flnode_cache
, kp
->flnode
);
973 void ramster_register_pamops(struct tmem_pamops
*pamops
)
975 pamops
->free_obj
= ramster_pampd_free_obj
;
976 pamops
->new_obj
= ramster_pampd_new_obj
;
977 pamops
->replace_in_obj
= ramster_pampd_replace_in_obj
;
978 pamops
->is_remote
= ramster_pampd_is_remote
;
979 pamops
->repatriate
= ramster_pampd_repatriate
;
980 pamops
->repatriate_preload
= ramster_pampd_repatriate_preload
;
983 void __init
ramster_init(bool cleancache
, bool frontswap
,
984 bool frontswap_exclusive_gets
)
989 use_cleancache
= true;
991 use_frontswap
= true;
992 if (frontswap_exclusive_gets
)
993 use_frontswap_exclusive_gets
= true;
994 ramster_debugfs_init();
995 ret
= sysfs_create_group(mm_kobj
, &ramster_attr_group
);
997 pr_err("ramster: can't create sysfs for ramster\n");
998 (void)r2net_register_handlers();
999 INIT_LIST_HEAD(&ramster_rem_op_list
);
1000 ramster_flnode_cache
= kmem_cache_create("ramster_flnode",
1001 sizeof(struct flushlist_node
), 0, 0, NULL
);
1002 frontswap_selfshrinking
= use_frontswap_selfshrink
;
1003 if (frontswap_selfshrinking
) {
1004 pr_info("ramster: Initializing frontswap selfshrink driver.\n");
1005 schedule_delayed_work(&selfshrink_worker
,
1006 selfshrink_interval
* HZ
);
1008 ramster_remotify_init();