staging: ramster: Provide accessory functions for counter increase
[deliverable/linux.git] / drivers / staging / zcache / ramster / ramster.c
CommitLineData
14c9fda5
DM
1/*
2 * ramster.c
3 *
4 * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
5 *
6 * RAMster implements peer-to-peer transcendent memory, allowing a "cluster" of
7 * kernels to dynamically pool their RAM so that a RAM-hungry workload on one
8 * machine can temporarily and transparently utilize RAM on another machine
9 * which is presumably idle or running a non-RAM-hungry workload.
10 *
11 * RAMster combines a clustering and messaging foundation based on the ocfs2
12 * cluster layer with the in-kernel compression implementation of zcache, and
13 * adds code to glue them together. When a page is "put" to RAMster, it is
14 * compressed and stored locally. Periodically, a thread will "remotify" these
15 * pages by sending them via messages to a remote machine. When the page is
16 * later needed as indicated by a page fault, a "get" is issued. If the data
17 * is local, it is uncompressed and the fault is resolved. If the data is
18 * remote, a message is sent to fetch the data and the faulting thread sleeps;
19 * when the data arrives, the thread awakens, the data is decompressed and
20 * the fault is resolved.
21
22 * As of V5, clusters up to eight nodes are supported; each node can remotify
23 * pages to one specified node, so clusters can be configured as clients to
24 * a "memory server". Some simple policy is in place that will need to be
25 * refined over time. Larger clusters and fault-resistant protocols can also
26 * be added over time.
27 */
28
29#include <linux/module.h>
30#include <linux/cpu.h>
31#include <linux/highmem.h>
32#include <linux/list.h>
33#include <linux/lzo.h>
34#include <linux/slab.h>
35#include <linux/spinlock.h>
36#include <linux/types.h>
37#include <linux/atomic.h>
38#include <linux/frontswap.h>
39#include "../tmem.h"
40#include "../zcache.h"
41#include "../zbud.h"
42#include "ramster.h"
43#include "ramster_nodemanager.h"
44#include "tcp.h"
45
46#define RAMSTER_TESTING
47
48#ifndef CONFIG_SYSFS
49#error "ramster needs sysfs to define cluster nodes to use"
50#endif
51
52static bool use_cleancache __read_mostly;
53static bool use_frontswap __read_mostly;
54static bool use_frontswap_exclusive_gets __read_mostly;
55
56/* These must be sysfs not debugfs as they are checked/used by userland!! */
57static unsigned long ramster_interface_revision __read_mostly =
58 R2NM_API_VERSION; /* interface revision must match userspace! */
59static unsigned long ramster_pers_remotify_enable __read_mostly;
60static unsigned long ramster_eph_remotify_enable __read_mostly;
61static atomic_t ramster_remote_pers_pages = ATOMIC_INIT(0);
62#define MANUAL_NODES 8
63static bool ramster_nodes_manual_up[MANUAL_NODES] __read_mostly;
64static int ramster_remote_target_nodenum __read_mostly = -1;
65
66/* these counters are made available via debugfs */
67static long ramster_flnodes;
68static atomic_t ramster_flnodes_atomic = ATOMIC_INIT(0);
69static unsigned long ramster_flnodes_max;
93cc39b6
WL
70static inline void inc_ramster_flnodes(void)
71{
72 ramster_flnodes = atomic_inc_return(&ramster_flnodes_atomic);
73 if (ramster_flnodes > ramster_flnodes_max)
74 ramster_flnodes_max = ramster_flnodes;
75}
80e9fc9a 76static ssize_t ramster_foreign_eph_pages;
14c9fda5 77static atomic_t ramster_foreign_eph_pages_atomic = ATOMIC_INIT(0);
80e9fc9a 78static ssize_t ramster_foreign_eph_pages_max;
93cc39b6
WL
79static inline void inc_ramster_foreign_eph_pages(void)
80{
81 ramster_foreign_eph_pages = atomic_inc_return(
82 &ramster_foreign_eph_pages_atomic);
83 if (ramster_foreign_eph_pages > ramster_foreign_eph_pages_max)
84 ramster_foreign_eph_pages_max = ramster_foreign_eph_pages;
85}
80e9fc9a 86static ssize_t ramster_foreign_pers_pages;
14c9fda5 87static atomic_t ramster_foreign_pers_pages_atomic = ATOMIC_INIT(0);
80e9fc9a 88static ssize_t ramster_foreign_pers_pages_max;
93cc39b6
WL
89static inline void inc_ramster_foreign_pers_pages(void)
90{
91 ramster_foreign_pers_pages = atomic_inc_return(
92 &ramster_foreign_pers_pages_atomic);
93 if (ramster_foreign_pers_pages > ramster_foreign_pers_pages_max)
94 ramster_foreign_pers_pages_max = ramster_foreign_pers_pages;
95}
80e9fc9a
KRW
96static ssize_t ramster_eph_pages_remoted;
97static ssize_t ramster_pers_pages_remoted;
98static ssize_t ramster_eph_pages_remote_failed;
99static ssize_t ramster_pers_pages_remote_failed;
100static ssize_t ramster_remote_eph_pages_succ_get;
101static ssize_t ramster_remote_pers_pages_succ_get;
102static ssize_t ramster_remote_eph_pages_unsucc_get;
103static ssize_t ramster_remote_pers_pages_unsucc_get;
104static ssize_t ramster_pers_pages_remote_nomem;
105static ssize_t ramster_remote_objects_flushed;
106static ssize_t ramster_remote_object_flushes_failed;
107static ssize_t ramster_remote_pages_flushed;
108static ssize_t ramster_remote_page_flushes_failed;
14c9fda5
DM
109/* FIXME frontswap selfshrinking knobs in debugfs? */
110
111#ifdef CONFIG_DEBUG_FS
112#include <linux/debugfs.h>
113#define zdfs debugfs_create_size_t
114#define zdfs64 debugfs_create_u64
115static int __init ramster_debugfs_init(void)
116{
117 struct dentry *root = debugfs_create_dir("ramster", NULL);
118 if (root == NULL)
119 return -ENXIO;
120
121 zdfs("eph_pages_remoted", S_IRUGO, root, &ramster_eph_pages_remoted);
122 zdfs("pers_pages_remoted", S_IRUGO, root, &ramster_pers_pages_remoted);
123 zdfs("eph_pages_remote_failed", S_IRUGO, root,
124 &ramster_eph_pages_remote_failed);
125 zdfs("pers_pages_remote_failed", S_IRUGO, root,
126 &ramster_pers_pages_remote_failed);
127 zdfs("remote_eph_pages_succ_get", S_IRUGO, root,
128 &ramster_remote_eph_pages_succ_get);
129 zdfs("remote_pers_pages_succ_get", S_IRUGO, root,
130 &ramster_remote_pers_pages_succ_get);
131 zdfs("remote_eph_pages_unsucc_get", S_IRUGO, root,
132 &ramster_remote_eph_pages_unsucc_get);
133 zdfs("remote_pers_pages_unsucc_get", S_IRUGO, root,
134 &ramster_remote_pers_pages_unsucc_get);
135 zdfs("pers_pages_remote_nomem", S_IRUGO, root,
136 &ramster_pers_pages_remote_nomem);
137 zdfs("remote_objects_flushed", S_IRUGO, root,
138 &ramster_remote_objects_flushed);
139 zdfs("remote_pages_flushed", S_IRUGO, root,
140 &ramster_remote_pages_flushed);
141 zdfs("remote_object_flushes_failed", S_IRUGO, root,
142 &ramster_remote_object_flushes_failed);
143 zdfs("remote_page_flushes_failed", S_IRUGO, root,
144 &ramster_remote_page_flushes_failed);
145 zdfs("foreign_eph_pages", S_IRUGO, root,
146 &ramster_foreign_eph_pages);
147 zdfs("foreign_eph_pages_max", S_IRUGO, root,
148 &ramster_foreign_eph_pages_max);
149 zdfs("foreign_pers_pages", S_IRUGO, root,
150 &ramster_foreign_pers_pages);
151 zdfs("foreign_pers_pages_max", S_IRUGO, root,
152 &ramster_foreign_pers_pages_max);
153 return 0;
154}
155#undef zdebugfs
156#undef zdfs64
555ee46f
BL
157#else
158static inline int ramster_debugfs_init(void)
159{
160 return 0;
161}
14c9fda5
DM
162#endif
163
164static LIST_HEAD(ramster_rem_op_list);
165static DEFINE_SPINLOCK(ramster_rem_op_list_lock);
166static DEFINE_PER_CPU(struct ramster_preload, ramster_preloads);
167
168static DEFINE_PER_CPU(unsigned char *, ramster_remoteputmem1);
169static DEFINE_PER_CPU(unsigned char *, ramster_remoteputmem2);
170
171static struct kmem_cache *ramster_flnode_cache __read_mostly;
172
173static struct flushlist_node *ramster_flnode_alloc(struct tmem_pool *pool)
174{
175 struct flushlist_node *flnode = NULL;
176 struct ramster_preload *kp;
177
178 kp = &__get_cpu_var(ramster_preloads);
179 flnode = kp->flnode;
180 BUG_ON(flnode == NULL);
181 kp->flnode = NULL;
93cc39b6 182 inc_ramster_flnodes();
14c9fda5
DM
183 return flnode;
184}
185
186/* the "flush list" asynchronously collects pages to remotely flush */
187#define FLUSH_ENTIRE_OBJECT ((uint32_t)-1)
188static void ramster_flnode_free(struct flushlist_node *flnode,
189 struct tmem_pool *pool)
190{
191 int flnodes;
192
193 flnodes = atomic_dec_return(&ramster_flnodes_atomic);
194 BUG_ON(flnodes < 0);
195 kmem_cache_free(ramster_flnode_cache, flnode);
196}
197
198int ramster_do_preload_flnode(struct tmem_pool *pool)
199{
200 struct ramster_preload *kp;
201 struct flushlist_node *flnode;
202 int ret = -ENOMEM;
203
204 BUG_ON(!irqs_disabled());
205 if (unlikely(ramster_flnode_cache == NULL))
206 BUG();
207 kp = &__get_cpu_var(ramster_preloads);
208 flnode = kmem_cache_alloc(ramster_flnode_cache, GFP_ATOMIC);
209 if (unlikely(flnode == NULL) && kp->flnode == NULL)
210 BUG(); /* FIXME handle more gracefully, but how??? */
211 else if (kp->flnode == NULL)
212 kp->flnode = flnode;
213 else
214 kmem_cache_free(ramster_flnode_cache, flnode);
215 return ret;
216}
217
218/*
219 * Called by the message handler after a (still compressed) page has been
220 * fetched from the remote machine in response to an "is_remote" tmem_get
221 * or persistent tmem_localify. For a tmem_get, "extra" is the address of
222 * the page that is to be filled to successfully resolve the tmem_get; for
223 * a (persistent) tmem_localify, "extra" is NULL (as the data is placed only
224 * in the local zcache). "data" points to "size" bytes of (compressed) data
225 * passed in the message. In the case of a persistent remote get, if
226 * pre-allocation was successful (see ramster_repatriate_preload), the page
227 * is placed into both local zcache and at "extra".
228 */
229int ramster_localify(int pool_id, struct tmem_oid *oidp, uint32_t index,
230 char *data, unsigned int size, void *extra)
231{
232 int ret = -ENOENT;
233 unsigned long flags;
234 struct tmem_pool *pool;
235 bool eph, delete = false;
236 void *pampd, *saved_hb;
237 struct tmem_obj *obj;
238
239 pool = zcache_get_pool_by_id(LOCAL_CLIENT, pool_id);
240 if (unlikely(pool == NULL))
241 /* pool doesn't exist anymore */
242 goto out;
243 eph = is_ephemeral(pool);
244 local_irq_save(flags); /* FIXME: maybe only disable softirqs? */
245 pampd = tmem_localify_get_pampd(pool, oidp, index, &obj, &saved_hb);
246 if (pampd == NULL) {
247 /* hmmm... must have been a flush while waiting */
248#ifdef RAMSTER_TESTING
249 pr_err("UNTESTED pampd==NULL in ramster_localify\n");
250#endif
251 if (eph)
252 ramster_remote_eph_pages_unsucc_get++;
253 else
254 ramster_remote_pers_pages_unsucc_get++;
255 obj = NULL;
256 goto finish;
257 } else if (unlikely(!pampd_is_remote(pampd))) {
258 /* hmmm... must have been a dup put while waiting */
259#ifdef RAMSTER_TESTING
260 pr_err("UNTESTED dup while waiting in ramster_localify\n");
261#endif
262 if (eph)
263 ramster_remote_eph_pages_unsucc_get++;
264 else
265 ramster_remote_pers_pages_unsucc_get++;
266 obj = NULL;
267 pampd = NULL;
268 ret = -EEXIST;
269 goto finish;
270 } else if (size == 0) {
271 /* no remote data, delete the local is_remote pampd */
272 pampd = NULL;
273 if (eph)
274 ramster_remote_eph_pages_unsucc_get++;
275 else
276 BUG();
277 delete = true;
278 goto finish;
279 }
280 if (pampd_is_intransit(pampd)) {
281 /*
282 * a pampd is marked intransit if it is remote and space has
283 * been allocated for it locally (note, only happens for
284 * persistent pages, in which case the remote copy is freed)
285 */
286 BUG_ON(eph);
287 pampd = pampd_mask_intransit_and_remote(pampd);
288 zbud_copy_to_zbud(pampd, data, size);
289 } else {
290 /*
291 * setting pampd to NULL tells tmem_localify_finish to leave
292 * pampd alone... meaning it is left pointing to the
293 * remote copy
294 */
295 pampd = NULL;
296 obj = NULL;
297 }
298 /*
299 * but in all cases, we decompress direct-to-memory to complete
300 * the remotify and return success
301 */
302 BUG_ON(extra == NULL);
303 zcache_decompress_to_page(data, size, (struct page *)extra);
304 if (eph)
305 ramster_remote_eph_pages_succ_get++;
306 else
307 ramster_remote_pers_pages_succ_get++;
308 ret = 0;
309finish:
310 tmem_localify_finish(obj, index, pampd, saved_hb, delete);
311 zcache_put_pool(pool);
312 local_irq_restore(flags);
313out:
314 return ret;
315}
316
317void ramster_pampd_new_obj(struct tmem_obj *obj)
318{
319 obj->extra = NULL;
320}
321
322void ramster_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj,
323 bool pool_destroy)
324{
325 struct flushlist_node *flnode;
326
327 BUG_ON(preemptible());
328 if (obj->extra == NULL)
329 return;
330 if (pool_destroy && is_ephemeral(pool))
331 /* FIXME don't bother with remote eph data for now */
332 return;
333 BUG_ON(!pampd_is_remote(obj->extra));
334 flnode = ramster_flnode_alloc(pool);
335 flnode->xh.client_id = pampd_remote_node(obj->extra);
336 flnode->xh.pool_id = pool->pool_id;
337 flnode->xh.oid = obj->oid;
338 flnode->xh.index = FLUSH_ENTIRE_OBJECT;
339 flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_OBJ;
340 spin_lock(&ramster_rem_op_list_lock);
341 list_add(&flnode->rem_op.list, &ramster_rem_op_list);
342 spin_unlock(&ramster_rem_op_list_lock);
343}
344
345/*
346 * Called on a remote persistent tmem_get to attempt to preallocate
347 * local storage for the data contained in the remote persistent page.
348 * If successfully preallocated, returns the pampd, marked as remote and
349 * in_transit. Else returns NULL. Note that the appropriate tmem data
350 * structure must be locked.
351 */
352void *ramster_pampd_repatriate_preload(void *pampd, struct tmem_pool *pool,
353 struct tmem_oid *oidp, uint32_t index,
354 bool *intransit)
355{
356 int clen = pampd_remote_size(pampd), c;
357 void *ret_pampd = NULL;
358 unsigned long flags;
359 struct tmem_handle th;
360
361 BUG_ON(!pampd_is_remote(pampd));
362 BUG_ON(is_ephemeral(pool));
363 if (use_frontswap_exclusive_gets)
364 /* don't need local storage */
365 goto out;
366 if (pampd_is_intransit(pampd)) {
367 /*
368 * to avoid multiple allocations (and maybe a memory leak)
369 * don't preallocate if already in the process of being
370 * repatriated
371 */
372 *intransit = true;
373 goto out;
374 }
375 *intransit = false;
376 local_irq_save(flags);
377 th.client_id = pampd_remote_node(pampd);
378 th.pool_id = pool->pool_id;
379 th.oid = *oidp;
380 th.index = index;
381 ret_pampd = zcache_pampd_create(NULL, clen, true, false, &th);
382 if (ret_pampd != NULL) {
383 /*
384 * a pampd is marked intransit if it is remote and space has
385 * been allocated for it locally (note, only happens for
386 * persistent pages, in which case the remote copy is freed)
387 */
388 ret_pampd = pampd_mark_intransit(ret_pampd);
389 c = atomic_dec_return(&ramster_remote_pers_pages);
390 WARN_ON_ONCE(c < 0);
391 } else {
392 ramster_pers_pages_remote_nomem++;
393 }
394 local_irq_restore(flags);
395out:
396 return ret_pampd;
397}
398
399/*
400 * Called on a remote tmem_get to invoke a message to fetch the page.
401 * Might sleep so no tmem locks can be held. "extra" is passed
402 * all the way through the round-trip messaging to ramster_localify.
403 */
404int ramster_pampd_repatriate(void *fake_pampd, void *real_pampd,
405 struct tmem_pool *pool,
406 struct tmem_oid *oid, uint32_t index,
407 bool free, void *extra)
408{
409 struct tmem_xhandle xh;
410 int ret;
411
412 if (pampd_is_intransit(real_pampd))
413 /* have local space pre-reserved, so free remote copy */
414 free = true;
415 xh = tmem_xhandle_fill(LOCAL_CLIENT, pool, oid, index);
416 /* unreliable request/response for now */
417 ret = r2net_remote_async_get(&xh, free,
418 pampd_remote_node(fake_pampd),
419 pampd_remote_size(fake_pampd),
420 pampd_remote_cksum(fake_pampd),
421 extra);
422 return ret;
423}
424
425bool ramster_pampd_is_remote(void *pampd)
426{
427 return pampd_is_remote(pampd);
428}
429
430int ramster_pampd_replace_in_obj(void *new_pampd, struct tmem_obj *obj)
431{
432 int ret = -1;
433
434 if (new_pampd != NULL) {
435 if (obj->extra == NULL)
436 obj->extra = new_pampd;
437 /* enforce that all remote pages in an object reside
438 * in the same node! */
439 else if (pampd_remote_node(new_pampd) !=
440 pampd_remote_node((void *)(obj->extra)))
441 BUG();
442 ret = 0;
443 }
444 return ret;
445}
446
447void *ramster_pampd_free(void *pampd, struct tmem_pool *pool,
448 struct tmem_oid *oid, uint32_t index, bool acct)
449{
450 bool eph = is_ephemeral(pool);
451 void *local_pampd = NULL;
452 int c;
453
454 BUG_ON(preemptible());
455 BUG_ON(!pampd_is_remote(pampd));
456 WARN_ON(acct == false);
457 if (oid == NULL) {
458 /*
459 * a NULL oid means to ignore this pampd free
460 * as the remote freeing will be handled elsewhere
461 */
462 } else if (eph) {
463 /* FIXME remote flush optional but probably good idea */
464 } else if (pampd_is_intransit(pampd)) {
465 /* did a pers remote get_and_free, so just free local */
466 local_pampd = pampd_mask_intransit_and_remote(pampd);
467 } else {
468 struct flushlist_node *flnode =
469 ramster_flnode_alloc(pool);
470
471 flnode->xh.client_id = pampd_remote_node(pampd);
472 flnode->xh.pool_id = pool->pool_id;
473 flnode->xh.oid = *oid;
474 flnode->xh.index = index;
475 flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_PAGE;
476 spin_lock(&ramster_rem_op_list_lock);
477 list_add(&flnode->rem_op.list, &ramster_rem_op_list);
478 spin_unlock(&ramster_rem_op_list_lock);
479 c = atomic_dec_return(&ramster_remote_pers_pages);
480 WARN_ON_ONCE(c < 0);
481 }
482 return local_pampd;
483}
484
485void ramster_count_foreign_pages(bool eph, int count)
486{
487 int c;
488
489 BUG_ON(count != 1 && count != -1);
490 if (eph) {
491 if (count > 0) {
93cc39b6 492 inc_ramster_foreign_eph_pages();
14c9fda5
DM
493 } else {
494 c = atomic_dec_return(&ramster_foreign_eph_pages_atomic);
495 WARN_ON_ONCE(c < 0);
496 }
497 ramster_foreign_eph_pages = c;
498 } else {
499 if (count > 0) {
93cc39b6 500 inc_ramster_foreign_pers_pages();
14c9fda5
DM
501 } else {
502 c = atomic_dec_return(
503 &ramster_foreign_pers_pages_atomic);
504 WARN_ON_ONCE(c < 0);
505 }
506 ramster_foreign_pers_pages = c;
507 }
508}
509
510/*
511 * For now, just push over a few pages every few seconds to
512 * ensure that it basically works
513 */
514static struct workqueue_struct *ramster_remotify_workqueue;
515static void ramster_remotify_process(struct work_struct *work);
516static DECLARE_DELAYED_WORK(ramster_remotify_worker,
517 ramster_remotify_process);
518
519static void ramster_remotify_queue_delayed_work(unsigned long delay)
520{
521 if (!queue_delayed_work(ramster_remotify_workqueue,
522 &ramster_remotify_worker, delay))
523 pr_err("ramster_remotify: bad workqueue\n");
524}
525
526static void ramster_remote_flush_page(struct flushlist_node *flnode)
527{
528 struct tmem_xhandle *xh;
529 int remotenode, ret;
530
531 preempt_disable();
532 xh = &flnode->xh;
533 remotenode = flnode->xh.client_id;
534 ret = r2net_remote_flush(xh, remotenode);
535 if (ret >= 0)
536 ramster_remote_pages_flushed++;
537 else
538 ramster_remote_page_flushes_failed++;
539 preempt_enable_no_resched();
540 ramster_flnode_free(flnode, NULL);
541}
542
543static void ramster_remote_flush_object(struct flushlist_node *flnode)
544{
545 struct tmem_xhandle *xh;
546 int remotenode, ret;
547
548 preempt_disable();
549 xh = &flnode->xh;
550 remotenode = flnode->xh.client_id;
551 ret = r2net_remote_flush_object(xh, remotenode);
552 if (ret >= 0)
553 ramster_remote_objects_flushed++;
554 else
555 ramster_remote_object_flushes_failed++;
556 preempt_enable_no_resched();
557 ramster_flnode_free(flnode, NULL);
558}
559
560int ramster_remotify_pageframe(bool eph)
561{
562 struct tmem_xhandle xh;
563 unsigned int size;
564 int remotenode, ret, zbuds;
565 struct tmem_pool *pool;
566 unsigned long flags;
567 unsigned char cksum;
568 char *p;
569 int i, j;
570 unsigned char *tmpmem[2];
571 struct tmem_handle th[2];
572 unsigned int zsize[2];
573
574 tmpmem[0] = __get_cpu_var(ramster_remoteputmem1);
575 tmpmem[1] = __get_cpu_var(ramster_remoteputmem2);
576 local_bh_disable();
577 zbuds = zbud_make_zombie_lru(&th[0], &tmpmem[0], &zsize[0], eph);
578 /* now OK to release lock set in caller */
579 local_bh_enable();
580 if (zbuds == 0)
581 goto out;
582 BUG_ON(zbuds > 2);
583 for (i = 0; i < zbuds; i++) {
584 xh.client_id = th[i].client_id;
585 xh.pool_id = th[i].pool_id;
586 xh.oid = th[i].oid;
587 xh.index = th[i].index;
588 size = zsize[i];
589 BUG_ON(size == 0 || size > zbud_max_buddy_size());
590 for (p = tmpmem[i], cksum = 0, j = 0; j < size; j++)
591 cksum += *p++;
592 ret = r2net_remote_put(&xh, tmpmem[i], size, eph, &remotenode);
593 if (ret != 0) {
594 /*
595 * This is some form of a memory leak... if the remote put
596 * fails, there will never be another attempt to remotify
597 * this page. But since we've dropped the zv pointer,
598 * the page may have been freed or the data replaced
599 * so we can't just "put it back" in the remote op list.
600 * Even if we could, not sure where to put it in the list
601 * because there may be flushes that must be strictly
602 * ordered vs the put. So leave this as a FIXME for now.
603 * But count them so we know if it becomes a problem.
604 */
605 if (eph)
606 ramster_eph_pages_remote_failed++;
607 else
608 ramster_pers_pages_remote_failed++;
609 break;
610 } else {
611 if (!eph)
612 atomic_inc(&ramster_remote_pers_pages);
613 }
614 if (eph)
615 ramster_eph_pages_remoted++;
616 else
617 ramster_pers_pages_remoted++;
618 /*
619 * data was successfully remoted so change the local version to
620 * point to the remote node where it landed
621 */
622 local_bh_disable();
623 pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh.pool_id);
624 local_irq_save(flags);
625 (void)tmem_replace(pool, &xh.oid, xh.index,
626 pampd_make_remote(remotenode, size, cksum));
627 local_irq_restore(flags);
628 zcache_put_pool(pool);
629 local_bh_enable();
630 }
631out:
632 return zbuds;
633}
634
635static void zcache_do_remotify_flushes(void)
636{
637 struct ramster_remotify_hdr *rem_op;
638 union remotify_list_node *u;
639
640 while (1) {
641 spin_lock(&ramster_rem_op_list_lock);
642 if (list_empty(&ramster_rem_op_list)) {
643 spin_unlock(&ramster_rem_op_list_lock);
644 goto out;
645 }
646 rem_op = list_first_entry(&ramster_rem_op_list,
647 struct ramster_remotify_hdr, list);
648 list_del_init(&rem_op->list);
649 spin_unlock(&ramster_rem_op_list_lock);
650 u = (union remotify_list_node *)rem_op;
651 switch (rem_op->op) {
652 case RAMSTER_REMOTIFY_FLUSH_PAGE:
653 ramster_remote_flush_page((struct flushlist_node *)u);
654 break;
655 case RAMSTER_REMOTIFY_FLUSH_OBJ:
656 ramster_remote_flush_object((struct flushlist_node *)u);
657 break;
658 default:
659 BUG();
660 }
661 }
662out:
663 return;
664}
665
666static void ramster_remotify_process(struct work_struct *work)
667{
668 static bool remotify_in_progress;
669 int i;
670
671 BUG_ON(irqs_disabled());
672 if (remotify_in_progress)
673 goto requeue;
674 if (ramster_remote_target_nodenum == -1)
675 goto requeue;
676 remotify_in_progress = true;
677 if (use_cleancache && ramster_eph_remotify_enable) {
678 for (i = 0; i < 100; i++) {
679 zcache_do_remotify_flushes();
680 (void)ramster_remotify_pageframe(true);
681 }
682 }
683 if (use_frontswap && ramster_pers_remotify_enable) {
684 for (i = 0; i < 100; i++) {
685 zcache_do_remotify_flushes();
686 (void)ramster_remotify_pageframe(false);
687 }
688 }
689 remotify_in_progress = false;
690requeue:
691 ramster_remotify_queue_delayed_work(HZ);
692}
693
694void __init ramster_remotify_init(void)
695{
696 unsigned long n = 60UL;
697 ramster_remotify_workqueue =
698 create_singlethread_workqueue("ramster_remotify");
699 ramster_remotify_queue_delayed_work(n * HZ);
700}
701
702static ssize_t ramster_manual_node_up_show(struct kobject *kobj,
703 struct kobj_attribute *attr, char *buf)
704{
705 int i;
706 char *p = buf;
707 for (i = 0; i < MANUAL_NODES; i++)
708 if (ramster_nodes_manual_up[i])
709 p += sprintf(p, "%d ", i);
710 p += sprintf(p, "\n");
711 return p - buf;
712}
713
714static ssize_t ramster_manual_node_up_store(struct kobject *kobj,
715 struct kobj_attribute *attr, const char *buf, size_t count)
716{
717 int err;
718 unsigned long node_num;
719
720 err = kstrtoul(buf, 10, &node_num);
721 if (err) {
722 pr_err("ramster: bad strtoul?\n");
723 return -EINVAL;
724 }
725 if (node_num >= MANUAL_NODES) {
726 pr_err("ramster: bad node_num=%lu?\n", node_num);
727 return -EINVAL;
728 }
729 if (ramster_nodes_manual_up[node_num]) {
730 pr_err("ramster: node %d already up, ignoring\n",
731 (int)node_num);
732 } else {
733 ramster_nodes_manual_up[node_num] = true;
734 r2net_hb_node_up_manual((int)node_num);
735 }
736 return count;
737}
738
739static struct kobj_attribute ramster_manual_node_up_attr = {
740 .attr = { .name = "manual_node_up", .mode = 0644 },
741 .show = ramster_manual_node_up_show,
742 .store = ramster_manual_node_up_store,
743};
744
745static ssize_t ramster_remote_target_nodenum_show(struct kobject *kobj,
746 struct kobj_attribute *attr, char *buf)
747{
748 if (ramster_remote_target_nodenum == -1UL)
749 return sprintf(buf, "unset\n");
750 else
751 return sprintf(buf, "%d\n", ramster_remote_target_nodenum);
752}
753
754static ssize_t ramster_remote_target_nodenum_store(struct kobject *kobj,
755 struct kobj_attribute *attr, const char *buf, size_t count)
756{
757 int err;
758 unsigned long node_num;
759
760 err = kstrtoul(buf, 10, &node_num);
761 if (err) {
762 pr_err("ramster: bad strtoul?\n");
763 return -EINVAL;
764 } else if (node_num == -1UL) {
765 pr_err("ramster: disabling all remotification, "
766 "data may still reside on remote nodes however\n");
767 return -EINVAL;
768 } else if (node_num >= MANUAL_NODES) {
769 pr_err("ramster: bad node_num=%lu?\n", node_num);
770 return -EINVAL;
771 } else if (!ramster_nodes_manual_up[node_num]) {
772 pr_err("ramster: node %d not up, ignoring setting "
773 "of remotification target\n", (int)node_num);
774 } else if (r2net_remote_target_node_set((int)node_num) >= 0) {
775 pr_info("ramster: node %d set as remotification target\n",
776 (int)node_num);
777 ramster_remote_target_nodenum = (int)node_num;
778 } else {
779 pr_err("ramster: bad num to node node_num=%d?\n",
780 (int)node_num);
781 return -EINVAL;
782 }
783 return count;
784}
785
786static struct kobj_attribute ramster_remote_target_nodenum_attr = {
787 .attr = { .name = "remote_target_nodenum", .mode = 0644 },
788 .show = ramster_remote_target_nodenum_show,
789 .store = ramster_remote_target_nodenum_store,
790};
791
792#define RAMSTER_SYSFS_RO(_name) \
793 static ssize_t ramster_##_name##_show(struct kobject *kobj, \
794 struct kobj_attribute *attr, char *buf) \
795 { \
796 return sprintf(buf, "%lu\n", ramster_##_name); \
797 } \
798 static struct kobj_attribute ramster_##_name##_attr = { \
799 .attr = { .name = __stringify(_name), .mode = 0444 }, \
800 .show = ramster_##_name##_show, \
801 }
802
803#define RAMSTER_SYSFS_RW(_name) \
804 static ssize_t ramster_##_name##_show(struct kobject *kobj, \
805 struct kobj_attribute *attr, char *buf) \
806 { \
807 return sprintf(buf, "%lu\n", ramster_##_name); \
808 } \
809 static ssize_t ramster_##_name##_store(struct kobject *kobj, \
810 struct kobj_attribute *attr, const char *buf, size_t count) \
811 { \
812 int err; \
813 unsigned long enable; \
814 err = kstrtoul(buf, 10, &enable); \
815 if (err) \
816 return -EINVAL; \
817 ramster_##_name = enable; \
818 return count; \
819 } \
820 static struct kobj_attribute ramster_##_name##_attr = { \
821 .attr = { .name = __stringify(_name), .mode = 0644 }, \
822 .show = ramster_##_name##_show, \
823 .store = ramster_##_name##_store, \
824 }
825
826#define RAMSTER_SYSFS_RO_ATOMIC(_name) \
827 static ssize_t ramster_##_name##_show(struct kobject *kobj, \
828 struct kobj_attribute *attr, char *buf) \
829 { \
830 return sprintf(buf, "%d\n", atomic_read(&ramster_##_name)); \
831 } \
832 static struct kobj_attribute ramster_##_name##_attr = { \
833 .attr = { .name = __stringify(_name), .mode = 0444 }, \
834 .show = ramster_##_name##_show, \
835 }
836
837RAMSTER_SYSFS_RO(interface_revision);
838RAMSTER_SYSFS_RO_ATOMIC(remote_pers_pages);
839RAMSTER_SYSFS_RW(pers_remotify_enable);
840RAMSTER_SYSFS_RW(eph_remotify_enable);
841
842static struct attribute *ramster_attrs[] = {
843 &ramster_interface_revision_attr.attr,
844 &ramster_remote_pers_pages_attr.attr,
845 &ramster_manual_node_up_attr.attr,
846 &ramster_remote_target_nodenum_attr.attr,
847 &ramster_pers_remotify_enable_attr.attr,
848 &ramster_eph_remotify_enable_attr.attr,
849 NULL,
850};
851
852static struct attribute_group ramster_attr_group = {
853 .attrs = ramster_attrs,
854 .name = "ramster",
855};
856
857/*
858 * frontswap selfshrinking
859 */
860
861/* In HZ, controls frequency of worker invocation. */
862static unsigned int selfshrink_interval __read_mostly = 5;
863/* Enable/disable with sysfs. */
864static bool frontswap_selfshrinking __read_mostly;
865
866static void selfshrink_process(struct work_struct *work);
867static DECLARE_DELAYED_WORK(selfshrink_worker, selfshrink_process);
868
869/* Enable/disable with kernel boot option. */
870static bool use_frontswap_selfshrink __initdata = true;
871
872/*
873 * The default values for the following parameters were deemed reasonable
874 * by experimentation, may be workload-dependent, and can all be
875 * adjusted via sysfs.
876 */
877
878/* Control rate for frontswap shrinking. Higher hysteresis is slower. */
879static unsigned int frontswap_hysteresis __read_mostly = 20;
880
881/*
882 * Number of selfshrink worker invocations to wait before observing that
883 * frontswap selfshrinking should commence. Note that selfshrinking does
884 * not use a separate worker thread.
885 */
886static unsigned int frontswap_inertia __read_mostly = 3;
887
888/* Countdown to next invocation of frontswap_shrink() */
889static unsigned long frontswap_inertia_counter;
890
891/*
892 * Invoked by the selfshrink worker thread, uses current number of pages
893 * in frontswap (frontswap_curr_pages()), previous status, and control
894 * values (hysteresis and inertia) to determine if frontswap should be
895 * shrunk and what the new frontswap size should be. Note that
896 * frontswap_shrink is essentially a partial swapoff that immediately
897 * transfers pages from the "swap device" (frontswap) back into kernel
898 * RAM; despite the name, frontswap "shrinking" is very different from
899 * the "shrinker" interface used by the kernel MM subsystem to reclaim
900 * memory.
901 */
902static void frontswap_selfshrink(void)
903{
904 static unsigned long cur_frontswap_pages;
905 static unsigned long last_frontswap_pages;
906 static unsigned long tgt_frontswap_pages;
907
908 last_frontswap_pages = cur_frontswap_pages;
909 cur_frontswap_pages = frontswap_curr_pages();
910 if (!cur_frontswap_pages ||
911 (cur_frontswap_pages > last_frontswap_pages)) {
912 frontswap_inertia_counter = frontswap_inertia;
913 return;
914 }
915 if (frontswap_inertia_counter && --frontswap_inertia_counter)
916 return;
917 if (cur_frontswap_pages <= frontswap_hysteresis)
918 tgt_frontswap_pages = 0;
919 else
920 tgt_frontswap_pages = cur_frontswap_pages -
921 (cur_frontswap_pages / frontswap_hysteresis);
922 frontswap_shrink(tgt_frontswap_pages);
923}
924
925static int __init ramster_nofrontswap_selfshrink_setup(char *s)
926{
927 use_frontswap_selfshrink = false;
928 return 1;
929}
930
931__setup("noselfshrink", ramster_nofrontswap_selfshrink_setup);
932
933static void selfshrink_process(struct work_struct *work)
934{
935 if (frontswap_selfshrinking && frontswap_enabled) {
936 frontswap_selfshrink();
937 schedule_delayed_work(&selfshrink_worker,
938 selfshrink_interval * HZ);
939 }
940}
941
942void ramster_cpu_up(int cpu)
943{
944 unsigned char *p1 = kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT);
945 unsigned char *p2 = kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT);
946 BUG_ON(!p1 || !p2);
947 per_cpu(ramster_remoteputmem1, cpu) = p1;
948 per_cpu(ramster_remoteputmem2, cpu) = p2;
949}
950
951void ramster_cpu_down(int cpu)
952{
953 struct ramster_preload *kp;
954
955 kfree(per_cpu(ramster_remoteputmem1, cpu));
956 per_cpu(ramster_remoteputmem1, cpu) = NULL;
957 kfree(per_cpu(ramster_remoteputmem2, cpu));
958 per_cpu(ramster_remoteputmem2, cpu) = NULL;
959 kp = &per_cpu(ramster_preloads, cpu);
960 if (kp->flnode) {
961 kmem_cache_free(ramster_flnode_cache, kp->flnode);
962 kp->flnode = NULL;
963 }
964}
965
966void ramster_register_pamops(struct tmem_pamops *pamops)
967{
968 pamops->free_obj = ramster_pampd_free_obj;
969 pamops->new_obj = ramster_pampd_new_obj;
970 pamops->replace_in_obj = ramster_pampd_replace_in_obj;
971 pamops->is_remote = ramster_pampd_is_remote;
972 pamops->repatriate = ramster_pampd_repatriate;
973 pamops->repatriate_preload = ramster_pampd_repatriate_preload;
974}
975
976void __init ramster_init(bool cleancache, bool frontswap,
977 bool frontswap_exclusive_gets)
978{
979 int ret = 0;
980
981 if (cleancache)
982 use_cleancache = true;
983 if (frontswap)
984 use_frontswap = true;
985 if (frontswap_exclusive_gets)
986 use_frontswap_exclusive_gets = true;
987 ramster_debugfs_init();
988 ret = sysfs_create_group(mm_kobj, &ramster_attr_group);
989 if (ret)
990 pr_err("ramster: can't create sysfs for ramster\n");
991 (void)r2net_register_handlers();
992 INIT_LIST_HEAD(&ramster_rem_op_list);
993 ramster_flnode_cache = kmem_cache_create("ramster_flnode",
994 sizeof(struct flushlist_node), 0, 0, NULL);
995 frontswap_selfshrinking = use_frontswap_selfshrink;
996 if (frontswap_selfshrinking) {
997 pr_info("ramster: Initializing frontswap selfshrink driver.\n");
998 schedule_delayed_work(&selfshrink_worker,
999 selfshrink_interval * HZ);
1000 }
1001 ramster_remotify_init();
1002}
This page took 0.126906 seconds and 5 git commands to generate.