Commit | Line | Data |
---|---|---|
faca2ef7 DM |
1 | /* |
2 | * zcache.c | |
3 | * | |
4 | * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp. | |
5 | * Copyright (c) 2010,2011, Nitin Gupta | |
6 | * | |
7 | * Zcache provides an in-kernel "host implementation" for transcendent memory | |
8 | * ("tmem") and, thus indirectly, for cleancache and frontswap. Zcache uses | |
9 | * lzo1x compression to improve density and an embedded allocator called | |
10 | * "zbud" which "buddies" two compressed pages semi-optimally in each physical | |
11 | * pageframe. Zbud is integrally tied into tmem to allow pageframes to | |
12 | * be "reclaimed" efficiently. | |
13 | */ | |
14 | ||
15 | #include <linux/module.h> | |
16 | #include <linux/cpu.h> | |
17 | #include <linux/highmem.h> | |
18 | #include <linux/list.h> | |
19 | #include <linux/slab.h> | |
20 | #include <linux/spinlock.h> | |
21 | #include <linux/types.h> | |
aeac64aa | 22 | #include <linux/string.h> |
faca2ef7 DM |
23 | #include <linux/atomic.h> |
24 | #include <linux/math64.h> | |
25 | #include <linux/crypto.h> | |
76426daf DM |
26 | #include <linux/swap.h> |
27 | #include <linux/swapops.h> | |
28 | #include <linux/pagemap.h> | |
29 | #include <linux/writeback.h> | |
faca2ef7 DM |
30 | |
31 | #include <linux/cleancache.h> | |
32 | #include <linux/frontswap.h> | |
33 | #include "tmem.h" | |
34 | #include "zcache.h" | |
35 | #include "zbud.h" | |
36 | #include "ramster.h" | |
95bdaee2 | 37 | #include "debug.h" |
faca2ef7 | 38 | #ifdef CONFIG_RAMSTER |
7937d74a | 39 | static bool ramster_enabled __read_mostly; |
835f2f51 | 40 | static int disable_frontswap_selfshrink; |
faca2ef7 | 41 | #else |
7937d74a | 42 | #define ramster_enabled false |
835f2f51 | 43 | #define disable_frontswap_selfshrink 0 |
faca2ef7 DM |
44 | #endif |
45 | ||
46 | #ifndef __PG_WAS_ACTIVE | |
47 | static inline bool PageWasActive(struct page *page) | |
48 | { | |
49 | return true; | |
50 | } | |
51 | ||
52 | static inline void SetPageWasActive(struct page *page) | |
53 | { | |
54 | } | |
55 | #endif | |
56 | ||
57 | #ifdef FRONTSWAP_HAS_EXCLUSIVE_GETS | |
58 | static bool frontswap_has_exclusive_gets __read_mostly = true; | |
59 | #else | |
60 | static bool frontswap_has_exclusive_gets __read_mostly; | |
61 | static inline void frontswap_tmem_exclusive_gets(bool b) | |
62 | { | |
63 | } | |
64 | #endif | |
65 | ||
8762c7dd WL |
66 | /* |
67 | * mark pampd to special value in order that later | |
68 | * retrieve will identify zero-filled pages | |
69 | */ | |
70 | #define ZERO_FILLED 0x2 | |
71 | ||
76426daf DM |
72 | /* enable (or fix code) when Seth's patches are accepted upstream */ |
73 | #define zcache_writeback_enabled 0 | |
74 | ||
7937d74a KRW |
75 | static bool zcache_enabled __read_mostly; |
76 | static bool disable_cleancache __read_mostly; | |
77 | static bool disable_frontswap __read_mostly; | |
78 | static bool disable_frontswap_ignore_nonactive __read_mostly; | |
79 | static bool disable_cleancache_ignore_nonactive __read_mostly; | |
faca2ef7 DM |
80 | static char *namestr __read_mostly = "zcache"; |
81 | ||
82 | #define ZCACHE_GFP_MASK \ | |
83 | (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC) | |
84 | ||
faca2ef7 | 85 | /* crypto API for zcache */ |
835f2f51 DM |
86 | #ifdef CONFIG_ZCACHE_MODULE |
87 | static char *zcache_comp_name = "lzo"; | |
88 | #else | |
faca2ef7 DM |
89 | #define ZCACHE_COMP_NAME_SZ CRYPTO_MAX_ALG_NAME |
90 | static char zcache_comp_name[ZCACHE_COMP_NAME_SZ] __read_mostly; | |
835f2f51 | 91 | #endif |
faca2ef7 DM |
92 | static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms __read_mostly; |
93 | ||
94 | enum comp_op { | |
95 | ZCACHE_COMPOP_COMPRESS, | |
96 | ZCACHE_COMPOP_DECOMPRESS | |
97 | }; | |
98 | ||
99 | static inline int zcache_comp_op(enum comp_op op, | |
100 | const u8 *src, unsigned int slen, | |
101 | u8 *dst, unsigned int *dlen) | |
102 | { | |
103 | struct crypto_comp *tfm; | |
104 | int ret = -1; | |
105 | ||
106 | BUG_ON(!zcache_comp_pcpu_tfms); | |
107 | tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu()); | |
108 | BUG_ON(!tfm); | |
109 | switch (op) { | |
110 | case ZCACHE_COMPOP_COMPRESS: | |
111 | ret = crypto_comp_compress(tfm, src, slen, dst, dlen); | |
112 | break; | |
113 | case ZCACHE_COMPOP_DECOMPRESS: | |
114 | ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); | |
115 | break; | |
116 | default: | |
117 | ret = -EINVAL; | |
118 | } | |
119 | put_cpu(); | |
120 | return ret; | |
121 | } | |
122 | ||
123 | /* | |
124 | * policy parameters | |
125 | */ | |
126 | ||
127 | /* | |
128 | * byte count defining poor compression; pages with greater zsize will be | |
129 | * rejected | |
130 | */ | |
131 | static unsigned int zbud_max_zsize __read_mostly = (PAGE_SIZE / 8) * 7; | |
132 | /* | |
133 | * byte count defining poor *mean* compression; pages with greater zsize | |
134 | * will be rejected until sufficient better-compressed pages are accepted | |
135 | * driving the mean below this threshold | |
136 | */ | |
137 | static unsigned int zbud_max_mean_zsize __read_mostly = (PAGE_SIZE / 8) * 5; | |
138 | ||
139 | /* | |
140 | * for now, used named slabs so can easily track usage; later can | |
141 | * either just use kmalloc, or perhaps add a slab-like allocator | |
142 | * to more carefully manage total memory utilization | |
143 | */ | |
144 | static struct kmem_cache *zcache_objnode_cache; | |
145 | static struct kmem_cache *zcache_obj_cache; | |
146 | ||
147 | static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, }; | |
148 | ||
95bdaee2 KRW |
149 | /* Used by debug.c */ |
150 | ssize_t zcache_pers_zpages; | |
151 | u64 zcache_pers_zbytes; | |
152 | ssize_t zcache_eph_pageframes; | |
153 | ssize_t zcache_pers_pageframes; | |
e0d11aed | 154 | |
95bdaee2 | 155 | /* Used by this code. */ |
86d7de66 KRW |
156 | ssize_t zcache_last_active_file_pageframes; |
157 | ssize_t zcache_last_inactive_file_pageframes; | |
158 | ssize_t zcache_last_active_anon_pageframes; | |
159 | ssize_t zcache_last_inactive_anon_pageframes; | |
95bdaee2 | 160 | #ifdef CONFIG_ZCACHE_WRITEBACK |
86d7de66 KRW |
161 | ssize_t zcache_writtenback_pages; |
162 | ssize_t zcache_outstanding_writeback_pages; | |
faca2ef7 | 163 | #endif |
faca2ef7 DM |
164 | /* |
165 | * zcache core code starts here | |
166 | */ | |
167 | ||
168 | static struct zcache_client zcache_host; | |
169 | static struct zcache_client zcache_clients[MAX_CLIENTS]; | |
170 | ||
171 | static inline bool is_local_client(struct zcache_client *cli) | |
172 | { | |
173 | return cli == &zcache_host; | |
174 | } | |
175 | ||
176 | static struct zcache_client *zcache_get_client_by_id(uint16_t cli_id) | |
177 | { | |
178 | struct zcache_client *cli = &zcache_host; | |
179 | ||
180 | if (cli_id != LOCAL_CLIENT) { | |
181 | if (cli_id >= MAX_CLIENTS) | |
182 | goto out; | |
183 | cli = &zcache_clients[cli_id]; | |
184 | } | |
185 | out: | |
186 | return cli; | |
187 | } | |
188 | ||
189 | /* | |
190 | * Tmem operations assume the poolid implies the invoking client. | |
191 | * Zcache only has one client (the kernel itself): LOCAL_CLIENT. | |
192 | * RAMster has each client numbered by cluster node, and a KVM version | |
193 | * of zcache would have one client per guest and each client might | |
194 | * have a poolid==N. | |
195 | */ | |
196 | struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid) | |
197 | { | |
198 | struct tmem_pool *pool = NULL; | |
199 | struct zcache_client *cli = NULL; | |
200 | ||
201 | cli = zcache_get_client_by_id(cli_id); | |
202 | if (cli == NULL) | |
203 | goto out; | |
204 | if (!is_local_client(cli)) | |
205 | atomic_inc(&cli->refcount); | |
206 | if (poolid < MAX_POOLS_PER_CLIENT) { | |
207 | pool = cli->tmem_pools[poolid]; | |
208 | if (pool != NULL) | |
209 | atomic_inc(&pool->refcount); | |
210 | } | |
211 | out: | |
212 | return pool; | |
213 | } | |
214 | ||
215 | void zcache_put_pool(struct tmem_pool *pool) | |
216 | { | |
217 | struct zcache_client *cli = NULL; | |
218 | ||
219 | if (pool == NULL) | |
220 | BUG(); | |
221 | cli = pool->client; | |
222 | atomic_dec(&pool->refcount); | |
223 | if (!is_local_client(cli)) | |
224 | atomic_dec(&cli->refcount); | |
225 | } | |
226 | ||
227 | int zcache_new_client(uint16_t cli_id) | |
228 | { | |
229 | struct zcache_client *cli; | |
230 | int ret = -1; | |
231 | ||
232 | cli = zcache_get_client_by_id(cli_id); | |
233 | if (cli == NULL) | |
234 | goto out; | |
235 | if (cli->allocated) | |
236 | goto out; | |
237 | cli->allocated = 1; | |
238 | ret = 0; | |
239 | out: | |
240 | return ret; | |
241 | } | |
242 | ||
243 | /* | |
244 | * zcache implementation for tmem host ops | |
245 | */ | |
246 | ||
247 | static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool) | |
248 | { | |
249 | struct tmem_objnode *objnode = NULL; | |
250 | struct zcache_preload *kp; | |
251 | int i; | |
252 | ||
253 | kp = &__get_cpu_var(zcache_preloads); | |
254 | for (i = 0; i < ARRAY_SIZE(kp->objnodes); i++) { | |
255 | objnode = kp->objnodes[i]; | |
256 | if (objnode != NULL) { | |
257 | kp->objnodes[i] = NULL; | |
258 | break; | |
259 | } | |
260 | } | |
261 | BUG_ON(objnode == NULL); | |
3f007ca4 | 262 | inc_zcache_objnode_count(); |
faca2ef7 DM |
263 | return objnode; |
264 | } | |
265 | ||
266 | static void zcache_objnode_free(struct tmem_objnode *objnode, | |
267 | struct tmem_pool *pool) | |
268 | { | |
6f4336fb | 269 | dec_zcache_objnode_count(); |
faca2ef7 DM |
270 | kmem_cache_free(zcache_objnode_cache, objnode); |
271 | } | |
272 | ||
273 | static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool) | |
274 | { | |
275 | struct tmem_obj *obj = NULL; | |
276 | struct zcache_preload *kp; | |
277 | ||
278 | kp = &__get_cpu_var(zcache_preloads); | |
279 | obj = kp->obj; | |
280 | BUG_ON(obj == NULL); | |
281 | kp->obj = NULL; | |
3f007ca4 | 282 | inc_zcache_obj_count(); |
faca2ef7 DM |
283 | return obj; |
284 | } | |
285 | ||
286 | static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool) | |
287 | { | |
6f4336fb | 288 | dec_zcache_obj_count(); |
faca2ef7 DM |
289 | kmem_cache_free(zcache_obj_cache, obj); |
290 | } | |
291 | ||
8762c7dd WL |
292 | /* |
293 | * Compressing zero-filled pages will waste memory and introduce | |
294 | * serious fragmentation, skip it to avoid overhead. | |
295 | */ | |
296 | static bool page_is_zero_filled(struct page *p) | |
bdcde42d WL |
297 | { |
298 | unsigned int pos; | |
8762c7dd | 299 | char *page; |
bdcde42d | 300 | |
8762c7dd | 301 | page = kmap_atomic(p); |
bdcde42d | 302 | for (pos = 0; pos < PAGE_SIZE / sizeof(*page); pos++) { |
8762c7dd WL |
303 | if (page[pos]) { |
304 | kunmap_atomic(page); | |
bdcde42d | 305 | return false; |
8762c7dd | 306 | } |
bdcde42d | 307 | } |
8762c7dd | 308 | kunmap_atomic(page); |
bdcde42d WL |
309 | |
310 | return true; | |
311 | } | |
312 | ||
313 | static void handle_zero_filled_page(void *p) | |
314 | { | |
315 | void *user_mem; | |
316 | struct page *page = (struct page *)p; | |
317 | ||
318 | user_mem = kmap_atomic(page); | |
319 | memset(user_mem, 0, PAGE_SIZE); | |
320 | kunmap_atomic(user_mem); | |
321 | ||
322 | flush_dcache_page(page); | |
323 | } | |
324 | ||
faca2ef7 DM |
325 | static struct tmem_hostops zcache_hostops = { |
326 | .obj_alloc = zcache_obj_alloc, | |
327 | .obj_free = zcache_obj_free, | |
328 | .objnode_alloc = zcache_objnode_alloc, | |
329 | .objnode_free = zcache_objnode_free, | |
330 | }; | |
331 | ||
332 | static struct page *zcache_alloc_page(void) | |
333 | { | |
334 | struct page *page = alloc_page(ZCACHE_GFP_MASK); | |
335 | ||
336 | if (page != NULL) | |
3f007ca4 | 337 | inc_zcache_pageframes_alloced(); |
faca2ef7 DM |
338 | return page; |
339 | } | |
340 | ||
faca2ef7 DM |
341 | static void zcache_free_page(struct page *page) |
342 | { | |
343 | long curr_pageframes; | |
7892e560 | 344 | static long max_pageframes, min_pageframes; |
faca2ef7 DM |
345 | |
346 | if (page == NULL) | |
347 | BUG(); | |
348 | __free_page(page); | |
3f007ca4 | 349 | inc_zcache_pageframes_freed(); |
e0d11aed | 350 | curr_pageframes = curr_pageframes_count(); |
faca2ef7 DM |
351 | if (curr_pageframes > max_pageframes) |
352 | max_pageframes = curr_pageframes; | |
353 | if (curr_pageframes < min_pageframes) | |
354 | min_pageframes = curr_pageframes; | |
67e2cba4 | 355 | #ifdef CONFIG_ZCACHE_DEBUG |
faca2ef7 DM |
356 | if (curr_pageframes > 2L || curr_pageframes < -2L) { |
357 | /* pr_info here */ | |
358 | } | |
359 | #endif | |
360 | } | |
361 | ||
362 | /* | |
363 | * zcache implementations for PAM page descriptor ops | |
364 | */ | |
365 | ||
366 | /* forward reference */ | |
367 | static void zcache_compress(struct page *from, | |
368 | void **out_va, unsigned *out_len); | |
369 | ||
370 | static struct page *zcache_evict_eph_pageframe(void); | |
371 | ||
372 | static void *zcache_pampd_eph_create(char *data, size_t size, bool raw, | |
373 | struct tmem_handle *th) | |
374 | { | |
375 | void *pampd = NULL, *cdata = data; | |
376 | unsigned clen = size; | |
8762c7dd | 377 | bool zero_filled = false; |
faca2ef7 DM |
378 | struct page *page = (struct page *)(data), *newpage; |
379 | ||
8762c7dd WL |
380 | if (page_is_zero_filled(page)) { |
381 | clen = 0; | |
382 | zero_filled = true; | |
834e3a1c | 383 | inc_zcache_zero_filled_pages(); |
8762c7dd WL |
384 | goto got_pampd; |
385 | } | |
386 | ||
faca2ef7 DM |
387 | if (!raw) { |
388 | zcache_compress(page, &cdata, &clen); | |
389 | if (clen > zbud_max_buddy_size()) { | |
86d7de66 | 390 | inc_zcache_compress_poor(); |
faca2ef7 DM |
391 | goto out; |
392 | } | |
393 | } else { | |
394 | BUG_ON(clen > zbud_max_buddy_size()); | |
395 | } | |
396 | ||
397 | /* look for space via an existing match first */ | |
398 | pampd = (void *)zbud_match_prep(th, true, cdata, clen); | |
399 | if (pampd != NULL) | |
400 | goto got_pampd; | |
401 | ||
402 | /* no match, now we need to find (or free up) a full page */ | |
403 | newpage = zcache_alloc_page(); | |
404 | if (newpage != NULL) | |
405 | goto create_in_new_page; | |
406 | ||
86d7de66 | 407 | inc_zcache_failed_getfreepages(); |
faca2ef7 DM |
408 | /* can't allocate a page, evict an ephemeral page via LRU */ |
409 | newpage = zcache_evict_eph_pageframe(); | |
410 | if (newpage == NULL) { | |
86d7de66 | 411 | inc_zcache_eph_ate_tail_failed(); |
faca2ef7 DM |
412 | goto out; |
413 | } | |
86d7de66 | 414 | inc_zcache_eph_ate_tail(); |
faca2ef7 DM |
415 | |
416 | create_in_new_page: | |
417 | pampd = (void *)zbud_create_prep(th, true, cdata, clen, newpage); | |
418 | BUG_ON(pampd == NULL); | |
3f007ca4 | 419 | inc_zcache_eph_pageframes(); |
faca2ef7 DM |
420 | |
421 | got_pampd: | |
3f007ca4 KRW |
422 | inc_zcache_eph_zbytes(clen); |
423 | inc_zcache_eph_zpages(); | |
55345fb9 | 424 | if (ramster_enabled && raw && !zero_filled) |
faca2ef7 | 425 | ramster_count_foreign_pages(true, 1); |
8762c7dd WL |
426 | if (zero_filled) |
427 | pampd = (void *)ZERO_FILLED; | |
faca2ef7 DM |
428 | out: |
429 | return pampd; | |
430 | } | |
431 | ||
432 | static void *zcache_pampd_pers_create(char *data, size_t size, bool raw, | |
433 | struct tmem_handle *th) | |
434 | { | |
435 | void *pampd = NULL, *cdata = data; | |
436 | unsigned clen = size; | |
8762c7dd | 437 | bool zero_filled = false; |
faca2ef7 DM |
438 | struct page *page = (struct page *)(data), *newpage; |
439 | unsigned long zbud_mean_zsize; | |
440 | unsigned long curr_pers_zpages, total_zsize; | |
441 | ||
442 | if (data == NULL) { | |
443 | BUG_ON(!ramster_enabled); | |
444 | goto create_pampd; | |
445 | } | |
8762c7dd WL |
446 | |
447 | if (page_is_zero_filled(page)) { | |
448 | clen = 0; | |
449 | zero_filled = true; | |
834e3a1c | 450 | inc_zcache_zero_filled_pages(); |
8762c7dd WL |
451 | goto got_pampd; |
452 | } | |
453 | ||
faca2ef7 DM |
454 | curr_pers_zpages = zcache_pers_zpages; |
455 | /* FIXME CONFIG_RAMSTER... subtract atomic remote_pers_pages here? */ | |
456 | if (!raw) | |
457 | zcache_compress(page, &cdata, &clen); | |
458 | /* reject if compression is too poor */ | |
459 | if (clen > zbud_max_zsize) { | |
86d7de66 | 460 | inc_zcache_compress_poor(); |
faca2ef7 DM |
461 | goto out; |
462 | } | |
463 | /* reject if mean compression is too poor */ | |
464 | if ((clen > zbud_max_mean_zsize) && (curr_pers_zpages > 0)) { | |
465 | total_zsize = zcache_pers_zbytes; | |
466 | if ((long)total_zsize < 0) | |
467 | total_zsize = 0; | |
468 | zbud_mean_zsize = div_u64(total_zsize, | |
469 | curr_pers_zpages); | |
470 | if (zbud_mean_zsize > zbud_max_mean_zsize) { | |
86d7de66 | 471 | inc_zcache_mean_compress_poor(); |
faca2ef7 DM |
472 | goto out; |
473 | } | |
474 | } | |
475 | ||
476 | create_pampd: | |
477 | /* look for space via an existing match first */ | |
478 | pampd = (void *)zbud_match_prep(th, false, cdata, clen); | |
479 | if (pampd != NULL) | |
480 | goto got_pampd; | |
481 | ||
482 | /* no match, now we need to find (or free up) a full page */ | |
483 | newpage = zcache_alloc_page(); | |
484 | if (newpage != NULL) | |
485 | goto create_in_new_page; | |
486 | /* | |
487 | * FIXME do the following only if eph is oversized? | |
488 | * if (zcache_eph_pageframes > | |
489 | * (global_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE) + | |
490 | * global_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE))) | |
491 | */ | |
86d7de66 | 492 | inc_zcache_failed_getfreepages(); |
faca2ef7 DM |
493 | /* can't allocate a page, evict an ephemeral page via LRU */ |
494 | newpage = zcache_evict_eph_pageframe(); | |
495 | if (newpage == NULL) { | |
86d7de66 | 496 | inc_zcache_pers_ate_eph_failed(); |
faca2ef7 DM |
497 | goto out; |
498 | } | |
86d7de66 | 499 | inc_zcache_pers_ate_eph(); |
faca2ef7 DM |
500 | |
501 | create_in_new_page: | |
502 | pampd = (void *)zbud_create_prep(th, false, cdata, clen, newpage); | |
503 | BUG_ON(pampd == NULL); | |
3f007ca4 | 504 | inc_zcache_pers_pageframes(); |
faca2ef7 DM |
505 | |
506 | got_pampd: | |
3f007ca4 KRW |
507 | inc_zcache_pers_zpages(); |
508 | inc_zcache_pers_zbytes(clen); | |
55345fb9 | 509 | if (ramster_enabled && raw && !zero_filled) |
faca2ef7 | 510 | ramster_count_foreign_pages(false, 1); |
8762c7dd WL |
511 | if (zero_filled) |
512 | pampd = (void *)ZERO_FILLED; | |
faca2ef7 DM |
513 | out: |
514 | return pampd; | |
515 | } | |
516 | ||
517 | /* | |
518 | * This is called directly from zcache_put_page to pre-allocate space | |
519 | * to store a zpage. | |
520 | */ | |
521 | void *zcache_pampd_create(char *data, unsigned int size, bool raw, | |
522 | int eph, struct tmem_handle *th) | |
523 | { | |
524 | void *pampd = NULL; | |
525 | struct zcache_preload *kp; | |
526 | struct tmem_objnode *objnode; | |
527 | struct tmem_obj *obj; | |
528 | int i; | |
529 | ||
530 | BUG_ON(!irqs_disabled()); | |
531 | /* pre-allocate per-cpu metadata */ | |
532 | BUG_ON(zcache_objnode_cache == NULL); | |
533 | BUG_ON(zcache_obj_cache == NULL); | |
534 | kp = &__get_cpu_var(zcache_preloads); | |
535 | for (i = 0; i < ARRAY_SIZE(kp->objnodes); i++) { | |
536 | objnode = kp->objnodes[i]; | |
537 | if (objnode == NULL) { | |
538 | objnode = kmem_cache_alloc(zcache_objnode_cache, | |
539 | ZCACHE_GFP_MASK); | |
540 | if (unlikely(objnode == NULL)) { | |
86d7de66 | 541 | inc_zcache_failed_alloc(); |
faca2ef7 DM |
542 | goto out; |
543 | } | |
544 | kp->objnodes[i] = objnode; | |
545 | } | |
546 | } | |
547 | if (kp->obj == NULL) { | |
548 | obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK); | |
549 | kp->obj = obj; | |
550 | } | |
551 | if (unlikely(kp->obj == NULL)) { | |
86d7de66 | 552 | inc_zcache_failed_alloc(); |
faca2ef7 DM |
553 | goto out; |
554 | } | |
555 | /* | |
556 | * ok, have all the metadata pre-allocated, now do the data | |
557 | * but since how we allocate the data is dependent on ephemeral | |
558 | * or persistent, we split the call here to different sub-functions | |
559 | */ | |
560 | if (eph) | |
561 | pampd = zcache_pampd_eph_create(data, size, raw, th); | |
562 | else | |
563 | pampd = zcache_pampd_pers_create(data, size, raw, th); | |
564 | out: | |
565 | return pampd; | |
566 | } | |
567 | ||
568 | /* | |
569 | * This is a pamops called via tmem_put and is necessary to "finish" | |
570 | * a pampd creation. | |
571 | */ | |
572 | void zcache_pampd_create_finish(void *pampd, bool eph) | |
573 | { | |
8762c7dd WL |
574 | if (pampd != (void *)ZERO_FILLED) |
575 | zbud_create_finish((struct zbudref *)pampd, eph); | |
faca2ef7 DM |
576 | } |
577 | ||
578 | /* | |
579 | * This is passed as a function parameter to zbud_decompress so that | |
580 | * zbud need not be familiar with the details of crypto. It assumes that | |
581 | * the bytes from_va and to_va through from_va+size-1 and to_va+size-1 are | |
582 | * kmapped. It must be successful, else there is a logic bug somewhere. | |
583 | */ | |
584 | static void zcache_decompress(char *from_va, unsigned int size, char *to_va) | |
585 | { | |
586 | int ret; | |
587 | unsigned int outlen = PAGE_SIZE; | |
588 | ||
589 | ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, from_va, size, | |
590 | to_va, &outlen); | |
591 | BUG_ON(ret); | |
592 | BUG_ON(outlen != PAGE_SIZE); | |
593 | } | |
594 | ||
595 | /* | |
596 | * Decompress from the kernel va to a pageframe | |
597 | */ | |
598 | void zcache_decompress_to_page(char *from_va, unsigned int size, | |
599 | struct page *to_page) | |
600 | { | |
601 | char *to_va = kmap_atomic(to_page); | |
602 | zcache_decompress(from_va, size, to_va); | |
603 | kunmap_atomic(to_va); | |
604 | } | |
605 | ||
606 | /* | |
607 | * fill the pageframe corresponding to the struct page with the data | |
608 | * from the passed pampd | |
609 | */ | |
610 | static int zcache_pampd_get_data(char *data, size_t *sizep, bool raw, | |
611 | void *pampd, struct tmem_pool *pool, | |
612 | struct tmem_oid *oid, uint32_t index) | |
613 | { | |
614 | int ret; | |
615 | bool eph = !is_persistent(pool); | |
616 | ||
617 | BUG_ON(preemptible()); | |
618 | BUG_ON(eph); /* fix later if shared pools get implemented */ | |
619 | BUG_ON(pampd_is_remote(pampd)); | |
8762c7dd WL |
620 | |
621 | if (pampd == (void *)ZERO_FILLED) { | |
622 | handle_zero_filled_page(data); | |
623 | if (!raw) | |
624 | *sizep = PAGE_SIZE; | |
625 | return 0; | |
626 | } | |
627 | ||
faca2ef7 DM |
628 | if (raw) |
629 | ret = zbud_copy_from_zbud(data, (struct zbudref *)pampd, | |
630 | sizep, eph); | |
631 | else { | |
632 | ret = zbud_decompress((struct page *)(data), | |
633 | (struct zbudref *)pampd, false, | |
634 | zcache_decompress); | |
635 | *sizep = PAGE_SIZE; | |
636 | } | |
637 | return ret; | |
638 | } | |
639 | ||
640 | /* | |
641 | * fill the pageframe corresponding to the struct page with the data | |
642 | * from the passed pampd | |
643 | */ | |
644 | static int zcache_pampd_get_data_and_free(char *data, size_t *sizep, bool raw, | |
645 | void *pampd, struct tmem_pool *pool, | |
646 | struct tmem_oid *oid, uint32_t index) | |
647 | { | |
8762c7dd WL |
648 | int ret = 0; |
649 | bool eph = !is_persistent(pool), zero_filled = false; | |
faca2ef7 DM |
650 | struct page *page = NULL; |
651 | unsigned int zsize, zpages; | |
652 | ||
653 | BUG_ON(preemptible()); | |
654 | BUG_ON(pampd_is_remote(pampd)); | |
8762c7dd WL |
655 | |
656 | if (pampd == (void *)ZERO_FILLED) { | |
657 | handle_zero_filled_page(data); | |
658 | zero_filled = true; | |
b0749e77 WL |
659 | zsize = 0; |
660 | zpages = 1; | |
8762c7dd WL |
661 | if (!raw) |
662 | *sizep = PAGE_SIZE; | |
834e3a1c | 663 | dec_zcache_zero_filled_pages(); |
8762c7dd WL |
664 | goto zero_fill; |
665 | } | |
666 | ||
faca2ef7 DM |
667 | if (raw) |
668 | ret = zbud_copy_from_zbud(data, (struct zbudref *)pampd, | |
669 | sizep, eph); | |
670 | else { | |
671 | ret = zbud_decompress((struct page *)(data), | |
672 | (struct zbudref *)pampd, eph, | |
673 | zcache_decompress); | |
674 | *sizep = PAGE_SIZE; | |
675 | } | |
676 | page = zbud_free_and_delist((struct zbudref *)pampd, eph, | |
677 | &zsize, &zpages); | |
8762c7dd | 678 | zero_fill: |
faca2ef7 DM |
679 | if (eph) { |
680 | if (page) | |
6f4336fb KRW |
681 | dec_zcache_eph_pageframes(); |
682 | dec_zcache_eph_zpages(zpages); | |
683 | dec_zcache_eph_zbytes(zsize); | |
faca2ef7 DM |
684 | } else { |
685 | if (page) | |
6f4336fb KRW |
686 | dec_zcache_pers_pageframes(); |
687 | dec_zcache_pers_zpages(zpages); | |
688 | dec_zcache_pers_zbytes(zsize); | |
faca2ef7 | 689 | } |
55345fb9 | 690 | if (!is_local_client(pool->client) && !zero_filled) |
faca2ef7 | 691 | ramster_count_foreign_pages(eph, -1); |
8762c7dd | 692 | if (page && !zero_filled) |
faca2ef7 DM |
693 | zcache_free_page(page); |
694 | return ret; | |
695 | } | |
696 | ||
697 | /* | |
698 | * free the pampd and remove it from any zcache lists | |
699 | * pampd must no longer be pointed to from any tmem data structures! | |
700 | */ | |
701 | static void zcache_pampd_free(void *pampd, struct tmem_pool *pool, | |
702 | struct tmem_oid *oid, uint32_t index, bool acct) | |
703 | { | |
704 | struct page *page = NULL; | |
705 | unsigned int zsize, zpages; | |
8762c7dd | 706 | bool zero_filled = false; |
faca2ef7 DM |
707 | |
708 | BUG_ON(preemptible()); | |
8762c7dd | 709 | |
b0749e77 | 710 | if (pampd == (void *)ZERO_FILLED) { |
8762c7dd | 711 | zero_filled = true; |
b0749e77 WL |
712 | zsize = 0; |
713 | zpages = 1; | |
834e3a1c | 714 | dec_zcache_zero_filled_pages(); |
b0749e77 | 715 | } |
8762c7dd WL |
716 | |
717 | if (pampd_is_remote(pampd) && !zero_filled) { | |
faca2ef7 DM |
718 | BUG_ON(!ramster_enabled); |
719 | pampd = ramster_pampd_free(pampd, pool, oid, index, acct); | |
720 | if (pampd == NULL) | |
721 | return; | |
722 | } | |
723 | if (is_ephemeral(pool)) { | |
8762c7dd WL |
724 | if (!zero_filled) |
725 | page = zbud_free_and_delist((struct zbudref *)pampd, | |
faca2ef7 DM |
726 | true, &zsize, &zpages); |
727 | if (page) | |
6f4336fb KRW |
728 | dec_zcache_eph_pageframes(); |
729 | dec_zcache_eph_zpages(zpages); | |
730 | dec_zcache_eph_zbytes(zsize); | |
faca2ef7 DM |
731 | /* FIXME CONFIG_RAMSTER... check acct parameter? */ |
732 | } else { | |
8762c7dd WL |
733 | if (!zero_filled) |
734 | page = zbud_free_and_delist((struct zbudref *)pampd, | |
faca2ef7 DM |
735 | false, &zsize, &zpages); |
736 | if (page) | |
6f4336fb KRW |
737 | dec_zcache_pers_pageframes(); |
738 | dec_zcache_pers_zpages(zpages); | |
739 | dec_zcache_pers_zbytes(zsize); | |
faca2ef7 | 740 | } |
55345fb9 | 741 | if (!is_local_client(pool->client) && !zero_filled) |
faca2ef7 | 742 | ramster_count_foreign_pages(is_ephemeral(pool), -1); |
8762c7dd | 743 | if (page && !zero_filled) |
faca2ef7 DM |
744 | zcache_free_page(page); |
745 | } | |
746 | ||
747 | static struct tmem_pamops zcache_pamops = { | |
748 | .create_finish = zcache_pampd_create_finish, | |
749 | .get_data = zcache_pampd_get_data, | |
750 | .get_data_and_free = zcache_pampd_get_data_and_free, | |
751 | .free = zcache_pampd_free, | |
752 | }; | |
753 | ||
754 | /* | |
755 | * zcache compression/decompression and related per-cpu stuff | |
756 | */ | |
757 | ||
758 | static DEFINE_PER_CPU(unsigned char *, zcache_dstmem); | |
759 | #define ZCACHE_DSTMEM_ORDER 1 | |
760 | ||
761 | static void zcache_compress(struct page *from, void **out_va, unsigned *out_len) | |
762 | { | |
763 | int ret; | |
764 | unsigned char *dmem = __get_cpu_var(zcache_dstmem); | |
765 | char *from_va; | |
766 | ||
767 | BUG_ON(!irqs_disabled()); | |
768 | /* no buffer or no compressor so can't compress */ | |
769 | BUG_ON(dmem == NULL); | |
770 | *out_len = PAGE_SIZE << ZCACHE_DSTMEM_ORDER; | |
771 | from_va = kmap_atomic(from); | |
772 | mb(); | |
773 | ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, from_va, PAGE_SIZE, dmem, | |
774 | out_len); | |
775 | BUG_ON(ret); | |
776 | *out_va = dmem; | |
777 | kunmap_atomic(from_va); | |
778 | } | |
779 | ||
780 | static int zcache_comp_cpu_up(int cpu) | |
781 | { | |
782 | struct crypto_comp *tfm; | |
783 | ||
784 | tfm = crypto_alloc_comp(zcache_comp_name, 0, 0); | |
785 | if (IS_ERR(tfm)) | |
786 | return NOTIFY_BAD; | |
787 | *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm; | |
788 | return NOTIFY_OK; | |
789 | } | |
790 | ||
791 | static void zcache_comp_cpu_down(int cpu) | |
792 | { | |
793 | struct crypto_comp *tfm; | |
794 | ||
795 | tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu); | |
796 | crypto_free_comp(tfm); | |
797 | *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL; | |
798 | } | |
799 | ||
800 | static int zcache_cpu_notifier(struct notifier_block *nb, | |
801 | unsigned long action, void *pcpu) | |
802 | { | |
803 | int ret, i, cpu = (long)pcpu; | |
804 | struct zcache_preload *kp; | |
805 | ||
806 | switch (action) { | |
807 | case CPU_UP_PREPARE: | |
808 | ret = zcache_comp_cpu_up(cpu); | |
809 | if (ret != NOTIFY_OK) { | |
810 | pr_err("%s: can't allocate compressor xform\n", | |
811 | namestr); | |
812 | return ret; | |
813 | } | |
814 | per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages( | |
815 | GFP_KERNEL | __GFP_REPEAT, ZCACHE_DSTMEM_ORDER); | |
816 | if (ramster_enabled) | |
817 | ramster_cpu_up(cpu); | |
818 | break; | |
819 | case CPU_DEAD: | |
820 | case CPU_UP_CANCELED: | |
821 | zcache_comp_cpu_down(cpu); | |
822 | free_pages((unsigned long)per_cpu(zcache_dstmem, cpu), | |
823 | ZCACHE_DSTMEM_ORDER); | |
824 | per_cpu(zcache_dstmem, cpu) = NULL; | |
825 | kp = &per_cpu(zcache_preloads, cpu); | |
826 | for (i = 0; i < ARRAY_SIZE(kp->objnodes); i++) { | |
827 | if (kp->objnodes[i]) | |
828 | kmem_cache_free(zcache_objnode_cache, | |
829 | kp->objnodes[i]); | |
830 | } | |
831 | if (kp->obj) { | |
832 | kmem_cache_free(zcache_obj_cache, kp->obj); | |
833 | kp->obj = NULL; | |
834 | } | |
835 | if (ramster_enabled) | |
836 | ramster_cpu_down(cpu); | |
837 | break; | |
838 | default: | |
839 | break; | |
840 | } | |
841 | return NOTIFY_OK; | |
842 | } | |
843 | ||
844 | static struct notifier_block zcache_cpu_notifier_block = { | |
845 | .notifier_call = zcache_cpu_notifier | |
846 | }; | |
847 | ||
848 | /* | |
849 | * The following code interacts with the zbud eviction and zbud | |
850 | * zombify code to access LRU pages | |
851 | */ | |
852 | ||
853 | static struct page *zcache_evict_eph_pageframe(void) | |
854 | { | |
855 | struct page *page; | |
856 | unsigned int zsize = 0, zpages = 0; | |
857 | ||
858 | page = zbud_evict_pageframe_lru(&zsize, &zpages); | |
859 | if (page == NULL) | |
860 | goto out; | |
6f4336fb KRW |
861 | dec_zcache_eph_zbytes(zsize); |
862 | dec_zcache_eph_zpages(zpages); | |
86d7de66 | 863 | inc_zcache_evicted_eph_zpages(zpages); |
6f4336fb | 864 | dec_zcache_eph_pageframes(); |
86d7de66 | 865 | inc_zcache_evicted_eph_pageframes(); |
faca2ef7 DM |
866 | out: |
867 | return page; | |
868 | } | |
869 | ||
76426daf DM |
870 | #ifdef CONFIG_ZCACHE_WRITEBACK |
871 | ||
872 | static atomic_t zcache_outstanding_writeback_pages_atomic = ATOMIC_INIT(0); | |
873 | ||
3f007ca4 KRW |
874 | static inline void inc_zcache_outstanding_writeback_pages(void) |
875 | { | |
876 | zcache_outstanding_writeback_pages = | |
877 | atomic_inc_return(&zcache_outstanding_writeback_pages_atomic); | |
878 | } | |
6f4336fb KRW |
879 | static inline void dec_zcache_outstanding_writeback_pages(void) |
880 | { | |
881 | zcache_outstanding_writeback_pages = | |
882 | atomic_dec_return(&zcache_outstanding_writeback_pages_atomic); | |
883 | }; | |
faca2ef7 DM |
884 | static void unswiz(struct tmem_oid oid, u32 index, |
885 | unsigned *type, pgoff_t *offset); | |
7892e560 | 886 | |
faca2ef7 | 887 | /* |
76426daf DM |
888 | * Choose an LRU persistent pageframe and attempt to write it back to |
889 | * the backing swap disk by calling frontswap_writeback on both zpages. | |
faca2ef7 DM |
890 | * |
891 | * This is work-in-progress. | |
892 | */ | |
893 | ||
76426daf DM |
894 | static void zcache_end_swap_write(struct bio *bio, int err) |
895 | { | |
896 | end_swap_bio_write(bio, err); | |
6f4336fb | 897 | dec_zcache_outstanding_writeback_pages(); |
76426daf DM |
898 | zcache_writtenback_pages++; |
899 | } | |
900 | ||
901 | /* | |
902 | * zcache_get_swap_cache_page | |
903 | * | |
904 | * This is an adaption of read_swap_cache_async() | |
905 | * | |
906 | * If success, page is returned in retpage | |
907 | * Returns 0 if page was already in the swap cache, page is not locked | |
908 | * Returns 1 if the new page needs to be populated, page is locked | |
909 | */ | |
910 | static int zcache_get_swap_cache_page(int type, pgoff_t offset, | |
911 | struct page *new_page) | |
912 | { | |
913 | struct page *found_page; | |
914 | swp_entry_t entry = swp_entry(type, offset); | |
915 | int err; | |
916 | ||
917 | BUG_ON(new_page == NULL); | |
918 | do { | |
919 | /* | |
920 | * First check the swap cache. Since this is normally | |
921 | * called after lookup_swap_cache() failed, re-calling | |
922 | * that would confuse statistics. | |
923 | */ | |
924 | found_page = find_get_page(&swapper_space, entry.val); | |
925 | if (found_page) | |
926 | return 0; | |
927 | ||
928 | /* | |
929 | * call radix_tree_preload() while we can wait. | |
930 | */ | |
931 | err = radix_tree_preload(GFP_KERNEL); | |
932 | if (err) | |
933 | break; | |
934 | ||
935 | /* | |
936 | * Swap entry may have been freed since our caller observed it. | |
937 | */ | |
938 | err = swapcache_prepare(entry); | |
939 | if (err == -EEXIST) { /* seems racy */ | |
940 | radix_tree_preload_end(); | |
941 | continue; | |
942 | } | |
943 | if (err) { /* swp entry is obsolete ? */ | |
944 | radix_tree_preload_end(); | |
945 | break; | |
946 | } | |
947 | ||
948 | /* May fail (-ENOMEM) if radix-tree node allocation failed. */ | |
949 | __set_page_locked(new_page); | |
950 | SetPageSwapBacked(new_page); | |
951 | err = __add_to_swap_cache(new_page, entry); | |
952 | if (likely(!err)) { | |
953 | radix_tree_preload_end(); | |
954 | lru_cache_add_anon(new_page); | |
955 | return 1; | |
956 | } | |
957 | radix_tree_preload_end(); | |
958 | ClearPageSwapBacked(new_page); | |
959 | __clear_page_locked(new_page); | |
960 | /* | |
961 | * add_to_swap_cache() doesn't return -EEXIST, so we can safely | |
962 | * clear SWAP_HAS_CACHE flag. | |
963 | */ | |
964 | swapcache_free(entry, NULL); | |
965 | /* FIXME: is it possible to get here without err==-ENOMEM? | |
966 | * If not, we can dispense with the do loop, use goto retry */ | |
967 | } while (err != -ENOMEM); | |
968 | ||
969 | return -ENOMEM; | |
970 | } | |
971 | ||
972 | /* | |
973 | * Given a frontswap zpage in zcache (identified by type/offset) and | |
974 | * an empty page, put the page into the swap cache, use frontswap | |
975 | * to get the page from zcache into the empty page, then give it | |
976 | * to the swap subsystem to send to disk (carefully avoiding the | |
977 | * possibility that frontswap might snatch it back). | |
978 | * Returns < 0 if error, 0 if successful, and 1 if successful but | |
979 | * the newpage passed in not needed and should be freed. | |
980 | */ | |
981 | static int zcache_frontswap_writeback_zpage(int type, pgoff_t offset, | |
982 | struct page *newpage) | |
983 | { | |
984 | struct page *page = newpage; | |
985 | int ret; | |
986 | struct writeback_control wbc = { | |
987 | .sync_mode = WB_SYNC_NONE, | |
988 | }; | |
989 | ||
990 | ret = zcache_get_swap_cache_page(type, offset, page); | |
991 | if (ret < 0) | |
992 | return ret; | |
993 | else if (ret == 0) { | |
994 | /* more uptodate page is already in swapcache */ | |
995 | __frontswap_invalidate_page(type, offset); | |
996 | return 1; | |
997 | } | |
998 | ||
999 | BUG_ON(!frontswap_has_exclusive_gets); /* load must also invalidate */ | |
1000 | /* FIXME: how is it possible to get here when page is unlocked? */ | |
1001 | __frontswap_load(page); | |
1002 | SetPageUptodate(page); /* above does SetPageDirty, is that enough? */ | |
1003 | ||
1004 | /* start writeback */ | |
1005 | SetPageReclaim(page); | |
1006 | /* | |
1007 | * Return value is ignored here because it doesn't change anything | |
1008 | * for us. Page is returned unlocked. | |
1009 | */ | |
1010 | (void)__swap_writepage(page, &wbc, zcache_end_swap_write); | |
1011 | page_cache_release(page); | |
3f007ca4 | 1012 | inc_zcache_outstanding_writeback_pages(); |
76426daf DM |
1013 | |
1014 | return 0; | |
1015 | } | |
1016 | ||
1017 | /* | |
1018 | * The following is still a magic number... we want to allow forward progress | |
1019 | * for writeback because it clears out needed RAM when under pressure, but | |
1020 | * we don't want to allow writeback to absorb and queue too many GFP_KERNEL | |
1021 | * pages if the swap device is very slow. | |
1022 | */ | |
1023 | #define ZCACHE_MAX_OUTSTANDING_WRITEBACK_PAGES 6400 | |
1024 | ||
1025 | /* | |
1026 | * Try to allocate two free pages, first using a non-aggressive alloc, | |
1027 | * then by evicting zcache ephemeral (clean pagecache) pages, and last | |
1028 | * by aggressive GFP_KERNEL alloc. We allow zbud to choose a pageframe | |
1029 | * consisting of 1-2 zbuds/zpages, then call the writeback_zpage helper | |
1030 | * function above for each. | |
1031 | */ | |
1032 | static int zcache_frontswap_writeback(void) | |
faca2ef7 DM |
1033 | { |
1034 | struct tmem_handle th[2]; | |
76426daf DM |
1035 | int ret = 0; |
1036 | int nzbuds, writeback_ret; | |
faca2ef7 | 1037 | unsigned type; |
76426daf | 1038 | struct page *znewpage1 = NULL, *znewpage2 = NULL; |
faca2ef7 | 1039 | struct page *evictpage1 = NULL, *evictpage2 = NULL; |
76426daf DM |
1040 | struct page *newpage1 = NULL, *newpage2 = NULL; |
1041 | struct page *page1 = NULL, *page2 = NULL; | |
faca2ef7 DM |
1042 | pgoff_t offset; |
1043 | ||
76426daf DM |
1044 | znewpage1 = alloc_page(ZCACHE_GFP_MASK); |
1045 | znewpage2 = alloc_page(ZCACHE_GFP_MASK); | |
1046 | if (znewpage1 == NULL) | |
faca2ef7 | 1047 | evictpage1 = zcache_evict_eph_pageframe(); |
76426daf | 1048 | if (znewpage2 == NULL) |
faca2ef7 | 1049 | evictpage2 = zcache_evict_eph_pageframe(); |
76426daf DM |
1050 | |
1051 | if ((evictpage1 == NULL || evictpage2 == NULL) && | |
1052 | atomic_read(&zcache_outstanding_writeback_pages_atomic) > | |
1053 | ZCACHE_MAX_OUTSTANDING_WRITEBACK_PAGES) { | |
faca2ef7 | 1054 | goto free_and_out; |
76426daf DM |
1055 | } |
1056 | if (znewpage1 == NULL && evictpage1 == NULL) | |
1057 | newpage1 = alloc_page(GFP_KERNEL); | |
1058 | if (znewpage2 == NULL && evictpage2 == NULL) | |
1059 | newpage2 = alloc_page(GFP_KERNEL); | |
1060 | if (newpage1 == NULL || newpage2 == NULL) | |
1061 | goto free_and_out; | |
1062 | ||
1063 | /* ok, we have two pageframes pre-allocated, get a pair of zbuds */ | |
faca2ef7 DM |
1064 | nzbuds = zbud_make_zombie_lru(&th[0], NULL, NULL, false); |
1065 | if (nzbuds == 0) { | |
1066 | ret = -ENOENT; | |
1067 | goto free_and_out; | |
1068 | } | |
76426daf DM |
1069 | |
1070 | /* process the first zbud */ | |
faca2ef7 | 1071 | unswiz(th[0].oid, th[0].index, &type, &offset); |
76426daf DM |
1072 | page1 = (znewpage1 != NULL) ? znewpage1 : |
1073 | ((newpage1 != NULL) ? newpage1 : evictpage1); | |
1074 | writeback_ret = zcache_frontswap_writeback_zpage(type, offset, page1); | |
1075 | if (writeback_ret < 0) { | |
1076 | ret = -ENOMEM; | |
faca2ef7 | 1077 | goto free_and_out; |
faca2ef7 | 1078 | } |
76426daf DM |
1079 | if (evictpage1 != NULL) |
1080 | zcache_pageframes_freed = | |
1081 | atomic_inc_return(&zcache_pageframes_freed_atomic); | |
1082 | if (writeback_ret == 0) { | |
1083 | /* zcache_get_swap_cache_page will free, don't double free */ | |
1084 | znewpage1 = NULL; | |
1085 | newpage1 = NULL; | |
1086 | evictpage1 = NULL; | |
1087 | } | |
1088 | if (nzbuds < 2) | |
1089 | goto free_and_out; | |
1090 | ||
1091 | /* if there is a second zbud, process it */ | |
1092 | unswiz(th[1].oid, th[1].index, &type, &offset); | |
1093 | page2 = (znewpage2 != NULL) ? znewpage2 : | |
1094 | ((newpage2 != NULL) ? newpage2 : evictpage2); | |
1095 | writeback_ret = zcache_frontswap_writeback_zpage(type, offset, page2); | |
1096 | if (writeback_ret < 0) { | |
1097 | ret = -ENOMEM; | |
1098 | goto free_and_out; | |
1099 | } | |
1100 | if (evictpage2 != NULL) | |
1101 | zcache_pageframes_freed = | |
1102 | atomic_inc_return(&zcache_pageframes_freed_atomic); | |
1103 | if (writeback_ret == 0) { | |
1104 | znewpage2 = NULL; | |
1105 | newpage2 = NULL; | |
1106 | evictpage2 = NULL; | |
1107 | } | |
faca2ef7 DM |
1108 | |
1109 | free_and_out: | |
76426daf DM |
1110 | if (znewpage1 != NULL) |
1111 | page_cache_release(znewpage1); | |
1112 | if (znewpage2 != NULL) | |
1113 | page_cache_release(znewpage2); | |
faca2ef7 | 1114 | if (newpage1 != NULL) |
76426daf | 1115 | page_cache_release(newpage1); |
faca2ef7 | 1116 | if (newpage2 != NULL) |
76426daf | 1117 | page_cache_release(newpage2); |
faca2ef7 DM |
1118 | if (evictpage1 != NULL) |
1119 | zcache_free_page(evictpage1); | |
1120 | if (evictpage2 != NULL) | |
1121 | zcache_free_page(evictpage2); | |
faca2ef7 DM |
1122 | return ret; |
1123 | } | |
76426daf | 1124 | #endif /* CONFIG_ZCACHE_WRITEBACK */ |
faca2ef7 DM |
1125 | |
1126 | /* | |
1127 | * When zcache is disabled ("frozen"), pools can be created and destroyed, | |
1128 | * but all puts (and thus all other operations that require memory allocation) | |
1129 | * must fail. If zcache is unfrozen, accepts puts, then frozen again, | |
1130 | * data consistency requires all puts while frozen to be converted into | |
1131 | * flushes. | |
1132 | */ | |
1133 | static bool zcache_freeze; | |
1134 | ||
1135 | /* | |
1136 | * This zcache shrinker interface reduces the number of ephemeral pageframes | |
1137 | * used by zcache to approximately the same as the total number of LRU_FILE | |
76426daf DM |
1138 | * pageframes in use, and now also reduces the number of persistent pageframes |
1139 | * used by zcache to approximately the same as the total number of LRU_ANON | |
1140 | * pageframes in use. FIXME POLICY: Probably the writeback should only occur | |
1141 | * if the eviction doesn't free enough pages. | |
faca2ef7 DM |
1142 | */ |
1143 | static int shrink_zcache_memory(struct shrinker *shrink, | |
1144 | struct shrink_control *sc) | |
1145 | { | |
1146 | static bool in_progress; | |
1147 | int ret = -1; | |
1148 | int nr = sc->nr_to_scan; | |
1149 | int nr_evict = 0; | |
76426daf | 1150 | int nr_writeback = 0; |
faca2ef7 | 1151 | struct page *page; |
76426daf | 1152 | int file_pageframes_inuse, anon_pageframes_inuse; |
faca2ef7 DM |
1153 | |
1154 | if (nr <= 0) | |
1155 | goto skip_evict; | |
1156 | ||
1157 | /* don't allow more than one eviction thread at a time */ | |
1158 | if (in_progress) | |
1159 | goto skip_evict; | |
1160 | ||
1161 | in_progress = true; | |
1162 | ||
1163 | /* we are going to ignore nr, and target a different value */ | |
1164 | zcache_last_active_file_pageframes = | |
1165 | global_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); | |
1166 | zcache_last_inactive_file_pageframes = | |
1167 | global_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); | |
76426daf DM |
1168 | file_pageframes_inuse = zcache_last_active_file_pageframes + |
1169 | zcache_last_inactive_file_pageframes; | |
1170 | if (zcache_eph_pageframes > file_pageframes_inuse) | |
1171 | nr_evict = zcache_eph_pageframes - file_pageframes_inuse; | |
1172 | else | |
1173 | nr_evict = 0; | |
faca2ef7 DM |
1174 | while (nr_evict-- > 0) { |
1175 | page = zcache_evict_eph_pageframe(); | |
1176 | if (page == NULL) | |
1177 | break; | |
1178 | zcache_free_page(page); | |
1179 | } | |
1180 | ||
1181 | zcache_last_active_anon_pageframes = | |
1182 | global_page_state(NR_LRU_BASE + LRU_ACTIVE_ANON); | |
1183 | zcache_last_inactive_anon_pageframes = | |
1184 | global_page_state(NR_LRU_BASE + LRU_INACTIVE_ANON); | |
76426daf DM |
1185 | anon_pageframes_inuse = zcache_last_active_anon_pageframes + |
1186 | zcache_last_inactive_anon_pageframes; | |
1187 | if (zcache_pers_pageframes > anon_pageframes_inuse) | |
1188 | nr_writeback = zcache_pers_pageframes - anon_pageframes_inuse; | |
1189 | else | |
1190 | nr_writeback = 0; | |
1191 | while (nr_writeback-- > 0) { | |
1192 | #ifdef CONFIG_ZCACHE_WRITEBACK | |
1193 | int writeback_ret; | |
1194 | writeback_ret = zcache_frontswap_writeback(); | |
1195 | if (writeback_ret == -ENOMEM) | |
1196 | #endif | |
faca2ef7 DM |
1197 | break; |
1198 | } | |
faca2ef7 DM |
1199 | in_progress = false; |
1200 | ||
1201 | skip_evict: | |
1202 | /* resample: has changed, but maybe not all the way yet */ | |
1203 | zcache_last_active_file_pageframes = | |
1204 | global_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); | |
1205 | zcache_last_inactive_file_pageframes = | |
1206 | global_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); | |
1207 | ret = zcache_eph_pageframes - zcache_last_active_file_pageframes + | |
1208 | zcache_last_inactive_file_pageframes; | |
1209 | if (ret < 0) | |
1210 | ret = 0; | |
1211 | return ret; | |
1212 | } | |
1213 | ||
1214 | static struct shrinker zcache_shrinker = { | |
1215 | .shrink = shrink_zcache_memory, | |
1216 | .seeks = DEFAULT_SEEKS, | |
1217 | }; | |
1218 | ||
1219 | /* | |
1220 | * zcache shims between cleancache/frontswap ops and tmem | |
1221 | */ | |
1222 | ||
1223 | /* FIXME rename these core routines to zcache_tmemput etc? */ | |
1224 | int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp, | |
1225 | uint32_t index, void *page, | |
1226 | unsigned int size, bool raw, int ephemeral) | |
1227 | { | |
1228 | struct tmem_pool *pool; | |
1229 | struct tmem_handle th; | |
1230 | int ret = -1; | |
1231 | void *pampd = NULL; | |
1232 | ||
1233 | BUG_ON(!irqs_disabled()); | |
1234 | pool = zcache_get_pool_by_id(cli_id, pool_id); | |
1235 | if (unlikely(pool == NULL)) | |
1236 | goto out; | |
1237 | if (!zcache_freeze) { | |
1238 | ret = 0; | |
1239 | th.client_id = cli_id; | |
1240 | th.pool_id = pool_id; | |
1241 | th.oid = *oidp; | |
1242 | th.index = index; | |
1243 | pampd = zcache_pampd_create((char *)page, size, raw, | |
1244 | ephemeral, &th); | |
1245 | if (pampd == NULL) { | |
1246 | ret = -ENOMEM; | |
1247 | if (ephemeral) | |
86d7de66 | 1248 | inc_zcache_failed_eph_puts(); |
faca2ef7 | 1249 | else |
86d7de66 | 1250 | inc_zcache_failed_pers_puts(); |
faca2ef7 DM |
1251 | } else { |
1252 | if (ramster_enabled) | |
1253 | ramster_do_preload_flnode(pool); | |
1254 | ret = tmem_put(pool, oidp, index, 0, pampd); | |
1255 | if (ret < 0) | |
1256 | BUG(); | |
1257 | } | |
1258 | zcache_put_pool(pool); | |
1259 | } else { | |
86d7de66 | 1260 | inc_zcache_put_to_flush(); |
faca2ef7 DM |
1261 | if (ramster_enabled) |
1262 | ramster_do_preload_flnode(pool); | |
1263 | if (atomic_read(&pool->obj_count) > 0) | |
1264 | /* the put fails whether the flush succeeds or not */ | |
1265 | (void)tmem_flush_page(pool, oidp, index); | |
1266 | zcache_put_pool(pool); | |
1267 | } | |
1268 | out: | |
1269 | return ret; | |
1270 | } | |
1271 | ||
1272 | int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp, | |
1273 | uint32_t index, void *page, | |
1274 | size_t *sizep, bool raw, int get_and_free) | |
1275 | { | |
1276 | struct tmem_pool *pool; | |
1277 | int ret = -1; | |
1278 | bool eph; | |
1279 | ||
1280 | if (!raw) { | |
1281 | BUG_ON(irqs_disabled()); | |
1282 | BUG_ON(in_softirq()); | |
1283 | } | |
1284 | pool = zcache_get_pool_by_id(cli_id, pool_id); | |
1285 | eph = is_ephemeral(pool); | |
1286 | if (likely(pool != NULL)) { | |
1287 | if (atomic_read(&pool->obj_count) > 0) | |
1288 | ret = tmem_get(pool, oidp, index, (char *)(page), | |
1289 | sizep, raw, get_and_free); | |
1290 | zcache_put_pool(pool); | |
1291 | } | |
1292 | WARN_ONCE((!is_ephemeral(pool) && (ret != 0)), | |
1293 | "zcache_get fails on persistent pool, " | |
1294 | "bad things are very likely to happen soon\n"); | |
1295 | #ifdef RAMSTER_TESTING | |
1296 | if (ret != 0 && ret != -1 && !(ret == -EINVAL && is_ephemeral(pool))) | |
1297 | pr_err("TESTING zcache_get tmem_get returns ret=%d\n", ret); | |
1298 | #endif | |
1299 | return ret; | |
1300 | } | |
1301 | ||
1302 | int zcache_flush_page(int cli_id, int pool_id, | |
1303 | struct tmem_oid *oidp, uint32_t index) | |
1304 | { | |
1305 | struct tmem_pool *pool; | |
1306 | int ret = -1; | |
1307 | unsigned long flags; | |
1308 | ||
1309 | local_irq_save(flags); | |
86d7de66 | 1310 | inc_zcache_flush_total(); |
faca2ef7 DM |
1311 | pool = zcache_get_pool_by_id(cli_id, pool_id); |
1312 | if (ramster_enabled) | |
1313 | ramster_do_preload_flnode(pool); | |
1314 | if (likely(pool != NULL)) { | |
1315 | if (atomic_read(&pool->obj_count) > 0) | |
1316 | ret = tmem_flush_page(pool, oidp, index); | |
1317 | zcache_put_pool(pool); | |
1318 | } | |
1319 | if (ret >= 0) | |
86d7de66 | 1320 | inc_zcache_flush_found(); |
faca2ef7 DM |
1321 | local_irq_restore(flags); |
1322 | return ret; | |
1323 | } | |
1324 | ||
1325 | int zcache_flush_object(int cli_id, int pool_id, | |
1326 | struct tmem_oid *oidp) | |
1327 | { | |
1328 | struct tmem_pool *pool; | |
1329 | int ret = -1; | |
1330 | unsigned long flags; | |
1331 | ||
1332 | local_irq_save(flags); | |
86d7de66 | 1333 | inc_zcache_flobj_total(); |
faca2ef7 DM |
1334 | pool = zcache_get_pool_by_id(cli_id, pool_id); |
1335 | if (ramster_enabled) | |
1336 | ramster_do_preload_flnode(pool); | |
1337 | if (likely(pool != NULL)) { | |
1338 | if (atomic_read(&pool->obj_count) > 0) | |
1339 | ret = tmem_flush_object(pool, oidp); | |
1340 | zcache_put_pool(pool); | |
1341 | } | |
1342 | if (ret >= 0) | |
86d7de66 | 1343 | inc_zcache_flobj_found(); |
faca2ef7 DM |
1344 | local_irq_restore(flags); |
1345 | return ret; | |
1346 | } | |
1347 | ||
1348 | static int zcache_client_destroy_pool(int cli_id, int pool_id) | |
1349 | { | |
1350 | struct tmem_pool *pool = NULL; | |
1351 | struct zcache_client *cli = NULL; | |
1352 | int ret = -1; | |
1353 | ||
1354 | if (pool_id < 0) | |
1355 | goto out; | |
1356 | if (cli_id == LOCAL_CLIENT) | |
1357 | cli = &zcache_host; | |
1358 | else if ((unsigned int)cli_id < MAX_CLIENTS) | |
1359 | cli = &zcache_clients[cli_id]; | |
1360 | if (cli == NULL) | |
1361 | goto out; | |
1362 | atomic_inc(&cli->refcount); | |
1363 | pool = cli->tmem_pools[pool_id]; | |
1364 | if (pool == NULL) | |
1365 | goto out; | |
1366 | cli->tmem_pools[pool_id] = NULL; | |
1367 | /* wait for pool activity on other cpus to quiesce */ | |
1368 | while (atomic_read(&pool->refcount) != 0) | |
1369 | ; | |
1370 | atomic_dec(&cli->refcount); | |
1371 | local_bh_disable(); | |
1372 | ret = tmem_destroy_pool(pool); | |
1373 | local_bh_enable(); | |
1374 | kfree(pool); | |
1375 | if (cli_id == LOCAL_CLIENT) | |
1376 | pr_info("%s: destroyed local pool id=%d\n", namestr, pool_id); | |
1377 | else | |
1378 | pr_info("%s: destroyed pool id=%d, client=%d\n", | |
1379 | namestr, pool_id, cli_id); | |
1380 | out: | |
1381 | return ret; | |
1382 | } | |
1383 | ||
1384 | int zcache_new_pool(uint16_t cli_id, uint32_t flags) | |
1385 | { | |
1386 | int poolid = -1; | |
1387 | struct tmem_pool *pool; | |
1388 | struct zcache_client *cli = NULL; | |
1389 | ||
1390 | if (cli_id == LOCAL_CLIENT) | |
1391 | cli = &zcache_host; | |
1392 | else if ((unsigned int)cli_id < MAX_CLIENTS) | |
1393 | cli = &zcache_clients[cli_id]; | |
1394 | if (cli == NULL) | |
1395 | goto out; | |
1396 | atomic_inc(&cli->refcount); | |
1397 | pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC); | |
78110bb8 | 1398 | if (pool == NULL) |
faca2ef7 | 1399 | goto out; |
faca2ef7 DM |
1400 | |
1401 | for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++) | |
1402 | if (cli->tmem_pools[poolid] == NULL) | |
1403 | break; | |
1404 | if (poolid >= MAX_POOLS_PER_CLIENT) { | |
1405 | pr_info("%s: pool creation failed: max exceeded\n", namestr); | |
1406 | kfree(pool); | |
1407 | poolid = -1; | |
1408 | goto out; | |
1409 | } | |
1410 | atomic_set(&pool->refcount, 0); | |
1411 | pool->client = cli; | |
1412 | pool->pool_id = poolid; | |
1413 | tmem_new_pool(pool, flags); | |
1414 | cli->tmem_pools[poolid] = pool; | |
1415 | if (cli_id == LOCAL_CLIENT) | |
1416 | pr_info("%s: created %s local tmem pool, id=%d\n", namestr, | |
1417 | flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", | |
1418 | poolid); | |
1419 | else | |
1420 | pr_info("%s: created %s tmem pool, id=%d, client=%d\n", namestr, | |
1421 | flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", | |
1422 | poolid, cli_id); | |
1423 | out: | |
1424 | if (cli != NULL) | |
1425 | atomic_dec(&cli->refcount); | |
1426 | return poolid; | |
1427 | } | |
1428 | ||
1429 | static int zcache_local_new_pool(uint32_t flags) | |
1430 | { | |
1431 | return zcache_new_pool(LOCAL_CLIENT, flags); | |
1432 | } | |
1433 | ||
f0290de2 | 1434 | int zcache_autocreate_pool(unsigned int cli_id, unsigned int pool_id, bool eph) |
faca2ef7 DM |
1435 | { |
1436 | struct tmem_pool *pool; | |
6fd3d05a | 1437 | struct zcache_client *cli = NULL; |
faca2ef7 DM |
1438 | uint32_t flags = eph ? 0 : TMEM_POOL_PERSIST; |
1439 | int ret = -1; | |
1440 | ||
1441 | BUG_ON(!ramster_enabled); | |
1442 | if (cli_id == LOCAL_CLIENT) | |
1443 | goto out; | |
1444 | if (pool_id >= MAX_POOLS_PER_CLIENT) | |
1445 | goto out; | |
f0290de2 DC |
1446 | if (cli_id >= MAX_CLIENTS) |
1447 | goto out; | |
1448 | ||
1449 | cli = &zcache_clients[cli_id]; | |
faca2ef7 DM |
1450 | if ((eph && disable_cleancache) || (!eph && disable_frontswap)) { |
1451 | pr_err("zcache_autocreate_pool: pool type disabled\n"); | |
1452 | goto out; | |
1453 | } | |
1454 | if (!cli->allocated) { | |
1455 | if (zcache_new_client(cli_id)) { | |
1456 | pr_err("zcache_autocreate_pool: can't create client\n"); | |
1457 | goto out; | |
1458 | } | |
1459 | cli = &zcache_clients[cli_id]; | |
1460 | } | |
1461 | atomic_inc(&cli->refcount); | |
1462 | pool = cli->tmem_pools[pool_id]; | |
1463 | if (pool != NULL) { | |
1464 | if (pool->persistent && eph) { | |
1465 | pr_err("zcache_autocreate_pool: type mismatch\n"); | |
1466 | goto out; | |
1467 | } | |
1468 | ret = 0; | |
1469 | goto out; | |
1470 | } | |
1471 | pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL); | |
78110bb8 | 1472 | if (pool == NULL) |
faca2ef7 | 1473 | goto out; |
78110bb8 | 1474 | |
faca2ef7 DM |
1475 | atomic_set(&pool->refcount, 0); |
1476 | pool->client = cli; | |
1477 | pool->pool_id = pool_id; | |
1478 | tmem_new_pool(pool, flags); | |
1479 | cli->tmem_pools[pool_id] = pool; | |
1480 | pr_info("%s: AUTOcreated %s tmem poolid=%d, for remote client=%d\n", | |
1481 | namestr, flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", | |
1482 | pool_id, cli_id); | |
1483 | ret = 0; | |
1484 | out: | |
1485 | if (cli != NULL) | |
1486 | atomic_dec(&cli->refcount); | |
1487 | return ret; | |
1488 | } | |
1489 | ||
1490 | /********** | |
1491 | * Two kernel functionalities currently can be layered on top of tmem. | |
1492 | * These are "cleancache" which is used as a second-chance cache for clean | |
1493 | * page cache pages; and "frontswap" which is used for swap pages | |
1494 | * to avoid writes to disk. A generic "shim" is provided here for each | |
1495 | * to translate in-kernel semantics to zcache semantics. | |
1496 | */ | |
1497 | ||
1498 | static void zcache_cleancache_put_page(int pool_id, | |
1499 | struct cleancache_filekey key, | |
1500 | pgoff_t index, struct page *page) | |
1501 | { | |
1502 | u32 ind = (u32) index; | |
1503 | struct tmem_oid oid = *(struct tmem_oid *)&key; | |
1504 | ||
1505 | if (!disable_cleancache_ignore_nonactive && !PageWasActive(page)) { | |
86d7de66 | 1506 | inc_zcache_eph_nonactive_puts_ignored(); |
faca2ef7 DM |
1507 | return; |
1508 | } | |
1509 | if (likely(ind == index)) | |
1510 | (void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index, | |
1511 | page, PAGE_SIZE, false, 1); | |
1512 | } | |
1513 | ||
1514 | static int zcache_cleancache_get_page(int pool_id, | |
1515 | struct cleancache_filekey key, | |
1516 | pgoff_t index, struct page *page) | |
1517 | { | |
1518 | u32 ind = (u32) index; | |
1519 | struct tmem_oid oid = *(struct tmem_oid *)&key; | |
1520 | size_t size; | |
1521 | int ret = -1; | |
1522 | ||
1523 | if (likely(ind == index)) { | |
1524 | ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index, | |
1525 | page, &size, false, 0); | |
1526 | BUG_ON(ret >= 0 && size != PAGE_SIZE); | |
1527 | if (ret == 0) | |
1528 | SetPageWasActive(page); | |
1529 | } | |
1530 | return ret; | |
1531 | } | |
1532 | ||
1533 | static void zcache_cleancache_flush_page(int pool_id, | |
1534 | struct cleancache_filekey key, | |
1535 | pgoff_t index) | |
1536 | { | |
1537 | u32 ind = (u32) index; | |
1538 | struct tmem_oid oid = *(struct tmem_oid *)&key; | |
1539 | ||
1540 | if (likely(ind == index)) | |
1541 | (void)zcache_flush_page(LOCAL_CLIENT, pool_id, &oid, ind); | |
1542 | } | |
1543 | ||
1544 | static void zcache_cleancache_flush_inode(int pool_id, | |
1545 | struct cleancache_filekey key) | |
1546 | { | |
1547 | struct tmem_oid oid = *(struct tmem_oid *)&key; | |
1548 | ||
1549 | (void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid); | |
1550 | } | |
1551 | ||
1552 | static void zcache_cleancache_flush_fs(int pool_id) | |
1553 | { | |
1554 | if (pool_id >= 0) | |
1555 | (void)zcache_client_destroy_pool(LOCAL_CLIENT, pool_id); | |
1556 | } | |
1557 | ||
1558 | static int zcache_cleancache_init_fs(size_t pagesize) | |
1559 | { | |
1560 | BUG_ON(sizeof(struct cleancache_filekey) != | |
1561 | sizeof(struct tmem_oid)); | |
1562 | BUG_ON(pagesize != PAGE_SIZE); | |
1563 | return zcache_local_new_pool(0); | |
1564 | } | |
1565 | ||
1566 | static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) | |
1567 | { | |
1568 | /* shared pools are unsupported and map to private */ | |
1569 | BUG_ON(sizeof(struct cleancache_filekey) != | |
1570 | sizeof(struct tmem_oid)); | |
1571 | BUG_ON(pagesize != PAGE_SIZE); | |
1572 | return zcache_local_new_pool(0); | |
1573 | } | |
1574 | ||
1575 | static struct cleancache_ops zcache_cleancache_ops = { | |
1576 | .put_page = zcache_cleancache_put_page, | |
1577 | .get_page = zcache_cleancache_get_page, | |
1578 | .invalidate_page = zcache_cleancache_flush_page, | |
1579 | .invalidate_inode = zcache_cleancache_flush_inode, | |
1580 | .invalidate_fs = zcache_cleancache_flush_fs, | |
1581 | .init_shared_fs = zcache_cleancache_init_shared_fs, | |
1582 | .init_fs = zcache_cleancache_init_fs | |
1583 | }; | |
1584 | ||
833f8662 | 1585 | struct cleancache_ops *zcache_cleancache_register_ops(void) |
faca2ef7 | 1586 | { |
833f8662 | 1587 | struct cleancache_ops *old_ops = |
faca2ef7 DM |
1588 | cleancache_register_ops(&zcache_cleancache_ops); |
1589 | ||
1590 | return old_ops; | |
1591 | } | |
1592 | ||
1593 | /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ | |
1594 | static int zcache_frontswap_poolid __read_mostly = -1; | |
1595 | ||
1596 | /* | |
1597 | * Swizzling increases objects per swaptype, increasing tmem concurrency | |
1598 | * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS | |
1599 | * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from | |
1600 | * frontswap_get_page(), but has side-effects. Hence using 8. | |
1601 | */ | |
1602 | #define SWIZ_BITS 8 | |
1603 | #define SWIZ_MASK ((1 << SWIZ_BITS) - 1) | |
1604 | #define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) | |
1605 | #define iswiz(_ind) (_ind >> SWIZ_BITS) | |
1606 | ||
1607 | static inline struct tmem_oid oswiz(unsigned type, u32 ind) | |
1608 | { | |
1609 | struct tmem_oid oid = { .oid = { 0 } }; | |
1610 | oid.oid[0] = _oswiz(type, ind); | |
1611 | return oid; | |
1612 | } | |
1613 | ||
76426daf | 1614 | #ifdef CONFIG_ZCACHE_WRITEBACK |
faca2ef7 DM |
1615 | static void unswiz(struct tmem_oid oid, u32 index, |
1616 | unsigned *type, pgoff_t *offset) | |
1617 | { | |
1618 | *type = (unsigned)(oid.oid[0] >> SWIZ_BITS); | |
1619 | *offset = (pgoff_t)((index << SWIZ_BITS) | | |
1620 | (oid.oid[0] & SWIZ_MASK)); | |
1621 | } | |
7892e560 | 1622 | #endif |
faca2ef7 DM |
1623 | |
1624 | static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, | |
1625 | struct page *page) | |
1626 | { | |
1627 | u64 ind64 = (u64)offset; | |
1628 | u32 ind = (u32)offset; | |
1629 | struct tmem_oid oid = oswiz(type, ind); | |
1630 | int ret = -1; | |
1631 | unsigned long flags; | |
faca2ef7 DM |
1632 | |
1633 | BUG_ON(!PageLocked(page)); | |
1634 | if (!disable_frontswap_ignore_nonactive && !PageWasActive(page)) { | |
86d7de66 | 1635 | inc_zcache_pers_nonactive_puts_ignored(); |
faca2ef7 DM |
1636 | ret = -ERANGE; |
1637 | goto out; | |
1638 | } | |
1639 | if (likely(ind64 == ind)) { | |
1640 | local_irq_save(flags); | |
1641 | ret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid, | |
1642 | &oid, iswiz(ind), | |
1643 | page, PAGE_SIZE, false, 0); | |
1644 | local_irq_restore(flags); | |
1645 | } | |
1646 | out: | |
1647 | return ret; | |
1648 | } | |
1649 | ||
1650 | /* returns 0 if the page was successfully gotten from frontswap, -1 if | |
1651 | * was not present (should never happen!) */ | |
1652 | static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, | |
1653 | struct page *page) | |
1654 | { | |
1655 | u64 ind64 = (u64)offset; | |
1656 | u32 ind = (u32)offset; | |
1657 | struct tmem_oid oid = oswiz(type, ind); | |
1658 | size_t size; | |
1659 | int ret = -1, get_and_free; | |
1660 | ||
1661 | if (frontswap_has_exclusive_gets) | |
1662 | get_and_free = 1; | |
1663 | else | |
1664 | get_and_free = -1; | |
1665 | BUG_ON(!PageLocked(page)); | |
1666 | if (likely(ind64 == ind)) { | |
1667 | ret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid, | |
1668 | &oid, iswiz(ind), | |
1669 | page, &size, false, get_and_free); | |
1670 | BUG_ON(ret >= 0 && size != PAGE_SIZE); | |
1671 | } | |
1672 | return ret; | |
1673 | } | |
1674 | ||
1675 | /* flush a single page from frontswap */ | |
1676 | static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset) | |
1677 | { | |
1678 | u64 ind64 = (u64)offset; | |
1679 | u32 ind = (u32)offset; | |
1680 | struct tmem_oid oid = oswiz(type, ind); | |
1681 | ||
1682 | if (likely(ind64 == ind)) | |
1683 | (void)zcache_flush_page(LOCAL_CLIENT, zcache_frontswap_poolid, | |
1684 | &oid, iswiz(ind)); | |
1685 | } | |
1686 | ||
1687 | /* flush all pages from the passed swaptype */ | |
1688 | static void zcache_frontswap_flush_area(unsigned type) | |
1689 | { | |
1690 | struct tmem_oid oid; | |
1691 | int ind; | |
1692 | ||
1693 | for (ind = SWIZ_MASK; ind >= 0; ind--) { | |
1694 | oid = oswiz(type, ind); | |
1695 | (void)zcache_flush_object(LOCAL_CLIENT, | |
1696 | zcache_frontswap_poolid, &oid); | |
1697 | } | |
1698 | } | |
1699 | ||
1700 | static void zcache_frontswap_init(unsigned ignored) | |
1701 | { | |
1702 | /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ | |
1703 | if (zcache_frontswap_poolid < 0) | |
1704 | zcache_frontswap_poolid = | |
1705 | zcache_local_new_pool(TMEM_POOL_PERSIST); | |
1706 | } | |
1707 | ||
1708 | static struct frontswap_ops zcache_frontswap_ops = { | |
1709 | .store = zcache_frontswap_put_page, | |
1710 | .load = zcache_frontswap_get_page, | |
1711 | .invalidate_page = zcache_frontswap_flush_page, | |
1712 | .invalidate_area = zcache_frontswap_flush_area, | |
1713 | .init = zcache_frontswap_init | |
1714 | }; | |
1715 | ||
1e01c968 | 1716 | struct frontswap_ops *zcache_frontswap_register_ops(void) |
faca2ef7 | 1717 | { |
1e01c968 | 1718 | struct frontswap_ops *old_ops = |
faca2ef7 DM |
1719 | frontswap_register_ops(&zcache_frontswap_ops); |
1720 | ||
1721 | return old_ops; | |
1722 | } | |
1723 | ||
1724 | /* | |
1725 | * zcache initialization | |
1726 | * NOTE FOR NOW zcache or ramster MUST BE PROVIDED AS A KERNEL BOOT PARAMETER | |
1727 | * OR NOTHING HAPPENS! | |
1728 | */ | |
1729 | ||
835f2f51 | 1730 | #ifndef CONFIG_ZCACHE_MODULE |
faca2ef7 DM |
1731 | static int __init enable_zcache(char *s) |
1732 | { | |
7937d74a | 1733 | zcache_enabled = true; |
faca2ef7 DM |
1734 | return 1; |
1735 | } | |
1736 | __setup("zcache", enable_zcache); | |
1737 | ||
1738 | static int __init enable_ramster(char *s) | |
1739 | { | |
7937d74a | 1740 | zcache_enabled = true; |
faca2ef7 | 1741 | #ifdef CONFIG_RAMSTER |
7937d74a | 1742 | ramster_enabled = true; |
faca2ef7 DM |
1743 | #endif |
1744 | return 1; | |
1745 | } | |
1746 | __setup("ramster", enable_ramster); | |
1747 | ||
1748 | /* allow independent dynamic disabling of cleancache and frontswap */ | |
1749 | ||
1750 | static int __init no_cleancache(char *s) | |
1751 | { | |
7937d74a | 1752 | disable_cleancache = true; |
faca2ef7 DM |
1753 | return 1; |
1754 | } | |
1755 | ||
1756 | __setup("nocleancache", no_cleancache); | |
1757 | ||
1758 | static int __init no_frontswap(char *s) | |
1759 | { | |
7937d74a | 1760 | disable_frontswap = true; |
faca2ef7 DM |
1761 | return 1; |
1762 | } | |
1763 | ||
1764 | __setup("nofrontswap", no_frontswap); | |
1765 | ||
1766 | static int __init no_frontswap_exclusive_gets(char *s) | |
1767 | { | |
1768 | frontswap_has_exclusive_gets = false; | |
1769 | return 1; | |
1770 | } | |
1771 | ||
1772 | __setup("nofrontswapexclusivegets", no_frontswap_exclusive_gets); | |
1773 | ||
1774 | static int __init no_frontswap_ignore_nonactive(char *s) | |
1775 | { | |
7937d74a | 1776 | disable_frontswap_ignore_nonactive = true; |
faca2ef7 DM |
1777 | return 1; |
1778 | } | |
1779 | ||
1780 | __setup("nofrontswapignorenonactive", no_frontswap_ignore_nonactive); | |
1781 | ||
1782 | static int __init no_cleancache_ignore_nonactive(char *s) | |
1783 | { | |
7937d74a | 1784 | disable_cleancache_ignore_nonactive = true; |
faca2ef7 DM |
1785 | return 1; |
1786 | } | |
1787 | ||
1788 | __setup("nocleancacheignorenonactive", no_cleancache_ignore_nonactive); | |
1789 | ||
1790 | static int __init enable_zcache_compressor(char *s) | |
1791 | { | |
aeac64aa | 1792 | strlcpy(zcache_comp_name, s, sizeof(zcache_comp_name)); |
7937d74a | 1793 | zcache_enabled = true; |
faca2ef7 DM |
1794 | return 1; |
1795 | } | |
1796 | __setup("zcache=", enable_zcache_compressor); | |
835f2f51 | 1797 | #endif |
faca2ef7 DM |
1798 | |
1799 | ||
835f2f51 | 1800 | static int zcache_comp_init(void) |
faca2ef7 DM |
1801 | { |
1802 | int ret = 0; | |
1803 | ||
1804 | /* check crypto algorithm */ | |
835f2f51 DM |
1805 | #ifdef CONFIG_ZCACHE_MODULE |
1806 | ret = crypto_has_comp(zcache_comp_name, 0, 0); | |
1807 | if (!ret) { | |
1808 | ret = -1; | |
1809 | goto out; | |
1810 | } | |
1811 | #else | |
faca2ef7 DM |
1812 | if (*zcache_comp_name != '\0') { |
1813 | ret = crypto_has_comp(zcache_comp_name, 0, 0); | |
02073798 | 1814 | if (!ret) { |
faca2ef7 DM |
1815 | pr_info("zcache: %s not supported\n", |
1816 | zcache_comp_name); | |
02073798 PS |
1817 | ret = 1; |
1818 | goto out; | |
1819 | } | |
faca2ef7 DM |
1820 | } |
1821 | if (!ret) | |
1822 | strcpy(zcache_comp_name, "lzo"); | |
1823 | ret = crypto_has_comp(zcache_comp_name, 0, 0); | |
1824 | if (!ret) { | |
1825 | ret = 1; | |
1826 | goto out; | |
1827 | } | |
835f2f51 | 1828 | #endif |
faca2ef7 DM |
1829 | pr_info("zcache: using %s compressor\n", zcache_comp_name); |
1830 | ||
1831 | /* alloc percpu transforms */ | |
1832 | ret = 0; | |
1833 | zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); | |
1834 | if (!zcache_comp_pcpu_tfms) | |
1835 | ret = 1; | |
1836 | out: | |
1837 | return ret; | |
1838 | } | |
1839 | ||
835f2f51 | 1840 | static int zcache_init(void) |
faca2ef7 DM |
1841 | { |
1842 | int ret = 0; | |
1843 | ||
835f2f51 DM |
1844 | #ifdef CONFIG_ZCACHE_MODULE |
1845 | zcache_enabled = 1; | |
1846 | #endif | |
faca2ef7 DM |
1847 | if (ramster_enabled) { |
1848 | namestr = "ramster"; | |
1849 | ramster_register_pamops(&zcache_pamops); | |
1850 | } | |
faca2ef7 | 1851 | zcache_debugfs_init(); |
faca2ef7 DM |
1852 | if (zcache_enabled) { |
1853 | unsigned int cpu; | |
1854 | ||
1855 | tmem_register_hostops(&zcache_hostops); | |
1856 | tmem_register_pamops(&zcache_pamops); | |
1857 | ret = register_cpu_notifier(&zcache_cpu_notifier_block); | |
1858 | if (ret) { | |
1859 | pr_err("%s: can't register cpu notifier\n", namestr); | |
1860 | goto out; | |
1861 | } | |
1862 | ret = zcache_comp_init(); | |
1863 | if (ret) { | |
1864 | pr_err("%s: compressor initialization failed\n", | |
1865 | namestr); | |
1866 | goto out; | |
1867 | } | |
1868 | for_each_online_cpu(cpu) { | |
1869 | void *pcpu = (void *)(long)cpu; | |
1870 | zcache_cpu_notifier(&zcache_cpu_notifier_block, | |
1871 | CPU_UP_PREPARE, pcpu); | |
1872 | } | |
1873 | } | |
1874 | zcache_objnode_cache = kmem_cache_create("zcache_objnode", | |
1875 | sizeof(struct tmem_objnode), 0, 0, NULL); | |
1876 | zcache_obj_cache = kmem_cache_create("zcache_obj", | |
1877 | sizeof(struct tmem_obj), 0, 0, NULL); | |
1878 | ret = zcache_new_client(LOCAL_CLIENT); | |
1879 | if (ret) { | |
1880 | pr_err("%s: can't create client\n", namestr); | |
1881 | goto out; | |
1882 | } | |
1883 | zbud_init(); | |
1884 | if (zcache_enabled && !disable_cleancache) { | |
833f8662 | 1885 | struct cleancache_ops *old_ops; |
faca2ef7 DM |
1886 | |
1887 | register_shrinker(&zcache_shrinker); | |
1888 | old_ops = zcache_cleancache_register_ops(); | |
1889 | pr_info("%s: cleancache enabled using kernel transcendent " | |
1890 | "memory and compression buddies\n", namestr); | |
67e2cba4 | 1891 | #ifdef CONFIG_ZCACHE_DEBUG |
faca2ef7 DM |
1892 | pr_info("%s: cleancache: ignorenonactive = %d\n", |
1893 | namestr, !disable_cleancache_ignore_nonactive); | |
1894 | #endif | |
833f8662 | 1895 | if (old_ops != NULL) |
faca2ef7 DM |
1896 | pr_warn("%s: cleancache_ops overridden\n", namestr); |
1897 | } | |
1898 | if (zcache_enabled && !disable_frontswap) { | |
1e01c968 | 1899 | struct frontswap_ops *old_ops; |
faca2ef7 DM |
1900 | |
1901 | old_ops = zcache_frontswap_register_ops(); | |
1902 | if (frontswap_has_exclusive_gets) | |
1903 | frontswap_tmem_exclusive_gets(true); | |
1904 | pr_info("%s: frontswap enabled using kernel transcendent " | |
1905 | "memory and compression buddies\n", namestr); | |
67e2cba4 | 1906 | #ifdef CONFIG_ZCACHE_DEBUG |
faca2ef7 DM |
1907 | pr_info("%s: frontswap: excl gets = %d active only = %d\n", |
1908 | namestr, frontswap_has_exclusive_gets, | |
1909 | !disable_frontswap_ignore_nonactive); | |
1910 | #endif | |
f42158fe KRW |
1911 | if (IS_ERR(old_ops) || old_ops) { |
1912 | if (IS_ERR(old_ops)) | |
1913 | return PTR_RET(old_ops); | |
faca2ef7 | 1914 | pr_warn("%s: frontswap_ops overridden\n", namestr); |
f42158fe | 1915 | } |
faca2ef7 DM |
1916 | } |
1917 | if (ramster_enabled) | |
1918 | ramster_init(!disable_cleancache, !disable_frontswap, | |
835f2f51 DM |
1919 | frontswap_has_exclusive_gets, |
1920 | !disable_frontswap_selfshrink); | |
faca2ef7 DM |
1921 | out: |
1922 | return ret; | |
1923 | } | |
1924 | ||
835f2f51 DM |
1925 | #ifdef CONFIG_ZCACHE_MODULE |
1926 | #ifdef CONFIG_RAMSTER | |
410b6372 | 1927 | module_param(ramster_enabled, bool, S_IRUGO); |
835f2f51 DM |
1928 | module_param(disable_frontswap_selfshrink, int, S_IRUGO); |
1929 | #endif | |
410b6372 GU |
1930 | module_param(disable_cleancache, bool, S_IRUGO); |
1931 | module_param(disable_frontswap, bool, S_IRUGO); | |
835f2f51 DM |
1932 | #ifdef FRONTSWAP_HAS_EXCLUSIVE_GETS |
1933 | module_param(frontswap_has_exclusive_gets, bool, S_IRUGO); | |
1934 | #endif | |
410b6372 | 1935 | module_param(disable_frontswap_ignore_nonactive, bool, S_IRUGO); |
835f2f51 DM |
1936 | module_param(zcache_comp_name, charp, S_IRUGO); |
1937 | module_init(zcache_init); | |
1938 | MODULE_LICENSE("GPL"); | |
1939 | MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>"); | |
1940 | MODULE_DESCRIPTION("In-kernel compression of cleancache/frontswap pages"); | |
1941 | #else | |
faca2ef7 | 1942 | late_initcall(zcache_init); |
835f2f51 | 1943 | #endif |