zcache/zbud: Fix __init mismatch
[deliverable/linux.git] / drivers / staging / zcache / zbud.c
CommitLineData
faca2ef7
DM
1/*
2 * zbud.c - Compression buddies allocator
3 *
4 * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
5 *
6 * Compression buddies ("zbud") provides for efficiently packing two
7 * (or, possibly in the future, more) compressed pages ("zpages") into
8 * a single "raw" pageframe and for tracking both zpages and pageframes
9 * so that whole pageframes can be easily reclaimed in LRU-like order.
10 * It is designed to be used in conjunction with transcendent memory
11 * ("tmem"); for example separate LRU lists are maintained for persistent
12 * vs. ephemeral pages.
13 *
14 * A zbudpage is an overlay for a struct page and thus each zbudpage
15 * refers to a physical pageframe of RAM. When the caller passes a
16 * struct page from the kernel's page allocator, zbud "transforms" it
17 * to a zbudpage which sets/uses a different set of fields than the
18 * struct-page and thus must "untransform" it back by reinitializing
19 * certain fields before the struct-page can be freed. The fields
20 * of a zbudpage include a page lock for controlling access to the
21 * corresponding pageframe, and there is a size field for each zpage.
22 * Each zbudpage also lives on two linked lists: a "budlist" which is
23 * used to support efficient buddying of zpages; and an "lru" which
24 * is used for reclaiming pageframes in approximately least-recently-used
25 * order.
26 *
27 * A zbudpageframe is a pageframe divided up into aligned 64-byte "chunks"
28 * which contain the compressed data for zero, one, or two zbuds. Contained
29 * with the compressed data is a tmem_handle which is a key to allow
30 * the same data to be found via the tmem interface so the zpage can
31 * be invalidated (for ephemeral pages) or repatriated to the swap cache
32 * (for persistent pages). The contents of a zbudpageframe must never
33 * be accessed without holding the page lock for the corresponding
34 * zbudpage and, to accomodate highmem machines, the contents may
35 * only be examined or changes when kmapped. Thus, when in use, a
36 * kmapped zbudpageframe is referred to in the zbud code as "void *zbpg".
37 *
38 * Note that the term "zbud" refers to the combination of a zpage and
39 * a tmem_handle that is stored as one of possibly two "buddied" zpages;
40 * it also generically refers to this allocator... sorry for any confusion.
41 *
42 * A zbudref is a pointer to a struct zbudpage (which can be cast to a
43 * struct page), with the LSB either cleared or set to indicate, respectively,
44 * the first or second zpage in the zbudpageframe. Since a zbudref can be
45 * cast to a pointer, it is used as the tmem "pampd" pointer and uniquely
46 * references a stored tmem page and so is the only zbud data structure
47 * externally visible to zbud.c/zbud.h.
48 *
49 * Since we wish to reclaim entire pageframes but zpages may be randomly
50 * added and deleted to any given pageframe, we approximate LRU by
51 * promoting a pageframe to MRU when a zpage is added to it, but
52 * leaving it at the current place in the list when a zpage is deleted
53 * from it. As a side effect, zpages that are difficult to buddy (e.g.
54 * very large paages) will be reclaimed faster than average, which seems
55 * reasonable.
56 *
57 * In the current implementation, no more than two zpages may be stored in
58 * any pageframe and no zpage ever crosses a pageframe boundary. While
59 * other zpage allocation mechanisms may allow greater density, this two
60 * zpage-per-pageframe limit both ensures simple reclaim of pageframes
61 * (including garbage collection of references to the contents of those
62 * pageframes from tmem data structures) AND avoids the need for compaction.
63 * With additional complexity, zbud could be modified to support storing
64 * up to three zpages per pageframe or, to handle larger average zpages,
65 * up to three zpages per pair of pageframes, but it is not clear if the
66 * additional complexity would be worth it. So consider it an exercise
67 * for future developers.
68 *
69 * Note also that zbud does no page allocation or freeing. This is so
70 * that the caller has complete control over and, for accounting, visibility
71 * into if/when pages are allocated and freed.
72 *
73 * Finally, note that zbud limits the size of zpages it can store; the
74 * caller must check the zpage size with zbud_max_buddy_size before
75 * storing it, else BUGs will result. User beware.
76 */
77
78#include <linux/module.h>
79#include <linux/highmem.h>
80#include <linux/list.h>
81#include <linux/spinlock.h>
82#include <linux/pagemap.h>
83#include <linux/atomic.h>
84#include <linux/bug.h>
85#include "tmem.h"
86#include "zcache.h"
87#include "zbud.h"
88
89/*
90 * We need to ensure that a struct zbudpage is never larger than a
91 * struct page. This is checked with a BUG_ON in zbud_init.
92 *
93 * The unevictable field indicates that a zbud is being added to the
94 * zbudpage. Since this is a two-phase process (due to tmem locking),
95 * this field locks the zbudpage against eviction when a zbud match
96 * or creation is in process. Since this addition process may occur
97 * in parallel for two zbuds in one zbudpage, the field is a counter
98 * that must not exceed two.
99 */
100struct zbudpage {
101 union {
102 struct page page;
103 struct {
104 unsigned long space_for_flags;
105 struct {
e49ee36d
DM
106 unsigned zbud0_size: PAGE_SHIFT;
107 unsigned zbud1_size: PAGE_SHIFT;
faca2ef7
DM
108 unsigned unevictable:2;
109 };
110 struct list_head budlist;
111 struct list_head lru;
112 };
113 };
114};
e49ee36d
DM
115#if (PAGE_SHIFT * 2) + 2 > BITS_PER_LONG
116#error "zbud won't work for this arch, PAGE_SIZE is too large"
117#endif
faca2ef7
DM
118
119struct zbudref {
120 union {
121 struct zbudpage *zbudpage;
122 unsigned long zbudref;
123 };
124};
125
126#define CHUNK_SHIFT 6
127#define CHUNK_SIZE (1 << CHUNK_SHIFT)
128#define CHUNK_MASK (~(CHUNK_SIZE-1))
129#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
130#define MAX_CHUNK (NCHUNKS-1)
131
132/*
133 * The following functions deal with the difference between struct
134 * page and struct zbudpage. Note the hack of using the pageflags
135 * from struct page; this is to avoid duplicating all the complex
136 * pageflag macros.
137 */
138static inline void zbudpage_spin_lock(struct zbudpage *zbudpage)
139{
140 struct page *page = (struct page *)zbudpage;
141
142 while (unlikely(test_and_set_bit_lock(PG_locked, &page->flags))) {
143 do {
144 cpu_relax();
145 } while (test_bit(PG_locked, &page->flags));
146 }
147}
148
149static inline void zbudpage_spin_unlock(struct zbudpage *zbudpage)
150{
151 struct page *page = (struct page *)zbudpage;
152
153 clear_bit(PG_locked, &page->flags);
154}
155
156static inline int zbudpage_spin_trylock(struct zbudpage *zbudpage)
157{
158 return trylock_page((struct page *)zbudpage);
159}
160
161static inline int zbudpage_is_locked(struct zbudpage *zbudpage)
162{
163 return PageLocked((struct page *)zbudpage);
164}
165
166static inline void *kmap_zbudpage_atomic(struct zbudpage *zbudpage)
167{
168 return kmap_atomic((struct page *)zbudpage);
169}
170
171/*
172 * A dying zbudpage is an ephemeral page in the process of being evicted.
173 * Any data contained in the zbudpage is invalid and we are just waiting for
174 * the tmem pampds to be invalidated before freeing the page
175 */
176static inline int zbudpage_is_dying(struct zbudpage *zbudpage)
177{
178 struct page *page = (struct page *)zbudpage;
179
180 return test_bit(PG_reclaim, &page->flags);
181}
182
183static inline void zbudpage_set_dying(struct zbudpage *zbudpage)
184{
185 struct page *page = (struct page *)zbudpage;
186
187 set_bit(PG_reclaim, &page->flags);
188}
189
190static inline void zbudpage_clear_dying(struct zbudpage *zbudpage)
191{
192 struct page *page = (struct page *)zbudpage;
193
194 clear_bit(PG_reclaim, &page->flags);
195}
196
197/*
198 * A zombie zbudpage is a persistent page in the process of being evicted.
199 * The data contained in the zbudpage is valid and we are just waiting for
200 * the tmem pampds to be invalidated before freeing the page
201 */
202static inline int zbudpage_is_zombie(struct zbudpage *zbudpage)
203{
204 struct page *page = (struct page *)zbudpage;
205
206 return test_bit(PG_dirty, &page->flags);
207}
208
209static inline void zbudpage_set_zombie(struct zbudpage *zbudpage)
210{
211 struct page *page = (struct page *)zbudpage;
212
213 set_bit(PG_dirty, &page->flags);
214}
215
216static inline void zbudpage_clear_zombie(struct zbudpage *zbudpage)
217{
218 struct page *page = (struct page *)zbudpage;
219
220 clear_bit(PG_dirty, &page->flags);
221}
222
223static inline void kunmap_zbudpage_atomic(void *zbpg)
224{
225 kunmap_atomic(zbpg);
226}
227
228/*
229 * zbud "translation" and helper functions
230 */
231
232static inline struct zbudpage *zbudref_to_zbudpage(struct zbudref *zref)
233{
234 unsigned long zbud = (unsigned long)zref;
235 zbud &= ~1UL;
236 return (struct zbudpage *)zbud;
237}
238
239static inline struct zbudref *zbudpage_to_zbudref(struct zbudpage *zbudpage,
240 unsigned budnum)
241{
242 unsigned long zbud = (unsigned long)zbudpage;
243 BUG_ON(budnum > 1);
244 zbud |= budnum;
245 return (struct zbudref *)zbud;
246}
247
248static inline int zbudref_budnum(struct zbudref *zbudref)
249{
250 unsigned long zbud = (unsigned long)zbudref;
251 return zbud & 1UL;
252}
253
254static inline unsigned zbud_max_size(void)
255{
256 return MAX_CHUNK << CHUNK_SHIFT;
257}
258
259static inline unsigned zbud_size_to_chunks(unsigned size)
260{
261 BUG_ON(size == 0 || size > zbud_max_size());
262 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
263}
264
265/* can only be used between kmap_zbudpage_atomic/kunmap_zbudpage_atomic! */
266static inline char *zbud_data(void *zbpg,
267 unsigned budnum, unsigned size)
268{
269 char *p;
270
271 BUG_ON(size == 0 || size > zbud_max_size());
272 p = (char *)zbpg;
273 if (budnum == 1)
274 p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
275 return p;
276}
277
278/*
279 * These are all informative and exposed through debugfs... except for
280 * the arrays... anyone know how to do that? To avoid confusion for
281 * debugfs viewers, some of these should also be atomic_long_t, but
282 * I don't know how to expose atomics via debugfs either...
283 */
feb897e1
KRW
284static ssize_t zbud_eph_pageframes;
285static ssize_t zbud_pers_pageframes;
286static ssize_t zbud_eph_zpages;
287static ssize_t zbud_pers_zpages;
faca2ef7
DM
288static u64 zbud_eph_zbytes;
289static u64 zbud_pers_zbytes;
feb897e1
KRW
290static ssize_t zbud_eph_evicted_pageframes;
291static ssize_t zbud_pers_evicted_pageframes;
292static ssize_t zbud_eph_cumul_zpages;
293static ssize_t zbud_pers_cumul_zpages;
faca2ef7
DM
294static u64 zbud_eph_cumul_zbytes;
295static u64 zbud_pers_cumul_zbytes;
feb897e1
KRW
296static ssize_t zbud_eph_cumul_chunk_counts[NCHUNKS];
297static ssize_t zbud_pers_cumul_chunk_counts[NCHUNKS];
298static ssize_t zbud_eph_buddied_count;
299static ssize_t zbud_pers_buddied_count;
300static ssize_t zbud_eph_unbuddied_count;
301static ssize_t zbud_pers_unbuddied_count;
302static ssize_t zbud_eph_zombie_count;
303static ssize_t zbud_pers_zombie_count;
faca2ef7
DM
304static atomic_t zbud_eph_zombie_atomic;
305static atomic_t zbud_pers_zombie_atomic;
306
307#ifdef CONFIG_DEBUG_FS
308#include <linux/debugfs.h>
309#define zdfs debugfs_create_size_t
310#define zdfs64 debugfs_create_u64
311static int zbud_debugfs_init(void)
312{
313 struct dentry *root = debugfs_create_dir("zbud", NULL);
314 if (root == NULL)
315 return -ENXIO;
316
317 /*
318 * would be nice to dump the sizes of the unbuddied
319 * arrays, like was done with sysfs, but it doesn't
320 * look like debugfs is flexible enough to do that
321 */
322 zdfs64("eph_zbytes", S_IRUGO, root, &zbud_eph_zbytes);
323 zdfs64("eph_cumul_zbytes", S_IRUGO, root, &zbud_eph_cumul_zbytes);
324 zdfs64("pers_zbytes", S_IRUGO, root, &zbud_pers_zbytes);
325 zdfs64("pers_cumul_zbytes", S_IRUGO, root, &zbud_pers_cumul_zbytes);
326 zdfs("eph_cumul_zpages", S_IRUGO, root, &zbud_eph_cumul_zpages);
327 zdfs("eph_evicted_pageframes", S_IRUGO, root,
328 &zbud_eph_evicted_pageframes);
329 zdfs("eph_zpages", S_IRUGO, root, &zbud_eph_zpages);
330 zdfs("eph_pageframes", S_IRUGO, root, &zbud_eph_pageframes);
331 zdfs("eph_buddied_count", S_IRUGO, root, &zbud_eph_buddied_count);
332 zdfs("eph_unbuddied_count", S_IRUGO, root, &zbud_eph_unbuddied_count);
333 zdfs("pers_cumul_zpages", S_IRUGO, root, &zbud_pers_cumul_zpages);
334 zdfs("pers_evicted_pageframes", S_IRUGO, root,
335 &zbud_pers_evicted_pageframes);
336 zdfs("pers_zpages", S_IRUGO, root, &zbud_pers_zpages);
337 zdfs("pers_pageframes", S_IRUGO, root, &zbud_pers_pageframes);
338 zdfs("pers_buddied_count", S_IRUGO, root, &zbud_pers_buddied_count);
339 zdfs("pers_unbuddied_count", S_IRUGO, root, &zbud_pers_unbuddied_count);
340 zdfs("pers_zombie_count", S_IRUGO, root, &zbud_pers_zombie_count);
341 return 0;
342}
343#undef zdfs
344#undef zdfs64
345#endif
346
347/* protects the buddied list and all unbuddied lists */
348static DEFINE_SPINLOCK(zbud_eph_lists_lock);
349static DEFINE_SPINLOCK(zbud_pers_lists_lock);
350
351struct zbud_unbuddied {
352 struct list_head list;
353 unsigned count;
354};
355
356/* list N contains pages with N chunks USED and NCHUNKS-N unused */
357/* element 0 is never used but optimizing that isn't worth it */
358static struct zbud_unbuddied zbud_eph_unbuddied[NCHUNKS];
359static struct zbud_unbuddied zbud_pers_unbuddied[NCHUNKS];
360static LIST_HEAD(zbud_eph_lru_list);
361static LIST_HEAD(zbud_pers_lru_list);
362static LIST_HEAD(zbud_eph_buddied_list);
363static LIST_HEAD(zbud_pers_buddied_list);
364static LIST_HEAD(zbud_eph_zombie_list);
365static LIST_HEAD(zbud_pers_zombie_list);
366
367/*
368 * Given a struct page, transform it to a zbudpage so that it can be
369 * used by zbud and initialize fields as necessary.
370 */
371static inline struct zbudpage *zbud_init_zbudpage(struct page *page, bool eph)
372{
373 struct zbudpage *zbudpage = (struct zbudpage *)page;
374
375 BUG_ON(page == NULL);
376 INIT_LIST_HEAD(&zbudpage->budlist);
377 INIT_LIST_HEAD(&zbudpage->lru);
378 zbudpage->zbud0_size = 0;
379 zbudpage->zbud1_size = 0;
380 zbudpage->unevictable = 0;
381 if (eph)
382 zbud_eph_pageframes++;
383 else
384 zbud_pers_pageframes++;
385 return zbudpage;
386}
387
388/* "Transform" a zbudpage back to a struct page suitable to free. */
389static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage,
390 bool eph)
391{
392 struct page *page = (struct page *)zbudpage;
393
394 BUG_ON(!list_empty(&zbudpage->budlist));
395 BUG_ON(!list_empty(&zbudpage->lru));
396 BUG_ON(zbudpage->zbud0_size != 0);
397 BUG_ON(zbudpage->zbud1_size != 0);
398 BUG_ON(!PageLocked(page));
399 BUG_ON(zbudpage->unevictable != 0);
400 BUG_ON(zbudpage_is_dying(zbudpage));
401 BUG_ON(zbudpage_is_zombie(zbudpage));
402 if (eph)
403 zbud_eph_pageframes--;
404 else
405 zbud_pers_pageframes--;
406 zbudpage_spin_unlock(zbudpage);
407 reset_page_mapcount(page);
408 init_page_count(page);
409 page->index = 0;
410 return page;
411}
412
413/* Mark a zbud as unused and do accounting */
414static inline void zbud_unuse_zbud(struct zbudpage *zbudpage,
415 int budnum, bool eph)
416{
417 unsigned size;
418
419 BUG_ON(!zbudpage_is_locked(zbudpage));
420 if (budnum == 0) {
421 size = zbudpage->zbud0_size;
422 zbudpage->zbud0_size = 0;
423 } else {
424 size = zbudpage->zbud1_size;
425 zbudpage->zbud1_size = 0;
426 }
427 if (eph) {
428 zbud_eph_zbytes -= size;
429 zbud_eph_zpages--;
430 } else {
431 zbud_pers_zbytes -= size;
432 zbud_pers_zpages--;
433 }
434}
435
436/*
437 * Given a zbudpage/budnum/size, a tmem handle, and a kmapped pointer
438 * to some data, set up the zbud appropriately including data copying
439 * and accounting. Note that if cdata is NULL, the data copying is
440 * skipped. (This is useful for lazy writes such as for RAMster.)
441 */
442static void zbud_init_zbud(struct zbudpage *zbudpage, struct tmem_handle *th,
443 bool eph, void *cdata,
444 unsigned budnum, unsigned size)
445{
446 char *to;
447 void *zbpg;
448 struct tmem_handle *to_th;
449 unsigned nchunks = zbud_size_to_chunks(size);
450
451 BUG_ON(!zbudpage_is_locked(zbudpage));
452 zbpg = kmap_zbudpage_atomic(zbudpage);
453 to = zbud_data(zbpg, budnum, size);
454 to_th = (struct tmem_handle *)to;
455 to_th->index = th->index;
456 to_th->oid = th->oid;
457 to_th->pool_id = th->pool_id;
458 to_th->client_id = th->client_id;
459 to += sizeof(struct tmem_handle);
460 if (cdata != NULL)
461 memcpy(to, cdata, size - sizeof(struct tmem_handle));
462 kunmap_zbudpage_atomic(zbpg);
463 if (budnum == 0)
464 zbudpage->zbud0_size = size;
465 else
466 zbudpage->zbud1_size = size;
467 if (eph) {
468 zbud_eph_cumul_chunk_counts[nchunks]++;
469 zbud_eph_zpages++;
470 zbud_eph_cumul_zpages++;
471 zbud_eph_zbytes += size;
472 zbud_eph_cumul_zbytes += size;
473 } else {
474 zbud_pers_cumul_chunk_counts[nchunks]++;
475 zbud_pers_zpages++;
476 zbud_pers_cumul_zpages++;
477 zbud_pers_zbytes += size;
478 zbud_pers_cumul_zbytes += size;
479 }
480}
481
482/*
483 * Given a locked dying zbudpage, read out the tmem handles from the data,
484 * unlock the page, then use the handles to tell tmem to flush out its
485 * references
486 */
487static void zbud_evict_tmem(struct zbudpage *zbudpage)
488{
489 int i, j;
490 uint32_t pool_id[2], client_id[2];
491 uint32_t index[2];
492 struct tmem_oid oid[2];
493 struct tmem_pool *pool;
494 void *zbpg;
495 struct tmem_handle *th;
496 unsigned size;
497
498 /* read out the tmem handles from the data and set aside */
499 zbpg = kmap_zbudpage_atomic(zbudpage);
500 for (i = 0, j = 0; i < 2; i++) {
501 size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
502 if (size) {
503 th = (struct tmem_handle *)zbud_data(zbpg, i, size);
504 client_id[j] = th->client_id;
505 pool_id[j] = th->pool_id;
506 oid[j] = th->oid;
507 index[j] = th->index;
508 j++;
509 zbud_unuse_zbud(zbudpage, i, true);
510 }
511 }
512 kunmap_zbudpage_atomic(zbpg);
513 zbudpage_spin_unlock(zbudpage);
514 /* zbudpage is now an unlocked dying... tell tmem to flush pointers */
515 for (i = 0; i < j; i++) {
516 pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
517 if (pool != NULL) {
518 tmem_flush_page(pool, &oid[i], index[i]);
519 zcache_put_pool(pool);
520 }
521 }
522}
523
524/*
525 * Externally callable zbud handling routines.
526 */
527
528/*
529 * Return the maximum size compressed page that can be stored (secretly
530 * setting aside space for the tmem handle.
531 */
532unsigned int zbud_max_buddy_size(void)
533{
534 return zbud_max_size() - sizeof(struct tmem_handle);
535}
536
537/*
538 * Given a zbud reference, free the corresponding zbud from all lists,
539 * mark it as unused, do accounting, and if the freeing of the zbud
540 * frees up an entire pageframe, return it to the caller (else NULL).
541 */
542struct page *zbud_free_and_delist(struct zbudref *zref, bool eph,
543 unsigned int *zsize, unsigned int *zpages)
544{
545 unsigned long budnum = zbudref_budnum(zref);
546 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
547 struct page *page = NULL;
548 unsigned chunks, bud_size, other_bud_size;
549 spinlock_t *lists_lock =
550 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
551 struct zbud_unbuddied *unbud =
552 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
553
554
555 spin_lock(lists_lock);
556 zbudpage_spin_lock(zbudpage);
557 if (zbudpage_is_dying(zbudpage)) {
558 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
559 zbudpage_spin_unlock(zbudpage);
560 spin_unlock(lists_lock);
561 *zpages = 0;
562 *zsize = 0;
563 goto out;
564 }
565 if (budnum == 0) {
566 bud_size = zbudpage->zbud0_size;
567 other_bud_size = zbudpage->zbud1_size;
568 } else {
569 bud_size = zbudpage->zbud1_size;
570 other_bud_size = zbudpage->zbud0_size;
571 }
572 *zsize = bud_size - sizeof(struct tmem_handle);
573 *zpages = 1;
574 zbud_unuse_zbud(zbudpage, budnum, eph);
575 if (other_bud_size == 0) { /* was unbuddied: unlist and free */
576 chunks = zbud_size_to_chunks(bud_size) ;
577 if (zbudpage_is_zombie(zbudpage)) {
578 if (eph)
579 zbud_pers_zombie_count =
580 atomic_dec_return(&zbud_eph_zombie_atomic);
581 else
582 zbud_pers_zombie_count =
583 atomic_dec_return(&zbud_pers_zombie_atomic);
584 zbudpage_clear_zombie(zbudpage);
585 } else {
586 BUG_ON(list_empty(&unbud[chunks].list));
587 list_del_init(&zbudpage->budlist);
588 unbud[chunks].count--;
589 }
590 list_del_init(&zbudpage->lru);
591 spin_unlock(lists_lock);
592 if (eph)
593 zbud_eph_unbuddied_count--;
594 else
595 zbud_pers_unbuddied_count--;
596 page = zbud_unuse_zbudpage(zbudpage, eph);
597 } else { /* was buddied: move remaining buddy to unbuddied list */
598 chunks = zbud_size_to_chunks(other_bud_size) ;
599 if (!zbudpage_is_zombie(zbudpage)) {
600 list_del_init(&zbudpage->budlist);
601 list_add_tail(&zbudpage->budlist, &unbud[chunks].list);
602 unbud[chunks].count++;
603 }
604 if (eph) {
605 zbud_eph_buddied_count--;
606 zbud_eph_unbuddied_count++;
607 } else {
608 zbud_pers_unbuddied_count++;
609 zbud_pers_buddied_count--;
610 }
611 /* don't mess with lru, no need to move it */
612 zbudpage_spin_unlock(zbudpage);
613 spin_unlock(lists_lock);
614 }
615out:
616 return page;
617}
618
619/*
620 * Given a tmem handle, and a kmapped pointer to compressed data of
621 * the given size, try to find an unbuddied zbudpage in which to
622 * create a zbud. If found, put it there, mark the zbudpage unevictable,
623 * and return a zbudref to it. Else return NULL.
624 */
625struct zbudref *zbud_match_prep(struct tmem_handle *th, bool eph,
626 void *cdata, unsigned size)
627{
628 struct zbudpage *zbudpage = NULL, *zbudpage2;
629 unsigned long budnum = 0UL;
630 unsigned nchunks;
631 int i, found_good_buddy = 0;
632 spinlock_t *lists_lock =
633 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
634 struct zbud_unbuddied *unbud =
635 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
636
637 size += sizeof(struct tmem_handle);
638 nchunks = zbud_size_to_chunks(size);
639 for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
640 spin_lock(lists_lock);
641 if (!list_empty(&unbud[i].list)) {
642 list_for_each_entry_safe(zbudpage, zbudpage2,
643 &unbud[i].list, budlist) {
644 if (zbudpage_spin_trylock(zbudpage)) {
645 found_good_buddy = i;
646 goto found_unbuddied;
647 }
648 }
649 }
650 spin_unlock(lists_lock);
651 }
652 zbudpage = NULL;
653 goto out;
654
655found_unbuddied:
656 BUG_ON(!zbudpage_is_locked(zbudpage));
657 BUG_ON(!((zbudpage->zbud0_size == 0) ^ (zbudpage->zbud1_size == 0)));
658 if (zbudpage->zbud0_size == 0)
659 budnum = 0UL;
660 else if (zbudpage->zbud1_size == 0)
661 budnum = 1UL;
662 list_del_init(&zbudpage->budlist);
663 if (eph) {
664 list_add_tail(&zbudpage->budlist, &zbud_eph_buddied_list);
665 unbud[found_good_buddy].count--;
666 zbud_eph_unbuddied_count--;
667 zbud_eph_buddied_count++;
668 /* "promote" raw zbudpage to most-recently-used */
669 list_del_init(&zbudpage->lru);
670 list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
671 } else {
672 list_add_tail(&zbudpage->budlist, &zbud_pers_buddied_list);
673 unbud[found_good_buddy].count--;
674 zbud_pers_unbuddied_count--;
675 zbud_pers_buddied_count++;
676 /* "promote" raw zbudpage to most-recently-used */
677 list_del_init(&zbudpage->lru);
678 list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
679 }
680 zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
681 zbudpage->unevictable++;
682 BUG_ON(zbudpage->unevictable == 3);
683 zbudpage_spin_unlock(zbudpage);
684 spin_unlock(lists_lock);
685out:
686 return zbudpage_to_zbudref(zbudpage, budnum);
687
688}
689
690/*
691 * Given a tmem handle, and a kmapped pointer to compressed data of
692 * the given size, and a newly allocated struct page, create an unevictable
693 * zbud in that new page and return a zbudref to it.
694 */
695struct zbudref *zbud_create_prep(struct tmem_handle *th, bool eph,
696 void *cdata, unsigned size,
697 struct page *newpage)
698{
699 struct zbudpage *zbudpage;
700 unsigned long budnum = 0;
701 unsigned nchunks;
702 spinlock_t *lists_lock =
703 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
704 struct zbud_unbuddied *unbud =
705 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
706
707#if 0
708 /* this may be worth it later to support decompress-in-place? */
709 static unsigned long counter;
710 budnum = counter++ & 1; /* alternate using zbud0 and zbud1 */
711#endif
712
713 if (size > zbud_max_buddy_size())
714 return NULL;
715 if (newpage == NULL)
716 return NULL;
717
718 size += sizeof(struct tmem_handle);
719 nchunks = zbud_size_to_chunks(size) ;
720 spin_lock(lists_lock);
721 zbudpage = zbud_init_zbudpage(newpage, eph);
722 zbudpage_spin_lock(zbudpage);
723 list_add_tail(&zbudpage->budlist, &unbud[nchunks].list);
724 if (eph) {
725 list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
726 zbud_eph_unbuddied_count++;
727 } else {
728 list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
729 zbud_pers_unbuddied_count++;
730 }
731 unbud[nchunks].count++;
732 zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
733 zbudpage->unevictable++;
734 BUG_ON(zbudpage->unevictable == 3);
735 zbudpage_spin_unlock(zbudpage);
736 spin_unlock(lists_lock);
737 return zbudpage_to_zbudref(zbudpage, budnum);
738}
739
740/*
741 * Finish creation of a zbud by, assuming another zbud isn't being created
742 * in parallel, marking it evictable.
743 */
744void zbud_create_finish(struct zbudref *zref, bool eph)
745{
746 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
747 spinlock_t *lists_lock =
748 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
749
750 spin_lock(lists_lock);
751 zbudpage_spin_lock(zbudpage);
752 BUG_ON(zbudpage_is_dying(zbudpage));
753 zbudpage->unevictable--;
754 BUG_ON((int)zbudpage->unevictable < 0);
755 zbudpage_spin_unlock(zbudpage);
756 spin_unlock(lists_lock);
757}
758
759/*
760 * Given a zbudref and a struct page, decompress the data from
761 * the zbud into the physical page represented by the struct page
762 * by upcalling to zcache_decompress
763 */
764int zbud_decompress(struct page *data_page, struct zbudref *zref, bool eph,
765 void (*decompress)(char *, unsigned int, char *))
766{
767 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
768 unsigned long budnum = zbudref_budnum(zref);
769 void *zbpg;
770 char *to_va, *from_va;
771 unsigned size;
772 int ret = -1;
773 spinlock_t *lists_lock =
774 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
775
776 spin_lock(lists_lock);
777 zbudpage_spin_lock(zbudpage);
778 if (zbudpage_is_dying(zbudpage)) {
779 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
780 goto out;
781 }
782 zbpg = kmap_zbudpage_atomic(zbudpage);
783 to_va = kmap_atomic(data_page);
784 if (budnum == 0)
785 size = zbudpage->zbud0_size;
786 else
787 size = zbudpage->zbud1_size;
788 BUG_ON(size == 0 || size > zbud_max_size());
789 from_va = zbud_data(zbpg, budnum, size);
790 from_va += sizeof(struct tmem_handle);
791 size -= sizeof(struct tmem_handle);
792 decompress(from_va, size, to_va);
793 kunmap_atomic(to_va);
794 kunmap_zbudpage_atomic(zbpg);
795 ret = 0;
796out:
797 zbudpage_spin_unlock(zbudpage);
798 spin_unlock(lists_lock);
799 return ret;
800}
801
802/*
803 * Given a zbudref and a kernel pointer, copy the data from
804 * the zbud to the kernel pointer.
805 */
806int zbud_copy_from_zbud(char *to_va, struct zbudref *zref,
807 size_t *sizep, bool eph)
808{
809 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
810 unsigned long budnum = zbudref_budnum(zref);
811 void *zbpg;
812 char *from_va;
813 unsigned size;
814 int ret = -1;
815 spinlock_t *lists_lock =
816 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
817
818 spin_lock(lists_lock);
819 zbudpage_spin_lock(zbudpage);
820 if (zbudpage_is_dying(zbudpage)) {
821 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
822 goto out;
823 }
824 zbpg = kmap_zbudpage_atomic(zbudpage);
825 if (budnum == 0)
826 size = zbudpage->zbud0_size;
827 else
828 size = zbudpage->zbud1_size;
829 BUG_ON(size == 0 || size > zbud_max_size());
830 from_va = zbud_data(zbpg, budnum, size);
831 from_va += sizeof(struct tmem_handle);
832 size -= sizeof(struct tmem_handle);
833 *sizep = size;
834 memcpy(to_va, from_va, size);
835
836 kunmap_zbudpage_atomic(zbpg);
837 ret = 0;
838out:
839 zbudpage_spin_unlock(zbudpage);
840 spin_unlock(lists_lock);
841 return ret;
842}
843
844/*
845 * Given a zbudref and a kernel pointer, copy the data from
846 * the kernel pointer to the zbud.
847 */
848int zbud_copy_to_zbud(struct zbudref *zref, char *from_va, bool eph)
849{
850 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
851 unsigned long budnum = zbudref_budnum(zref);
852 void *zbpg;
853 char *to_va;
854 unsigned size;
855 int ret = -1;
856 spinlock_t *lists_lock =
857 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
858
859 spin_lock(lists_lock);
860 zbudpage_spin_lock(zbudpage);
861 if (zbudpage_is_dying(zbudpage)) {
862 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
863 goto out;
864 }
865 zbpg = kmap_zbudpage_atomic(zbudpage);
866 if (budnum == 0)
867 size = zbudpage->zbud0_size;
868 else
869 size = zbudpage->zbud1_size;
870 BUG_ON(size == 0 || size > zbud_max_size());
871 to_va = zbud_data(zbpg, budnum, size);
872 to_va += sizeof(struct tmem_handle);
873 size -= sizeof(struct tmem_handle);
874 memcpy(to_va, from_va, size);
875
876 kunmap_zbudpage_atomic(zbpg);
877 ret = 0;
878out:
879 zbudpage_spin_unlock(zbudpage);
880 spin_unlock(lists_lock);
881 return ret;
882}
883
884/*
885 * Choose an ephemeral LRU zbudpage that is evictable (not locked), ensure
886 * there are no references to it remaining, and return the now unused
887 * (and re-init'ed) struct page and the total amount of compressed
888 * data that was evicted.
889 */
890struct page *zbud_evict_pageframe_lru(unsigned int *zsize, unsigned int *zpages)
891{
892 struct zbudpage *zbudpage = NULL, *zbudpage2;
893 struct zbud_unbuddied *unbud = zbud_eph_unbuddied;
894 struct page *page = NULL;
895 bool irqs_disabled = irqs_disabled();
896
897 /*
898 * Since this can be called indirectly from cleancache_put, which
899 * has interrupts disabled, as well as frontswap_put, which does not,
900 * we need to be able to handle both cases, even though it is ugly.
901 */
902 if (irqs_disabled)
903 spin_lock(&zbud_eph_lists_lock);
904 else
905 spin_lock_bh(&zbud_eph_lists_lock);
906 *zsize = 0;
907 if (list_empty(&zbud_eph_lru_list))
908 goto unlock_out;
909 list_for_each_entry_safe(zbudpage, zbudpage2, &zbud_eph_lru_list, lru) {
910 /* skip a locked zbudpage */
911 if (unlikely(!zbudpage_spin_trylock(zbudpage)))
912 continue;
913 /* skip an unevictable zbudpage */
914 if (unlikely(zbudpage->unevictable != 0)) {
915 zbudpage_spin_unlock(zbudpage);
916 continue;
917 }
918 /* got a locked evictable page */
919 goto evict_page;
920
921 }
922unlock_out:
923 /* no unlocked evictable pages, give up */
924 if (irqs_disabled)
925 spin_unlock(&zbud_eph_lists_lock);
926 else
927 spin_unlock_bh(&zbud_eph_lists_lock);
928 goto out;
929
930evict_page:
931 list_del_init(&zbudpage->budlist);
932 list_del_init(&zbudpage->lru);
933 zbudpage_set_dying(zbudpage);
934 /*
935 * the zbudpage is now "dying" and attempts to read, write,
936 * or delete data from it will be ignored
937 */
938 if (zbudpage->zbud0_size != 0 && zbudpage->zbud1_size != 0) {
939 *zsize = zbudpage->zbud0_size + zbudpage->zbud1_size -
940 (2 * sizeof(struct tmem_handle));
941 *zpages = 2;
942 } else if (zbudpage->zbud0_size != 0) {
943 unbud[zbud_size_to_chunks(zbudpage->zbud0_size)].count--;
944 *zsize = zbudpage->zbud0_size - sizeof(struct tmem_handle);
945 *zpages = 1;
946 } else if (zbudpage->zbud1_size != 0) {
947 unbud[zbud_size_to_chunks(zbudpage->zbud1_size)].count--;
948 *zsize = zbudpage->zbud1_size - sizeof(struct tmem_handle);
949 *zpages = 1;
950 } else {
951 BUG();
952 }
953 spin_unlock(&zbud_eph_lists_lock);
954 zbud_eph_evicted_pageframes++;
955 if (*zpages == 1)
956 zbud_eph_unbuddied_count--;
957 else
958 zbud_eph_buddied_count--;
959 zbud_evict_tmem(zbudpage);
960 zbudpage_spin_lock(zbudpage);
961 zbudpage_clear_dying(zbudpage);
962 page = zbud_unuse_zbudpage(zbudpage, true);
963 if (!irqs_disabled)
964 local_bh_enable();
965out:
966 return page;
967}
968
969/*
970 * Choose a persistent LRU zbudpage that is evictable (not locked), zombify it,
971 * read the tmem_handle(s) out of it into the passed array, and return the
972 * number of zbuds. Caller must perform necessary tmem functions and,
973 * indirectly, zbud functions to fetch any valid data and cause the
974 * now-zombified zbudpage to eventually be freed. We track the zombified
975 * zbudpage count so it is possible to observe if there is a leak.
976 FIXME: describe (ramster) case where data pointers are passed in for memcpy
977 */
978unsigned int zbud_make_zombie_lru(struct tmem_handle *th, unsigned char **data,
979 unsigned int *zsize, bool eph)
980{
981 struct zbudpage *zbudpage = NULL, *zbudpag2;
982 struct tmem_handle *thfrom;
983 char *from_va;
984 void *zbpg;
985 unsigned size;
986 int ret = 0, i;
987 spinlock_t *lists_lock =
988 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
989 struct list_head *lru_list =
990 eph ? &zbud_eph_lru_list : &zbud_pers_lru_list;
991
992 spin_lock_bh(lists_lock);
993 if (list_empty(lru_list))
994 goto out;
995 list_for_each_entry_safe(zbudpage, zbudpag2, lru_list, lru) {
996 /* skip a locked zbudpage */
997 if (unlikely(!zbudpage_spin_trylock(zbudpage)))
998 continue;
999 /* skip an unevictable zbudpage */
1000 if (unlikely(zbudpage->unevictable != 0)) {
1001 zbudpage_spin_unlock(zbudpage);
1002 continue;
1003 }
1004 /* got a locked evictable page */
1005 goto zombify_page;
1006 }
1007 /* no unlocked evictable pages, give up */
1008 goto out;
1009
1010zombify_page:
1011 /* got an unlocked evictable page, zombify it */
1012 list_del_init(&zbudpage->budlist);
1013 zbudpage_set_zombie(zbudpage);
1014 /* FIXME what accounting do I need to do here? */
1015 list_del_init(&zbudpage->lru);
1016 if (eph) {
1017 list_add_tail(&zbudpage->lru, &zbud_eph_zombie_list);
1018 zbud_eph_zombie_count =
1019 atomic_inc_return(&zbud_eph_zombie_atomic);
1020 } else {
1021 list_add_tail(&zbudpage->lru, &zbud_pers_zombie_list);
1022 zbud_pers_zombie_count =
1023 atomic_inc_return(&zbud_pers_zombie_atomic);
1024 }
1025 /* FIXME what accounting do I need to do here? */
1026 zbpg = kmap_zbudpage_atomic(zbudpage);
1027 for (i = 0; i < 2; i++) {
1028 size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
1029 if (size) {
1030 from_va = zbud_data(zbpg, i, size);
1031 thfrom = (struct tmem_handle *)from_va;
1032 from_va += sizeof(struct tmem_handle);
1033 size -= sizeof(struct tmem_handle);
1034 if (th != NULL)
1035 th[ret] = *thfrom;
1036 if (data != NULL)
1037 memcpy(data[ret], from_va, size);
1038 if (zsize != NULL)
1039 *zsize++ = size;
1040 ret++;
1041 }
1042 }
1043 kunmap_zbudpage_atomic(zbpg);
1044 zbudpage_spin_unlock(zbudpage);
1045out:
1046 spin_unlock_bh(lists_lock);
1047 return ret;
1048}
1049
d489082a 1050void zbud_init(void)
faca2ef7
DM
1051{
1052 int i;
1053
1054#ifdef CONFIG_DEBUG_FS
1055 zbud_debugfs_init();
1056#endif
1057 BUG_ON((sizeof(struct tmem_handle) * 2 > CHUNK_SIZE));
1058 BUG_ON(sizeof(struct zbudpage) > sizeof(struct page));
1059 for (i = 0; i < NCHUNKS; i++) {
1060 INIT_LIST_HEAD(&zbud_eph_unbuddied[i].list);
1061 INIT_LIST_HEAD(&zbud_pers_unbuddied[i].list);
1062 }
1063}
This page took 0.116242 seconds and 5 git commands to generate.