2 * zbud.c - Compression buddies allocator
4 * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
6 * Compression buddies ("zbud") provides for efficiently packing two
7 * (or, possibly in the future, more) compressed pages ("zpages") into
8 * a single "raw" pageframe and for tracking both zpages and pageframes
9 * so that whole pageframes can be easily reclaimed in LRU-like order.
10 * It is designed to be used in conjunction with transcendent memory
11 * ("tmem"); for example separate LRU lists are maintained for persistent
12 * vs. ephemeral pages.
14 * A zbudpage is an overlay for a struct page and thus each zbudpage
15 * refers to a physical pageframe of RAM. When the caller passes a
16 * struct page from the kernel's page allocator, zbud "transforms" it
17 * to a zbudpage which sets/uses a different set of fields than the
18 * struct-page and thus must "untransform" it back by reinitializing
19 * certain fields before the struct-page can be freed. The fields
20 * of a zbudpage include a page lock for controlling access to the
21 * corresponding pageframe, and there is a size field for each zpage.
22 * Each zbudpage also lives on two linked lists: a "budlist" which is
23 * used to support efficient buddying of zpages; and an "lru" which
24 * is used for reclaiming pageframes in approximately least-recently-used
27 * A zbudpageframe is a pageframe divided up into aligned 64-byte "chunks"
28 * which contain the compressed data for zero, one, or two zbuds. Contained
29 * with the compressed data is a tmem_handle which is a key to allow
30 * the same data to be found via the tmem interface so the zpage can
31 * be invalidated (for ephemeral pages) or repatriated to the swap cache
32 * (for persistent pages). The contents of a zbudpageframe must never
33 * be accessed without holding the page lock for the corresponding
34 * zbudpage and, to accomodate highmem machines, the contents may
35 * only be examined or changes when kmapped. Thus, when in use, a
36 * kmapped zbudpageframe is referred to in the zbud code as "void *zbpg".
38 * Note that the term "zbud" refers to the combination of a zpage and
39 * a tmem_handle that is stored as one of possibly two "buddied" zpages;
40 * it also generically refers to this allocator... sorry for any confusion.
42 * A zbudref is a pointer to a struct zbudpage (which can be cast to a
43 * struct page), with the LSB either cleared or set to indicate, respectively,
44 * the first or second zpage in the zbudpageframe. Since a zbudref can be
45 * cast to a pointer, it is used as the tmem "pampd" pointer and uniquely
46 * references a stored tmem page and so is the only zbud data structure
47 * externally visible to zbud.c/zbud.h.
49 * Since we wish to reclaim entire pageframes but zpages may be randomly
50 * added and deleted to any given pageframe, we approximate LRU by
51 * promoting a pageframe to MRU when a zpage is added to it, but
52 * leaving it at the current place in the list when a zpage is deleted
53 * from it. As a side effect, zpages that are difficult to buddy (e.g.
54 * very large paages) will be reclaimed faster than average, which seems
57 * In the current implementation, no more than two zpages may be stored in
58 * any pageframe and no zpage ever crosses a pageframe boundary. While
59 * other zpage allocation mechanisms may allow greater density, this two
60 * zpage-per-pageframe limit both ensures simple reclaim of pageframes
61 * (including garbage collection of references to the contents of those
62 * pageframes from tmem data structures) AND avoids the need for compaction.
63 * With additional complexity, zbud could be modified to support storing
64 * up to three zpages per pageframe or, to handle larger average zpages,
65 * up to three zpages per pair of pageframes, but it is not clear if the
66 * additional complexity would be worth it. So consider it an exercise
67 * for future developers.
69 * Note also that zbud does no page allocation or freeing. This is so
70 * that the caller has complete control over and, for accounting, visibility
71 * into if/when pages are allocated and freed.
73 * Finally, note that zbud limits the size of zpages it can store; the
74 * caller must check the zpage size with zbud_max_buddy_size before
75 * storing it, else BUGs will result. User beware.
78 #include <linux/module.h>
79 #include <linux/highmem.h>
80 #include <linux/list.h>
81 #include <linux/spinlock.h>
82 #include <linux/pagemap.h>
83 #include <linux/atomic.h>
84 #include <linux/bug.h>
90 * We need to ensure that a struct zbudpage is never larger than a
91 * struct page. This is checked with a BUG_ON in zbud_init.
93 * The unevictable field indicates that a zbud is being added to the
94 * zbudpage. Since this is a two-phase process (due to tmem locking),
95 * this field locks the zbudpage against eviction when a zbud match
96 * or creation is in process. Since this addition process may occur
97 * in parallel for two zbuds in one zbudpage, the field is a counter
98 * that must not exceed two.
104 unsigned long space_for_flags
;
106 unsigned zbud0_size
: PAGE_SHIFT
;
107 unsigned zbud1_size
: PAGE_SHIFT
;
108 unsigned unevictable
:2;
110 struct list_head budlist
;
111 struct list_head lru
;
115 #if (PAGE_SHIFT * 2) + 2 > BITS_PER_LONG
116 #error "zbud won't work for this arch, PAGE_SIZE is too large"
121 struct zbudpage
*zbudpage
;
122 unsigned long zbudref
;
126 #define CHUNK_SHIFT 6
127 #define CHUNK_SIZE (1 << CHUNK_SHIFT)
128 #define CHUNK_MASK (~(CHUNK_SIZE-1))
129 #define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
130 #define MAX_CHUNK (NCHUNKS-1)
133 * The following functions deal with the difference between struct
134 * page and struct zbudpage. Note the hack of using the pageflags
135 * from struct page; this is to avoid duplicating all the complex
138 static inline void zbudpage_spin_lock(struct zbudpage
*zbudpage
)
140 struct page
*page
= (struct page
*)zbudpage
;
142 while (unlikely(test_and_set_bit_lock(PG_locked
, &page
->flags
))) {
145 } while (test_bit(PG_locked
, &page
->flags
));
149 static inline void zbudpage_spin_unlock(struct zbudpage
*zbudpage
)
151 struct page
*page
= (struct page
*)zbudpage
;
153 clear_bit(PG_locked
, &page
->flags
);
156 static inline int zbudpage_spin_trylock(struct zbudpage
*zbudpage
)
158 return trylock_page((struct page
*)zbudpage
);
161 static inline int zbudpage_is_locked(struct zbudpage
*zbudpage
)
163 return PageLocked((struct page
*)zbudpage
);
166 static inline void *kmap_zbudpage_atomic(struct zbudpage
*zbudpage
)
168 return kmap_atomic((struct page
*)zbudpage
);
172 * A dying zbudpage is an ephemeral page in the process of being evicted.
173 * Any data contained in the zbudpage is invalid and we are just waiting for
174 * the tmem pampds to be invalidated before freeing the page
176 static inline int zbudpage_is_dying(struct zbudpage
*zbudpage
)
178 struct page
*page
= (struct page
*)zbudpage
;
180 return test_bit(PG_reclaim
, &page
->flags
);
183 static inline void zbudpage_set_dying(struct zbudpage
*zbudpage
)
185 struct page
*page
= (struct page
*)zbudpage
;
187 set_bit(PG_reclaim
, &page
->flags
);
190 static inline void zbudpage_clear_dying(struct zbudpage
*zbudpage
)
192 struct page
*page
= (struct page
*)zbudpage
;
194 clear_bit(PG_reclaim
, &page
->flags
);
198 * A zombie zbudpage is a persistent page in the process of being evicted.
199 * The data contained in the zbudpage is valid and we are just waiting for
200 * the tmem pampds to be invalidated before freeing the page
202 static inline int zbudpage_is_zombie(struct zbudpage
*zbudpage
)
204 struct page
*page
= (struct page
*)zbudpage
;
206 return test_bit(PG_dirty
, &page
->flags
);
209 static inline void zbudpage_set_zombie(struct zbudpage
*zbudpage
)
211 struct page
*page
= (struct page
*)zbudpage
;
213 set_bit(PG_dirty
, &page
->flags
);
216 static inline void zbudpage_clear_zombie(struct zbudpage
*zbudpage
)
218 struct page
*page
= (struct page
*)zbudpage
;
220 clear_bit(PG_dirty
, &page
->flags
);
223 static inline void kunmap_zbudpage_atomic(void *zbpg
)
229 * zbud "translation" and helper functions
232 static inline struct zbudpage
*zbudref_to_zbudpage(struct zbudref
*zref
)
234 unsigned long zbud
= (unsigned long)zref
;
236 return (struct zbudpage
*)zbud
;
239 static inline struct zbudref
*zbudpage_to_zbudref(struct zbudpage
*zbudpage
,
242 unsigned long zbud
= (unsigned long)zbudpage
;
245 return (struct zbudref
*)zbud
;
248 static inline int zbudref_budnum(struct zbudref
*zbudref
)
250 unsigned long zbud
= (unsigned long)zbudref
;
254 static inline unsigned zbud_max_size(void)
256 return MAX_CHUNK
<< CHUNK_SHIFT
;
259 static inline unsigned zbud_size_to_chunks(unsigned size
)
261 BUG_ON(size
== 0 || size
> zbud_max_size());
262 return (size
+ CHUNK_SIZE
- 1) >> CHUNK_SHIFT
;
265 /* can only be used between kmap_zbudpage_atomic/kunmap_zbudpage_atomic! */
266 static inline char *zbud_data(void *zbpg
,
267 unsigned budnum
, unsigned size
)
271 BUG_ON(size
== 0 || size
> zbud_max_size());
274 p
+= PAGE_SIZE
- ((size
+ CHUNK_SIZE
- 1) & CHUNK_MASK
);
279 * These are all informative and exposed through debugfs... except for
280 * the arrays... anyone know how to do that? To avoid confusion for
281 * debugfs viewers, some of these should also be atomic_long_t, but
282 * I don't know how to expose atomics via debugfs either...
284 static ssize_t zbud_eph_pageframes
;
285 static ssize_t zbud_pers_pageframes
;
286 static ssize_t zbud_eph_zpages
;
287 static ssize_t zbud_pers_zpages
;
288 static u64 zbud_eph_zbytes
;
289 static u64 zbud_pers_zbytes
;
290 static ssize_t zbud_eph_evicted_pageframes
;
291 static ssize_t zbud_pers_evicted_pageframes
;
292 static ssize_t zbud_eph_cumul_zpages
;
293 static ssize_t zbud_pers_cumul_zpages
;
294 static u64 zbud_eph_cumul_zbytes
;
295 static u64 zbud_pers_cumul_zbytes
;
296 static ssize_t zbud_eph_cumul_chunk_counts
[NCHUNKS
];
297 static ssize_t zbud_pers_cumul_chunk_counts
[NCHUNKS
];
298 static ssize_t zbud_eph_buddied_count
;
299 static ssize_t zbud_pers_buddied_count
;
300 static ssize_t zbud_eph_unbuddied_count
;
301 static ssize_t zbud_pers_unbuddied_count
;
302 static ssize_t zbud_eph_zombie_count
;
303 static ssize_t zbud_pers_zombie_count
;
304 static atomic_t zbud_eph_zombie_atomic
;
305 static atomic_t zbud_pers_zombie_atomic
;
307 #ifdef CONFIG_DEBUG_FS
308 #include <linux/debugfs.h>
309 #define zdfs debugfs_create_size_t
310 #define zdfs64 debugfs_create_u64
311 static int zbud_debugfs_init(void)
313 struct dentry
*root
= debugfs_create_dir("zbud", NULL
);
318 * would be nice to dump the sizes of the unbuddied
319 * arrays, like was done with sysfs, but it doesn't
320 * look like debugfs is flexible enough to do that
322 zdfs64("eph_zbytes", S_IRUGO
, root
, &zbud_eph_zbytes
);
323 zdfs64("eph_cumul_zbytes", S_IRUGO
, root
, &zbud_eph_cumul_zbytes
);
324 zdfs64("pers_zbytes", S_IRUGO
, root
, &zbud_pers_zbytes
);
325 zdfs64("pers_cumul_zbytes", S_IRUGO
, root
, &zbud_pers_cumul_zbytes
);
326 zdfs("eph_cumul_zpages", S_IRUGO
, root
, &zbud_eph_cumul_zpages
);
327 zdfs("eph_evicted_pageframes", S_IRUGO
, root
,
328 &zbud_eph_evicted_pageframes
);
329 zdfs("eph_zpages", S_IRUGO
, root
, &zbud_eph_zpages
);
330 zdfs("eph_pageframes", S_IRUGO
, root
, &zbud_eph_pageframes
);
331 zdfs("eph_buddied_count", S_IRUGO
, root
, &zbud_eph_buddied_count
);
332 zdfs("eph_unbuddied_count", S_IRUGO
, root
, &zbud_eph_unbuddied_count
);
333 zdfs("pers_cumul_zpages", S_IRUGO
, root
, &zbud_pers_cumul_zpages
);
334 zdfs("pers_evicted_pageframes", S_IRUGO
, root
,
335 &zbud_pers_evicted_pageframes
);
336 zdfs("pers_zpages", S_IRUGO
, root
, &zbud_pers_zpages
);
337 zdfs("pers_pageframes", S_IRUGO
, root
, &zbud_pers_pageframes
);
338 zdfs("pers_buddied_count", S_IRUGO
, root
, &zbud_pers_buddied_count
);
339 zdfs("pers_unbuddied_count", S_IRUGO
, root
, &zbud_pers_unbuddied_count
);
340 zdfs("pers_zombie_count", S_IRUGO
, root
, &zbud_pers_zombie_count
);
347 /* protects the buddied list and all unbuddied lists */
348 static DEFINE_SPINLOCK(zbud_eph_lists_lock
);
349 static DEFINE_SPINLOCK(zbud_pers_lists_lock
);
351 struct zbud_unbuddied
{
352 struct list_head list
;
356 /* list N contains pages with N chunks USED and NCHUNKS-N unused */
357 /* element 0 is never used but optimizing that isn't worth it */
358 static struct zbud_unbuddied zbud_eph_unbuddied
[NCHUNKS
];
359 static struct zbud_unbuddied zbud_pers_unbuddied
[NCHUNKS
];
360 static LIST_HEAD(zbud_eph_lru_list
);
361 static LIST_HEAD(zbud_pers_lru_list
);
362 static LIST_HEAD(zbud_eph_buddied_list
);
363 static LIST_HEAD(zbud_pers_buddied_list
);
364 static LIST_HEAD(zbud_eph_zombie_list
);
365 static LIST_HEAD(zbud_pers_zombie_list
);
368 * Given a struct page, transform it to a zbudpage so that it can be
369 * used by zbud and initialize fields as necessary.
371 static inline struct zbudpage
*zbud_init_zbudpage(struct page
*page
, bool eph
)
373 struct zbudpage
*zbudpage
= (struct zbudpage
*)page
;
375 BUG_ON(page
== NULL
);
376 INIT_LIST_HEAD(&zbudpage
->budlist
);
377 INIT_LIST_HEAD(&zbudpage
->lru
);
378 zbudpage
->zbud0_size
= 0;
379 zbudpage
->zbud1_size
= 0;
380 zbudpage
->unevictable
= 0;
382 zbud_eph_pageframes
++;
384 zbud_pers_pageframes
++;
388 /* "Transform" a zbudpage back to a struct page suitable to free. */
389 static inline struct page
*zbud_unuse_zbudpage(struct zbudpage
*zbudpage
,
392 struct page
*page
= (struct page
*)zbudpage
;
394 BUG_ON(!list_empty(&zbudpage
->budlist
));
395 BUG_ON(!list_empty(&zbudpage
->lru
));
396 BUG_ON(zbudpage
->zbud0_size
!= 0);
397 BUG_ON(zbudpage
->zbud1_size
!= 0);
398 BUG_ON(!PageLocked(page
));
399 BUG_ON(zbudpage
->unevictable
!= 0);
400 BUG_ON(zbudpage_is_dying(zbudpage
));
401 BUG_ON(zbudpage_is_zombie(zbudpage
));
403 zbud_eph_pageframes
--;
405 zbud_pers_pageframes
--;
406 zbudpage_spin_unlock(zbudpage
);
407 reset_page_mapcount(page
);
408 init_page_count(page
);
413 /* Mark a zbud as unused and do accounting */
414 static inline void zbud_unuse_zbud(struct zbudpage
*zbudpage
,
415 int budnum
, bool eph
)
419 BUG_ON(!zbudpage_is_locked(zbudpage
));
421 size
= zbudpage
->zbud0_size
;
422 zbudpage
->zbud0_size
= 0;
424 size
= zbudpage
->zbud1_size
;
425 zbudpage
->zbud1_size
= 0;
428 zbud_eph_zbytes
-= size
;
431 zbud_pers_zbytes
-= size
;
437 * Given a zbudpage/budnum/size, a tmem handle, and a kmapped pointer
438 * to some data, set up the zbud appropriately including data copying
439 * and accounting. Note that if cdata is NULL, the data copying is
440 * skipped. (This is useful for lazy writes such as for RAMster.)
442 static void zbud_init_zbud(struct zbudpage
*zbudpage
, struct tmem_handle
*th
,
443 bool eph
, void *cdata
,
444 unsigned budnum
, unsigned size
)
448 struct tmem_handle
*to_th
;
449 unsigned nchunks
= zbud_size_to_chunks(size
);
451 BUG_ON(!zbudpage_is_locked(zbudpage
));
452 zbpg
= kmap_zbudpage_atomic(zbudpage
);
453 to
= zbud_data(zbpg
, budnum
, size
);
454 to_th
= (struct tmem_handle
*)to
;
455 to_th
->index
= th
->index
;
456 to_th
->oid
= th
->oid
;
457 to_th
->pool_id
= th
->pool_id
;
458 to_th
->client_id
= th
->client_id
;
459 to
+= sizeof(struct tmem_handle
);
461 memcpy(to
, cdata
, size
- sizeof(struct tmem_handle
));
462 kunmap_zbudpage_atomic(zbpg
);
464 zbudpage
->zbud0_size
= size
;
466 zbudpage
->zbud1_size
= size
;
468 zbud_eph_cumul_chunk_counts
[nchunks
]++;
470 zbud_eph_cumul_zpages
++;
471 zbud_eph_zbytes
+= size
;
472 zbud_eph_cumul_zbytes
+= size
;
474 zbud_pers_cumul_chunk_counts
[nchunks
]++;
476 zbud_pers_cumul_zpages
++;
477 zbud_pers_zbytes
+= size
;
478 zbud_pers_cumul_zbytes
+= size
;
483 * Given a locked dying zbudpage, read out the tmem handles from the data,
484 * unlock the page, then use the handles to tell tmem to flush out its
487 static void zbud_evict_tmem(struct zbudpage
*zbudpage
)
490 uint32_t pool_id
[2], client_id
[2];
492 struct tmem_oid oid
[2];
493 struct tmem_pool
*pool
;
495 struct tmem_handle
*th
;
498 /* read out the tmem handles from the data and set aside */
499 zbpg
= kmap_zbudpage_atomic(zbudpage
);
500 for (i
= 0, j
= 0; i
< 2; i
++) {
501 size
= (i
== 0) ? zbudpage
->zbud0_size
: zbudpage
->zbud1_size
;
503 th
= (struct tmem_handle
*)zbud_data(zbpg
, i
, size
);
504 client_id
[j
] = th
->client_id
;
505 pool_id
[j
] = th
->pool_id
;
507 index
[j
] = th
->index
;
509 zbud_unuse_zbud(zbudpage
, i
, true);
512 kunmap_zbudpage_atomic(zbpg
);
513 zbudpage_spin_unlock(zbudpage
);
514 /* zbudpage is now an unlocked dying... tell tmem to flush pointers */
515 for (i
= 0; i
< j
; i
++) {
516 pool
= zcache_get_pool_by_id(client_id
[i
], pool_id
[i
]);
518 tmem_flush_page(pool
, &oid
[i
], index
[i
]);
519 zcache_put_pool(pool
);
525 * Externally callable zbud handling routines.
529 * Return the maximum size compressed page that can be stored (secretly
530 * setting aside space for the tmem handle.
532 unsigned int zbud_max_buddy_size(void)
534 return zbud_max_size() - sizeof(struct tmem_handle
);
538 * Given a zbud reference, free the corresponding zbud from all lists,
539 * mark it as unused, do accounting, and if the freeing of the zbud
540 * frees up an entire pageframe, return it to the caller (else NULL).
542 struct page
*zbud_free_and_delist(struct zbudref
*zref
, bool eph
,
543 unsigned int *zsize
, unsigned int *zpages
)
545 unsigned long budnum
= zbudref_budnum(zref
);
546 struct zbudpage
*zbudpage
= zbudref_to_zbudpage(zref
);
547 struct page
*page
= NULL
;
548 unsigned chunks
, bud_size
, other_bud_size
;
549 spinlock_t
*lists_lock
=
550 eph
? &zbud_eph_lists_lock
: &zbud_pers_lists_lock
;
551 struct zbud_unbuddied
*unbud
=
552 eph
? zbud_eph_unbuddied
: zbud_pers_unbuddied
;
555 spin_lock(lists_lock
);
556 zbudpage_spin_lock(zbudpage
);
557 if (zbudpage_is_dying(zbudpage
)) {
558 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
559 zbudpage_spin_unlock(zbudpage
);
560 spin_unlock(lists_lock
);
566 bud_size
= zbudpage
->zbud0_size
;
567 other_bud_size
= zbudpage
->zbud1_size
;
569 bud_size
= zbudpage
->zbud1_size
;
570 other_bud_size
= zbudpage
->zbud0_size
;
572 *zsize
= bud_size
- sizeof(struct tmem_handle
);
574 zbud_unuse_zbud(zbudpage
, budnum
, eph
);
575 if (other_bud_size
== 0) { /* was unbuddied: unlist and free */
576 chunks
= zbud_size_to_chunks(bud_size
) ;
577 if (zbudpage_is_zombie(zbudpage
)) {
579 zbud_pers_zombie_count
=
580 atomic_dec_return(&zbud_eph_zombie_atomic
);
582 zbud_pers_zombie_count
=
583 atomic_dec_return(&zbud_pers_zombie_atomic
);
584 zbudpage_clear_zombie(zbudpage
);
586 BUG_ON(list_empty(&unbud
[chunks
].list
));
587 list_del_init(&zbudpage
->budlist
);
588 unbud
[chunks
].count
--;
590 list_del_init(&zbudpage
->lru
);
591 spin_unlock(lists_lock
);
593 zbud_eph_unbuddied_count
--;
595 zbud_pers_unbuddied_count
--;
596 page
= zbud_unuse_zbudpage(zbudpage
, eph
);
597 } else { /* was buddied: move remaining buddy to unbuddied list */
598 chunks
= zbud_size_to_chunks(other_bud_size
) ;
599 if (!zbudpage_is_zombie(zbudpage
)) {
600 list_del_init(&zbudpage
->budlist
);
601 list_add_tail(&zbudpage
->budlist
, &unbud
[chunks
].list
);
602 unbud
[chunks
].count
++;
605 zbud_eph_buddied_count
--;
606 zbud_eph_unbuddied_count
++;
608 zbud_pers_unbuddied_count
++;
609 zbud_pers_buddied_count
--;
611 /* don't mess with lru, no need to move it */
612 zbudpage_spin_unlock(zbudpage
);
613 spin_unlock(lists_lock
);
620 * Given a tmem handle, and a kmapped pointer to compressed data of
621 * the given size, try to find an unbuddied zbudpage in which to
622 * create a zbud. If found, put it there, mark the zbudpage unevictable,
623 * and return a zbudref to it. Else return NULL.
625 struct zbudref
*zbud_match_prep(struct tmem_handle
*th
, bool eph
,
626 void *cdata
, unsigned size
)
628 struct zbudpage
*zbudpage
= NULL
, *zbudpage2
;
629 unsigned long budnum
= 0UL;
631 int i
, found_good_buddy
= 0;
632 spinlock_t
*lists_lock
=
633 eph
? &zbud_eph_lists_lock
: &zbud_pers_lists_lock
;
634 struct zbud_unbuddied
*unbud
=
635 eph
? zbud_eph_unbuddied
: zbud_pers_unbuddied
;
637 size
+= sizeof(struct tmem_handle
);
638 nchunks
= zbud_size_to_chunks(size
);
639 for (i
= MAX_CHUNK
- nchunks
+ 1; i
> 0; i
--) {
640 spin_lock(lists_lock
);
641 if (!list_empty(&unbud
[i
].list
)) {
642 list_for_each_entry_safe(zbudpage
, zbudpage2
,
643 &unbud
[i
].list
, budlist
) {
644 if (zbudpage_spin_trylock(zbudpage
)) {
645 found_good_buddy
= i
;
646 goto found_unbuddied
;
650 spin_unlock(lists_lock
);
656 BUG_ON(!zbudpage_is_locked(zbudpage
));
657 BUG_ON(!((zbudpage
->zbud0_size
== 0) ^ (zbudpage
->zbud1_size
== 0)));
658 if (zbudpage
->zbud0_size
== 0)
660 else if (zbudpage
->zbud1_size
== 0)
662 list_del_init(&zbudpage
->budlist
);
664 list_add_tail(&zbudpage
->budlist
, &zbud_eph_buddied_list
);
665 unbud
[found_good_buddy
].count
--;
666 zbud_eph_unbuddied_count
--;
667 zbud_eph_buddied_count
++;
668 /* "promote" raw zbudpage to most-recently-used */
669 list_del_init(&zbudpage
->lru
);
670 list_add_tail(&zbudpage
->lru
, &zbud_eph_lru_list
);
672 list_add_tail(&zbudpage
->budlist
, &zbud_pers_buddied_list
);
673 unbud
[found_good_buddy
].count
--;
674 zbud_pers_unbuddied_count
--;
675 zbud_pers_buddied_count
++;
676 /* "promote" raw zbudpage to most-recently-used */
677 list_del_init(&zbudpage
->lru
);
678 list_add_tail(&zbudpage
->lru
, &zbud_pers_lru_list
);
680 zbud_init_zbud(zbudpage
, th
, eph
, cdata
, budnum
, size
);
681 zbudpage
->unevictable
++;
682 BUG_ON(zbudpage
->unevictable
== 3);
683 zbudpage_spin_unlock(zbudpage
);
684 spin_unlock(lists_lock
);
686 return zbudpage_to_zbudref(zbudpage
, budnum
);
691 * Given a tmem handle, and a kmapped pointer to compressed data of
692 * the given size, and a newly allocated struct page, create an unevictable
693 * zbud in that new page and return a zbudref to it.
695 struct zbudref
*zbud_create_prep(struct tmem_handle
*th
, bool eph
,
696 void *cdata
, unsigned size
,
697 struct page
*newpage
)
699 struct zbudpage
*zbudpage
;
700 unsigned long budnum
= 0;
702 spinlock_t
*lists_lock
=
703 eph
? &zbud_eph_lists_lock
: &zbud_pers_lists_lock
;
704 struct zbud_unbuddied
*unbud
=
705 eph
? zbud_eph_unbuddied
: zbud_pers_unbuddied
;
708 /* this may be worth it later to support decompress-in-place? */
709 static unsigned long counter
;
710 budnum
= counter
++ & 1; /* alternate using zbud0 and zbud1 */
713 if (size
> zbud_max_buddy_size())
718 size
+= sizeof(struct tmem_handle
);
719 nchunks
= zbud_size_to_chunks(size
) ;
720 spin_lock(lists_lock
);
721 zbudpage
= zbud_init_zbudpage(newpage
, eph
);
722 zbudpage_spin_lock(zbudpage
);
723 list_add_tail(&zbudpage
->budlist
, &unbud
[nchunks
].list
);
725 list_add_tail(&zbudpage
->lru
, &zbud_eph_lru_list
);
726 zbud_eph_unbuddied_count
++;
728 list_add_tail(&zbudpage
->lru
, &zbud_pers_lru_list
);
729 zbud_pers_unbuddied_count
++;
731 unbud
[nchunks
].count
++;
732 zbud_init_zbud(zbudpage
, th
, eph
, cdata
, budnum
, size
);
733 zbudpage
->unevictable
++;
734 BUG_ON(zbudpage
->unevictable
== 3);
735 zbudpage_spin_unlock(zbudpage
);
736 spin_unlock(lists_lock
);
737 return zbudpage_to_zbudref(zbudpage
, budnum
);
741 * Finish creation of a zbud by, assuming another zbud isn't being created
742 * in parallel, marking it evictable.
744 void zbud_create_finish(struct zbudref
*zref
, bool eph
)
746 struct zbudpage
*zbudpage
= zbudref_to_zbudpage(zref
);
747 spinlock_t
*lists_lock
=
748 eph
? &zbud_eph_lists_lock
: &zbud_pers_lists_lock
;
750 spin_lock(lists_lock
);
751 zbudpage_spin_lock(zbudpage
);
752 BUG_ON(zbudpage_is_dying(zbudpage
));
753 zbudpage
->unevictable
--;
754 BUG_ON((int)zbudpage
->unevictable
< 0);
755 zbudpage_spin_unlock(zbudpage
);
756 spin_unlock(lists_lock
);
760 * Given a zbudref and a struct page, decompress the data from
761 * the zbud into the physical page represented by the struct page
762 * by upcalling to zcache_decompress
764 int zbud_decompress(struct page
*data_page
, struct zbudref
*zref
, bool eph
,
765 void (*decompress
)(char *, unsigned int, char *))
767 struct zbudpage
*zbudpage
= zbudref_to_zbudpage(zref
);
768 unsigned long budnum
= zbudref_budnum(zref
);
770 char *to_va
, *from_va
;
773 spinlock_t
*lists_lock
=
774 eph
? &zbud_eph_lists_lock
: &zbud_pers_lists_lock
;
776 spin_lock(lists_lock
);
777 zbudpage_spin_lock(zbudpage
);
778 if (zbudpage_is_dying(zbudpage
)) {
779 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
782 zbpg
= kmap_zbudpage_atomic(zbudpage
);
783 to_va
= kmap_atomic(data_page
);
785 size
= zbudpage
->zbud0_size
;
787 size
= zbudpage
->zbud1_size
;
788 BUG_ON(size
== 0 || size
> zbud_max_size());
789 from_va
= zbud_data(zbpg
, budnum
, size
);
790 from_va
+= sizeof(struct tmem_handle
);
791 size
-= sizeof(struct tmem_handle
);
792 decompress(from_va
, size
, to_va
);
793 kunmap_atomic(to_va
);
794 kunmap_zbudpage_atomic(zbpg
);
797 zbudpage_spin_unlock(zbudpage
);
798 spin_unlock(lists_lock
);
803 * Given a zbudref and a kernel pointer, copy the data from
804 * the zbud to the kernel pointer.
806 int zbud_copy_from_zbud(char *to_va
, struct zbudref
*zref
,
807 size_t *sizep
, bool eph
)
809 struct zbudpage
*zbudpage
= zbudref_to_zbudpage(zref
);
810 unsigned long budnum
= zbudref_budnum(zref
);
815 spinlock_t
*lists_lock
=
816 eph
? &zbud_eph_lists_lock
: &zbud_pers_lists_lock
;
818 spin_lock(lists_lock
);
819 zbudpage_spin_lock(zbudpage
);
820 if (zbudpage_is_dying(zbudpage
)) {
821 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
824 zbpg
= kmap_zbudpage_atomic(zbudpage
);
826 size
= zbudpage
->zbud0_size
;
828 size
= zbudpage
->zbud1_size
;
829 BUG_ON(size
== 0 || size
> zbud_max_size());
830 from_va
= zbud_data(zbpg
, budnum
, size
);
831 from_va
+= sizeof(struct tmem_handle
);
832 size
-= sizeof(struct tmem_handle
);
834 memcpy(to_va
, from_va
, size
);
836 kunmap_zbudpage_atomic(zbpg
);
839 zbudpage_spin_unlock(zbudpage
);
840 spin_unlock(lists_lock
);
845 * Given a zbudref and a kernel pointer, copy the data from
846 * the kernel pointer to the zbud.
848 int zbud_copy_to_zbud(struct zbudref
*zref
, char *from_va
, bool eph
)
850 struct zbudpage
*zbudpage
= zbudref_to_zbudpage(zref
);
851 unsigned long budnum
= zbudref_budnum(zref
);
856 spinlock_t
*lists_lock
=
857 eph
? &zbud_eph_lists_lock
: &zbud_pers_lists_lock
;
859 spin_lock(lists_lock
);
860 zbudpage_spin_lock(zbudpage
);
861 if (zbudpage_is_dying(zbudpage
)) {
862 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
865 zbpg
= kmap_zbudpage_atomic(zbudpage
);
867 size
= zbudpage
->zbud0_size
;
869 size
= zbudpage
->zbud1_size
;
870 BUG_ON(size
== 0 || size
> zbud_max_size());
871 to_va
= zbud_data(zbpg
, budnum
, size
);
872 to_va
+= sizeof(struct tmem_handle
);
873 size
-= sizeof(struct tmem_handle
);
874 memcpy(to_va
, from_va
, size
);
876 kunmap_zbudpage_atomic(zbpg
);
879 zbudpage_spin_unlock(zbudpage
);
880 spin_unlock(lists_lock
);
885 * Choose an ephemeral LRU zbudpage that is evictable (not locked), ensure
886 * there are no references to it remaining, and return the now unused
887 * (and re-init'ed) struct page and the total amount of compressed
888 * data that was evicted.
890 struct page
*zbud_evict_pageframe_lru(unsigned int *zsize
, unsigned int *zpages
)
892 struct zbudpage
*zbudpage
= NULL
, *zbudpage2
;
893 struct zbud_unbuddied
*unbud
= zbud_eph_unbuddied
;
894 struct page
*page
= NULL
;
895 bool irqs_disabled
= irqs_disabled();
898 * Since this can be called indirectly from cleancache_put, which
899 * has interrupts disabled, as well as frontswap_put, which does not,
900 * we need to be able to handle both cases, even though it is ugly.
903 spin_lock(&zbud_eph_lists_lock
);
905 spin_lock_bh(&zbud_eph_lists_lock
);
907 if (list_empty(&zbud_eph_lru_list
))
909 list_for_each_entry_safe(zbudpage
, zbudpage2
, &zbud_eph_lru_list
, lru
) {
910 /* skip a locked zbudpage */
911 if (unlikely(!zbudpage_spin_trylock(zbudpage
)))
913 /* skip an unevictable zbudpage */
914 if (unlikely(zbudpage
->unevictable
!= 0)) {
915 zbudpage_spin_unlock(zbudpage
);
918 /* got a locked evictable page */
923 /* no unlocked evictable pages, give up */
925 spin_unlock(&zbud_eph_lists_lock
);
927 spin_unlock_bh(&zbud_eph_lists_lock
);
931 list_del_init(&zbudpage
->budlist
);
932 list_del_init(&zbudpage
->lru
);
933 zbudpage_set_dying(zbudpage
);
935 * the zbudpage is now "dying" and attempts to read, write,
936 * or delete data from it will be ignored
938 if (zbudpage
->zbud0_size
!= 0 && zbudpage
->zbud1_size
!= 0) {
939 *zsize
= zbudpage
->zbud0_size
+ zbudpage
->zbud1_size
-
940 (2 * sizeof(struct tmem_handle
));
942 } else if (zbudpage
->zbud0_size
!= 0) {
943 unbud
[zbud_size_to_chunks(zbudpage
->zbud0_size
)].count
--;
944 *zsize
= zbudpage
->zbud0_size
- sizeof(struct tmem_handle
);
946 } else if (zbudpage
->zbud1_size
!= 0) {
947 unbud
[zbud_size_to_chunks(zbudpage
->zbud1_size
)].count
--;
948 *zsize
= zbudpage
->zbud1_size
- sizeof(struct tmem_handle
);
953 spin_unlock(&zbud_eph_lists_lock
);
954 zbud_eph_evicted_pageframes
++;
956 zbud_eph_unbuddied_count
--;
958 zbud_eph_buddied_count
--;
959 zbud_evict_tmem(zbudpage
);
960 zbudpage_spin_lock(zbudpage
);
961 zbudpage_clear_dying(zbudpage
);
962 page
= zbud_unuse_zbudpage(zbudpage
, true);
970 * Choose a persistent LRU zbudpage that is evictable (not locked), zombify it,
971 * read the tmem_handle(s) out of it into the passed array, and return the
972 * number of zbuds. Caller must perform necessary tmem functions and,
973 * indirectly, zbud functions to fetch any valid data and cause the
974 * now-zombified zbudpage to eventually be freed. We track the zombified
975 * zbudpage count so it is possible to observe if there is a leak.
976 FIXME: describe (ramster) case where data pointers are passed in for memcpy
978 unsigned int zbud_make_zombie_lru(struct tmem_handle
*th
, unsigned char **data
,
979 unsigned int *zsize
, bool eph
)
981 struct zbudpage
*zbudpage
= NULL
, *zbudpag2
;
982 struct tmem_handle
*thfrom
;
987 spinlock_t
*lists_lock
=
988 eph
? &zbud_eph_lists_lock
: &zbud_pers_lists_lock
;
989 struct list_head
*lru_list
=
990 eph
? &zbud_eph_lru_list
: &zbud_pers_lru_list
;
992 spin_lock_bh(lists_lock
);
993 if (list_empty(lru_list
))
995 list_for_each_entry_safe(zbudpage
, zbudpag2
, lru_list
, lru
) {
996 /* skip a locked zbudpage */
997 if (unlikely(!zbudpage_spin_trylock(zbudpage
)))
999 /* skip an unevictable zbudpage */
1000 if (unlikely(zbudpage
->unevictable
!= 0)) {
1001 zbudpage_spin_unlock(zbudpage
);
1004 /* got a locked evictable page */
1007 /* no unlocked evictable pages, give up */
1011 /* got an unlocked evictable page, zombify it */
1012 list_del_init(&zbudpage
->budlist
);
1013 zbudpage_set_zombie(zbudpage
);
1014 /* FIXME what accounting do I need to do here? */
1015 list_del_init(&zbudpage
->lru
);
1017 list_add_tail(&zbudpage
->lru
, &zbud_eph_zombie_list
);
1018 zbud_eph_zombie_count
=
1019 atomic_inc_return(&zbud_eph_zombie_atomic
);
1021 list_add_tail(&zbudpage
->lru
, &zbud_pers_zombie_list
);
1022 zbud_pers_zombie_count
=
1023 atomic_inc_return(&zbud_pers_zombie_atomic
);
1025 /* FIXME what accounting do I need to do here? */
1026 zbpg
= kmap_zbudpage_atomic(zbudpage
);
1027 for (i
= 0; i
< 2; i
++) {
1028 size
= (i
== 0) ? zbudpage
->zbud0_size
: zbudpage
->zbud1_size
;
1030 from_va
= zbud_data(zbpg
, i
, size
);
1031 thfrom
= (struct tmem_handle
*)from_va
;
1032 from_va
+= sizeof(struct tmem_handle
);
1033 size
-= sizeof(struct tmem_handle
);
1037 memcpy(data
[ret
], from_va
, size
);
1043 kunmap_zbudpage_atomic(zbpg
);
1044 zbudpage_spin_unlock(zbudpage
);
1046 spin_unlock_bh(lists_lock
);
1050 void __init
zbud_init(void)
1054 #ifdef CONFIG_DEBUG_FS
1055 zbud_debugfs_init();
1057 BUG_ON((sizeof(struct tmem_handle
) * 2 > CHUNK_SIZE
));
1058 BUG_ON(sizeof(struct zbudpage
) > sizeof(struct page
));
1059 for (i
= 0; i
< NCHUNKS
; i
++) {
1060 INIT_LIST_HEAD(&zbud_eph_unbuddied
[i
].list
);
1061 INIT_LIST_HEAD(&zbud_pers_unbuddied
[i
].list
);