drivers/staging/ramster/zbud.c

   1 /*
   2  * zbud.c - Compression buddies allocator
   3  *
   4  * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
   5  *
   6  * Compression buddies ("zbud") provides for efficiently packing two
   7  * (or, possibly in the future, more) compressed pages ("zpages") into
   8  * a single "raw" pageframe and for tracking both zpages and pageframes
   9  * so that whole pageframes can be easily reclaimed in LRU-like order.
  10  * It is designed to be used in conjunction with transcendent memory
  11  * ("tmem"); for example separate LRU lists are maintained for persistent
  12  * vs. ephemeral pages.
  13  *
  14  * A zbudpage is an overlay for a struct page and thus each zbudpage
  15  * refers to a physical pageframe of RAM.  When the caller passes a
  16  * struct page from the kernel's page allocator, zbud "transforms" it
  17  * to a zbudpage which sets/uses a different set of fields than the
  18  * struct-page and thus must "untransform" it back by reinitializing
  19  * certain fields before the struct-page can be freed.  The fields
  20  * of a zbudpage include a page lock for controlling access to the
  21  * corresponding pageframe, and there is a size field for each zpage.
  22  * Each zbudpage also lives on two linked lists: a "budlist" which is
  23  * used to support efficient buddying of zpages; and an "lru" which
  24  * is used for reclaiming pageframes in approximately least-recently-used
  25  * order.
  26  *
  27  * A zbudpageframe is a pageframe divided up into aligned 64-byte "chunks"
  28  * which contain the compressed data for zero, one, or two zbuds.  Contained
  29  * with the compressed data is a tmem_handle which is a key to allow
  30  * the same data to be found via the tmem interface so the zpage can
  31  * be invalidated (for ephemeral pages) or repatriated to the swap cache
  32  * (for persistent pages).  The contents of a zbudpageframe must never
  33  * be accessed without holding the page lock for the corresponding
  34  * zbudpage and, to accomodate highmem machines, the contents may
  35  * only be examined or changes when kmapped.  Thus, when in use, a
  36  * kmapped zbudpageframe is referred to in the zbud code as "void *zbpg".
  37  *
  38  * Note that the term "zbud" refers to the combination of a zpage and
  39  * a tmem_handle that is stored as one of possibly two "buddied" zpages;
  40  * it also generically refers to this allocator... sorry for any confusion.
  41  *
  42  * A zbudref is a pointer to a struct zbudpage (which can be cast to a
  43  * struct page), with the LSB either cleared or set to indicate, respectively,
  44  * the first or second zpage in the zbudpageframe. Since a zbudref can be
  45  * cast to a pointer, it is used as the tmem "pampd" pointer and uniquely
  46  * references a stored tmem page and so is the only zbud data structure
  47  * externally visible to zbud.c/zbud.h.
  48  *
  49  * Since we wish to reclaim entire pageframes but zpages may be randomly
  50  * added and deleted to any given pageframe, we approximate LRU by
  51  * promoting a pageframe to MRU when a zpage is added to it, but
  52  * leaving it at the current place in the list when a zpage is deleted
  53  * from it.  As a side effect, zpages that are difficult to buddy (e.g.
  54  * very large paages) will be reclaimed faster than average, which seems
  55  * reasonable.
  56  *
  57  * In the current implementation, no more than two zpages may be stored in
  58  * any pageframe and no zpage ever crosses a pageframe boundary.  While
  59  * other zpage allocation mechanisms may allow greater density, this two
  60  * zpage-per-pageframe limit both ensures simple reclaim of pageframes
  61  * (including garbage collection of references to the contents of those
  62  * pageframes from tmem data structures) AND avoids the need for compaction.
  63  * With additional complexity, zbud could be modified to support storing
  64  * up to three zpages per pageframe or, to handle larger average zpages,
  65  * up to three zpages per pair of pageframes, but it is not clear if the
  66  * additional complexity would be worth it.  So consider it an exercise
  67  * for future developers.
  68  *
  69  * Note also that zbud does no page allocation or freeing.  This is so
  70  * that the caller has complete control over and, for accounting, visibility
  71  * into if/when pages are allocated and freed.
  72  *
  73  * Finally, note that zbud limits the size of zpages it can store; the
  74  * caller must check the zpage size with zbud_max_buddy_size before
  75  * storing it, else BUGs will result.  User beware.
  76  */
  77
  78 #include <linux/module.h>
  79 #include <linux/highmem.h>
  80 #include <linux/list.h>
  81 #include <linux/spinlock.h>
  82 #include <linux/pagemap.h>
  83 #include <linux/atomic.h>
  84 #include <linux/bug.h>
  85 #include "tmem.h"
  86 #include "zcache.h"
  87 #include "zbud.h"
  88
  89 /*
  90  * We need to ensure that a struct zbudpage is never larger than a
  91  * struct page.  This is checked with a BUG_ON in zbud_init.
  92  *
  93  * The unevictable field indicates that a zbud is being added to the
  94  * zbudpage.  Since this is a two-phase process (due to tmem locking),
  95  * this field locks the zbudpage against eviction when a zbud match
  96  * or creation is in process.  Since this addition process may occur
  97  * in parallel for two zbuds in one zbudpage, the field is a counter
  98  * that must not exceed two.
  99  */
 100 struct zbudpage {
 101         union {
 102                 struct page page;
 103                 struct {
 104                         unsigned long space_for_flags;
 105                         struct {
 106                                 unsigned zbud0_size:12;
 107                                 unsigned zbud1_size:12;
 108                                 unsigned unevictable:2;
 109                         };
 110                         struct list_head budlist;
 111                         struct list_head lru;
 112                 };
 113         };
 114 };
 115
 116 struct zbudref {
 117         union {
 118                 struct zbudpage *zbudpage;
 119                 unsigned long zbudref;
 120         };
 121 };
 122
 123 #define CHUNK_SHIFT     6
 124 #define CHUNK_SIZE      (1 << CHUNK_SHIFT)
 125 #define CHUNK_MASK      (~(CHUNK_SIZE-1))
 126 #define NCHUNKS         (PAGE_SIZE >> CHUNK_SHIFT)
 127 #define MAX_CHUNK       (NCHUNKS-1)
 128
 129 /*
 130  * The following functions deal with the difference between struct
 131  * page and struct zbudpage.  Note the hack of using the pageflags
 132  * from struct page; this is to avoid duplicating all the complex
 133  * pageflag macros.
 134  */
 135 static inline void zbudpage_spin_lock(struct zbudpage *zbudpage)
 136 {
 137         struct page *page = (struct page *)zbudpage;
 138
 139         while (unlikely(test_and_set_bit_lock(PG_locked, &page->flags))) {
 140                 do {
 141                         cpu_relax();
 142                 } while (test_bit(PG_locked, &page->flags));
 143         }
 144 }
 145
 146 static inline void zbudpage_spin_unlock(struct zbudpage *zbudpage)
 147 {
 148         struct page *page = (struct page *)zbudpage;
 149
 150         clear_bit(PG_locked, &page->flags);
 151 }
 152
 153 static inline int zbudpage_spin_trylock(struct zbudpage *zbudpage)
 154 {
 155         return trylock_page((struct page *)zbudpage);
 156 }
 157
 158 static inline int zbudpage_is_locked(struct zbudpage *zbudpage)
 159 {
 160         return PageLocked((struct page *)zbudpage);
 161 }
 162
 163 static inline void *kmap_zbudpage_atomic(struct zbudpage *zbudpage)
 164 {
 165         return kmap_atomic((struct page *)zbudpage);
 166 }
 167
 168 /*
 169  * A dying zbudpage is an ephemeral page in the process of being evicted.
 170  * Any data contained in the zbudpage is invalid and we are just waiting for
 171  * the tmem pampds to be invalidated before freeing the page
 172  */
 173 static inline int zbudpage_is_dying(struct zbudpage *zbudpage)
 174 {
 175         struct page *page = (struct page *)zbudpage;
 176
 177         return test_bit(PG_reclaim, &page->flags);
 178 }
 179
 180 static inline void zbudpage_set_dying(struct zbudpage *zbudpage)
 181 {
 182         struct page *page = (struct page *)zbudpage;
 183
 184         set_bit(PG_reclaim, &page->flags);
 185 }
 186
 187 static inline void zbudpage_clear_dying(struct zbudpage *zbudpage)
 188 {
 189         struct page *page = (struct page *)zbudpage;
 190
 191         clear_bit(PG_reclaim, &page->flags);
 192 }
 193
 194 /*
 195  * A zombie zbudpage is a persistent page in the process of being evicted.
 196  * The data contained in the zbudpage is valid and we are just waiting for
 197  * the tmem pampds to be invalidated before freeing the page
 198  */
 199 static inline int zbudpage_is_zombie(struct zbudpage *zbudpage)
 200 {
 201         struct page *page = (struct page *)zbudpage;
 202
 203         return test_bit(PG_dirty, &page->flags);
 204 }
 205
 206 static inline void zbudpage_set_zombie(struct zbudpage *zbudpage)
 207 {
 208         struct page *page = (struct page *)zbudpage;
 209
 210         set_bit(PG_dirty, &page->flags);
 211 }
 212
 213 static inline void zbudpage_clear_zombie(struct zbudpage *zbudpage)
 214 {
 215         struct page *page = (struct page *)zbudpage;
 216
 217         clear_bit(PG_dirty, &page->flags);
 218 }
 219
 220 static inline void kunmap_zbudpage_atomic(void *zbpg)
 221 {
 222         kunmap_atomic(zbpg);
 223 }
 224
 225 /*
 226  * zbud "translation" and helper functions
 227  */
 228
 229 static inline struct zbudpage *zbudref_to_zbudpage(struct zbudref *zref)
 230 {
 231         unsigned long zbud = (unsigned long)zref;
 232         zbud &= ~1UL;
 233         return (struct zbudpage *)zbud;
 234 }
 235
 236 static inline struct zbudref *zbudpage_to_zbudref(struct zbudpage *zbudpage,
 237                                                         unsigned budnum)
 238 {
 239         unsigned long zbud = (unsigned long)zbudpage;
 240         BUG_ON(budnum > 1);
 241         zbud |= budnum;
 242         return (struct zbudref *)zbud;
 243 }
 244
 245 static inline int zbudref_budnum(struct zbudref *zbudref)
 246 {
 247         unsigned long zbud = (unsigned long)zbudref;
 248         return zbud & 1UL;
 249 }
 250
 251 static inline unsigned zbud_max_size(void)
 252 {
 253         return MAX_CHUNK << CHUNK_SHIFT;
 254 }
 255
 256 static inline unsigned zbud_size_to_chunks(unsigned size)
 257 {
 258         BUG_ON(size == 0 || size > zbud_max_size());
 259         return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
 260 }
 261
 262 /* can only be used between kmap_zbudpage_atomic/kunmap_zbudpage_atomic! */
 263 static inline char *zbud_data(void *zbpg,
 264                         unsigned budnum, unsigned size)
 265 {
 266         char *p;
 267
 268         BUG_ON(size == 0 || size > zbud_max_size());
 269         p = (char *)zbpg;
 270         if (budnum == 1)
 271                 p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
 272         return p;
 273 }
 274
 275 /*
 276  * These are all informative and exposed through debugfs... except for
 277  * the arrays... anyone know how to do that?  To avoid confusion for
 278  * debugfs viewers, some of these should also be atomic_long_t, but
 279  * I don't know how to expose atomics via debugfs either...
 280  */
 281 static unsigned long zbud_eph_pageframes;
 282 static unsigned long zbud_pers_pageframes;
 283 static unsigned long zbud_eph_zpages;
 284 static unsigned long zbud_pers_zpages;
 285 static u64 zbud_eph_zbytes;
 286 static u64 zbud_pers_zbytes;
 287 static unsigned long zbud_eph_evicted_pageframes;
 288 static unsigned long zbud_pers_evicted_pageframes;
 289 static unsigned long zbud_eph_cumul_zpages;
 290 static unsigned long zbud_pers_cumul_zpages;
 291 static u64 zbud_eph_cumul_zbytes;
 292 static u64 zbud_pers_cumul_zbytes;
 293 static unsigned long zbud_eph_cumul_chunk_counts[NCHUNKS];
 294 static unsigned long zbud_pers_cumul_chunk_counts[NCHUNKS];
 295 static unsigned long zbud_eph_buddied_count;
 296 static unsigned long zbud_pers_buddied_count;
 297 static unsigned long zbud_eph_unbuddied_count;
 298 static unsigned long zbud_pers_unbuddied_count;
 299 static unsigned long zbud_eph_zombie_count;
 300 static unsigned long zbud_pers_zombie_count;
 301 static atomic_t zbud_eph_zombie_atomic;
 302 static atomic_t zbud_pers_zombie_atomic;
 303
 304 #ifdef CONFIG_DEBUG_FS
 305 #include <linux/debugfs.h>
 306 #define zdfs    debugfs_create_size_t
 307 #define zdfs64  debugfs_create_u64
 308 static int zbud_debugfs_init(void)
 309 {
 310         struct dentry *root = debugfs_create_dir("zbud", NULL);
 311         if (root == NULL)
 312                 return -ENXIO;
 313
 314         /*
 315          * would be nice to dump the sizes of the unbuddied
 316          * arrays, like was done with sysfs, but it doesn't
 317          * look like debugfs is flexible enough to do that
 318          */
 319         zdfs64("eph_zbytes", S_IRUGO, root, &zbud_eph_zbytes);
 320         zdfs64("eph_cumul_zbytes", S_IRUGO, root, &zbud_eph_cumul_zbytes);
 321         zdfs64("pers_zbytes", S_IRUGO, root, &zbud_pers_zbytes);
 322         zdfs64("pers_cumul_zbytes", S_IRUGO, root, &zbud_pers_cumul_zbytes);
 323         zdfs("eph_cumul_zpages", S_IRUGO, root, &zbud_eph_cumul_zpages);
 324         zdfs("eph_evicted_pageframes", S_IRUGO, root,
 325                                 &zbud_eph_evicted_pageframes);
 326         zdfs("eph_zpages", S_IRUGO, root, &zbud_eph_zpages);
 327         zdfs("eph_pageframes", S_IRUGO, root, &zbud_eph_pageframes);
 328         zdfs("eph_buddied_count", S_IRUGO, root, &zbud_eph_buddied_count);
 329         zdfs("eph_unbuddied_count", S_IRUGO, root, &zbud_eph_unbuddied_count);
 330         zdfs("pers_cumul_zpages", S_IRUGO, root, &zbud_pers_cumul_zpages);
 331         zdfs("pers_evicted_pageframes", S_IRUGO, root,
 332                                 &zbud_pers_evicted_pageframes);
 333         zdfs("pers_zpages", S_IRUGO, root, &zbud_pers_zpages);
 334         zdfs("pers_pageframes", S_IRUGO, root, &zbud_pers_pageframes);
 335         zdfs("pers_buddied_count", S_IRUGO, root, &zbud_pers_buddied_count);
 336         zdfs("pers_unbuddied_count", S_IRUGO, root, &zbud_pers_unbuddied_count);
 337         zdfs("pers_zombie_count", S_IRUGO, root, &zbud_pers_zombie_count);
 338         return 0;
 339 }
 340 #undef  zdfs
 341 #undef  zdfs64
 342 #endif
 343
 344 /* protects the buddied list and all unbuddied lists */
 345 static DEFINE_SPINLOCK(zbud_eph_lists_lock);
 346 static DEFINE_SPINLOCK(zbud_pers_lists_lock);
 347
 348 struct zbud_unbuddied {
 349         struct list_head list;
 350         unsigned count;
 351 };
 352
 353 /* list N contains pages with N chunks USED and NCHUNKS-N unused */
 354 /* element 0 is never used but optimizing that isn't worth it */
 355 static struct zbud_unbuddied zbud_eph_unbuddied[NCHUNKS];
 356 static struct zbud_unbuddied zbud_pers_unbuddied[NCHUNKS];
 357 static LIST_HEAD(zbud_eph_lru_list);
 358 static LIST_HEAD(zbud_pers_lru_list);
 359 static LIST_HEAD(zbud_eph_buddied_list);
 360 static LIST_HEAD(zbud_pers_buddied_list);
 361 static LIST_HEAD(zbud_eph_zombie_list);
 362 static LIST_HEAD(zbud_pers_zombie_list);
 363
 364 /*
 365  * Given a struct page, transform it to a zbudpage so that it can be
 366  * used by zbud and initialize fields as necessary.
 367  */
 368 static inline struct zbudpage *zbud_init_zbudpage(struct page *page, bool eph)
 369 {
 370         struct zbudpage *zbudpage = (struct zbudpage *)page;
 371
 372         BUG_ON(page == NULL);
 373         INIT_LIST_HEAD(&zbudpage->budlist);
 374         INIT_LIST_HEAD(&zbudpage->lru);
 375         zbudpage->zbud0_size = 0;
 376         zbudpage->zbud1_size = 0;
 377         zbudpage->unevictable = 0;
 378         if (eph)
 379                 zbud_eph_pageframes++;
 380         else
 381                 zbud_pers_pageframes++;
 382         return zbudpage;
 383 }
 384
 385 /* "Transform" a zbudpage back to a struct page suitable to free. */
 386 static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage,
 387                                                                 bool eph)
 388 {
 389         struct page *page = (struct page *)zbudpage;
 390
 391         BUG_ON(!list_empty(&zbudpage->budlist));
 392         BUG_ON(!list_empty(&zbudpage->lru));
 393         BUG_ON(zbudpage->zbud0_size != 0);
 394         BUG_ON(zbudpage->zbud1_size != 0);
 395         BUG_ON(!PageLocked(page));
 396         BUG_ON(zbudpage->unevictable != 0);
 397         BUG_ON(zbudpage_is_dying(zbudpage));
 398         BUG_ON(zbudpage_is_zombie(zbudpage));
 399         if (eph)
 400                 zbud_eph_pageframes--;
 401         else
 402                 zbud_pers_pageframes--;
 403         zbudpage_spin_unlock(zbudpage);
 404         reset_page_mapcount(page);
 405         init_page_count(page);
 406         page->index = 0;
 407         return page;
 408 }
 409
 410 /* Mark a zbud as unused and do accounting */
 411 static inline void zbud_unuse_zbud(struct zbudpage *zbudpage,
 412                                         int budnum, bool eph)
 413 {
 414         unsigned size;
 415
 416         BUG_ON(!zbudpage_is_locked(zbudpage));
 417         if (budnum == 0) {
 418                 size = zbudpage->zbud0_size;
 419                 zbudpage->zbud0_size = 0;
 420         } else {
 421                 size = zbudpage->zbud1_size;
 422                 zbudpage->zbud1_size = 0;
 423         }
 424         if (eph) {
 425                 zbud_eph_zbytes -= size;
 426                 zbud_eph_zpages--;
 427         } else {
 428                 zbud_pers_zbytes -= size;
 429                 zbud_pers_zpages--;
 430         }
 431 }
 432
 433 /*
 434  * Given a zbudpage/budnum/size, a tmem handle, and a kmapped pointer
 435  * to some data, set up the zbud appropriately including data copying
 436  * and accounting.  Note that if cdata is NULL, the data copying is
 437  * skipped.  (This is useful for lazy writes such as for RAMster.)
 438  */
 439 static void zbud_init_zbud(struct zbudpage *zbudpage, struct tmem_handle *th,
 440                                 bool eph, void *cdata,
 441                                 unsigned budnum, unsigned size)
 442 {
 443         char *to;
 444         void *zbpg;
 445         struct tmem_handle *to_th;
 446         unsigned nchunks = zbud_size_to_chunks(size);
 447
 448         BUG_ON(!zbudpage_is_locked(zbudpage));
 449         zbpg = kmap_zbudpage_atomic(zbudpage);
 450         to = zbud_data(zbpg, budnum, size);
 451         to_th = (struct tmem_handle *)to;
 452         to_th->index = th->index;
 453         to_th->oid = th->oid;
 454         to_th->pool_id = th->pool_id;
 455         to_th->client_id = th->client_id;
 456         to += sizeof(struct tmem_handle);
 457         if (cdata != NULL)
 458                 memcpy(to, cdata, size - sizeof(struct tmem_handle));
 459         kunmap_zbudpage_atomic(zbpg);
 460         if (budnum == 0)
 461                 zbudpage->zbud0_size = size;
 462         else
 463                 zbudpage->zbud1_size = size;
 464         if (eph) {
 465                 zbud_eph_cumul_chunk_counts[nchunks]++;
 466                 zbud_eph_zpages++;
 467                 zbud_eph_cumul_zpages++;
 468                 zbud_eph_zbytes += size;
 469                 zbud_eph_cumul_zbytes += size;
 470         } else {
 471                 zbud_pers_cumul_chunk_counts[nchunks]++;
 472                 zbud_pers_zpages++;
 473                 zbud_pers_cumul_zpages++;
 474                 zbud_pers_zbytes += size;
 475                 zbud_pers_cumul_zbytes += size;
 476         }
 477 }
 478
 479 /*
 480  * Given a locked dying zbudpage, read out the tmem handles from the data,
 481  * unlock the page, then use the handles to tell tmem to flush out its
 482  * references
 483  */
 484 static void zbud_evict_tmem(struct zbudpage *zbudpage)
 485 {
 486         int i, j;
 487         uint32_t pool_id[2], client_id[2];
 488         uint32_t index[2];
 489         struct tmem_oid oid[2];
 490         struct tmem_pool *pool;
 491         void *zbpg;
 492         struct tmem_handle *th;
 493         unsigned size;
 494
 495         /* read out the tmem handles from the data and set aside */
 496         zbpg = kmap_zbudpage_atomic(zbudpage);
 497         for (i = 0, j = 0; i < 2; i++) {
 498                 size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
 499                 if (size) {
 500                         th = (struct tmem_handle *)zbud_data(zbpg, i, size);
 501                         client_id[j] = th->client_id;
 502                         pool_id[j] = th->pool_id;
 503                         oid[j] = th->oid;
 504                         index[j] = th->index;
 505                         j++;
 506                         zbud_unuse_zbud(zbudpage, i, true);
 507                 }
 508         }
 509         kunmap_zbudpage_atomic(zbpg);
 510         zbudpage_spin_unlock(zbudpage);
 511         /* zbudpage is now an unlocked dying... tell tmem to flush pointers */
 512         for (i = 0; i < j; i++) {
 513                 pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
 514                 if (pool != NULL) {
 515                         tmem_flush_page(pool, &oid[i], index[i]);
 516                         zcache_put_pool(pool);
 517                 }
 518         }
 519 }
 520
 521 /*
 522  * Externally callable zbud handling routines.
 523  */
 524
 525 /*
 526  * Return the maximum size compressed page that can be stored (secretly
 527  * setting aside space for the tmem handle.
 528  */
 529 unsigned int zbud_max_buddy_size(void)
 530 {
 531         return zbud_max_size() - sizeof(struct tmem_handle);
 532 }
 533
 534 /*
 535  * Given a zbud reference, free the corresponding zbud from all lists,
 536  * mark it as unused, do accounting, and if the freeing of the zbud
 537  * frees up an entire pageframe, return it to the caller (else NULL).
 538  */
 539 struct page *zbud_free_and_delist(struct zbudref *zref, bool eph,
 540                                   unsigned int *zsize, unsigned int *zpages)
 541 {
 542         unsigned long budnum = zbudref_budnum(zref);
 543         struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 544         struct page *page = NULL;
 545         unsigned chunks, bud_size, other_bud_size;
 546         spinlock_t *lists_lock =
 547                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 548         struct zbud_unbuddied *unbud =
 549                 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
 550
 551
 552         spin_lock(lists_lock);
 553         zbudpage_spin_lock(zbudpage);
 554         if (zbudpage_is_dying(zbudpage)) {
 555                 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 556                 zbudpage_spin_unlock(zbudpage);
 557                 spin_unlock(lists_lock);
 558                 *zpages = 0;
 559                 *zsize = 0;
 560                 goto out;
 561         }
 562         if (budnum == 0) {
 563                 bud_size = zbudpage->zbud0_size;
 564                 other_bud_size = zbudpage->zbud1_size;
 565         } else {
 566                 bud_size = zbudpage->zbud1_size;
 567                 other_bud_size = zbudpage->zbud0_size;
 568         }
 569         *zsize = bud_size - sizeof(struct tmem_handle);
 570         *zpages = 1;
 571         zbud_unuse_zbud(zbudpage, budnum, eph);
 572         if (other_bud_size == 0) { /* was unbuddied: unlist and free */
 573                 chunks = zbud_size_to_chunks(bud_size) ;
 574                 if (zbudpage_is_zombie(zbudpage)) {
 575                         if (eph)
 576                                 zbud_pers_zombie_count =
 577                                   atomic_dec_return(&zbud_eph_zombie_atomic);
 578                         else
 579                                 zbud_pers_zombie_count =
 580                                   atomic_dec_return(&zbud_pers_zombie_atomic);
 581                         zbudpage_clear_zombie(zbudpage);
 582                 } else {
 583                         BUG_ON(list_empty(&unbud[chunks].list));
 584                         list_del_init(&zbudpage->budlist);
 585                         unbud[chunks].count--;
 586                 }
 587                 list_del_init(&zbudpage->lru);
 588                 spin_unlock(lists_lock);
 589                 if (eph)
 590                         zbud_eph_unbuddied_count--;
 591                 else
 592                         zbud_pers_unbuddied_count--;
 593                 page = zbud_unuse_zbudpage(zbudpage, eph);
 594         } else { /* was buddied: move remaining buddy to unbuddied list */
 595                 chunks = zbud_size_to_chunks(other_bud_size) ;
 596                 if (!zbudpage_is_zombie(zbudpage)) {
 597                         list_del_init(&zbudpage->budlist);
 598                         list_add_tail(&zbudpage->budlist, &unbud[chunks].list);
 599                         unbud[chunks].count++;
 600                 }
 601                 if (eph) {
 602                         zbud_eph_buddied_count--;
 603                         zbud_eph_unbuddied_count++;
 604                 } else {
 605                         zbud_pers_unbuddied_count++;
 606                         zbud_pers_buddied_count--;
 607                 }
 608                 /* don't mess with lru, no need to move it */
 609                 zbudpage_spin_unlock(zbudpage);
 610                 spin_unlock(lists_lock);
 611         }
 612 out:
 613         return page;
 614 }
 615
 616 /*
 617  * Given a tmem handle, and a kmapped pointer to compressed data of
 618  * the given size, try to find an unbuddied zbudpage in which to
 619  * create a zbud. If found, put it there, mark the zbudpage unevictable,
 620  * and return a zbudref to it.  Else return NULL.
 621  */
 622 struct zbudref *zbud_match_prep(struct tmem_handle *th, bool eph,
 623                                 void *cdata, unsigned size)
 624 {
 625         struct zbudpage *zbudpage = NULL, *zbudpage2;
 626         unsigned long budnum = 0UL;
 627         unsigned nchunks;
 628         int i, found_good_buddy = 0;
 629         spinlock_t *lists_lock =
 630                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 631         struct zbud_unbuddied *unbud =
 632                 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
 633
 634         size += sizeof(struct tmem_handle);
 635         nchunks = zbud_size_to_chunks(size);
 636         for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
 637                 spin_lock(lists_lock);
 638                 if (!list_empty(&unbud[i].list)) {
 639                         list_for_each_entry_safe(zbudpage, zbudpage2,
 640                                     &unbud[i].list, budlist) {
 641                                 if (zbudpage_spin_trylock(zbudpage)) {
 642                                         found_good_buddy = i;
 643                                         goto found_unbuddied;
 644                                 }
 645                         }
 646                 }
 647                 spin_unlock(lists_lock);
 648         }
 649         zbudpage = NULL;
 650         goto out;
 651
 652 found_unbuddied:
 653         BUG_ON(!zbudpage_is_locked(zbudpage));
 654         BUG_ON(!((zbudpage->zbud0_size == 0) ^ (zbudpage->zbud1_size == 0)));
 655         if (zbudpage->zbud0_size == 0)
 656                 budnum = 0UL;
 657         else if (zbudpage->zbud1_size == 0)
 658                 budnum = 1UL;
 659         list_del_init(&zbudpage->budlist);
 660         if (eph) {
 661                 list_add_tail(&zbudpage->budlist, &zbud_eph_buddied_list);
 662                 unbud[found_good_buddy].count--;
 663                 zbud_eph_unbuddied_count--;
 664                 zbud_eph_buddied_count++;
 665                 /* "promote" raw zbudpage to most-recently-used */
 666                 list_del_init(&zbudpage->lru);
 667                 list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
 668         } else {
 669                 list_add_tail(&zbudpage->budlist, &zbud_pers_buddied_list);
 670                 unbud[found_good_buddy].count--;
 671                 zbud_pers_unbuddied_count--;
 672                 zbud_pers_buddied_count++;
 673                 /* "promote" raw zbudpage to most-recently-used */
 674                 list_del_init(&zbudpage->lru);
 675                 list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
 676         }
 677         zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
 678         zbudpage->unevictable++;
 679         BUG_ON(zbudpage->unevictable == 3);
 680         zbudpage_spin_unlock(zbudpage);
 681         spin_unlock(lists_lock);
 682 out:
 683         return zbudpage_to_zbudref(zbudpage, budnum);
 684
 685 }
 686
 687 /*
 688  * Given a tmem handle, and a kmapped pointer to compressed data of
 689  * the given size, and a newly allocated struct page, create an unevictable
 690  * zbud in that new page and return a zbudref to it.
 691  */
 692 struct zbudref *zbud_create_prep(struct tmem_handle *th, bool eph,
 693                                         void *cdata, unsigned size,
 694                                         struct page *newpage)
 695 {
 696         struct zbudpage *zbudpage;
 697         unsigned long budnum = 0;
 698         unsigned nchunks;
 699         spinlock_t *lists_lock =
 700                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 701         struct zbud_unbuddied *unbud =
 702                 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
 703
 704 #if 0
 705         /* this may be worth it later to support decompress-in-place? */
 706         static unsigned long counter;
 707         budnum = counter++ & 1; /* alternate using zbud0 and zbud1 */
 708 #endif
 709
 710         if (size  > zbud_max_buddy_size())
 711                 return NULL;
 712         if (newpage == NULL)
 713                 return NULL;
 714
 715         size += sizeof(struct tmem_handle);
 716         nchunks = zbud_size_to_chunks(size) ;
 717         spin_lock(lists_lock);
 718         zbudpage = zbud_init_zbudpage(newpage, eph);
 719         zbudpage_spin_lock(zbudpage);
 720         list_add_tail(&zbudpage->budlist, &unbud[nchunks].list);
 721         if (eph) {
 722                 list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
 723                 zbud_eph_unbuddied_count++;
 724         } else {
 725                 list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
 726                 zbud_pers_unbuddied_count++;
 727         }
 728         unbud[nchunks].count++;
 729         zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
 730         zbudpage->unevictable++;
 731         BUG_ON(zbudpage->unevictable == 3);
 732         zbudpage_spin_unlock(zbudpage);
 733         spin_unlock(lists_lock);
 734         return zbudpage_to_zbudref(zbudpage, budnum);
 735 }
 736
 737 /*
 738  * Finish creation of a zbud by, assuming another zbud isn't being created
 739  * in parallel, marking it evictable.
 740  */
 741 void zbud_create_finish(struct zbudref *zref, bool eph)
 742 {
 743         struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 744         spinlock_t *lists_lock =
 745                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 746
 747         spin_lock(lists_lock);
 748         zbudpage_spin_lock(zbudpage);
 749         BUG_ON(zbudpage_is_dying(zbudpage));
 750         zbudpage->unevictable--;
 751         BUG_ON((int)zbudpage->unevictable < 0);
 752         zbudpage_spin_unlock(zbudpage);
 753         spin_unlock(lists_lock);
 754 }
 755
 756 /*
 757  * Given a zbudref and a struct page, decompress the data from
 758  * the zbud into the physical page represented by the struct page
 759  * by upcalling to zcache_decompress
 760  */
 761 int zbud_decompress(struct page *data_page, struct zbudref *zref, bool eph,
 762                         void (*decompress)(char *, unsigned int, char *))
 763 {
 764         struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 765         unsigned long budnum = zbudref_budnum(zref);
 766         void *zbpg;
 767         char *to_va, *from_va;
 768         unsigned size;
 769         int ret = -1;
 770         spinlock_t *lists_lock =
 771                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 772
 773         spin_lock(lists_lock);
 774         zbudpage_spin_lock(zbudpage);
 775         if (zbudpage_is_dying(zbudpage)) {
 776                 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 777                 goto out;
 778         }
 779         zbpg = kmap_zbudpage_atomic(zbudpage);
 780         to_va = kmap_atomic(data_page);
 781         if (budnum == 0)
 782                 size = zbudpage->zbud0_size;
 783         else
 784                 size = zbudpage->zbud1_size;
 785         BUG_ON(size == 0 || size > zbud_max_size());
 786         from_va = zbud_data(zbpg, budnum, size);
 787         from_va += sizeof(struct tmem_handle);
 788         size -= sizeof(struct tmem_handle);
 789         decompress(from_va, size, to_va);
 790         kunmap_atomic(to_va);
 791         kunmap_zbudpage_atomic(zbpg);
 792         ret = 0;
 793 out:
 794         zbudpage_spin_unlock(zbudpage);
 795         spin_unlock(lists_lock);
 796         return ret;
 797 }
 798
 799 /*
 800  * Given a zbudref and a kernel pointer, copy the data from
 801  * the zbud to the kernel pointer.
 802  */
 803 int zbud_copy_from_zbud(char *to_va, struct zbudref *zref,
 804                                 size_t *sizep, bool eph)
 805 {
 806         struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 807         unsigned long budnum = zbudref_budnum(zref);
 808         void *zbpg;
 809         char *from_va;
 810         unsigned size;
 811         int ret = -1;
 812         spinlock_t *lists_lock =
 813                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 814
 815         spin_lock(lists_lock);
 816         zbudpage_spin_lock(zbudpage);
 817         if (zbudpage_is_dying(zbudpage)) {
 818                 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 819                 goto out;
 820         }
 821         zbpg = kmap_zbudpage_atomic(zbudpage);
 822         if (budnum == 0)
 823                 size = zbudpage->zbud0_size;
 824         else
 825                 size = zbudpage->zbud1_size;
 826         BUG_ON(size == 0 || size > zbud_max_size());
 827         from_va = zbud_data(zbpg, budnum, size);
 828         from_va += sizeof(struct tmem_handle);
 829         size -= sizeof(struct tmem_handle);
 830         *sizep = size;
 831         memcpy(to_va, from_va, size);
 832
 833         kunmap_zbudpage_atomic(zbpg);
 834         ret = 0;
 835 out:
 836         zbudpage_spin_unlock(zbudpage);
 837         spin_unlock(lists_lock);
 838         return ret;
 839 }
 840
 841 /*
 842  * Given a zbudref and a kernel pointer, copy the data from
 843  * the kernel pointer to the zbud.
 844  */
 845 int zbud_copy_to_zbud(struct zbudref *zref, char *from_va, bool eph)
 846 {
 847         struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 848         unsigned long budnum = zbudref_budnum(zref);
 849         void *zbpg;
 850         char *to_va;
 851         unsigned size;
 852         int ret = -1;
 853         spinlock_t *lists_lock =
 854                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 855
 856         spin_lock(lists_lock);
 857         zbudpage_spin_lock(zbudpage);
 858         if (zbudpage_is_dying(zbudpage)) {
 859                 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 860                 goto out;
 861         }
 862         zbpg = kmap_zbudpage_atomic(zbudpage);
 863         if (budnum == 0)
 864                 size = zbudpage->zbud0_size;
 865         else
 866                 size = zbudpage->zbud1_size;
 867         BUG_ON(size == 0 || size > zbud_max_size());
 868         to_va = zbud_data(zbpg, budnum, size);
 869         to_va += sizeof(struct tmem_handle);
 870         size -= sizeof(struct tmem_handle);
 871         memcpy(to_va, from_va, size);
 872
 873         kunmap_zbudpage_atomic(zbpg);
 874         ret = 0;
 875 out:
 876         zbudpage_spin_unlock(zbudpage);
 877         spin_unlock(lists_lock);
 878         return ret;
 879 }
 880
 881 /*
 882  * Choose an ephemeral LRU zbudpage that is evictable (not locked), ensure
 883  * there are no references to it remaining, and return the now unused
 884  * (and re-init'ed) struct page and the total amount of compressed
 885  * data that was evicted.
 886  */
 887 struct page *zbud_evict_pageframe_lru(unsigned int *zsize, unsigned int *zpages)
 888 {
 889         struct zbudpage *zbudpage = NULL, *zbudpage2;
 890         struct zbud_unbuddied *unbud = zbud_eph_unbuddied;
 891         struct page *page = NULL;
 892         bool irqs_disabled = irqs_disabled();
 893
 894         /*
 895          * Since this can be called indirectly from cleancache_put, which
 896          * has interrupts disabled, as well as frontswap_put, which does not,
 897          * we need to be able to handle both cases, even though it is ugly.
 898          */
 899         if (irqs_disabled)
 900                 spin_lock(&zbud_eph_lists_lock);
 901         else
 902                 spin_lock_bh(&zbud_eph_lists_lock);
 903         *zsize = 0;
 904         if (list_empty(&zbud_eph_lru_list))
 905                 goto unlock_out;
 906         list_for_each_entry_safe(zbudpage, zbudpage2, &zbud_eph_lru_list, lru) {
 907                 /* skip a locked zbudpage */
 908                 if (unlikely(!zbudpage_spin_trylock(zbudpage)))
 909                         continue;
 910                 /* skip an unevictable zbudpage */
 911                 if (unlikely(zbudpage->unevictable != 0)) {
 912                         zbudpage_spin_unlock(zbudpage);
 913                         continue;
 914                 }
 915                 /* got a locked evictable page */
 916                 goto evict_page;
 917
 918         }
 919 unlock_out:
 920         /* no unlocked evictable pages, give up */
 921         if (irqs_disabled)
 922                 spin_unlock(&zbud_eph_lists_lock);
 923         else
 924                 spin_unlock_bh(&zbud_eph_lists_lock);
 925         goto out;
 926
 927 evict_page:
 928         list_del_init(&zbudpage->budlist);
 929         list_del_init(&zbudpage->lru);
 930         zbudpage_set_dying(zbudpage);
 931         /*
 932          * the zbudpage is now "dying" and attempts to read, write,
 933          * or delete data from it will be ignored
 934          */
 935         if (zbudpage->zbud0_size != 0 && zbudpage->zbud1_size !=  0) {
 936                 *zsize = zbudpage->zbud0_size + zbudpage->zbud1_size -
 937                                 (2 * sizeof(struct tmem_handle));
 938                 *zpages = 2;
 939         } else if (zbudpage->zbud0_size != 0) {
 940                 unbud[zbud_size_to_chunks(zbudpage->zbud0_size)].count--;
 941                 *zsize = zbudpage->zbud0_size - sizeof(struct tmem_handle);
 942                 *zpages = 1;
 943         } else if (zbudpage->zbud1_size != 0) {
 944                 unbud[zbud_size_to_chunks(zbudpage->zbud1_size)].count--;
 945                 *zsize = zbudpage->zbud1_size - sizeof(struct tmem_handle);
 946                 *zpages = 1;
 947         } else {
 948                 BUG();
 949         }
 950         spin_unlock(&zbud_eph_lists_lock);
 951         zbud_eph_evicted_pageframes++;
 952         if (*zpages == 1)
 953                 zbud_eph_unbuddied_count--;
 954         else
 955                 zbud_eph_buddied_count--;
 956         zbud_evict_tmem(zbudpage);
 957         zbudpage_spin_lock(zbudpage);
 958         zbudpage_clear_dying(zbudpage);
 959         page = zbud_unuse_zbudpage(zbudpage, true);
 960         if (!irqs_disabled)
 961                 local_bh_enable();
 962 out:
 963         return page;
 964 }
 965
 966 /*
 967  * Choose a persistent LRU zbudpage that is evictable (not locked), zombify it,
 968  * read the tmem_handle(s) out of it into the passed array, and return the
 969  * number of zbuds.  Caller must perform necessary tmem functions and,
 970  * indirectly, zbud functions to fetch any valid data and cause the
 971  * now-zombified zbudpage to eventually be freed.  We track the zombified
 972  * zbudpage count so it is possible to observe if there is a leak.
 973  FIXME: describe (ramster) case where data pointers are passed in for memcpy
 974  */
 975 unsigned int zbud_make_zombie_lru(struct tmem_handle *th, unsigned char **data,
 976                                         unsigned int *zsize, bool eph)
 977 {
 978         struct zbudpage *zbudpage = NULL, *zbudpag2;
 979         struct tmem_handle *thfrom;
 980         char *from_va;
 981         void *zbpg;
 982         unsigned size;
 983         int ret = 0, i;
 984         spinlock_t *lists_lock =
 985                 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 986         struct list_head *lru_list =
 987                 eph ? &zbud_eph_lru_list : &zbud_pers_lru_list;
 988
 989         spin_lock_bh(lists_lock);
 990         if (list_empty(lru_list))
 991                 goto out;
 992         list_for_each_entry_safe(zbudpage, zbudpag2, lru_list, lru) {
 993                 /* skip a locked zbudpage */
 994                 if (unlikely(!zbudpage_spin_trylock(zbudpage)))
 995                         continue;
 996                 /* skip an unevictable zbudpage */
 997                 if (unlikely(zbudpage->unevictable != 0)) {
 998                         zbudpage_spin_unlock(zbudpage);
 999                         continue;
1000                 }
1001                 /* got a locked evictable page */
1002                 goto zombify_page;
1003         }
1004         /* no unlocked evictable pages, give up */
1005         goto out;
1006
1007 zombify_page:
1008         /* got an unlocked evictable page, zombify it */
1009         list_del_init(&zbudpage->budlist);
1010         zbudpage_set_zombie(zbudpage);
1011         /* FIXME what accounting do I need to do here? */
1012         list_del_init(&zbudpage->lru);
1013         if (eph) {
1014                 list_add_tail(&zbudpage->lru, &zbud_eph_zombie_list);
1015                 zbud_eph_zombie_count =
1016                                 atomic_inc_return(&zbud_eph_zombie_atomic);
1017         } else {
1018                 list_add_tail(&zbudpage->lru, &zbud_pers_zombie_list);
1019                 zbud_pers_zombie_count =
1020                                 atomic_inc_return(&zbud_pers_zombie_atomic);
1021         }
1022         /* FIXME what accounting do I need to do here? */
1023         zbpg = kmap_zbudpage_atomic(zbudpage);
1024         for (i = 0; i < 2; i++) {
1025                 size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
1026                 if (size) {
1027                         from_va = zbud_data(zbpg, i, size);
1028                         thfrom = (struct tmem_handle *)from_va;
1029                         from_va += sizeof(struct tmem_handle);
1030                         size -= sizeof(struct tmem_handle);
1031                         if (th != NULL)
1032                                 th[ret] = *thfrom;
1033                         if (data != NULL)
1034                                 memcpy(data[ret], from_va, size);
1035                         if (zsize != NULL)
1036                                 *zsize++ = size;
1037                         ret++;
1038                 }
1039         }
1040         kunmap_zbudpage_atomic(zbpg);
1041         zbudpage_spin_unlock(zbudpage);
1042 out:
1043         spin_unlock_bh(lists_lock);
1044         return ret;
1045 }
1046
1047 void __init zbud_init(void)
1048 {
1049         int i;
1050
1051 #ifdef CONFIG_DEBUG_FS
1052         zbud_debugfs_init();
1053 #endif
1054         BUG_ON((sizeof(struct tmem_handle) * 2 > CHUNK_SIZE));
1055         BUG_ON(sizeof(struct zbudpage) > sizeof(struct page));
1056         for (i = 0; i < NCHUNKS; i++) {
1057                 INIT_LIST_HEAD(&zbud_eph_unbuddied[i].list);
1058                 INIT_LIST_HEAD(&zbud_pers_unbuddied[i].list);
1059         }
1060 }