2 * zsmalloc memory allocator
4 * Copyright (C) 2011 Nitin Gupta
6 * This code is released using a dual license strategy: BSD/GPL
7 * You can choose the license that better fits your requirements.
9 * Released under the terms of 3-clause BSD License
10 * Released under the terms of GNU General Public License Version 2.0
15 * This allocator is designed for use with zcache and zram. Thus, the
16 * allocator is supposed to work well under low memory conditions. In
17 * particular, it never attempts higher order page allocation which is
18 * very likely to fail under memory pressure. On the other hand, if we
19 * just use single (0-order) pages, it would suffer from very high
20 * fragmentation -- any object of size PAGE_SIZE/2 or larger would occupy
21 * an entire page. This was one of the major issues with its predecessor
24 * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
25 * and links them together using various 'struct page' fields. These linked
26 * pages act as a single higher-order page i.e. an object can span 0-order
27 * page boundaries. The code refers to these linked pages as a single entity
30 * Following is how we use various fields and flags of underlying
31 * struct page(s) to form a zspage.
33 * Usage of struct page fields:
34 * page->first_page: points to the first component (0-order) page
35 * page->index (union with page->freelist): offset of the first object
36 * starting in this page. For the first page, this is
37 * always 0, so we use this field (aka freelist) to point
38 * to the first free object in zspage.
39 * page->lru: links together all component pages (except the first page)
42 * For _first_ page only:
44 * page->private (union with page->first_page): refers to the
45 * component page after the first page
46 * page->freelist: points to the first free object in zspage.
47 * Free objects are linked together using in-place
49 * page->objects: maximum number of objects we can store in this
50 * zspage (class->zspage_order * PAGE_SIZE / class->size)
51 * page->lru: links together first pages of various zspages.
52 * Basically forming list of zspages in a fullness group.
53 * page->mapping: class index and fullness group of the zspage
55 * Usage of struct page flags:
56 * PG_private: identifies the first component page
57 * PG_private2: identifies the last component page
61 #ifdef CONFIG_ZSMALLOC_DEBUG
65 #include <linux/module.h>
66 #include <linux/kernel.h>
67 #include <linux/bitops.h>
68 #include <linux/errno.h>
69 #include <linux/highmem.h>
70 #include <linux/init.h>
71 #include <linux/string.h>
72 #include <linux/slab.h>
73 #include <asm/tlbflush.h>
74 #include <asm/pgtable.h>
75 #include <linux/cpumask.h>
76 #include <linux/cpu.h>
77 #include <linux/vmalloc.h>
80 #include "zsmalloc_int.h"
83 * A zspage's class index and fullness group
84 * are encoded in its (first)page->mapping
86 #define CLASS_IDX_BITS 28
87 #define FULLNESS_BITS 4
88 #define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1)
89 #define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1)
91 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
92 static DEFINE_PER_CPU(struct mapping_area
, zs_map_area
);
94 static int is_first_page(struct page
*page
)
96 return PagePrivate(page
);
99 static int is_last_page(struct page
*page
)
101 return PagePrivate2(page
);
104 static void get_zspage_mapping(struct page
*page
, unsigned int *class_idx
,
105 enum fullness_group
*fullness
)
108 BUG_ON(!is_first_page(page
));
110 m
= (unsigned long)page
->mapping
;
111 *fullness
= m
& FULLNESS_MASK
;
112 *class_idx
= (m
>> FULLNESS_BITS
) & CLASS_IDX_MASK
;
115 static void set_zspage_mapping(struct page
*page
, unsigned int class_idx
,
116 enum fullness_group fullness
)
119 BUG_ON(!is_first_page(page
));
121 m
= ((class_idx
& CLASS_IDX_MASK
) << FULLNESS_BITS
) |
122 (fullness
& FULLNESS_MASK
);
123 page
->mapping
= (struct address_space
*)m
;
126 static int get_size_class_index(int size
)
130 if (likely(size
> ZS_MIN_ALLOC_SIZE
))
131 idx
= DIV_ROUND_UP(size
- ZS_MIN_ALLOC_SIZE
,
132 ZS_SIZE_CLASS_DELTA
);
137 static enum fullness_group
get_fullness_group(struct page
*page
)
139 int inuse
, max_objects
;
140 enum fullness_group fg
;
141 BUG_ON(!is_first_page(page
));
144 max_objects
= page
->objects
;
148 else if (inuse
== max_objects
)
150 else if (inuse
<= max_objects
/ fullness_threshold_frac
)
151 fg
= ZS_ALMOST_EMPTY
;
158 static void insert_zspage(struct page
*page
, struct size_class
*class,
159 enum fullness_group fullness
)
163 BUG_ON(!is_first_page(page
));
165 if (fullness
>= _ZS_NR_FULLNESS_GROUPS
)
168 head
= &class->fullness_list
[fullness
];
170 list_add_tail(&page
->lru
, &(*head
)->lru
);
175 static void remove_zspage(struct page
*page
, struct size_class
*class,
176 enum fullness_group fullness
)
180 BUG_ON(!is_first_page(page
));
182 if (fullness
>= _ZS_NR_FULLNESS_GROUPS
)
185 head
= &class->fullness_list
[fullness
];
187 if (list_empty(&(*head
)->lru
))
189 else if (*head
== page
)
190 *head
= (struct page
*)list_entry((*head
)->lru
.next
,
193 list_del_init(&page
->lru
);
196 static enum fullness_group
fix_fullness_group(struct zs_pool
*pool
,
200 struct size_class
*class;
201 enum fullness_group currfg
, newfg
;
203 BUG_ON(!is_first_page(page
));
205 get_zspage_mapping(page
, &class_idx
, &currfg
);
206 newfg
= get_fullness_group(page
);
210 class = &pool
->size_class
[class_idx
];
211 remove_zspage(page
, class, currfg
);
212 insert_zspage(page
, class, newfg
);
213 set_zspage_mapping(page
, class_idx
, newfg
);
220 * We have to decide on how many pages to link together
221 * to form a zspage for each size class. This is important
222 * to reduce wastage due to unusable space left at end of
223 * each zspage which is given as:
224 * wastage = Zp - Zp % size_class
225 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
227 * For example, for size class of 3/8 * PAGE_SIZE, we should
228 * link together 3 PAGE_SIZE sized pages to form a zspage
229 * since then we can perfectly fit in 8 such objects.
231 static int get_pages_per_zspage(int class_size
)
233 int i
, max_usedpc
= 0;
234 /* zspage order which gives maximum used size per KB */
235 int max_usedpc_order
= 1;
237 for (i
= 1; i
<= ZS_MAX_PAGES_PER_ZSPAGE
; i
++) {
241 zspage_size
= i
* PAGE_SIZE
;
242 waste
= zspage_size
% class_size
;
243 usedpc
= (zspage_size
- waste
) * 100 / zspage_size
;
245 if (usedpc
> max_usedpc
) {
247 max_usedpc_order
= i
;
251 return max_usedpc_order
;
255 * A single 'zspage' is composed of many system pages which are
256 * linked together using fields in struct page. This function finds
257 * the first/head page, given any component page of a zspage.
259 static struct page
*get_first_page(struct page
*page
)
261 if (is_first_page(page
))
264 return page
->first_page
;
267 static struct page
*get_next_page(struct page
*page
)
271 if (is_last_page(page
))
273 else if (is_first_page(page
))
274 next
= (struct page
*)page
->private;
276 next
= list_entry(page
->lru
.next
, struct page
, lru
);
281 /* Encode <page, obj_idx> as a single handle value */
282 static void *obj_location_to_handle(struct page
*page
, unsigned long obj_idx
)
284 unsigned long handle
;
291 handle
= page_to_pfn(page
) << OBJ_INDEX_BITS
;
292 handle
|= (obj_idx
& OBJ_INDEX_MASK
);
294 return (void *)handle
;
297 /* Decode <page, obj_idx> pair from the given object handle */
298 static void obj_handle_to_location(unsigned long handle
, struct page
**page
,
299 unsigned long *obj_idx
)
301 *page
= pfn_to_page(handle
>> OBJ_INDEX_BITS
);
302 *obj_idx
= handle
& OBJ_INDEX_MASK
;
305 static unsigned long obj_idx_to_offset(struct page
*page
,
306 unsigned long obj_idx
, int class_size
)
308 unsigned long off
= 0;
310 if (!is_first_page(page
))
313 return off
+ obj_idx
* class_size
;
316 static void reset_page(struct page
*page
)
318 clear_bit(PG_private
, &page
->flags
);
319 clear_bit(PG_private_2
, &page
->flags
);
320 set_page_private(page
, 0);
321 page
->mapping
= NULL
;
322 page
->freelist
= NULL
;
323 reset_page_mapcount(page
);
326 static void free_zspage(struct page
*first_page
)
328 struct page
*nextp
, *tmp
, *head_extra
;
330 BUG_ON(!is_first_page(first_page
));
331 BUG_ON(first_page
->inuse
);
333 head_extra
= (struct page
*)page_private(first_page
);
335 reset_page(first_page
);
336 __free_page(first_page
);
338 /* zspage with only 1 system page */
342 list_for_each_entry_safe(nextp
, tmp
, &head_extra
->lru
, lru
) {
343 list_del(&nextp
->lru
);
347 reset_page(head_extra
);
348 __free_page(head_extra
);
351 /* Initialize a newly allocated zspage */
352 static void init_zspage(struct page
*first_page
, struct size_class
*class)
354 unsigned long off
= 0;
355 struct page
*page
= first_page
;
357 BUG_ON(!is_first_page(first_page
));
359 struct page
*next_page
;
360 struct link_free
*link
;
361 unsigned int i
, objs_on_page
;
364 * page->index stores offset of first object starting
365 * in the page. For the first page, this is always 0,
366 * so we use first_page->index (aka ->freelist) to store
367 * head of corresponding zspage's freelist.
369 if (page
!= first_page
)
372 link
= (struct link_free
*)kmap_atomic(page
) +
374 objs_on_page
= (PAGE_SIZE
- off
) / class->size
;
376 for (i
= 1; i
<= objs_on_page
; i
++) {
378 if (off
< PAGE_SIZE
) {
379 link
->next
= obj_location_to_handle(page
, i
);
380 link
+= class->size
/ sizeof(*link
);
385 * We now come to the last (full or partial) object on this
386 * page, which must point to the first object on the next
389 next_page
= get_next_page(page
);
390 link
->next
= obj_location_to_handle(next_page
, 0);
393 off
= (off
+ class->size
) % PAGE_SIZE
;
398 * Allocate a zspage for the given size class
400 static struct page
*alloc_zspage(struct size_class
*class, gfp_t flags
)
403 struct page
*first_page
= NULL
, *uninitialized_var(prev_page
);
406 * Allocate individual pages and link them together as:
407 * 1. first page->private = first sub-page
408 * 2. all sub-pages are linked together using page->lru
409 * 3. each sub-page is linked to the first page using page->first_page
411 * For each size class, First/Head pages are linked together using
412 * page->lru. Also, we set PG_private to identify the first page
413 * (i.e. no other sub-page has this flag set) and PG_private_2 to
414 * identify the last page.
417 for (i
= 0; i
< class->pages_per_zspage
; i
++) {
420 page
= alloc_page(flags
);
424 INIT_LIST_HEAD(&page
->lru
);
425 if (i
== 0) { /* first page */
426 SetPagePrivate(page
);
427 set_page_private(page
, 0);
429 first_page
->inuse
= 0;
432 first_page
->private = (unsigned long)page
;
434 page
->first_page
= first_page
;
436 list_add(&page
->lru
, &prev_page
->lru
);
437 if (i
== class->pages_per_zspage
- 1) /* last page */
438 SetPagePrivate2(page
);
442 init_zspage(first_page
, class);
444 first_page
->freelist
= obj_location_to_handle(first_page
, 0);
445 /* Maximum number of objects we can store in this zspage */
446 first_page
->objects
= class->pages_per_zspage
* PAGE_SIZE
/ class->size
;
448 error
= 0; /* Success */
451 if (unlikely(error
) && first_page
) {
452 free_zspage(first_page
);
459 static struct page
*find_get_zspage(struct size_class
*class)
464 for (i
= 0; i
< _ZS_NR_FULLNESS_GROUPS
; i
++) {
465 page
= class->fullness_list
[i
];
473 static void zs_copy_map_object(char *buf
, struct page
*firstpage
,
476 struct page
*pages
[2];
480 pages
[0] = firstpage
;
481 pages
[1] = get_next_page(firstpage
);
484 sizes
[0] = PAGE_SIZE
- off
;
485 sizes
[1] = size
- sizes
[0];
487 /* copy object to per-cpu buffer */
488 addr
= kmap_atomic(pages
[0]);
489 memcpy(buf
, addr
+ off
, sizes
[0]);
491 addr
= kmap_atomic(pages
[1]);
492 memcpy(buf
+ sizes
[0], addr
, sizes
[1]);
496 static void zs_copy_unmap_object(char *buf
, struct page
*firstpage
,
499 struct page
*pages
[2];
503 pages
[0] = firstpage
;
504 pages
[1] = get_next_page(firstpage
);
507 sizes
[0] = PAGE_SIZE
- off
;
508 sizes
[1] = size
- sizes
[0];
510 /* copy per-cpu buffer to object */
511 addr
= kmap_atomic(pages
[0]);
512 memcpy(addr
+ off
, buf
, sizes
[0]);
514 addr
= kmap_atomic(pages
[1]);
515 memcpy(addr
, buf
+ sizes
[0], sizes
[1]);
519 static int zs_cpu_notifier(struct notifier_block
*nb
, unsigned long action
,
522 int cpu
= (long)pcpu
;
523 struct mapping_area
*area
;
527 area
= &per_cpu(zs_map_area
, cpu
);
529 * Make sure we don't leak memory if a cpu UP notification
530 * and zs_init() race and both call zs_cpu_up() on the same cpu
534 area
->vm_buf
= (char *)__get_free_page(GFP_KERNEL
);
540 case CPU_UP_CANCELED
:
541 area
= &per_cpu(zs_map_area
, cpu
);
543 free_page((unsigned long)area
->vm_buf
);
551 static struct notifier_block zs_cpu_nb
= {
552 .notifier_call
= zs_cpu_notifier
555 static void zs_exit(void)
559 for_each_online_cpu(cpu
)
560 zs_cpu_notifier(NULL
, CPU_DEAD
, (void *)(long)cpu
);
561 unregister_cpu_notifier(&zs_cpu_nb
);
564 static int zs_init(void)
568 register_cpu_notifier(&zs_cpu_nb
);
569 for_each_online_cpu(cpu
) {
570 ret
= zs_cpu_notifier(NULL
, CPU_UP_PREPARE
, (void *)(long)cpu
);
571 if (notifier_to_errno(ret
))
577 return notifier_to_errno(ret
);
580 struct zs_pool
*zs_create_pool(const char *name
, gfp_t flags
)
583 struct zs_pool
*pool
;
588 ovhd_size
= roundup(sizeof(*pool
), PAGE_SIZE
);
589 pool
= kzalloc(ovhd_size
, GFP_KERNEL
);
593 for (i
= 0; i
< ZS_SIZE_CLASSES
; i
++) {
595 struct size_class
*class;
597 size
= ZS_MIN_ALLOC_SIZE
+ i
* ZS_SIZE_CLASS_DELTA
;
598 if (size
> ZS_MAX_ALLOC_SIZE
)
599 size
= ZS_MAX_ALLOC_SIZE
;
601 class = &pool
->size_class
[i
];
604 spin_lock_init(&class->lock
);
605 class->pages_per_zspage
= get_pages_per_zspage(size
);
614 EXPORT_SYMBOL_GPL(zs_create_pool
);
616 void zs_destroy_pool(struct zs_pool
*pool
)
620 for (i
= 0; i
< ZS_SIZE_CLASSES
; i
++) {
622 struct size_class
*class = &pool
->size_class
[i
];
624 for (fg
= 0; fg
< _ZS_NR_FULLNESS_GROUPS
; fg
++) {
625 if (class->fullness_list
[fg
]) {
626 pr_info("Freeing non-empty class with size "
627 "%db, fullness group %d\n",
634 EXPORT_SYMBOL_GPL(zs_destroy_pool
);
637 * zs_malloc - Allocate block of given size from pool.
638 * @pool: pool to allocate from
639 * @size: size of block to allocate
641 * On success, handle to the allocated object is returned,
643 * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
645 unsigned long zs_malloc(struct zs_pool
*pool
, size_t size
)
648 struct link_free
*link
;
650 struct size_class
*class;
652 struct page
*first_page
, *m_page
;
653 unsigned long m_objidx
, m_offset
;
655 if (unlikely(!size
|| size
> ZS_MAX_ALLOC_SIZE
))
658 class_idx
= get_size_class_index(size
);
659 class = &pool
->size_class
[class_idx
];
660 BUG_ON(class_idx
!= class->index
);
662 spin_lock(&class->lock
);
663 first_page
= find_get_zspage(class);
666 spin_unlock(&class->lock
);
667 first_page
= alloc_zspage(class, pool
->flags
);
668 if (unlikely(!first_page
))
671 set_zspage_mapping(first_page
, class->index
, ZS_EMPTY
);
672 spin_lock(&class->lock
);
673 class->pages_allocated
+= class->pages_per_zspage
;
676 obj
= (unsigned long)first_page
->freelist
;
677 obj_handle_to_location(obj
, &m_page
, &m_objidx
);
678 m_offset
= obj_idx_to_offset(m_page
, m_objidx
, class->size
);
680 link
= (struct link_free
*)kmap_atomic(m_page
) +
681 m_offset
/ sizeof(*link
);
682 first_page
->freelist
= link
->next
;
683 memset(link
, POISON_INUSE
, sizeof(*link
));
687 /* Now move the zspage to another fullness group, if required */
688 fix_fullness_group(pool
, first_page
);
689 spin_unlock(&class->lock
);
693 EXPORT_SYMBOL_GPL(zs_malloc
);
695 void zs_free(struct zs_pool
*pool
, unsigned long obj
)
697 struct link_free
*link
;
698 struct page
*first_page
, *f_page
;
699 unsigned long f_objidx
, f_offset
;
702 struct size_class
*class;
703 enum fullness_group fullness
;
708 obj_handle_to_location(obj
, &f_page
, &f_objidx
);
709 first_page
= get_first_page(f_page
);
711 get_zspage_mapping(first_page
, &class_idx
, &fullness
);
712 class = &pool
->size_class
[class_idx
];
713 f_offset
= obj_idx_to_offset(f_page
, f_objidx
, class->size
);
715 spin_lock(&class->lock
);
717 /* Insert this object in containing zspage's freelist */
718 link
= (struct link_free
*)((unsigned char *)kmap_atomic(f_page
)
720 link
->next
= first_page
->freelist
;
722 first_page
->freelist
= (void *)obj
;
725 fullness
= fix_fullness_group(pool
, first_page
);
727 if (fullness
== ZS_EMPTY
)
728 class->pages_allocated
-= class->pages_per_zspage
;
730 spin_unlock(&class->lock
);
732 if (fullness
== ZS_EMPTY
)
733 free_zspage(first_page
);
735 EXPORT_SYMBOL_GPL(zs_free
);
738 * zs_map_object - get address of allocated object from handle.
739 * @pool: pool from which the object was allocated
740 * @handle: handle returned from zs_malloc
742 * Before using an object allocated from zs_malloc, it must be mapped using
743 * this function. When done with the object, it must be unmapped using
746 * Only one object can be mapped per cpu at a time. There is no protection
747 * against nested mappings.
749 * This function returns with preemption and page faults disabled.
751 void *zs_map_object(struct zs_pool
*pool
, unsigned long handle
,
755 unsigned long obj_idx
, off
;
757 unsigned int class_idx
;
758 enum fullness_group fg
;
759 struct size_class
*class;
760 struct mapping_area
*area
;
764 obj_handle_to_location(handle
, &page
, &obj_idx
);
765 get_zspage_mapping(get_first_page(page
), &class_idx
, &fg
);
766 class = &pool
->size_class
[class_idx
];
767 off
= obj_idx_to_offset(page
, obj_idx
, class->size
);
769 area
= &get_cpu_var(zs_map_area
);
770 if (off
+ class->size
<= PAGE_SIZE
) {
771 /* this object is contained entirely within a page */
772 area
->vm_addr
= kmap_atomic(page
);
773 return area
->vm_addr
+ off
;
776 /* disable page faults to match kmap_atomic() return conditions */
780 zs_copy_map_object(area
->vm_buf
, page
, off
, class->size
);
781 area
->vm_addr
= NULL
;
784 EXPORT_SYMBOL_GPL(zs_map_object
);
786 void zs_unmap_object(struct zs_pool
*pool
, unsigned long handle
)
789 unsigned long obj_idx
, off
;
791 unsigned int class_idx
;
792 enum fullness_group fg
;
793 struct size_class
*class;
794 struct mapping_area
*area
;
796 area
= &__get_cpu_var(zs_map_area
);
797 /* single-page object fastpath */
799 kunmap_atomic(area
->vm_addr
);
803 /* no write fastpath */
804 if (area
->vm_mm
== ZS_MM_RO
)
809 obj_handle_to_location(handle
, &page
, &obj_idx
);
810 get_zspage_mapping(get_first_page(page
), &class_idx
, &fg
);
811 class = &pool
->size_class
[class_idx
];
812 off
= obj_idx_to_offset(page
, obj_idx
, class->size
);
814 zs_copy_unmap_object(area
->vm_buf
, page
, off
, class->size
);
817 /* enable page faults to match kunmap_atomic() return conditions */
820 put_cpu_var(zs_map_area
);
822 EXPORT_SYMBOL_GPL(zs_unmap_object
);
824 u64
zs_get_total_size_bytes(struct zs_pool
*pool
)
829 for (i
= 0; i
< ZS_SIZE_CLASSES
; i
++)
830 npages
+= pool
->size_class
[i
].pages_allocated
;
832 return npages
<< PAGE_SHIFT
;
834 EXPORT_SYMBOL_GPL(zs_get_total_size_bytes
);
836 module_init(zs_init
);
837 module_exit(zs_exit
);
839 MODULE_LICENSE("Dual BSD/GPL");
840 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
This page took 0.065511 seconds and 6 git commands to generate.