Staging: virtual block device driver (ramzswap)
[deliverable/linux.git] / drivers / staging / ramzswap / ramzswap_drv.c
CommitLineData
306b0c95
NG
1/*
2 * Compressed RAM based swap device
3 *
4 * Copyright (C) 2008, 2009 Nitin Gupta
5 *
6 * This code is released using a dual license strategy: BSD/GPL
7 * You can choose the licence that better fits your requirements.
8 *
9 * Released under the terms of 3-clause BSD License
10 * Released under the terms of GNU General Public License Version 2.0
11 *
12 * Project home: http://compcache.googlecode.com
13 */
14
15#define KMSG_COMPONENT "ramzswap"
16#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
17
18#include <linux/module.h>
19#include <linux/kernel.h>
20#include <linux/bitops.h>
21#include <linux/blkdev.h>
22#include <linux/buffer_head.h>
23#include <linux/device.h>
24#include <linux/genhd.h>
25#include <linux/highmem.h>
26#include <linux/lzo.h>
27#include <linux/mutex.h>
28#include <linux/string.h>
29#include <linux/swap.h>
30#include <linux/swapops.h>
31#include <linux/vmalloc.h>
32#include <linux/version.h>
33
34#include "ramzswap_drv.h"
35
36/* Globals */
37static int ramzswap_major;
38static struct ramzswap *devices;
39
40/*
41 * Pages that compress to larger than this size are
42 * forwarded to backing swap, if present or stored
43 * uncompressed in memory otherwise.
44 */
45static unsigned int max_zpage_size;
46
47/* Module params (documentation at end) */
48static unsigned int num_devices;
49
50static int rzs_test_flag(struct ramzswap *rzs, u32 index,
51 enum rzs_pageflags flag)
52{
53 return rzs->table[index].flags & BIT(flag);
54}
55
56static void rzs_set_flag(struct ramzswap *rzs, u32 index,
57 enum rzs_pageflags flag)
58{
59 rzs->table[index].flags |= BIT(flag);
60}
61
62static void rzs_clear_flag(struct ramzswap *rzs, u32 index,
63 enum rzs_pageflags flag)
64{
65 rzs->table[index].flags &= ~BIT(flag);
66}
67
68static int page_zero_filled(void *ptr)
69{
70 unsigned int pos;
71 unsigned long *page;
72
73 page = (unsigned long *)ptr;
74
75 for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
76 if (page[pos])
77 return 0;
78 }
79
80 return 1;
81}
82
83/*
84 * memlimit cannot be greater than backing disk size.
85 */
86static void ramzswap_set_memlimit(struct ramzswap *rzs, size_t totalram_bytes)
87{
88 int memlimit_valid = 1;
89
90 if (!rzs->memlimit) {
91 pr_info("Memory limit not set.\n");
92 memlimit_valid = 0;
93 }
94
95 if (rzs->memlimit > rzs->disksize) {
96 pr_info("Memory limit cannot be greater than "
97 "disksize: limit=%zu, disksize=%zu\n",
98 rzs->memlimit, rzs->disksize);
99 memlimit_valid = 0;
100 }
101
102 if (!memlimit_valid) {
103 size_t mempart, disksize;
104 pr_info("Using default: smaller of (%u%% of RAM) and "
105 "(backing disk size).\n",
106 default_memlimit_perc_ram);
107 mempart = default_memlimit_perc_ram * (totalram_bytes / 100);
108 disksize = rzs->disksize;
109 rzs->memlimit = mempart > disksize ? disksize : mempart;
110 }
111
112 if (rzs->memlimit > totalram_bytes / 2) {
113 pr_info(
114 "Its not advisable setting limit more than half of "
115 "size of memory since we expect a 2:1 compression ratio. "
116 "Limit represents amount of *compressed* data we can keep "
117 "in memory!\n"
118 "\tMemory Size: %zu kB\n"
119 "\tLimit you selected: %zu kB\n"
120 "Continuing anyway ...\n",
121 totalram_bytes >> 10, rzs->memlimit >> 10
122 );
123 }
124
125 rzs->memlimit &= PAGE_MASK;
126 BUG_ON(!rzs->memlimit);
127}
128
129static void ramzswap_set_disksize(struct ramzswap *rzs, size_t totalram_bytes)
130{
131 if (!rzs->disksize) {
132 pr_info(
133 "disk size not provided. You can use disksize_kb module "
134 "param to specify size.\nUsing default: (%u%% of RAM).\n",
135 default_disksize_perc_ram
136 );
137 rzs->disksize = default_disksize_perc_ram *
138 (totalram_bytes / 100);
139 }
140
141 if (rzs->disksize > 2 * (totalram_bytes)) {
142 pr_info(
143 "There is little point creating a ramzswap of greater than "
144 "twice the size of memory since we expect a 2:1 compression "
145 "ratio. Note that ramzswap uses about 0.1%% of the size of "
146 "the swap device when not in use so a huge ramzswap is "
147 "wasteful.\n"
148 "\tMemory Size: %zu kB\n"
149 "\tSize you selected: %zu kB\n"
150 "Continuing anyway ...\n",
151 totalram_bytes >> 10, rzs->disksize
152 );
153 }
154
155 rzs->disksize &= PAGE_MASK;
156}
157
158/*
159 * Swap header (1st page of swap device) contains information
160 * to indentify it as a swap partition. Prepare such a header
161 * for ramzswap device (ramzswap0) so that swapon can identify
162 * it as swap partition. In case backing swap device is provided,
163 * copy its swap header.
164 */
165static int setup_swap_header(struct ramzswap *rzs, union swap_header *s)
166{
167 int ret = 0;
168 struct page *page;
169 struct address_space *mapping;
170 union swap_header *backing_swap_header;
171
172 /*
173 * There is no backing swap device. Create a swap header
174 * that is acceptable by swapon.
175 */
176 if (!rzs->backing_swap) {
177 s->info.version = 1;
178 s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
179 s->info.nr_badpages = 0;
180 memcpy(s->magic.magic, "SWAPSPACE2", 10);
181 return 0;
182 }
183
184 /*
185 * We have a backing swap device. Copy its swap header
186 * to ramzswap device header. If this header contains
187 * invalid information (backing device not a swap
188 * partition, etc.), swapon will fail for ramzswap
189 * which is correct behavior - we don't want to swap
190 * over filesystem partition!
191 */
192
193 /* Read the backing swap header (code from sys_swapon) */
194 mapping = rzs->swap_file->f_mapping;
195 if (!mapping->a_ops->readpage) {
196 ret = -EINVAL;
197 goto out;
198 }
199
200 page = read_mapping_page(mapping, 0, rzs->swap_file);
201 if (IS_ERR(page)) {
202 ret = PTR_ERR(page);
203 goto out;
204 }
205
206 backing_swap_header = kmap(page);
207 memcpy(s, backing_swap_header, sizeof(*s));
208 if (s->info.nr_badpages) {
209 pr_info("Cannot use backing swap with bad pages (%u)\n",
210 s->info.nr_badpages);
211 ret = -EINVAL;
212 }
213 /*
214 * ramzswap disksize equals number of usable pages in backing
215 * swap. Set last_page in swap header to match this disksize
216 * ('last_page' means 0-based index of last usable swap page).
217 */
218 s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
219 kunmap(page);
220
221out:
222 return ret;
223}
224
225static void ramzswap_flush_dcache_page(struct page *page)
226{
227#ifdef CONFIG_ARM
228 int flag = 0;
229 /*
230 * Ugly hack to get flush_dcache_page() work on ARM.
231 * page_mapping(page) == NULL after clearing this swap cache flag.
232 * Without clearing this flag, flush_dcache_page() will simply set
233 * "PG_dcache_dirty" bit and return.
234 */
235 if (PageSwapCache(page)) {
236 flag = 1;
237 ClearPageSwapCache(page);
238 }
239#endif
240 flush_dcache_page(page);
241#ifdef CONFIG_ARM
242 if (flag)
243 SetPageSwapCache(page);
244#endif
245}
246
247void ramzswap_ioctl_get_stats(struct ramzswap *rzs,
248 struct ramzswap_ioctl_stats *s)
249{
250 strncpy(s->backing_swap_name, rzs->backing_swap_name,
251 MAX_SWAP_NAME_LEN - 1);
252 s->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
253
254 s->disksize = rzs->disksize;
255 s->memlimit = rzs->memlimit;
256
257#if defined(CONFIG_RAMZSWAP_STATS)
258 {
259 struct ramzswap_stats *rs = &rzs->stats;
260 size_t succ_writes, mem_used;
261 unsigned int good_compress_perc = 0, no_compress_perc = 0;
262
263 mem_used = xv_get_total_size_bytes(rzs->mem_pool)
264 + (rs->pages_expand << PAGE_SHIFT);
265 succ_writes = rs->num_writes - rs->failed_writes;
266
267 if (succ_writes && rs->pages_stored) {
268 good_compress_perc = rs->good_compress * 100
269 / rs->pages_stored;
270 no_compress_perc = rs->pages_expand * 100
271 / rs->pages_stored;
272 }
273
274 s->num_reads = rs->num_reads;
275 s->num_writes = rs->num_writes;
276 s->failed_reads = rs->failed_reads;
277 s->failed_writes = rs->failed_writes;
278 s->invalid_io = rs->invalid_io;
279 s->pages_zero = rs->pages_zero;
280
281 s->good_compress_pct = good_compress_perc;
282 s->pages_expand_pct = no_compress_perc;
283
284 s->pages_stored = rs->pages_stored;
285 s->pages_used = mem_used >> PAGE_SHIFT;
286 s->orig_data_size = rs->pages_stored << PAGE_SHIFT;
287 s->compr_data_size = rs->compr_size;
288 s->mem_used_total = mem_used;
289
290 s->bdev_num_reads = rs->bdev_num_reads;
291 s->bdev_num_writes = rs->bdev_num_writes;
292 }
293#endif /* CONFIG_RAMZSWAP_STATS */
294}
295
296static int add_backing_swap_extent(struct ramzswap *rzs,
297 pgoff_t phy_pagenum,
298 pgoff_t num_pages)
299{
300 unsigned int idx;
301 struct list_head *head;
302 struct page *curr_page, *new_page;
303 unsigned int extents_per_page = PAGE_SIZE /
304 sizeof(struct ramzswap_backing_extent);
305
306 idx = rzs->num_extents % extents_per_page;
307 if (!idx) {
308 new_page = alloc_page(__GFP_ZERO);
309 if (!new_page)
310 return -ENOMEM;
311
312 if (rzs->num_extents) {
313 curr_page = virt_to_page(rzs->curr_extent);
314 head = &curr_page->lru;
315 } else {
316 head = &rzs->backing_swap_extent_list;
317 }
318
319 list_add(&new_page->lru, head);
320 rzs->curr_extent = page_address(new_page);
321 }
322
323 rzs->curr_extent->phy_pagenum = phy_pagenum;
324 rzs->curr_extent->num_pages = num_pages;
325
326 pr_debug("add_extent: idx=%u, phy_pgnum=%lu, num_pgs=%lu, "
327 "pg_last=%lu, curr_ext=%p\n", idx, phy_pagenum, num_pages,
328 phy_pagenum + num_pages - 1, rzs->curr_extent);
329
330 if (idx != extents_per_page - 1)
331 rzs->curr_extent++;
332
333 return 0;
334}
335
336static int setup_backing_swap_extents(struct ramzswap *rzs,
337 struct inode *inode, unsigned long *num_pages)
338{
339 int ret = 0;
340 unsigned blkbits;
341 unsigned blocks_per_page;
342 pgoff_t contig_pages = 0, total_pages = 0;
343 pgoff_t pagenum = 0, prev_pagenum = 0;
344 sector_t probe_block = 0;
345 sector_t last_block;
346
347 blkbits = inode->i_blkbits;
348 blocks_per_page = PAGE_SIZE >> blkbits;
349
350 last_block = i_size_read(inode) >> blkbits;
351 while (probe_block + blocks_per_page <= last_block) {
352 unsigned block_in_page;
353 sector_t first_block;
354
355 first_block = bmap(inode, probe_block);
356 if (first_block == 0)
357 goto bad_bmap;
358
359 /* It must be PAGE_SIZE aligned on-disk */
360 if (first_block & (blocks_per_page - 1)) {
361 probe_block++;
362 goto probe_next;
363 }
364
365 /* All blocks within this page must be contiguous on disk */
366 for (block_in_page = 1; block_in_page < blocks_per_page;
367 block_in_page++) {
368 sector_t block;
369
370 block = bmap(inode, probe_block + block_in_page);
371 if (block == 0)
372 goto bad_bmap;
373 if (block != first_block + block_in_page) {
374 /* Discontiguity */
375 probe_block++;
376 goto probe_next;
377 }
378 }
379
380 /*
381 * We found a PAGE_SIZE length, PAGE_SIZE aligned
382 * run of blocks.
383 */
384 pagenum = first_block >> (PAGE_SHIFT - blkbits);
385
386 if (total_pages && (pagenum != prev_pagenum + 1)) {
387 ret = add_backing_swap_extent(rzs, prev_pagenum -
388 (contig_pages - 1), contig_pages);
389 if (ret < 0)
390 goto out;
391 rzs->num_extents++;
392 contig_pages = 0;
393 }
394 total_pages++;
395 contig_pages++;
396 prev_pagenum = pagenum;
397 probe_block += blocks_per_page;
398
399probe_next:
400 continue;
401 }
402
403 if (contig_pages) {
404 pr_debug("adding last extent: pagenum=%lu, "
405 "contig_pages=%lu\n", pagenum, contig_pages);
406 ret = add_backing_swap_extent(rzs,
407 prev_pagenum - (contig_pages - 1), contig_pages);
408 if (ret < 0)
409 goto out;
410 rzs->num_extents++;
411 }
412 if (!rzs->num_extents) {
413 pr_err("No swap extents found!\n");
414 ret = -EINVAL;
415 }
416
417 if (!ret) {
418 *num_pages = total_pages;
419 pr_info("Found %lu extents containing %luk\n",
420 rzs->num_extents, *num_pages << (PAGE_SHIFT - 10));
421 }
422 goto out;
423
424bad_bmap:
425 pr_err("Backing swapfile has holes\n");
426 ret = -EINVAL;
427out:
428 while (ret && !list_empty(&rzs->backing_swap_extent_list)) {
429 struct page *page;
430 struct list_head *entry = rzs->backing_swap_extent_list.next;
431 page = list_entry(entry, struct page, lru);
432 list_del(entry);
433 __free_page(page);
434 }
435 return ret;
436}
437
438static void map_backing_swap_extents(struct ramzswap *rzs)
439{
440 struct ramzswap_backing_extent *se;
441 struct page *table_page, *se_page;
442 unsigned long num_pages, num_table_pages, entry;
443 unsigned long se_idx, span;
444 unsigned entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
445 unsigned extents_per_page = PAGE_SIZE / sizeof(*se);
446
447 /* True for block device */
448 if (!rzs->num_extents)
449 return;
450
451 se_page = list_entry(rzs->backing_swap_extent_list.next,
452 struct page, lru);
453 se = page_address(se_page);
454 span = se->num_pages;
455 num_pages = rzs->disksize >> PAGE_SHIFT;
456 num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
457 PAGE_SIZE);
458
459 entry = 0;
460 se_idx = 0;
461 while (num_table_pages--) {
462 table_page = vmalloc_to_page(&rzs->table[entry]);
463 while (span <= entry) {
464 se_idx++;
465 if (se_idx == rzs->num_extents)
466 BUG();
467
468 if (!(se_idx % extents_per_page)) {
469 se_page = list_entry(se_page->lru.next,
470 struct page, lru);
471 se = page_address(se_page);
472 } else
473 se++;
474
475 span += se->num_pages;
476 }
477 table_page->mapping = (struct address_space *)se;
478 table_page->private = se->num_pages - (span - entry);
479 pr_debug("map_table: entry=%lu, span=%lu, map=%p, priv=%lu\n",
480 entry, span, table_page->mapping, table_page->private);
481 entry += entries_per_page;
482 }
483}
484
485/*
486 * Check if value of backing_swap module param is sane.
487 * Claim this device and set ramzswap size equal to
488 * size of this block device.
489 */
490static int setup_backing_swap(struct ramzswap *rzs)
491{
492 int ret = 0;
493 size_t disksize;
494 unsigned long num_pages = 0;
495 struct inode *inode;
496 struct file *swap_file;
497 struct address_space *mapping;
498 struct block_device *bdev = NULL;
499
500 if (!rzs->backing_swap_name[0]) {
501 pr_debug("backing_swap param not given\n");
502 goto out;
503 }
504
505 pr_info("Using backing swap device: %s\n", rzs->backing_swap_name);
506
507 swap_file = filp_open(rzs->backing_swap_name,
508 O_RDWR | O_LARGEFILE, 0);
509 if (IS_ERR(swap_file)) {
510 pr_err("Error opening backing device: %s\n",
511 rzs->backing_swap_name);
512 ret = -EINVAL;
513 goto out;
514 }
515
516 mapping = swap_file->f_mapping;
517 inode = mapping->host;
518
519 if (S_ISBLK(inode->i_mode)) {
520 bdev = I_BDEV(inode);
521 ret = bd_claim(bdev, setup_backing_swap);
522 if (ret < 0) {
523 bdev = NULL;
524 goto bad_param;
525 }
526 disksize = i_size_read(inode);
527 } else if (S_ISREG(inode->i_mode)) {
528 bdev = inode->i_sb->s_bdev;
529 if (IS_SWAPFILE(inode)) {
530 ret = -EBUSY;
531 goto bad_param;
532 }
533 ret = setup_backing_swap_extents(rzs, inode, &num_pages);
534 if (ret < 0)
535 goto bad_param;
536 disksize = num_pages << PAGE_SHIFT;
537 } else {
538 goto bad_param;
539 }
540
541 rzs->swap_file = swap_file;
542 rzs->backing_swap = bdev;
543 rzs->disksize = disksize;
544 BUG_ON(!rzs->disksize);
545
546 return 0;
547
548bad_param:
549 if (bdev)
550 bd_release(bdev);
551 filp_close(swap_file, NULL);
552
553out:
554 rzs->backing_swap = NULL;
555 return ret;
556}
557
558/*
559 * Map logical page number 'pagenum' to physical page number
560 * on backing swap device. For block device, this is a nop.
561 */
562u32 map_backing_swap_page(struct ramzswap *rzs, u32 pagenum)
563{
564 u32 skip_pages, entries_per_page;
565 size_t delta, se_offset, skipped;
566 struct page *table_page, *se_page;
567 struct ramzswap_backing_extent *se;
568
569 if (!rzs->num_extents)
570 return pagenum;
571
572 entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
573
574 table_page = vmalloc_to_page(&rzs->table[pagenum]);
575 se = (struct ramzswap_backing_extent *)table_page->mapping;
576 se_page = virt_to_page(se);
577
578 skip_pages = pagenum - (pagenum / entries_per_page * entries_per_page);
579 se_offset = table_page->private + skip_pages;
580
581 if (se_offset < se->num_pages)
582 return se->phy_pagenum + se_offset;
583
584 skipped = se->num_pages - table_page->private;
585 do {
586 struct ramzswap_backing_extent *se_base;
587 u32 se_entries_per_page = PAGE_SIZE / sizeof(*se);
588
589 /* Get next swap extent */
590 se_base = (struct ramzswap_backing_extent *)
591 page_address(se_page);
592 if (se - se_base == se_entries_per_page - 1) {
593 se_page = list_entry(se_page->lru.next,
594 struct page, lru);
595 se = page_address(se_page);
596 } else {
597 se++;
598 }
599
600 skipped += se->num_pages;
601 } while (skipped < skip_pages);
602
603 delta = skipped - skip_pages;
604 se_offset = se->num_pages - delta;
605
606 return se->phy_pagenum + se_offset;
607}
608
609static void ramzswap_free_page(struct ramzswap *rzs, size_t index)
610{
611 u32 clen;
612 void *obj;
613
614 struct page *page = rzs->table[index].page;
615 u32 offset = rzs->table[index].offset;
616
617 if (unlikely(!page)) {
618 if (rzs_test_flag(rzs, index, RZS_ZERO)) {
619 rzs_clear_flag(rzs, index, RZS_ZERO);
620 stat_dec(rzs->stats.pages_zero);
621 }
622 return;
623 }
624
625 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) {
626 clen = PAGE_SIZE;
627 __free_page(page);
628 rzs_clear_flag(rzs, index, RZS_UNCOMPRESSED);
629 stat_dec(rzs->stats.pages_expand);
630 goto out;
631 }
632
633 obj = kmap_atomic(page, KM_USER0) + offset;
634 clen = xv_get_object_size(obj) - sizeof(struct zobj_header);
635 kunmap_atomic(obj, KM_USER0);
636
637 xv_free(rzs->mem_pool, page, offset);
638 if (clen <= PAGE_SIZE / 2)
639 stat_dec(rzs->stats.good_compress);
640
641out:
642 rzs->stats.compr_size -= clen;
643 stat_dec(rzs->stats.pages_stored);
644
645 rzs->table[index].page = NULL;
646 rzs->table[index].offset = 0;
647}
648
649static int handle_zero_page(struct bio *bio)
650{
651 void *user_mem;
652 struct page *page = bio->bi_io_vec[0].bv_page;
653
654 user_mem = kmap_atomic(page, KM_USER0);
655 memset(user_mem, 0, PAGE_SIZE);
656 kunmap_atomic(user_mem, KM_USER0);
657
658 ramzswap_flush_dcache_page(page);
659
660 set_bit(BIO_UPTODATE, &bio->bi_flags);
661 bio_endio(bio, 0);
662 return 0;
663}
664
665static int handle_uncompressed_page(struct ramzswap *rzs, struct bio *bio)
666{
667 u32 index;
668 struct page *page;
669 unsigned char *user_mem, *cmem;
670
671 page = bio->bi_io_vec[0].bv_page;
672 index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
673
674 user_mem = kmap_atomic(page, KM_USER0);
675 cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
676 rzs->table[index].offset;
677
678 memcpy(user_mem, cmem, PAGE_SIZE);
679 kunmap_atomic(user_mem, KM_USER0);
680 kunmap_atomic(cmem, KM_USER1);
681
682 ramzswap_flush_dcache_page(page);
683
684 set_bit(BIO_UPTODATE, &bio->bi_flags);
685 bio_endio(bio, 0);
686 return 0;
687}
688
689
690/*
691 * Called when request page is not present in ramzswap.
692 * Its either in backing swap device (if present) or
693 * this is an attempt to read before any previous write
694 * to this location - this happens due to readahead when
695 * swap device is read from user-space (e.g. during swapon)
696 */
697static int handle_ramzswap_fault(struct ramzswap *rzs, struct bio *bio)
698{
699 /*
700 * Always forward such requests to backing swap
701 * device (if present)
702 */
703 if (rzs->backing_swap) {
704 u32 pagenum;
705 stat_dec(rzs->stats.num_reads);
706 stat_inc(rzs->stats.bdev_num_reads);
707 bio->bi_bdev = rzs->backing_swap;
708
709 /*
710 * In case backing swap is a file, find the right offset within
711 * the file corresponding to logical position 'index'. For block
712 * device, this is a nop.
713 */
714 pagenum = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
715 bio->bi_sector = map_backing_swap_page(rzs, pagenum)
716 << SECTORS_PER_PAGE_SHIFT;
717 return 1;
718 }
719
720 /*
721 * Its unlikely event in case backing dev is
722 * not present
723 */
724 pr_debug("Read before write on swap device: "
725 "sector=%lu, size=%u, offset=%u\n",
726 (ulong)(bio->bi_sector), bio->bi_size,
727 bio->bi_io_vec[0].bv_offset);
728
729 /* Do nothing. Just return success */
730 set_bit(BIO_UPTODATE, &bio->bi_flags);
731 bio_endio(bio, 0);
732 return 0;
733}
734
735static int ramzswap_read(struct ramzswap *rzs, struct bio *bio)
736{
737 int ret;
738 u32 index;
739 size_t clen;
740 struct page *page;
741 struct zobj_header *zheader;
742 unsigned char *user_mem, *cmem;
743
744 stat_inc(rzs->stats.num_reads);
745
746 page = bio->bi_io_vec[0].bv_page;
747 index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
748
749 if (rzs_test_flag(rzs, index, RZS_ZERO))
750 return handle_zero_page(bio);
751
752 /* Requested page is not present in compressed area */
753 if (!rzs->table[index].page)
754 return handle_ramzswap_fault(rzs, bio);
755
756 /* Page is stored uncompressed since its incompressible */
757 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
758 return handle_uncompressed_page(rzs, bio);
759
760 user_mem = kmap_atomic(page, KM_USER0);
761 clen = PAGE_SIZE;
762
763 cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
764 rzs->table[index].offset;
765
766 ret = lzo1x_decompress_safe(
767 cmem + sizeof(*zheader),
768 xv_get_object_size(cmem) - sizeof(*zheader),
769 user_mem, &clen);
770
771 kunmap_atomic(user_mem, KM_USER0);
772 kunmap_atomic(cmem, KM_USER1);
773
774 /* should NEVER happen */
775 if (unlikely(ret != LZO_E_OK)) {
776 pr_err("Decompression failed! err=%d, page=%u\n",
777 ret, index);
778 stat_inc(rzs->stats.failed_reads);
779 goto out;
780 }
781
782 ramzswap_flush_dcache_page(page);
783
784 set_bit(BIO_UPTODATE, &bio->bi_flags);
785 bio_endio(bio, 0);
786 return 0;
787
788out:
789 bio_io_error(bio);
790 return 0;
791}
792
793static int ramzswap_write(struct ramzswap *rzs, struct bio *bio)
794{
795 int ret, fwd_write_request = 0;
796 u32 offset, index;
797 size_t clen;
798 struct zobj_header *zheader;
799 struct page *page, *page_store;
800 unsigned char *user_mem, *cmem, *src;
801
802 stat_inc(rzs->stats.num_writes);
803
804 page = bio->bi_io_vec[0].bv_page;
805 index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
806
807 src = rzs->compress_buffer;
808
809 /*
810 * System swaps to same sector again when the stored page
811 * is no longer referenced by any process. So, its now safe
812 * to free the memory that was allocated for this page.
813 */
814 if (rzs->table[index].page)
815 ramzswap_free_page(rzs, index);
816
817 /*
818 * No memory ia allocated for zero filled pages.
819 * Simply clear zero page flag.
820 */
821 if (rzs_test_flag(rzs, index, RZS_ZERO)) {
822 stat_dec(rzs->stats.pages_zero);
823 rzs_clear_flag(rzs, index, RZS_ZERO);
824 }
825
826 mutex_lock(&rzs->lock);
827
828 user_mem = kmap_atomic(page, KM_USER0);
829 if (page_zero_filled(user_mem)) {
830 kunmap_atomic(user_mem, KM_USER0);
831 mutex_unlock(&rzs->lock);
832 stat_inc(rzs->stats.pages_zero);
833 rzs_set_flag(rzs, index, RZS_ZERO);
834
835 set_bit(BIO_UPTODATE, &bio->bi_flags);
836 bio_endio(bio, 0);
837 return 0;
838 }
839
840 if (rzs->backing_swap &&
841 (rzs->stats.compr_size > rzs->memlimit - PAGE_SIZE)) {
842 kunmap_atomic(user_mem, KM_USER0);
843 mutex_unlock(&rzs->lock);
844 fwd_write_request = 1;
845 goto out;
846 }
847
848 ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen,
849 rzs->compress_workmem);
850
851 kunmap_atomic(user_mem, KM_USER0);
852
853 if (unlikely(ret != LZO_E_OK)) {
854 mutex_unlock(&rzs->lock);
855 pr_err("Compression failed! err=%d\n", ret);
856 stat_inc(rzs->stats.failed_writes);
857 goto out;
858 }
859
860 /*
861 * Page is incompressible. Forward it to backing swap
862 * if present. Otherwise, store it as-is (uncompressed)
863 * since we do not want to return too many swap write
864 * errors which has side effect of hanging the system.
865 */
866 if (unlikely(clen > max_zpage_size)) {
867 if (rzs->backing_swap) {
868 mutex_unlock(&rzs->lock);
869 fwd_write_request = 1;
870 goto out;
871 }
872
873 clen = PAGE_SIZE;
874 page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
875 if (unlikely(!page_store)) {
876 mutex_unlock(&rzs->lock);
877 pr_info("Error allocating memory for incompressible "
878 "page: %u\n", index);
879 stat_inc(rzs->stats.failed_writes);
880 goto out;
881 }
882
883 offset = 0;
884 rzs_set_flag(rzs, index, RZS_UNCOMPRESSED);
885 stat_inc(rzs->stats.pages_expand);
886 rzs->table[index].page = page_store;
887 src = kmap_atomic(page, KM_USER0);
888 goto memstore;
889 }
890
891 if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader),
892 &rzs->table[index].page, &offset,
893 GFP_NOIO | __GFP_HIGHMEM)) {
894 mutex_unlock(&rzs->lock);
895 pr_info("Error allocating memory for compressed "
896 "page: %u, size=%zu\n", index, clen);
897 stat_inc(rzs->stats.failed_writes);
898 if (rzs->backing_swap)
899 fwd_write_request = 1;
900 goto out;
901 }
902
903memstore:
904 rzs->table[index].offset = offset;
905
906 cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
907 rzs->table[index].offset;
908
909#if 0
910 /* Back-reference needed for memory defragmentation */
911 if (!rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)) {
912 zheader = (struct zobj_header *)cmem;
913 zheader->table_idx = index;
914 cmem += sizeof(*zheader);
915 }
916#endif
917
918 memcpy(cmem, src, clen);
919
920 kunmap_atomic(cmem, KM_USER1);
921 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
922 kunmap_atomic(src, KM_USER0);
923
924 /* Update stats */
925 rzs->stats.compr_size += clen;
926 stat_inc(rzs->stats.pages_stored);
927 if (clen <= PAGE_SIZE / 2)
928 stat_inc(rzs->stats.good_compress);
929
930 mutex_unlock(&rzs->lock);
931
932 set_bit(BIO_UPTODATE, &bio->bi_flags);
933 bio_endio(bio, 0);
934 return 0;
935
936out:
937 if (fwd_write_request) {
938 stat_inc(rzs->stats.bdev_num_writes);
939 bio->bi_bdev = rzs->backing_swap;
940#if 0
941 /*
942 * TODO: We currently have linear mapping of ramzswap and
943 * backing swap sectors. This is not desired since we want
944 * to optimize writes to backing swap to minimize disk seeks
945 * or have effective wear leveling (for SSDs). Also, a
946 * non-linear mapping is required to implement compressed
947 * on-disk swapping.
948 */
949 bio->bi_sector = get_backing_swap_page()
950 << SECTORS_PER_PAGE_SHIFT;
951#endif
952 /*
953 * In case backing swap is a file, find the right offset within
954 * the file corresponding to logical position 'index'. For block
955 * device, this is a nop.
956 */
957 bio->bi_sector = map_backing_swap_page(rzs, index)
958 << SECTORS_PER_PAGE_SHIFT;
959 return 1;
960 }
961
962 bio_io_error(bio);
963 return 0;
964}
965
966
967/*
968 * Check if request is within bounds and page aligned.
969 */
970static inline int valid_swap_request(struct ramzswap *rzs, struct bio *bio)
971{
972 if (unlikely(
973 (bio->bi_sector >= (rzs->disksize >> SECTOR_SHIFT)) ||
974 (bio->bi_sector & (SECTORS_PER_PAGE - 1)) ||
975 (bio->bi_vcnt != 1) ||
976 (bio->bi_size != PAGE_SIZE) ||
977 (bio->bi_io_vec[0].bv_offset != 0))) {
978
979 return 0;
980 }
981
982 /* swap request is valid */
983 return 1;
984}
985
986/*
987 * Handler function for all ramzswap I/O requests.
988 */
989static int ramzswap_make_request(struct request_queue *queue, struct bio *bio)
990{
991 int ret = 0;
992 struct ramzswap *rzs = queue->queuedata;
993
994 if (unlikely(!rzs->init_done)) {
995 bio_io_error(bio);
996 return 0;
997 }
998
999 if (!valid_swap_request(rzs, bio)) {
1000 stat_inc(rzs->stats.invalid_io);
1001 bio_io_error(bio);
1002 return 0;
1003 }
1004
1005 switch (bio_data_dir(bio)) {
1006 case READ:
1007 ret = ramzswap_read(rzs, bio);
1008 break;
1009
1010 case WRITE:
1011 ret = ramzswap_write(rzs, bio);
1012 break;
1013 }
1014
1015 return ret;
1016}
1017
1018static void reset_device(struct ramzswap *rzs)
1019{
1020 int is_backing_blkdev = 0;
1021 size_t index, num_pages;
1022 unsigned entries_per_page;
1023 unsigned long num_table_pages, entry = 0;
1024
1025 if (rzs->backing_swap && !rzs->num_extents)
1026 is_backing_blkdev = 1;
1027
1028 num_pages = rzs->disksize >> PAGE_SHIFT;
1029
1030 /* Free various per-device buffers */
1031 kfree(rzs->compress_workmem);
1032 free_pages((unsigned long)rzs->compress_buffer, 1);
1033
1034 rzs->compress_workmem = NULL;
1035 rzs->compress_buffer = NULL;
1036
1037 /* Free all pages that are still in this ramzswap device */
1038 for (index = 0; index < num_pages; index++) {
1039 struct page *page;
1040 u16 offset;
1041
1042 page = rzs->table[index].page;
1043 offset = rzs->table[index].offset;
1044
1045 if (!page)
1046 continue;
1047
1048 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
1049 __free_page(page);
1050 else
1051 xv_free(rzs->mem_pool, page, offset);
1052 }
1053
1054 entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
1055 num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
1056 PAGE_SIZE);
1057 /*
1058 * Set page->mapping to NULL for every table page.
1059 * Otherwise, we will hit bad_page() during free.
1060 */
1061 while (rzs->num_extents && num_table_pages--) {
1062 struct page *page;
1063 page = vmalloc_to_page(&rzs->table[entry]);
1064 page->mapping = NULL;
1065 entry += entries_per_page;
1066 }
1067 vfree(rzs->table);
1068 rzs->table = NULL;
1069
1070 xv_destroy_pool(rzs->mem_pool);
1071 rzs->mem_pool = NULL;
1072
1073 /* Free all swap extent pages */
1074 while (!list_empty(&rzs->backing_swap_extent_list)) {
1075 struct page *page;
1076 struct list_head *entry;
1077 entry = rzs->backing_swap_extent_list.next;
1078 page = list_entry(entry, struct page, lru);
1079 list_del(entry);
1080 __free_page(page);
1081 }
1082 INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
1083 rzs->num_extents = 0;
1084
1085 /* Close backing swap device, if present */
1086 if (rzs->backing_swap) {
1087 if (is_backing_blkdev)
1088 bd_release(rzs->backing_swap);
1089 filp_close(rzs->swap_file, NULL);
1090 rzs->backing_swap = NULL;
1091 }
1092
1093 /* Reset stats */
1094 memset(&rzs->stats, 0, sizeof(rzs->stats));
1095
1096 rzs->disksize = 0;
1097 rzs->memlimit = 0;
1098
1099 /* Back to uninitialized state */
1100 rzs->init_done = 0;
1101}
1102
1103static int ramzswap_ioctl_init_device(struct ramzswap *rzs)
1104{
1105 int ret;
1106 size_t num_pages;
1107 struct page *page;
1108 union swap_header *swap_header;
1109
1110 if (rzs->init_done) {
1111 pr_info("Device already initialized!\n");
1112 return -EBUSY;
1113 }
1114
1115 ret = setup_backing_swap(rzs);
1116 if (ret)
1117 goto fail;
1118
1119 if (rzs->backing_swap)
1120 ramzswap_set_memlimit(rzs, totalram_pages << PAGE_SHIFT);
1121 else
1122 ramzswap_set_disksize(rzs, totalram_pages << PAGE_SHIFT);
1123
1124 rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
1125 if (!rzs->compress_workmem) {
1126 pr_err("Error allocating compressor working memory!\n");
1127 ret = -ENOMEM;
1128 goto fail;
1129 }
1130
1131 rzs->compress_buffer = (void *)__get_free_pages(__GFP_ZERO, 1);
1132 if (!rzs->compress_buffer) {
1133 pr_err("Error allocating compressor buffer space\n");
1134 ret = -ENOMEM;
1135 goto fail;
1136 }
1137
1138 num_pages = rzs->disksize >> PAGE_SHIFT;
1139 rzs->table = vmalloc(num_pages * sizeof(*rzs->table));
1140 if (!rzs->table) {
1141 pr_err("Error allocating ramzswap address table\n");
1142 /* To prevent accessing table entries during cleanup */
1143 rzs->disksize = 0;
1144 ret = -ENOMEM;
1145 goto fail;
1146 }
1147 memset(rzs->table, 0, num_pages * sizeof(*rzs->table));
1148
1149 map_backing_swap_extents(rzs);
1150
1151 page = alloc_page(__GFP_ZERO);
1152 if (!page) {
1153 pr_err("Error allocating swap header page\n");
1154 ret = -ENOMEM;
1155 goto fail;
1156 }
1157 rzs->table[0].page = page;
1158 rzs_set_flag(rzs, 0, RZS_UNCOMPRESSED);
1159
1160 swap_header = kmap(page);
1161 ret = setup_swap_header(rzs, swap_header);
1162 kunmap(page);
1163 if (ret) {
1164 pr_err("Error setting swap header\n");
1165 goto fail;
1166 }
1167
1168 set_capacity(rzs->disk, rzs->disksize >> SECTOR_SHIFT);
1169
1170 /*
1171 * We have ident mapping of sectors for ramzswap and
1172 * and the backing swap device. So, this queue flag
1173 * should be according to backing dev.
1174 */
1175 if (!rzs->backing_swap ||
1176 blk_queue_nonrot(rzs->backing_swap->bd_disk->queue))
1177 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, rzs->disk->queue);
1178
1179 rzs->mem_pool = xv_create_pool();
1180 if (!rzs->mem_pool) {
1181 pr_err("Error creating memory pool\n");
1182 ret = -ENOMEM;
1183 goto fail;
1184 }
1185
1186 /*
1187 * Pages that compress to size greater than this are forwarded
1188 * to physical swap disk (if backing dev is provided)
1189 * TODO: make this configurable
1190 */
1191 if (rzs->backing_swap)
1192 max_zpage_size = max_zpage_size_bdev;
1193 else
1194 max_zpage_size = max_zpage_size_nobdev;
1195 pr_debug("Max compressed page size: %u bytes\n", max_zpage_size);
1196
1197 rzs->init_done = 1;
1198
1199 pr_debug("Initialization done!\n");
1200 return 0;
1201
1202fail:
1203 reset_device(rzs);
1204
1205 pr_err("Initialization failed: err=%d\n", ret);
1206 return ret;
1207}
1208
1209static int ramzswap_ioctl_reset_device(struct ramzswap *rzs)
1210{
1211 if (rzs->init_done)
1212 reset_device(rzs);
1213
1214 return 0;
1215}
1216
1217static int ramzswap_ioctl(struct block_device *bdev, fmode_t mode,
1218 unsigned int cmd, unsigned long arg)
1219{
1220 int ret = 0;
1221 size_t disksize_kb, memlimit_kb;
1222
1223 struct ramzswap *rzs = bdev->bd_disk->private_data;
1224
1225 switch (cmd) {
1226 case RZSIO_SET_DISKSIZE_KB:
1227 if (rzs->init_done) {
1228 ret = -EBUSY;
1229 goto out;
1230 }
1231 if (copy_from_user(&disksize_kb, (void *)arg,
1232 _IOC_SIZE(cmd))) {
1233 ret = -EFAULT;
1234 goto out;
1235 }
1236 rzs->disksize = disksize_kb << 10;
1237 pr_info("Disk size set to %zu kB\n", disksize_kb);
1238 break;
1239
1240 case RZSIO_SET_MEMLIMIT_KB:
1241 if (rzs->init_done) {
1242 /* TODO: allow changing memlimit */
1243 ret = -EBUSY;
1244 goto out;
1245 }
1246 if (copy_from_user(&memlimit_kb, (void *)arg,
1247 _IOC_SIZE(cmd))) {
1248 ret = -EFAULT;
1249 goto out;
1250 }
1251 rzs->memlimit = memlimit_kb << 10;
1252 pr_info("Memory limit set to %zu kB\n", memlimit_kb);
1253 break;
1254
1255 case RZSIO_SET_BACKING_SWAP:
1256 if (rzs->init_done) {
1257 ret = -EBUSY;
1258 goto out;
1259 }
1260
1261 if (copy_from_user(&rzs->backing_swap_name, (void *)arg,
1262 _IOC_SIZE(cmd))) {
1263 ret = -EFAULT;
1264 goto out;
1265 }
1266 rzs->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
1267 pr_info("Backing swap set to %s\n", rzs->backing_swap_name);
1268 break;
1269
1270 case RZSIO_GET_STATS:
1271 {
1272 struct ramzswap_ioctl_stats *stats;
1273 if (!rzs->init_done) {
1274 ret = -ENOTTY;
1275 goto out;
1276 }
1277 stats = kzalloc(sizeof(*stats), GFP_KERNEL);
1278 if (!stats) {
1279 ret = -ENOMEM;
1280 goto out;
1281 }
1282 ramzswap_ioctl_get_stats(rzs, stats);
1283 if (copy_to_user((void *)arg, stats, sizeof(*stats))) {
1284 kfree(stats);
1285 ret = -EFAULT;
1286 goto out;
1287 }
1288 kfree(stats);
1289 break;
1290 }
1291 case RZSIO_INIT:
1292 ret = ramzswap_ioctl_init_device(rzs);
1293 break;
1294
1295 case RZSIO_RESET:
1296 /* Do not reset an active device! */
1297 if (bdev->bd_holders) {
1298 ret = -EBUSY;
1299 goto out;
1300 }
1301 ret = ramzswap_ioctl_reset_device(rzs);
1302 break;
1303
1304 default:
1305 pr_info("Invalid ioctl %u\n", cmd);
1306 ret = -ENOTTY;
1307 }
1308
1309out:
1310 return ret;
1311}
1312
1313static struct block_device_operations ramzswap_devops = {
1314 .ioctl = ramzswap_ioctl,
1315 .owner = THIS_MODULE,
1316};
1317
1318static void create_device(struct ramzswap *rzs, int device_id)
1319{
1320 mutex_init(&rzs->lock);
1321 INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
1322
1323 rzs->queue = blk_alloc_queue(GFP_KERNEL);
1324 if (!rzs->queue) {
1325 pr_err("Error allocating disk queue for device %d\n",
1326 device_id);
1327 return;
1328 }
1329
1330 blk_queue_make_request(rzs->queue, ramzswap_make_request);
1331 rzs->queue->queuedata = rzs;
1332
1333 /* gendisk structure */
1334 rzs->disk = alloc_disk(1);
1335 if (!rzs->disk) {
1336 blk_cleanup_queue(rzs->queue);
1337 pr_warning("Error allocating disk structure for device %d\n",
1338 device_id);
1339 return;
1340 }
1341
1342 rzs->disk->major = ramzswap_major;
1343 rzs->disk->first_minor = device_id;
1344 rzs->disk->fops = &ramzswap_devops;
1345 rzs->disk->queue = rzs->queue;
1346 rzs->disk->private_data = rzs;
1347 snprintf(rzs->disk->disk_name, 16, "ramzswap%d", device_id);
1348
1349 /*
1350 * Actual capacity set using RZSIO_SET_DISKSIZE_KB ioctl
1351 * or set equal to backing swap device (if provided)
1352 */
1353 set_capacity(rzs->disk, 0);
1354 add_disk(rzs->disk);
1355
1356 rzs->init_done = 0;
1357}
1358
1359static void destroy_device(struct ramzswap *rzs)
1360{
1361 if (rzs->disk) {
1362 del_gendisk(rzs->disk);
1363 put_disk(rzs->disk);
1364 }
1365
1366 if (rzs->queue)
1367 blk_cleanup_queue(rzs->queue);
1368}
1369
1370static int __init ramzswap_init(void)
1371{
1372 int i, ret;
1373
1374 if (num_devices > max_num_devices) {
1375 pr_warning("Invalid value for num_devices: %u\n",
1376 num_devices);
1377 return -EINVAL;
1378 }
1379
1380 ramzswap_major = register_blkdev(0, "ramzswap");
1381 if (ramzswap_major <= 0) {
1382 pr_warning("Unable to get major number\n");
1383 return -EBUSY;
1384 }
1385
1386 if (!num_devices) {
1387 pr_info("num_devices not specified. Using default: 1\n");
1388 num_devices = 1;
1389 }
1390
1391 /* Allocate the device array and initialize each one */
1392 pr_info("Creating %u devices ...\n", num_devices);
1393 devices = kzalloc(num_devices * sizeof(struct ramzswap), GFP_KERNEL);
1394 if (!devices) {
1395 ret = -ENOMEM;
1396 goto out;
1397 }
1398
1399 for (i = 0; i < num_devices; i++)
1400 create_device(&devices[i], i);
1401
1402 return 0;
1403out:
1404 unregister_blkdev(ramzswap_major, "ramzswap");
1405 return ret;
1406}
1407
1408static void __exit ramzswap_exit(void)
1409{
1410 int i;
1411 struct ramzswap *rzs;
1412
1413 for (i = 0; i < num_devices; i++) {
1414 rzs = &devices[i];
1415
1416 destroy_device(rzs);
1417 if (rzs->init_done)
1418 reset_device(rzs);
1419 }
1420
1421 unregister_blkdev(ramzswap_major, "ramzswap");
1422
1423 kfree(devices);
1424 pr_debug("Cleanup done!\n");
1425}
1426
1427module_param(num_devices, uint, 0);
1428MODULE_PARM_DESC(num_devices, "Number of ramzswap devices");
1429
1430module_init(ramzswap_init);
1431module_exit(ramzswap_exit);
1432
1433MODULE_LICENSE("Dual BSD/GPL");
1434MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
1435MODULE_DESCRIPTION("Compressed RAM Based Swap Device");
This page took 0.074357 seconds and 5 git commands to generate.