drivers/md/dm-cache-policy-mq.c

   1 /*
   2  * Copyright (C) 2012 Red Hat. All rights reserved.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm-cache-policy.h"
   8 #include "dm.h"
   9
  10 #include <linux/hash.h>
  11 #include <linux/module.h>
  12 #include <linux/mutex.h>
  13 #include <linux/slab.h>
  14 #include <linux/vmalloc.h>
  15
  16 #define DM_MSG_PREFIX "cache-policy-mq"
  17
  18 static struct kmem_cache *mq_entry_cache;
  19
  20 /*----------------------------------------------------------------*/
  21
  22 static unsigned next_power(unsigned n, unsigned min)
  23 {
  24         return roundup_pow_of_two(max(n, min));
  25 }
  26
  27 /*----------------------------------------------------------------*/
  28
  29 static unsigned long *alloc_bitset(unsigned nr_entries)
  30 {
  31         size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
  32         return vzalloc(s);
  33 }
  34
  35 static void free_bitset(unsigned long *bits)
  36 {
  37         vfree(bits);
  38 }
  39
  40 /*----------------------------------------------------------------*/
  41
  42 /*
  43  * Large, sequential ios are probably better left on the origin device since
  44  * spindles tend to have good bandwidth.
  45  *
  46  * The io_tracker tries to spot when the io is in one of these sequential
  47  * modes.
  48  *
  49  * Two thresholds to switch between random and sequential io mode are defaulting
  50  * as follows and can be adjusted via the constructor and message interfaces.
  51  */
  52 #define RANDOM_THRESHOLD_DEFAULT 4
  53 #define SEQUENTIAL_THRESHOLD_DEFAULT 512
  54
  55 enum io_pattern {
  56         PATTERN_SEQUENTIAL,
  57         PATTERN_RANDOM
  58 };
  59
  60 struct io_tracker {
  61         enum io_pattern pattern;
  62
  63         unsigned nr_seq_samples;
  64         unsigned nr_rand_samples;
  65         unsigned thresholds[2];
  66
  67         dm_oblock_t last_end_oblock;
  68 };
  69
  70 static void iot_init(struct io_tracker *t,
  71                      int sequential_threshold, int random_threshold)
  72 {
  73         t->pattern = PATTERN_RANDOM;
  74         t->nr_seq_samples = 0;
  75         t->nr_rand_samples = 0;
  76         t->last_end_oblock = 0;
  77         t->thresholds[PATTERN_RANDOM] = random_threshold;
  78         t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold;
  79 }
  80
  81 static enum io_pattern iot_pattern(struct io_tracker *t)
  82 {
  83         return t->pattern;
  84 }
  85
  86 static void iot_update_stats(struct io_tracker *t, struct bio *bio)
  87 {
  88         if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1)
  89                 t->nr_seq_samples++;
  90         else {
  91                 /*
  92                  * Just one non-sequential IO is enough to reset the
  93                  * counters.
  94                  */
  95                 if (t->nr_seq_samples) {
  96                         t->nr_seq_samples = 0;
  97                         t->nr_rand_samples = 0;
  98                 }
  99
 100                 t->nr_rand_samples++;
 101         }
 102
 103         t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1);
 104 }
 105
 106 static void iot_check_for_pattern_switch(struct io_tracker *t)
 107 {
 108         switch (t->pattern) {
 109         case PATTERN_SEQUENTIAL:
 110                 if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) {
 111                         t->pattern = PATTERN_RANDOM;
 112                         t->nr_seq_samples = t->nr_rand_samples = 0;
 113                 }
 114                 break;
 115
 116         case PATTERN_RANDOM:
 117                 if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) {
 118                         t->pattern = PATTERN_SEQUENTIAL;
 119                         t->nr_seq_samples = t->nr_rand_samples = 0;
 120                 }
 121                 break;
 122         }
 123 }
 124
 125 static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
 126 {
 127         iot_update_stats(t, bio);
 128         iot_check_for_pattern_switch(t);
 129 }
 130
 131 /*----------------------------------------------------------------*/
 132
 133
 134 /*
 135  * This queue is divided up into different levels.  Allowing us to push
 136  * entries to the back of any of the levels.  Think of it as a partially
 137  * sorted queue.
 138  */
 139 #define NR_QUEUE_LEVELS 16u
 140
 141 struct queue {
 142         struct list_head qs[NR_QUEUE_LEVELS];
 143 };
 144
 145 static void queue_init(struct queue *q)
 146 {
 147         unsigned i;
 148
 149         for (i = 0; i < NR_QUEUE_LEVELS; i++)
 150                 INIT_LIST_HEAD(q->qs + i);
 151 }
 152
 153 /*
 154  * Checks to see if the queue is empty.
 155  * FIXME: reduce cpu usage.
 156  */
 157 static bool queue_empty(struct queue *q)
 158 {
 159         unsigned i;
 160
 161         for (i = 0; i < NR_QUEUE_LEVELS; i++)
 162                 if (!list_empty(q->qs + i))
 163                         return false;
 164
 165         return true;
 166 }
 167
 168 /*
 169  * Insert an entry to the back of the given level.
 170  */
 171 static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
 172 {
 173         list_add_tail(elt, q->qs + level);
 174 }
 175
 176 static void queue_remove(struct list_head *elt)
 177 {
 178         list_del(elt);
 179 }
 180
 181 /*
 182  * Shifts all regions down one level.  This has no effect on the order of
 183  * the queue.
 184  */
 185 static void queue_shift_down(struct queue *q)
 186 {
 187         unsigned level;
 188
 189         for (level = 1; level < NR_QUEUE_LEVELS; level++)
 190                 list_splice_init(q->qs + level, q->qs + level - 1);
 191 }
 192
 193 /*
 194  * Gives us the oldest entry of the lowest popoulated level.  If the first
 195  * level is emptied then we shift down one level.
 196  */
 197 static struct list_head *queue_pop(struct queue *q)
 198 {
 199         unsigned level;
 200         struct list_head *r;
 201
 202         for (level = 0; level < NR_QUEUE_LEVELS; level++)
 203                 if (!list_empty(q->qs + level)) {
 204                         r = q->qs[level].next;
 205                         list_del(r);
 206
 207                         /* have we just emptied the bottom level? */
 208                         if (level == 0 && list_empty(q->qs))
 209                                 queue_shift_down(q);
 210
 211                         return r;
 212                 }
 213
 214         return NULL;
 215 }
 216
 217 static struct list_head *list_pop(struct list_head *lh)
 218 {
 219         struct list_head *r = lh->next;
 220
 221         BUG_ON(!r);
 222         list_del_init(r);
 223
 224         return r;
 225 }
 226
 227 /*----------------------------------------------------------------*/
 228
 229 /*
 230  * Describes a cache entry.  Used in both the cache and the pre_cache.
 231  */
 232 struct entry {
 233         struct hlist_node hlist;
 234         struct list_head list;
 235         dm_oblock_t oblock;
 236         dm_cblock_t cblock;     /* valid iff in_cache */
 237
 238         /*
 239          * FIXME: pack these better
 240          */
 241         bool in_cache:1;
 242         bool dirty:1;
 243         unsigned hit_count;
 244         unsigned generation;
 245         unsigned tick;
 246 };
 247
 248 struct mq_policy {
 249         struct dm_cache_policy policy;
 250
 251         /* protects everything */
 252         struct mutex lock;
 253         dm_cblock_t cache_size;
 254         struct io_tracker tracker;
 255
 256         /*
 257          * We maintain three queues of entries.  The cache proper,
 258          * consisting of a clean and dirty queue, contains the currently
 259          * active mappings.  Whereas the pre_cache tracks blocks that
 260          * are being hit frequently and potential candidates for promotion
 261          * to the cache.
 262          */
 263         struct queue pre_cache;
 264         struct queue cache_clean;
 265         struct queue cache_dirty;
 266
 267         /*
 268          * Keeps track of time, incremented by the core.  We use this to
 269          * avoid attributing multiple hits within the same tick.
 270          *
 271          * Access to tick_protected should be done with the spin lock held.
 272          * It's copied to tick at the start of the map function (within the
 273          * mutex).
 274          */
 275         spinlock_t tick_lock;
 276         unsigned tick_protected;
 277         unsigned tick;
 278
 279         /*
 280          * A count of the number of times the map function has been called
 281          * and found an entry in the pre_cache or cache.  Currently used to
 282          * calculate the generation.
 283          */
 284         unsigned hit_count;
 285
 286         /*
 287          * A generation is a longish period that is used to trigger some
 288          * book keeping effects.  eg, decrementing hit counts on entries.
 289          * This is needed to allow the cache to evolve as io patterns
 290          * change.
 291          */
 292         unsigned generation;
 293         unsigned generation_period; /* in lookups (will probably change) */
 294
 295         /*
 296          * Entries in the pre_cache whose hit count passes the promotion
 297          * threshold move to the cache proper.  Working out the correct
 298          * value for the promotion_threshold is crucial to this policy.
 299          */
 300         unsigned promote_threshold;
 301
 302         /*
 303          * We need cache_size entries for the cache, and choose to have
 304          * cache_size entries for the pre_cache too.  One motivation for
 305          * using the same size is to make the hit counts directly
 306          * comparable between pre_cache and cache.
 307          */
 308         unsigned nr_entries;
 309         unsigned nr_entries_allocated;
 310         struct list_head free;
 311
 312         /*
 313          * Cache blocks may be unallocated.  We store this info in a
 314          * bitset.
 315          */
 316         unsigned long *allocation_bitset;
 317         unsigned nr_cblocks_allocated;
 318         unsigned find_free_nr_words;
 319         unsigned find_free_last_word;
 320
 321         /*
 322          * The hash table allows us to quickly find an entry by origin
 323          * block.  Both pre_cache and cache entries are in here.
 324          */
 325         unsigned nr_buckets;
 326         dm_block_t hash_bits;
 327         struct hlist_head *table;
 328 };
 329
 330 /*----------------------------------------------------------------*/
 331 /* Free/alloc mq cache entry structures. */
 332 static void concat_queue(struct list_head *lh, struct queue *q)
 333 {
 334         unsigned level;
 335
 336         for (level = 0; level < NR_QUEUE_LEVELS; level++)
 337                 list_splice(q->qs + level, lh);
 338 }
 339
 340 static void free_entries(struct mq_policy *mq)
 341 {
 342         struct entry *e, *tmp;
 343
 344         concat_queue(&mq->free, &mq->pre_cache);
 345         concat_queue(&mq->free, &mq->cache_clean);
 346         concat_queue(&mq->free, &mq->cache_dirty);
 347
 348         list_for_each_entry_safe(e, tmp, &mq->free, list)
 349                 kmem_cache_free(mq_entry_cache, e);
 350 }
 351
 352 static int alloc_entries(struct mq_policy *mq, unsigned elts)
 353 {
 354         unsigned u = mq->nr_entries;
 355
 356         INIT_LIST_HEAD(&mq->free);
 357         mq->nr_entries_allocated = 0;
 358
 359         while (u--) {
 360                 struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL);
 361
 362                 if (!e) {
 363                         free_entries(mq);
 364                         return -ENOMEM;
 365                 }
 366
 367
 368                 list_add(&e->list, &mq->free);
 369         }
 370
 371         return 0;
 372 }
 373
 374 /*----------------------------------------------------------------*/
 375
 376 /*
 377  * Simple hash table implementation.  Should replace with the standard hash
 378  * table that's making its way upstream.
 379  */
 380 static void hash_insert(struct mq_policy *mq, struct entry *e)
 381 {
 382         unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits);
 383
 384         hlist_add_head(&e->hlist, mq->table + h);
 385 }
 386
 387 static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock)
 388 {
 389         unsigned h = hash_64(from_oblock(oblock), mq->hash_bits);
 390         struct hlist_head *bucket = mq->table + h;
 391         struct entry *e;
 392
 393         hlist_for_each_entry(e, bucket, hlist)
 394                 if (e->oblock == oblock) {
 395                         hlist_del(&e->hlist);
 396                         hlist_add_head(&e->hlist, bucket);
 397                         return e;
 398                 }
 399
 400         return NULL;
 401 }
 402
 403 static void hash_remove(struct entry *e)
 404 {
 405         hlist_del(&e->hlist);
 406 }
 407
 408 /*----------------------------------------------------------------*/
 409
 410 /*
 411  * Allocates a new entry structure.  The memory is allocated in one lump,
 412  * so we just handing it out here.  Returns NULL if all entries have
 413  * already been allocated.  Cannot fail otherwise.
 414  */
 415 static struct entry *alloc_entry(struct mq_policy *mq)
 416 {
 417         struct entry *e;
 418
 419         if (mq->nr_entries_allocated >= mq->nr_entries) {
 420                 BUG_ON(!list_empty(&mq->free));
 421                 return NULL;
 422         }
 423
 424         e = list_entry(list_pop(&mq->free), struct entry, list);
 425         INIT_LIST_HEAD(&e->list);
 426         INIT_HLIST_NODE(&e->hlist);
 427
 428         mq->nr_entries_allocated++;
 429         return e;
 430 }
 431
 432 /*----------------------------------------------------------------*/
 433
 434 /*
 435  * Mark cache blocks allocated or not in the bitset.
 436  */
 437 static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock)
 438 {
 439         BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
 440         BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset));
 441
 442         set_bit(from_cblock(cblock), mq->allocation_bitset);
 443         mq->nr_cblocks_allocated++;
 444 }
 445
 446 static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock)
 447 {
 448         BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
 449         BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset));
 450
 451         clear_bit(from_cblock(cblock), mq->allocation_bitset);
 452         mq->nr_cblocks_allocated--;
 453 }
 454
 455 static bool any_free_cblocks(struct mq_policy *mq)
 456 {
 457         return mq->nr_cblocks_allocated < from_cblock(mq->cache_size);
 458 }
 459
 460 static bool any_clean_cblocks(struct mq_policy *mq)
 461 {
 462         return !queue_empty(&mq->cache_clean);
 463 }
 464
 465 /*
 466  * Fills result out with a cache block that isn't in use, or return
 467  * -ENOSPC.  This does _not_ mark the cblock as allocated, the caller is
 468  * reponsible for that.
 469  */
 470 static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end,
 471                               dm_cblock_t *result, unsigned *last_word)
 472 {
 473         int r = -ENOSPC;
 474         unsigned w;
 475
 476         for (w = begin; w < end; w++) {
 477                 /*
 478                  * ffz is undefined if no zero exists
 479                  */
 480                 if (mq->allocation_bitset[w] != ~0UL) {
 481                         *last_word = w;
 482                         *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w]));
 483                         if (from_cblock(*result) < from_cblock(mq->cache_size))
 484                                 r = 0;
 485
 486                         break;
 487                 }
 488         }
 489
 490         return r;
 491 }
 492
 493 static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result)
 494 {
 495         int r;
 496
 497         if (!any_free_cblocks(mq))
 498                 return -ENOSPC;
 499
 500         r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word);
 501         if (r == -ENOSPC && mq->find_free_last_word)
 502                 r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word);
 503
 504         return r;
 505 }
 506
 507 /*----------------------------------------------------------------*/
 508
 509 /*
 510  * Now we get to the meat of the policy.  This section deals with deciding
 511  * when to to add entries to the pre_cache and cache, and move between
 512  * them.
 513  */
 514
 515 /*
 516  * The queue level is based on the log2 of the hit count.
 517  */
 518 static unsigned queue_level(struct entry *e)
 519 {
 520         return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
 521 }
 522
 523 /*
 524  * Inserts the entry into the pre_cache or the cache.  Ensures the cache
 525  * block is marked as allocated if necc.  Inserts into the hash table.  Sets the
 526  * tick which records when the entry was last moved about.
 527  */
 528 static void push(struct mq_policy *mq, struct entry *e)
 529 {
 530         e->tick = mq->tick;
 531         hash_insert(mq, e);
 532
 533         if (e->in_cache) {
 534                 alloc_cblock(mq, e->cblock);
 535                 queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean,
 536                            queue_level(e), &e->list);
 537         } else
 538                 queue_push(&mq->pre_cache, queue_level(e), &e->list);
 539 }
 540
 541 /*
 542  * Removes an entry from pre_cache or cache.  Removes from the hash table.
 543  * Frees off the cache block if necc.
 544  */
 545 static void del(struct mq_policy *mq, struct entry *e)
 546 {
 547         queue_remove(&e->list);
 548         hash_remove(e);
 549         if (e->in_cache)
 550                 free_cblock(mq, e->cblock);
 551 }
 552
 553 /*
 554  * Like del, except it removes the first entry in the queue (ie. the least
 555  * recently used).
 556  */
 557 static struct entry *pop(struct mq_policy *mq, struct queue *q)
 558 {
 559         struct entry *e;
 560         struct list_head *h = queue_pop(q);
 561
 562         if (!h)
 563                 return NULL;
 564
 565         e = container_of(h, struct entry, list);
 566         hash_remove(e);
 567         if (e->in_cache)
 568                 free_cblock(mq, e->cblock);
 569
 570         return e;
 571 }
 572
 573 /*
 574  * Has this entry already been updated?
 575  */
 576 static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
 577 {
 578         return mq->tick == e->tick;
 579 }
 580
 581 /*
 582  * The promotion threshold is adjusted every generation.  As are the counts
 583  * of the entries.
 584  *
 585  * At the moment the threshold is taken by averaging the hit counts of some
 586  * of the entries in the cache (the first 20 entries across all levels in
 587  * ascending order, giving preference to the clean entries at each level).
 588  *
 589  * We can be much cleverer than this though.  For example, each promotion
 590  * could bump up the threshold helping to prevent churn.  Much more to do
 591  * here.
 592  */
 593
 594 #define MAX_TO_AVERAGE 20
 595
 596 static void check_generation(struct mq_policy *mq)
 597 {
 598         unsigned total = 0, nr = 0, count = 0, level;
 599         struct list_head *head;
 600         struct entry *e;
 601
 602         if ((mq->hit_count >= mq->generation_period) &&
 603             (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) {
 604
 605                 mq->hit_count = 0;
 606                 mq->generation++;
 607
 608                 for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
 609                         head = mq->cache_clean.qs + level;
 610                         list_for_each_entry(e, head, list) {
 611                                 nr++;
 612                                 total += e->hit_count;
 613
 614                                 if (++count >= MAX_TO_AVERAGE)
 615                                         break;
 616                         }
 617
 618                         head = mq->cache_dirty.qs + level;
 619                         list_for_each_entry(e, head, list) {
 620                                 nr++;
 621                                 total += e->hit_count;
 622
 623                                 if (++count >= MAX_TO_AVERAGE)
 624                                         break;
 625                         }
 626                 }
 627
 628                 mq->promote_threshold = nr ? total / nr : 1;
 629                 if (mq->promote_threshold * nr < total)
 630                         mq->promote_threshold++;
 631         }
 632 }
 633
 634 /*
 635  * Whenever we use an entry we bump up it's hit counter, and push it to the
 636  * back to it's current level.
 637  */
 638 static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
 639 {
 640         if (updated_this_tick(mq, e))
 641                 return;
 642
 643         e->hit_count++;
 644         mq->hit_count++;
 645         check_generation(mq);
 646
 647         /* generation adjustment, to stop the counts increasing forever. */
 648         /* FIXME: divide? */
 649         /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
 650         e->generation = mq->generation;
 651
 652         del(mq, e);
 653         push(mq, e);
 654 }
 655
 656 /*
 657  * Demote the least recently used entry from the cache to the pre_cache.
 658  * Returns the new cache entry to use, and the old origin block it was
 659  * mapped to.
 660  *
 661  * We drop the hit count on the demoted entry back to 1 to stop it bouncing
 662  * straight back into the cache if it's subsequently hit.  There are
 663  * various options here, and more experimentation would be good:
 664  *
 665  * - just forget about the demoted entry completely (ie. don't insert it
 666      into the pre_cache).
 667  * - divide the hit count rather that setting to some hard coded value.
 668  * - set the hit count to a hard coded value other than 1, eg, is it better
 669  *   if it goes in at level 2?
 670  */
 671 static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock, dm_cblock_t *cblock)
 672 {
 673         struct entry *demoted = pop(mq, &mq->cache_clean);
 674
 675         if (!demoted)
 676                 /*
 677                  * We could get a block from mq->cache_dirty, but that
 678                  * would add extra latency to the triggering bio as it
 679                  * waits for the writeback.  Better to not promote this
 680                  * time and hope there's a clean block next time this block
 681                  * is hit.
 682                  */
 683                 return -ENOSPC;
 684
 685         *cblock = demoted->cblock;
 686         *oblock = demoted->oblock;
 687         demoted->in_cache = false;
 688         demoted->dirty = false;
 689         demoted->hit_count = 1;
 690         push(mq, demoted);
 691
 692         return 0;
 693 }
 694
 695 /*
 696  * We modify the basic promotion_threshold depending on the specific io.
 697  *
 698  * If the origin block has been discarded then there's no cost to copy it
 699  * to the cache.
 700  *
 701  * We bias towards reads, since they can be demoted at no cost if they
 702  * haven't been dirtied.
 703  */
 704 #define DISCARDED_PROMOTE_THRESHOLD 1
 705 #define READ_PROMOTE_THRESHOLD 4
 706 #define WRITE_PROMOTE_THRESHOLD 8
 707
 708 static unsigned adjusted_promote_threshold(struct mq_policy *mq,
 709                                            bool discarded_oblock, int data_dir)
 710 {
 711         if (data_dir == READ)
 712                 return mq->promote_threshold + READ_PROMOTE_THRESHOLD;
 713
 714         if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
 715                 /*
 716                  * We don't need to do any copying at all, so give this a
 717                  * very low threshold.
 718                  */
 719                 return DISCARDED_PROMOTE_THRESHOLD;
 720         }
 721
 722         return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD;
 723 }
 724
 725 static bool should_promote(struct mq_policy *mq, struct entry *e,
 726                            bool discarded_oblock, int data_dir)
 727 {
 728         return e->hit_count >=
 729                 adjusted_promote_threshold(mq, discarded_oblock, data_dir);
 730 }
 731
 732 static int cache_entry_found(struct mq_policy *mq,
 733                              struct entry *e,
 734                              struct policy_result *result)
 735 {
 736         requeue_and_update_tick(mq, e);
 737
 738         if (e->in_cache) {
 739                 result->op = POLICY_HIT;
 740                 result->cblock = e->cblock;
 741         }
 742
 743         return 0;
 744 }
 745
 746 /*
 747  * Moves an entry from the pre_cache to the cache.  The main work is
 748  * finding which cache block to use.
 749  */
 750 static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 751                               struct policy_result *result)
 752 {
 753         int r;
 754         dm_cblock_t cblock;
 755
 756         if (find_free_cblock(mq, &cblock) == -ENOSPC) {
 757                 result->op = POLICY_REPLACE;
 758                 r = demote_cblock(mq, &result->old_oblock, &cblock);
 759                 if (r) {
 760                         result->op = POLICY_MISS;
 761                         return 0;
 762                 }
 763         } else
 764                 result->op = POLICY_NEW;
 765
 766         result->cblock = e->cblock = cblock;
 767
 768         del(mq, e);
 769         e->in_cache = true;
 770         e->dirty = false;
 771         push(mq, e);
 772
 773         return 0;
 774 }
 775
 776 static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 777                                  bool can_migrate, bool discarded_oblock,
 778                                  int data_dir, struct policy_result *result)
 779 {
 780         int r = 0;
 781         bool updated = updated_this_tick(mq, e);
 782
 783         requeue_and_update_tick(mq, e);
 784
 785         if ((!discarded_oblock && updated) ||
 786             !should_promote(mq, e, discarded_oblock, data_dir))
 787                 result->op = POLICY_MISS;
 788         else if (!can_migrate)
 789                 r = -EWOULDBLOCK;
 790         else
 791                 r = pre_cache_to_cache(mq, e, result);
 792
 793         return r;
 794 }
 795
 796 static void insert_entry_in_pre_cache(struct mq_policy *mq,
 797                                       struct entry *e, dm_oblock_t oblock)
 798 {
 799         e->in_cache = false;
 800         e->dirty = false;
 801         e->oblock = oblock;
 802         e->hit_count = 1;
 803         e->generation = mq->generation;
 804         push(mq, e);
 805 }
 806
 807 static void insert_in_pre_cache(struct mq_policy *mq,
 808                                 dm_oblock_t oblock)
 809 {
 810         struct entry *e = alloc_entry(mq);
 811
 812         if (!e)
 813                 /*
 814                  * There's no spare entry structure, so we grab the least
 815                  * used one from the pre_cache.
 816                  */
 817                 e = pop(mq, &mq->pre_cache);
 818
 819         if (unlikely(!e)) {
 820                 DMWARN("couldn't pop from pre cache");
 821                 return;
 822         }
 823
 824         insert_entry_in_pre_cache(mq, e, oblock);
 825 }
 826
 827 static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 828                             struct policy_result *result)
 829 {
 830         int r;
 831         struct entry *e;
 832         dm_cblock_t cblock;
 833
 834         if (find_free_cblock(mq, &cblock) == -ENOSPC) {
 835                 r = demote_cblock(mq, &result->old_oblock, &cblock);
 836                 if (unlikely(r)) {
 837                         result->op = POLICY_MISS;
 838                         insert_in_pre_cache(mq, oblock);
 839                         return;
 840                 }
 841
 842                 /*
 843                  * This will always succeed, since we've just demoted.
 844                  */
 845                 e = pop(mq, &mq->pre_cache);
 846                 result->op = POLICY_REPLACE;
 847
 848         } else {
 849                 e = alloc_entry(mq);
 850                 if (unlikely(!e))
 851                         e = pop(mq, &mq->pre_cache);
 852
 853                 if (unlikely(!e)) {
 854                         result->op = POLICY_MISS;
 855                         return;
 856                 }
 857
 858                 result->op = POLICY_NEW;
 859         }
 860
 861         e->oblock = oblock;
 862         e->cblock = cblock;
 863         e->in_cache = true;
 864         e->dirty = false;
 865         e->hit_count = 1;
 866         e->generation = mq->generation;
 867         push(mq, e);
 868
 869         result->cblock = e->cblock;
 870 }
 871
 872 static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
 873                           bool can_migrate, bool discarded_oblock,
 874                           int data_dir, struct policy_result *result)
 875 {
 876         if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) {
 877                 if (can_migrate)
 878                         insert_in_cache(mq, oblock, result);
 879                 else
 880                         return -EWOULDBLOCK;
 881         } else {
 882                 insert_in_pre_cache(mq, oblock);
 883                 result->op = POLICY_MISS;
 884         }
 885
 886         return 0;
 887 }
 888
 889 /*
 890  * Looks the oblock up in the hash table, then decides whether to put in
 891  * pre_cache, or cache etc.
 892  */
 893 static int map(struct mq_policy *mq, dm_oblock_t oblock,
 894                bool can_migrate, bool discarded_oblock,
 895                int data_dir, struct policy_result *result)
 896 {
 897         int r = 0;
 898         struct entry *e = hash_lookup(mq, oblock);
 899
 900         if (e && e->in_cache)
 901                 r = cache_entry_found(mq, e, result);
 902         else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
 903                 result->op = POLICY_MISS;
 904         else if (e)
 905                 r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
 906                                           data_dir, result);
 907         else
 908                 r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
 909                                    data_dir, result);
 910
 911         if (r == -EWOULDBLOCK)
 912                 result->op = POLICY_MISS;
 913
 914         return r;
 915 }
 916
 917 /*----------------------------------------------------------------*/
 918
 919 /*
 920  * Public interface, via the policy struct.  See dm-cache-policy.h for a
 921  * description of these.
 922  */
 923
 924 static struct mq_policy *to_mq_policy(struct dm_cache_policy *p)
 925 {
 926         return container_of(p, struct mq_policy, policy);
 927 }
 928
 929 static void mq_destroy(struct dm_cache_policy *p)
 930 {
 931         struct mq_policy *mq = to_mq_policy(p);
 932
 933         free_bitset(mq->allocation_bitset);
 934         kfree(mq->table);
 935         free_entries(mq);
 936         kfree(mq);
 937 }
 938
 939 static void copy_tick(struct mq_policy *mq)
 940 {
 941         unsigned long flags;
 942
 943         spin_lock_irqsave(&mq->tick_lock, flags);
 944         mq->tick = mq->tick_protected;
 945         spin_unlock_irqrestore(&mq->tick_lock, flags);
 946 }
 947
 948 static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
 949                   bool can_block, bool can_migrate, bool discarded_oblock,
 950                   struct bio *bio, struct policy_result *result)
 951 {
 952         int r;
 953         struct mq_policy *mq = to_mq_policy(p);
 954
 955         result->op = POLICY_MISS;
 956
 957         if (can_block)
 958                 mutex_lock(&mq->lock);
 959         else if (!mutex_trylock(&mq->lock))
 960                 return -EWOULDBLOCK;
 961
 962         copy_tick(mq);
 963
 964         iot_examine_bio(&mq->tracker, bio);
 965         r = map(mq, oblock, can_migrate, discarded_oblock,
 966                 bio_data_dir(bio), result);
 967
 968         mutex_unlock(&mq->lock);
 969
 970         return r;
 971 }
 972
 973 static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
 974 {
 975         int r;
 976         struct mq_policy *mq = to_mq_policy(p);
 977         struct entry *e;
 978
 979         if (!mutex_trylock(&mq->lock))
 980                 return -EWOULDBLOCK;
 981
 982         e = hash_lookup(mq, oblock);
 983         if (e && e->in_cache) {
 984                 *cblock = e->cblock;
 985                 r = 0;
 986         } else
 987                 r = -ENOENT;
 988
 989         mutex_unlock(&mq->lock);
 990
 991         return r;
 992 }
 993
 994 /*
 995  * FIXME: __mq_set_clear_dirty can block due to mutex.
 996  * Ideally a policy should not block in functions called
 997  * from the map() function.  Explore using RCU.
 998  */
 999 static void __mq_set_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock, bool set)
1000 {
1001         struct mq_policy *mq = to_mq_policy(p);
1002         struct entry *e;
1003
1004         mutex_lock(&mq->lock);
1005         e = hash_lookup(mq, oblock);
1006         if (!e)
1007                 DMWARN("__mq_set_clear_dirty called for a block that isn't in the cache");
1008         else {
1009                 BUG_ON(!e->in_cache);
1010
1011                 del(mq, e);
1012                 e->dirty = set;
1013                 push(mq, e);
1014         }
1015         mutex_unlock(&mq->lock);
1016 }
1017
1018 static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
1019 {
1020         __mq_set_clear_dirty(p, oblock, true);
1021 }
1022
1023 static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
1024 {
1025         __mq_set_clear_dirty(p, oblock, false);
1026 }
1027
1028 static int mq_load_mapping(struct dm_cache_policy *p,
1029                            dm_oblock_t oblock, dm_cblock_t cblock,
1030                            uint32_t hint, bool hint_valid)
1031 {
1032         struct mq_policy *mq = to_mq_policy(p);
1033         struct entry *e;
1034
1035         e = alloc_entry(mq);
1036         if (!e)
1037                 return -ENOMEM;
1038
1039         e->cblock = cblock;
1040         e->oblock = oblock;
1041         e->in_cache = true;
1042         e->dirty = false;       /* this gets corrected in a minute */
1043         e->hit_count = hint_valid ? hint : 1;
1044         e->generation = mq->generation;
1045         push(mq, e);
1046
1047         return 0;
1048 }
1049
1050 static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
1051                             void *context)
1052 {
1053         struct mq_policy *mq = to_mq_policy(p);
1054         int r = 0;
1055         struct entry *e;
1056         unsigned level;
1057
1058         mutex_lock(&mq->lock);
1059
1060         for (level = 0; level < NR_QUEUE_LEVELS; level++)
1061                 list_for_each_entry(e, &mq->cache_clean.qs[level], list) {
1062                         r = fn(context, e->cblock, e->oblock, e->hit_count);
1063                         if (r)
1064                                 goto out;
1065                 }
1066
1067         for (level = 0; level < NR_QUEUE_LEVELS; level++)
1068                 list_for_each_entry(e, &mq->cache_dirty.qs[level], list) {
1069                         r = fn(context, e->cblock, e->oblock, e->hit_count);
1070                         if (r)
1071                                 goto out;
1072                 }
1073
1074 out:
1075         mutex_unlock(&mq->lock);
1076
1077         return r;
1078 }
1079
1080 static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
1081 {
1082         struct mq_policy *mq = to_mq_policy(p);
1083         struct entry *e;
1084
1085         mutex_lock(&mq->lock);
1086
1087         e = hash_lookup(mq, oblock);
1088
1089         BUG_ON(!e || !e->in_cache);
1090
1091         del(mq, e);
1092         e->in_cache = false;
1093         e->dirty = false;
1094         push(mq, e);
1095
1096         mutex_unlock(&mq->lock);
1097 }
1098
1099 static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
1100                               dm_cblock_t *cblock)
1101 {
1102         struct entry *e = pop(mq, &mq->cache_dirty);
1103
1104         if (!e)
1105                 return -ENODATA;
1106
1107         *oblock = e->oblock;
1108         *cblock = e->cblock;
1109         e->dirty = false;
1110         push(mq, e);
1111
1112         return 0;
1113 }
1114
1115 static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
1116                              dm_cblock_t *cblock)
1117 {
1118         int r;
1119         struct mq_policy *mq = to_mq_policy(p);
1120
1121         mutex_lock(&mq->lock);
1122         r = __mq_writeback_work(mq, oblock, cblock);
1123         mutex_unlock(&mq->lock);
1124
1125         return r;
1126 }
1127
1128 static void force_mapping(struct mq_policy *mq,
1129                           dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1130 {
1131         struct entry *e = hash_lookup(mq, current_oblock);
1132
1133         BUG_ON(!e || !e->in_cache);
1134
1135         del(mq, e);
1136         e->oblock = new_oblock;
1137         e->dirty = true;
1138         push(mq, e);
1139 }
1140
1141 static void mq_force_mapping(struct dm_cache_policy *p,
1142                              dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1143 {
1144         struct mq_policy *mq = to_mq_policy(p);
1145
1146         mutex_lock(&mq->lock);
1147         force_mapping(mq, current_oblock, new_oblock);
1148         mutex_unlock(&mq->lock);
1149 }
1150
1151 static dm_cblock_t mq_residency(struct dm_cache_policy *p)
1152 {
1153         dm_cblock_t r;
1154         struct mq_policy *mq = to_mq_policy(p);
1155
1156         mutex_lock(&mq->lock);
1157         r = to_cblock(mq->nr_cblocks_allocated);
1158         mutex_unlock(&mq->lock);
1159
1160         return r;
1161 }
1162
1163 static void mq_tick(struct dm_cache_policy *p)
1164 {
1165         struct mq_policy *mq = to_mq_policy(p);
1166         unsigned long flags;
1167
1168         spin_lock_irqsave(&mq->tick_lock, flags);
1169         mq->tick_protected++;
1170         spin_unlock_irqrestore(&mq->tick_lock, flags);
1171 }
1172
1173 static int mq_set_config_value(struct dm_cache_policy *p,
1174                                const char *key, const char *value)
1175 {
1176         struct mq_policy *mq = to_mq_policy(p);
1177         enum io_pattern pattern;
1178         unsigned long tmp;
1179
1180         if (!strcasecmp(key, "random_threshold"))
1181                 pattern = PATTERN_RANDOM;
1182         else if (!strcasecmp(key, "sequential_threshold"))
1183                 pattern = PATTERN_SEQUENTIAL;
1184         else
1185                 return -EINVAL;
1186
1187         if (kstrtoul(value, 10, &tmp))
1188                 return -EINVAL;
1189
1190         mq->tracker.thresholds[pattern] = tmp;
1191
1192         return 0;
1193 }
1194
1195 static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
1196 {
1197         ssize_t sz = 0;
1198         struct mq_policy *mq = to_mq_policy(p);
1199
1200         DMEMIT("4 random_threshold %u sequential_threshold %u",
1201                mq->tracker.thresholds[PATTERN_RANDOM],
1202                mq->tracker.thresholds[PATTERN_SEQUENTIAL]);
1203
1204         return 0;
1205 }
1206
1207 /* Init the policy plugin interface function pointers. */
1208 static void init_policy_functions(struct mq_policy *mq)
1209 {
1210         mq->policy.destroy = mq_destroy;
1211         mq->policy.map = mq_map;
1212         mq->policy.lookup = mq_lookup;
1213         mq->policy.set_dirty = mq_set_dirty;
1214         mq->policy.clear_dirty = mq_clear_dirty;
1215         mq->policy.load_mapping = mq_load_mapping;
1216         mq->policy.walk_mappings = mq_walk_mappings;
1217         mq->policy.remove_mapping = mq_remove_mapping;
1218         mq->policy.writeback_work = mq_writeback_work;
1219         mq->policy.force_mapping = mq_force_mapping;
1220         mq->policy.residency = mq_residency;
1221         mq->policy.tick = mq_tick;
1222         mq->policy.emit_config_values = mq_emit_config_values;
1223         mq->policy.set_config_value = mq_set_config_value;
1224 }
1225
1226 static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1227                                          sector_t origin_size,
1228                                          sector_t cache_block_size)
1229 {
1230         int r;
1231         struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1232
1233         if (!mq)
1234                 return NULL;
1235
1236         init_policy_functions(mq);
1237         iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
1238
1239         mq->cache_size = cache_size;
1240         mq->tick_protected = 0;
1241         mq->tick = 0;
1242         mq->hit_count = 0;
1243         mq->generation = 0;
1244         mq->promote_threshold = 0;
1245         mutex_init(&mq->lock);
1246         spin_lock_init(&mq->tick_lock);
1247         mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG);
1248         mq->find_free_last_word = 0;
1249
1250         queue_init(&mq->pre_cache);
1251         queue_init(&mq->cache_clean);
1252         queue_init(&mq->cache_dirty);
1253
1254         mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
1255
1256         mq->nr_entries = 2 * from_cblock(cache_size);
1257         r = alloc_entries(mq, mq->nr_entries);
1258         if (r)
1259                 goto bad_cache_alloc;
1260
1261         mq->nr_entries_allocated = 0;
1262         mq->nr_cblocks_allocated = 0;
1263
1264         mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
1265         mq->hash_bits = ffs(mq->nr_buckets) - 1;
1266         mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL);
1267         if (!mq->table)
1268                 goto bad_alloc_table;
1269
1270         mq->allocation_bitset = alloc_bitset(from_cblock(cache_size));
1271         if (!mq->allocation_bitset)
1272                 goto bad_alloc_bitset;
1273
1274         return &mq->policy;
1275
1276 bad_alloc_bitset:
1277         kfree(mq->table);
1278 bad_alloc_table:
1279         free_entries(mq);
1280 bad_cache_alloc:
1281         kfree(mq);
1282
1283         return NULL;
1284 }
1285
1286 /*----------------------------------------------------------------*/
1287
1288 static struct dm_cache_policy_type mq_policy_type = {
1289         .name = "mq",
1290         .version = {1, 0, 0},
1291         .hint_size = 4,
1292         .owner = THIS_MODULE,
1293         .create = mq_create
1294 };
1295
1296 static struct dm_cache_policy_type default_policy_type = {
1297         .name = "default",
1298         .version = {1, 0, 0},
1299         .hint_size = 4,
1300         .owner = THIS_MODULE,
1301         .create = mq_create
1302 };
1303
1304 static int __init mq_init(void)
1305 {
1306         int r;
1307
1308         mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry",
1309                                            sizeof(struct entry),
1310                                            __alignof__(struct entry),
1311                                            0, NULL);
1312         if (!mq_entry_cache)
1313                 goto bad;
1314
1315         r = dm_cache_policy_register(&mq_policy_type);
1316         if (r) {
1317                 DMERR("register failed %d", r);
1318                 goto bad_register_mq;
1319         }
1320
1321         r = dm_cache_policy_register(&default_policy_type);
1322         if (!r) {
1323                 DMINFO("version %u.%u.%u loaded",
1324                        mq_policy_type.version[0],
1325                        mq_policy_type.version[1],
1326                        mq_policy_type.version[2]);
1327                 return 0;
1328         }
1329
1330         DMERR("register failed (as default) %d", r);
1331
1332         dm_cache_policy_unregister(&mq_policy_type);
1333 bad_register_mq:
1334         kmem_cache_destroy(mq_entry_cache);
1335 bad:
1336         return -ENOMEM;
1337 }
1338
1339 static void __exit mq_exit(void)
1340 {
1341         dm_cache_policy_unregister(&mq_policy_type);
1342         dm_cache_policy_unregister(&default_policy_type);
1343
1344         kmem_cache_destroy(mq_entry_cache);
1345 }
1346
1347 module_init(mq_init);
1348 module_exit(mq_exit);
1349
1350 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1351 MODULE_LICENSE("GPL");
1352 MODULE_DESCRIPTION("mq cache policy");
1353
1354 MODULE_ALIAS("dm-cache-default");