block/cfq-iosched.c

   1 /*
   2  *  CFQ, or complete fairness queueing, disk scheduler.
   3  *
   4  *  Based on ideas from a previously unfinished io
   5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
   6  *
   7  *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   8  */
   9 #include <linux/module.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/elevator.h>
  12 #include <linux/rbtree.h>
  13 #include <linux/ioprio.h>
  14 #include <linux/blktrace_api.h>
  15
  16 /*
  17  * tunables
  18  */
  19 /* max queue in one round of service */
  20 static const int cfq_quantum = 4;
  21 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
  22 /* maximum backwards seek, in KiB */
  23 static const int cfq_back_max = 16 * 1024;
  24 /* penalty of a backwards seek */
  25 static const int cfq_back_penalty = 2;
  26 static const int cfq_slice_sync = HZ / 10;
  27 static int cfq_slice_async = HZ / 25;
  28 static const int cfq_slice_async_rq = 2;
  29 static int cfq_slice_idle = HZ / 125;
  30 static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
  31 static const int cfq_hist_divisor = 4;
  32
  33 /*
  34  * offset from end of service tree
  35  */
  36 #define CFQ_IDLE_DELAY          (HZ / 5)
  37
  38 /*
  39  * below this threshold, we consider thinktime immediate
  40  */
  41 #define CFQ_MIN_TT              (2)
  42
  43 /*
  44  * Allow merged cfqqs to perform this amount of seeky I/O before
  45  * deciding to break the queues up again.
  46  */
  47 #define CFQQ_COOP_TOUT          (HZ)
  48
  49 #define CFQ_SLICE_SCALE         (5)
  50 #define CFQ_HW_QUEUE_MIN        (5)
  51
  52 #define RQ_CIC(rq)              \
  53         ((struct cfq_io_context *) (rq)->elevator_private)
  54 #define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elevator_private2)
  55
  56 static struct kmem_cache *cfq_pool;
  57 static struct kmem_cache *cfq_ioc_pool;
  58
  59 static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
  60 static struct completion *ioc_gone;
  61 static DEFINE_SPINLOCK(ioc_gone_lock);
  62
  63 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
  64 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
  65 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
  66
  67 #define sample_valid(samples)   ((samples) > 80)
  68
  69 /*
  70  * Most of our rbtree usage is for sorting with min extraction, so
  71  * if we cache the leftmost node we don't have to walk down the tree
  72  * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
  73  * move this into the elevator for the rq sorting as well.
  74  */
  75 struct cfq_rb_root {
  76         struct rb_root rb;
  77         struct rb_node *left;
  78         unsigned count;
  79 };
  80 #define CFQ_RB_ROOT     (struct cfq_rb_root) { RB_ROOT, NULL, 0, }
  81
  82 /*
  83  * Per process-grouping structure
  84  */
  85 struct cfq_queue {
  86         /* reference count */
  87         atomic_t ref;
  88         /* various state flags, see below */
  89         unsigned int flags;
  90         /* parent cfq_data */
  91         struct cfq_data *cfqd;
  92         /* service_tree member */
  93         struct rb_node rb_node;
  94         /* service_tree key */
  95         unsigned long rb_key;
  96         /* prio tree member */
  97         struct rb_node p_node;
  98         /* prio tree root we belong to, if any */
  99         struct rb_root *p_root;
 100         /* sorted list of pending requests */
 101         struct rb_root sort_list;
 102         /* if fifo isn't expired, next request to serve */
 103         struct request *next_rq;
 104         /* requests queued in sort_list */
 105         int queued[2];
 106         /* currently allocated requests */
 107         int allocated[2];
 108         /* fifo list of requests in sort_list */
 109         struct list_head fifo;
 110
 111         unsigned long slice_end;
 112         long slice_resid;
 113         unsigned int slice_dispatch;
 114
 115         /* pending metadata requests */
 116         int meta_pending;
 117         /* number of requests that are on the dispatch list or inside driver */
 118         int dispatched;
 119
 120         /* io prio of this group */
 121         unsigned short ioprio, org_ioprio;
 122         unsigned short ioprio_class, org_ioprio_class;
 123
 124         unsigned int seek_samples;
 125         u64 seek_total;
 126         sector_t seek_mean;
 127         sector_t last_request_pos;
 128         unsigned long seeky_start;
 129
 130         pid_t pid;
 131
 132         struct cfq_rb_root *service_tree;
 133         struct cfq_queue *new_cfqq;
 134 };
 135
 136 /*
 137  * First index in the service_trees.
 138  * IDLE is handled separately, so it has negative index
 139  */
 140 enum wl_prio_t {
 141         IDLE_WORKLOAD = -1,
 142         BE_WORKLOAD = 0,
 143         RT_WORKLOAD = 1
 144 };
 145
 146 /*
 147  * Second index in the service_trees.
 148  */
 149 enum wl_type_t {
 150         ASYNC_WORKLOAD = 0,
 151         SYNC_NOIDLE_WORKLOAD = 1,
 152         SYNC_WORKLOAD = 2
 153 };
 154
 155
 156 /*
 157  * Per block device queue structure
 158  */
 159 struct cfq_data {
 160         struct request_queue *queue;
 161
 162         /*
 163          * rr lists of queues with requests, onle rr for each priority class.
 164          * Counts are embedded in the cfq_rb_root
 165          */
 166         struct cfq_rb_root service_trees[2][3];
 167         struct cfq_rb_root service_tree_idle;
 168         /*
 169          * The priority currently being served
 170          */
 171         enum wl_prio_t serving_prio;
 172         enum wl_type_t serving_type;
 173         unsigned long workload_expires;
 174
 175         /*
 176          * Each priority tree is sorted by next_request position.  These
 177          * trees are used when determining if two or more queues are
 178          * interleaving requests (see cfq_close_cooperator).
 179          */
 180         struct rb_root prio_trees[CFQ_PRIO_LISTS];
 181
 182         unsigned int busy_queues;
 183         unsigned int busy_queues_avg[2];
 184
 185         int rq_in_driver[2];
 186         int sync_flight;
 187
 188         /*
 189          * queue-depth detection
 190          */
 191         int rq_queued;
 192         int hw_tag;
 193         int hw_tag_samples;
 194         int rq_in_driver_peak;
 195
 196         /*
 197          * idle window management
 198          */
 199         struct timer_list idle_slice_timer;
 200         struct work_struct unplug_work;
 201
 202         struct cfq_queue *active_queue;
 203         struct cfq_io_context *active_cic;
 204
 205         /*
 206          * async queue for each priority case
 207          */
 208         struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
 209         struct cfq_queue *async_idle_cfqq;
 210
 211         sector_t last_position;
 212
 213         /*
 214          * tunables, see top of file
 215          */
 216         unsigned int cfq_quantum;
 217         unsigned int cfq_fifo_expire[2];
 218         unsigned int cfq_back_penalty;
 219         unsigned int cfq_back_max;
 220         unsigned int cfq_slice[2];
 221         unsigned int cfq_slice_async_rq;
 222         unsigned int cfq_slice_idle;
 223         unsigned int cfq_latency;
 224
 225         struct list_head cic_list;
 226
 227         /*
 228          * Fallback dummy cfqq for extreme OOM conditions
 229          */
 230         struct cfq_queue oom_cfqq;
 231
 232         unsigned long last_end_sync_rq;
 233 };
 234
 235 static struct cfq_rb_root *service_tree_for(enum wl_prio_t prio,
 236                                             enum wl_type_t type,
 237                                             struct cfq_data *cfqd)
 238 {
 239         if (prio == IDLE_WORKLOAD)
 240                 return &cfqd->service_tree_idle;
 241
 242         return &cfqd->service_trees[prio][type];
 243 }
 244
 245 enum cfqq_state_flags {
 246         CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
 247         CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
 248         CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
 249         CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
 250         CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
 251         CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
 252         CFQ_CFQQ_FLAG_prio_changed,     /* task priority has changed */
 253         CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
 254         CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
 255         CFQ_CFQQ_FLAG_coop,             /* cfqq is shared */
 256         CFQ_CFQQ_FLAG_coop_preempt,     /* coop preempt */
 257 };
 258
 259 #define CFQ_CFQQ_FNS(name)                                              \
 260 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
 261 {                                                                       \
 262         (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);                   \
 263 }                                                                       \
 264 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
 265 {                                                                       \
 266         (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                  \
 267 }                                                                       \
 268 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
 269 {                                                                       \
 270         return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;      \
 271 }
 272
 273 CFQ_CFQQ_FNS(on_rr);
 274 CFQ_CFQQ_FNS(wait_request);
 275 CFQ_CFQQ_FNS(must_dispatch);
 276 CFQ_CFQQ_FNS(must_alloc_slice);
 277 CFQ_CFQQ_FNS(fifo_expire);
 278 CFQ_CFQQ_FNS(idle_window);
 279 CFQ_CFQQ_FNS(prio_changed);
 280 CFQ_CFQQ_FNS(slice_new);
 281 CFQ_CFQQ_FNS(sync);
 282 CFQ_CFQQ_FNS(coop);
 283 CFQ_CFQQ_FNS(coop_preempt);
 284 #undef CFQ_CFQQ_FNS
 285
 286 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
 287         blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
 288 #define cfq_log(cfqd, fmt, args...)     \
 289         blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 290
 291 static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
 292 {
 293         if (cfq_class_idle(cfqq))
 294                 return IDLE_WORKLOAD;
 295         if (cfq_class_rt(cfqq))
 296                 return RT_WORKLOAD;
 297         return BE_WORKLOAD;
 298 }
 299
 300
 301 static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
 302 {
 303         if (!cfq_cfqq_sync(cfqq))
 304                 return ASYNC_WORKLOAD;
 305         if (!cfq_cfqq_idle_window(cfqq))
 306                 return SYNC_NOIDLE_WORKLOAD;
 307         return SYNC_WORKLOAD;
 308 }
 309
 310 static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd)
 311 {
 312         if (wl == IDLE_WORKLOAD)
 313                 return cfqd->service_tree_idle.count;
 314
 315         return cfqd->service_trees[wl][ASYNC_WORKLOAD].count
 316                 + cfqd->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
 317                 + cfqd->service_trees[wl][SYNC_WORKLOAD].count;
 318 }
 319
 320 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 321 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
 322                                        struct io_context *, gfp_t);
 323 static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
 324                                                 struct io_context *);
 325
 326 static inline int rq_in_driver(struct cfq_data *cfqd)
 327 {
 328         return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
 329 }
 330
 331 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
 332                                             bool is_sync)
 333 {
 334         return cic->cfqq[is_sync];
 335 }
 336
 337 static inline void cic_set_cfqq(struct cfq_io_context *cic,
 338                                 struct cfq_queue *cfqq, bool is_sync)
 339 {
 340         cic->cfqq[is_sync] = cfqq;
 341 }
 342
 343 /*
 344  * We regard a request as SYNC, if it's either a read or has the SYNC bit
 345  * set (in which case it could also be direct WRITE).
 346  */
 347 static inline bool cfq_bio_sync(struct bio *bio)
 348 {
 349         return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);
 350 }
 351
 352 /*
 353  * scheduler run of queue, if there are requests pending and no one in the
 354  * driver that will restart queueing
 355  */
 356 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 357 {
 358         if (cfqd->busy_queues) {
 359                 cfq_log(cfqd, "schedule dispatch");
 360                 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
 361         }
 362 }
 363
 364 static int cfq_queue_empty(struct request_queue *q)
 365 {
 366         struct cfq_data *cfqd = q->elevator->elevator_data;
 367
 368         return !cfqd->busy_queues;
 369 }
 370
 371 /*
 372  * Scale schedule slice based on io priority. Use the sync time slice only
 373  * if a queue is marked sync and has sync io queued. A sync queue with async
 374  * io only, should not get full sync slice length.
 375  */
 376 static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
 377                                  unsigned short prio)
 378 {
 379         const int base_slice = cfqd->cfq_slice[sync];
 380
 381         WARN_ON(prio >= IOPRIO_BE_NR);
 382
 383         return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
 384 }
 385
 386 static inline int
 387 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 388 {
 389         return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 390 }
 391
 392 /*
 393  * get averaged number of queues of RT/BE priority.
 394  * average is updated, with a formula that gives more weight to higher numbers,
 395  * to quickly follows sudden increases and decrease slowly
 396  */
 397
 398 static inline unsigned cfq_get_avg_queues(struct cfq_data *cfqd, bool rt)
 399 {
 400         unsigned min_q, max_q;
 401         unsigned mult  = cfq_hist_divisor - 1;
 402         unsigned round = cfq_hist_divisor / 2;
 403         unsigned busy = cfq_busy_queues_wl(rt, cfqd);
 404
 405         min_q = min(cfqd->busy_queues_avg[rt], busy);
 406         max_q = max(cfqd->busy_queues_avg[rt], busy);
 407         cfqd->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
 408                 cfq_hist_divisor;
 409         return cfqd->busy_queues_avg[rt];
 410 }
 411
 412 static inline void
 413 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 414 {
 415         unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
 416         if (cfqd->cfq_latency) {
 417                 /* interested queues (we consider only the ones with the same
 418                  * priority class) */
 419                 unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
 420                 unsigned sync_slice = cfqd->cfq_slice[1];
 421                 unsigned expect_latency = sync_slice * iq;
 422                 if (expect_latency > cfq_target_latency) {
 423                         unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
 424                         /* scale low_slice according to IO priority
 425                          * and sync vs async */
 426                         unsigned low_slice =
 427                                 min(slice, base_low_slice * slice / sync_slice);
 428                         /* the adapted slice value is scaled to fit all iqs
 429                          * into the target latency */
 430                         slice = max(slice * cfq_target_latency / expect_latency,
 431                                     low_slice);
 432                 }
 433         }
 434         cfqq->slice_end = jiffies + slice;
 435         cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
 436 }
 437
 438 /*
 439  * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
 440  * isn't valid until the first request from the dispatch is activated
 441  * and the slice time set.
 442  */
 443 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
 444 {
 445         if (cfq_cfqq_slice_new(cfqq))
 446                 return 0;
 447         if (time_before(jiffies, cfqq->slice_end))
 448                 return 0;
 449
 450         return 1;
 451 }
 452
 453 /*
 454  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
 455  * We choose the request that is closest to the head right now. Distance
 456  * behind the head is penalized and only allowed to a certain extent.
 457  */
 458 static struct request *
 459 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
 460 {
 461         sector_t last, s1, s2, d1 = 0, d2 = 0;
 462         unsigned long back_max;
 463 #define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
 464 #define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
 465         unsigned wrap = 0; /* bit mask: requests behind the disk head? */
 466
 467         if (rq1 == NULL || rq1 == rq2)
 468                 return rq2;
 469         if (rq2 == NULL)
 470                 return rq1;
 471
 472         if (rq_is_sync(rq1) && !rq_is_sync(rq2))
 473                 return rq1;
 474         else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
 475                 return rq2;
 476         if (rq_is_meta(rq1) && !rq_is_meta(rq2))
 477                 return rq1;
 478         else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
 479                 return rq2;
 480
 481         s1 = blk_rq_pos(rq1);
 482         s2 = blk_rq_pos(rq2);
 483
 484         last = cfqd->last_position;
 485
 486         /*
 487          * by definition, 1KiB is 2 sectors
 488          */
 489         back_max = cfqd->cfq_back_max * 2;
 490
 491         /*
 492          * Strict one way elevator _except_ in the case where we allow
 493          * short backward seeks which are biased as twice the cost of a
 494          * similar forward seek.
 495          */
 496         if (s1 >= last)
 497                 d1 = s1 - last;
 498         else if (s1 + back_max >= last)
 499                 d1 = (last - s1) * cfqd->cfq_back_penalty;
 500         else
 501                 wrap |= CFQ_RQ1_WRAP;
 502
 503         if (s2 >= last)
 504                 d2 = s2 - last;
 505         else if (s2 + back_max >= last)
 506                 d2 = (last - s2) * cfqd->cfq_back_penalty;
 507         else
 508                 wrap |= CFQ_RQ2_WRAP;
 509
 510         /* Found required data */
 511
 512         /*
 513          * By doing switch() on the bit mask "wrap" we avoid having to
 514          * check two variables for all permutations: --> faster!
 515          */
 516         switch (wrap) {
 517         case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
 518                 if (d1 < d2)
 519                         return rq1;
 520                 else if (d2 < d1)
 521                         return rq2;
 522                 else {
 523                         if (s1 >= s2)
 524                                 return rq1;
 525                         else
 526                                 return rq2;
 527                 }
 528
 529         case CFQ_RQ2_WRAP:
 530                 return rq1;
 531         case CFQ_RQ1_WRAP:
 532                 return rq2;
 533         case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
 534         default:
 535                 /*
 536                  * Since both rqs are wrapped,
 537                  * start with the one that's further behind head
 538                  * (--> only *one* back seek required),
 539                  * since back seek takes more time than forward.
 540                  */
 541                 if (s1 <= s2)
 542                         return rq1;
 543                 else
 544                         return rq2;
 545         }
 546 }
 547
 548 /*
 549  * The below is leftmost cache rbtree addon
 550  */
 551 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
 552 {
 553         if (!root->left)
 554                 root->left = rb_first(&root->rb);
 555
 556         if (root->left)
 557                 return rb_entry(root->left, struct cfq_queue, rb_node);
 558
 559         return NULL;
 560 }
 561
 562 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 563 {
 564         rb_erase(n, root);
 565         RB_CLEAR_NODE(n);
 566 }
 567
 568 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
 569 {
 570         if (root->left == n)
 571                 root->left = NULL;
 572         rb_erase_init(n, &root->rb);
 573         --root->count;
 574 }
 575
 576 /*
 577  * would be nice to take fifo expire time into account as well
 578  */
 579 static struct request *
 580 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 581                   struct request *last)
 582 {
 583         struct rb_node *rbnext = rb_next(&last->rb_node);
 584         struct rb_node *rbprev = rb_prev(&last->rb_node);
 585         struct request *next = NULL, *prev = NULL;
 586
 587         BUG_ON(RB_EMPTY_NODE(&last->rb_node));
 588
 589         if (rbprev)
 590                 prev = rb_entry_rq(rbprev);
 591
 592         if (rbnext)
 593                 next = rb_entry_rq(rbnext);
 594         else {
 595                 rbnext = rb_first(&cfqq->sort_list);
 596                 if (rbnext && rbnext != &last->rb_node)
 597                         next = rb_entry_rq(rbnext);
 598         }
 599
 600         return cfq_choose_req(cfqd, next, prev);
 601 }
 602
 603 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 604                                       struct cfq_queue *cfqq)
 605 {
 606         /*
 607          * just an approximation, should be ok.
 608          */
 609         return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
 610                        cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
 611 }
 612
 613 /*
 614  * The cfqd->service_trees holds all pending cfq_queue's that have
 615  * requests waiting to be processed. It is sorted in the order that
 616  * we will service the queues.
 617  */
 618 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 619                                  bool add_front)
 620 {
 621         struct rb_node **p, *parent;
 622         struct cfq_queue *__cfqq;
 623         unsigned long rb_key;
 624         struct cfq_rb_root *service_tree;
 625         int left;
 626
 627         service_tree = service_tree_for(cfqq_prio(cfqq), cfqq_type(cfqq), cfqd);
 628         if (cfq_class_idle(cfqq)) {
 629                 rb_key = CFQ_IDLE_DELAY;
 630                 parent = rb_last(&service_tree->rb);
 631                 if (parent && parent != &cfqq->rb_node) {
 632                         __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 633                         rb_key += __cfqq->rb_key;
 634                 } else
 635                         rb_key += jiffies;
 636         } else if (!add_front) {
 637                 /*
 638                  * Get our rb key offset. Subtract any residual slice
 639                  * value carried from last service. A negative resid
 640                  * count indicates slice overrun, and this should position
 641                  * the next service time further away in the tree.
 642                  */
 643                 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
 644                 rb_key -= cfqq->slice_resid;
 645                 cfqq->slice_resid = 0;
 646         } else {
 647                 rb_key = -HZ;
 648                 __cfqq = cfq_rb_first(service_tree);
 649                 rb_key += __cfqq ? __cfqq->rb_key : jiffies;
 650         }
 651
 652         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
 653                 /*
 654                  * same position, nothing more to do
 655                  */
 656                 if (rb_key == cfqq->rb_key &&
 657                     cfqq->service_tree == service_tree)
 658                         return;
 659
 660                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
 661                 cfqq->service_tree = NULL;
 662         }
 663
 664         left = 1;
 665         parent = NULL;
 666         cfqq->service_tree = service_tree;
 667         p = &service_tree->rb.rb_node;
 668         while (*p) {
 669                 struct rb_node **n;
 670
 671                 parent = *p;
 672                 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 673
 674                 /*
 675                  * sort by key, that represents service time.
 676                  */
 677                 if (time_before(rb_key, __cfqq->rb_key))
 678                         n = &(*p)->rb_left;
 679                 else {
 680                         n = &(*p)->rb_right;
 681                         left = 0;
 682                 }
 683
 684                 p = n;
 685         }
 686
 687         if (left)
 688                 service_tree->left = &cfqq->rb_node;
 689
 690         cfqq->rb_key = rb_key;
 691         rb_link_node(&cfqq->rb_node, parent, p);
 692         rb_insert_color(&cfqq->rb_node, &service_tree->rb);
 693         service_tree->count++;
 694 }
 695
 696 static struct cfq_queue *
 697 cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
 698                      sector_t sector, struct rb_node **ret_parent,
 699                      struct rb_node ***rb_link)
 700 {
 701         struct rb_node **p, *parent;
 702         struct cfq_queue *cfqq = NULL;
 703
 704         parent = NULL;
 705         p = &root->rb_node;
 706         while (*p) {
 707                 struct rb_node **n;
 708
 709                 parent = *p;
 710                 cfqq = rb_entry(parent, struct cfq_queue, p_node);
 711
 712                 /*
 713                  * Sort strictly based on sector.  Smallest to the left,
 714                  * largest to the right.
 715                  */
 716                 if (sector > blk_rq_pos(cfqq->next_rq))
 717                         n = &(*p)->rb_right;
 718                 else if (sector < blk_rq_pos(cfqq->next_rq))
 719                         n = &(*p)->rb_left;
 720                 else
 721                         break;
 722                 p = n;
 723                 cfqq = NULL;
 724         }
 725
 726         *ret_parent = parent;
 727         if (rb_link)
 728                 *rb_link = p;
 729         return cfqq;
 730 }
 731
 732 static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 733 {
 734         struct rb_node **p, *parent;
 735         struct cfq_queue *__cfqq;
 736
 737         if (cfqq->p_root) {
 738                 rb_erase(&cfqq->p_node, cfqq->p_root);
 739                 cfqq->p_root = NULL;
 740         }
 741
 742         if (cfq_class_idle(cfqq))
 743                 return;
 744         if (!cfqq->next_rq)
 745                 return;
 746
 747         cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
 748         __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
 749                                       blk_rq_pos(cfqq->next_rq), &parent, &p);
 750         if (!__cfqq) {
 751                 rb_link_node(&cfqq->p_node, parent, p);
 752                 rb_insert_color(&cfqq->p_node, cfqq->p_root);
 753         } else
 754                 cfqq->p_root = NULL;
 755 }
 756
 757 /*
 758  * Update cfqq's position in the service tree.
 759  */
 760 static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 761 {
 762         /*
 763          * Resorting requires the cfqq to be on the RR list already.
 764          */
 765         if (cfq_cfqq_on_rr(cfqq)) {
 766                 cfq_service_tree_add(cfqd, cfqq, 0);
 767                 cfq_prio_tree_add(cfqd, cfqq);
 768         }
 769 }
 770
 771 /*
 772  * add to busy list of queues for service, trying to be fair in ordering
 773  * the pending list according to last request service
 774  */
 775 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 776 {
 777         cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
 778         BUG_ON(cfq_cfqq_on_rr(cfqq));
 779         cfq_mark_cfqq_on_rr(cfqq);
 780         cfqd->busy_queues++;
 781
 782         cfq_resort_rr_list(cfqd, cfqq);
 783 }
 784
 785 /*
 786  * Called when the cfqq no longer has requests pending, remove it from
 787  * the service tree.
 788  */
 789 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 790 {
 791         cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
 792         BUG_ON(!cfq_cfqq_on_rr(cfqq));
 793         cfq_clear_cfqq_on_rr(cfqq);
 794
 795         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
 796                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
 797                 cfqq->service_tree = NULL;
 798         }
 799         if (cfqq->p_root) {
 800                 rb_erase(&cfqq->p_node, cfqq->p_root);
 801                 cfqq->p_root = NULL;
 802         }
 803
 804         BUG_ON(!cfqd->busy_queues);
 805         cfqd->busy_queues--;
 806 }
 807
 808 /*
 809  * rb tree support functions
 810  */
 811 static void cfq_del_rq_rb(struct request *rq)
 812 {
 813         struct cfq_queue *cfqq = RQ_CFQQ(rq);
 814         struct cfq_data *cfqd = cfqq->cfqd;
 815         const int sync = rq_is_sync(rq);
 816
 817         BUG_ON(!cfqq->queued[sync]);
 818         cfqq->queued[sync]--;
 819
 820         elv_rb_del(&cfqq->sort_list, rq);
 821
 822         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
 823                 cfq_del_cfqq_rr(cfqd, cfqq);
 824 }
 825
 826 static void cfq_add_rq_rb(struct request *rq)
 827 {
 828         struct cfq_queue *cfqq = RQ_CFQQ(rq);
 829         struct cfq_data *cfqd = cfqq->cfqd;
 830         struct request *__alias, *prev;
 831
 832         cfqq->queued[rq_is_sync(rq)]++;
 833
 834         /*
 835          * looks a little odd, but the first insert might return an alias.
 836          * if that happens, put the alias on the dispatch list
 837          */
 838         while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
 839                 cfq_dispatch_insert(cfqd->queue, __alias);
 840
 841         if (!cfq_cfqq_on_rr(cfqq))
 842                 cfq_add_cfqq_rr(cfqd, cfqq);
 843
 844         /*
 845          * check if this request is a better next-serve candidate
 846          */
 847         prev = cfqq->next_rq;
 848         cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
 849
 850         /*
 851          * adjust priority tree position, if ->next_rq changes
 852          */
 853         if (prev != cfqq->next_rq)
 854                 cfq_prio_tree_add(cfqd, cfqq);
 855
 856         BUG_ON(!cfqq->next_rq);
 857 }
 858
 859 static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 860 {
 861         elv_rb_del(&cfqq->sort_list, rq);
 862         cfqq->queued[rq_is_sync(rq)]--;
 863         cfq_add_rq_rb(rq);
 864 }
 865
 866 static struct request *
 867 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 868 {
 869         struct task_struct *tsk = current;
 870         struct cfq_io_context *cic;
 871         struct cfq_queue *cfqq;
 872
 873         cic = cfq_cic_lookup(cfqd, tsk->io_context);
 874         if (!cic)
 875                 return NULL;
 876
 877         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
 878         if (cfqq) {
 879                 sector_t sector = bio->bi_sector + bio_sectors(bio);
 880
 881                 return elv_rb_find(&cfqq->sort_list, sector);
 882         }
 883
 884         return NULL;
 885 }
 886
 887 static void cfq_activate_request(struct request_queue *q, struct request *rq)
 888 {
 889         struct cfq_data *cfqd = q->elevator->elevator_data;
 890
 891         cfqd->rq_in_driver[rq_is_sync(rq)]++;
 892         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
 893                                                 rq_in_driver(cfqd));
 894
 895         cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
 896 }
 897
 898 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
 899 {
 900         struct cfq_data *cfqd = q->elevator->elevator_data;
 901         const int sync = rq_is_sync(rq);
 902
 903         WARN_ON(!cfqd->rq_in_driver[sync]);
 904         cfqd->rq_in_driver[sync]--;
 905         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
 906                                                 rq_in_driver(cfqd));
 907 }
 908
 909 static void cfq_remove_request(struct request *rq)
 910 {
 911         struct cfq_queue *cfqq = RQ_CFQQ(rq);
 912
 913         if (cfqq->next_rq == rq)
 914                 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
 915
 916         list_del_init(&rq->queuelist);
 917         cfq_del_rq_rb(rq);
 918
 919         cfqq->cfqd->rq_queued--;
 920         if (rq_is_meta(rq)) {
 921                 WARN_ON(!cfqq->meta_pending);
 922                 cfqq->meta_pending--;
 923         }
 924 }
 925
 926 static int cfq_merge(struct request_queue *q, struct request **req,
 927                      struct bio *bio)
 928 {
 929         struct cfq_data *cfqd = q->elevator->elevator_data;
 930         struct request *__rq;
 931
 932         __rq = cfq_find_rq_fmerge(cfqd, bio);
 933         if (__rq && elv_rq_merge_ok(__rq, bio)) {
 934                 *req = __rq;
 935                 return ELEVATOR_FRONT_MERGE;
 936         }
 937
 938         return ELEVATOR_NO_MERGE;
 939 }
 940
 941 static void cfq_merged_request(struct request_queue *q, struct request *req,
 942                                int type)
 943 {
 944         if (type == ELEVATOR_FRONT_MERGE) {
 945                 struct cfq_queue *cfqq = RQ_CFQQ(req);
 946
 947                 cfq_reposition_rq_rb(cfqq, req);
 948         }
 949 }
 950
 951 static void
 952 cfq_merged_requests(struct request_queue *q, struct request *rq,
 953                     struct request *next)
 954 {
 955         /*
 956          * reposition in fifo if next is older than rq
 957          */
 958         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
 959             time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
 960                 list_move(&rq->queuelist, &next->queuelist);
 961                 rq_set_fifo_time(rq, rq_fifo_time(next));
 962         }
 963
 964         cfq_remove_request(next);
 965 }
 966
 967 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 968                            struct bio *bio)
 969 {
 970         struct cfq_data *cfqd = q->elevator->elevator_data;
 971         struct cfq_io_context *cic;
 972         struct cfq_queue *cfqq;
 973
 974         /*
 975          * Disallow merge of a sync bio into an async request.
 976          */
 977         if (cfq_bio_sync(bio) && !rq_is_sync(rq))
 978                 return false;
 979
 980         /*
 981          * Lookup the cfqq that this bio will be queued with. Allow
 982          * merge only if rq is queued there.
 983          */
 984         cic = cfq_cic_lookup(cfqd, current->io_context);
 985         if (!cic)
 986                 return false;
 987
 988         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
 989         return cfqq == RQ_CFQQ(rq);
 990 }
 991
 992 static void __cfq_set_active_queue(struct cfq_data *cfqd,
 993                                    struct cfq_queue *cfqq)
 994 {
 995         if (cfqq) {
 996                 cfq_log_cfqq(cfqd, cfqq, "set_active");
 997                 cfqq->slice_end = 0;
 998                 cfqq->slice_dispatch = 0;
 999
1000                 cfq_clear_cfqq_wait_request(cfqq);
1001                 cfq_clear_cfqq_must_dispatch(cfqq);
1002                 cfq_clear_cfqq_must_alloc_slice(cfqq);
1003                 cfq_clear_cfqq_fifo_expire(cfqq);
1004                 cfq_mark_cfqq_slice_new(cfqq);
1005
1006                 del_timer(&cfqd->idle_slice_timer);
1007         }
1008
1009         cfqd->active_queue = cfqq;
1010 }
1011
1012 /*
1013  * current cfqq expired its slice (or was too idle), select new one
1014  */
1015 static void
1016 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1017                     bool timed_out)
1018 {
1019         cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
1020
1021         if (cfq_cfqq_wait_request(cfqq))
1022                 del_timer(&cfqd->idle_slice_timer);
1023
1024         cfq_clear_cfqq_wait_request(cfqq);
1025
1026         /*
1027          * store what was left of this slice, if the queue idled/timed out
1028          */
1029         if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
1030                 cfqq->slice_resid = cfqq->slice_end - jiffies;
1031                 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
1032         }
1033
1034         cfq_resort_rr_list(cfqd, cfqq);
1035
1036         if (cfqq == cfqd->active_queue)
1037                 cfqd->active_queue = NULL;
1038
1039         if (cfqd->active_cic) {
1040                 put_io_context(cfqd->active_cic->ioc);
1041                 cfqd->active_cic = NULL;
1042         }
1043 }
1044
1045 static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
1046 {
1047         struct cfq_queue *cfqq = cfqd->active_queue;
1048
1049         if (cfqq)
1050                 __cfq_slice_expired(cfqd, cfqq, timed_out);
1051 }
1052
1053 /*
1054  * Get next queue for service. Unless we have a queue preemption,
1055  * we'll simply select the first cfqq in the service tree.
1056  */
1057 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
1058 {
1059         struct cfq_rb_root *service_tree =
1060                 service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd);
1061
1062         if (RB_EMPTY_ROOT(&service_tree->rb))
1063                 return NULL;
1064         return cfq_rb_first(service_tree);
1065 }
1066
1067 /*
1068  * Get and set a new active queue for service.
1069  */
1070 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
1071                                               struct cfq_queue *cfqq)
1072 {
1073         if (!cfqq) {
1074                 cfqq = cfq_get_next_queue(cfqd);
1075
1076                 if (cfqq && !cfq_cfqq_coop_preempt(cfqq))
1077                         cfq_clear_cfqq_coop(cfqq);
1078         }
1079
1080         if (cfqq)
1081                 cfq_clear_cfqq_coop_preempt(cfqq);
1082
1083         __cfq_set_active_queue(cfqd, cfqq);
1084         return cfqq;
1085 }
1086
1087 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
1088                                           struct request *rq)
1089 {
1090         if (blk_rq_pos(rq) >= cfqd->last_position)
1091                 return blk_rq_pos(rq) - cfqd->last_position;
1092         else
1093                 return cfqd->last_position - blk_rq_pos(rq);
1094 }
1095
1096 #define CFQQ_SEEK_THR           8 * 1024
1097 #define CFQQ_SEEKY(cfqq)        ((cfqq)->seek_mean > CFQQ_SEEK_THR)
1098
1099 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1100                                struct request *rq)
1101 {
1102         sector_t sdist = cfqq->seek_mean;
1103
1104         if (!sample_valid(cfqq->seek_samples))
1105                 sdist = CFQQ_SEEK_THR;
1106
1107         return cfq_dist_from_last(cfqd, rq) <= sdist;
1108 }
1109
1110 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1111                                     struct cfq_queue *cur_cfqq)
1112 {
1113         struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
1114         struct rb_node *parent, *node;
1115         struct cfq_queue *__cfqq;
1116         sector_t sector = cfqd->last_position;
1117
1118         if (RB_EMPTY_ROOT(root))
1119                 return NULL;
1120
1121         /*
1122          * First, if we find a request starting at the end of the last
1123          * request, choose it.
1124          */
1125         __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
1126         if (__cfqq)
1127                 return __cfqq;
1128
1129         /*
1130          * If the exact sector wasn't found, the parent of the NULL leaf
1131          * will contain the closest sector.
1132          */
1133         __cfqq = rb_entry(parent, struct cfq_queue, p_node);
1134         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1135                 return __cfqq;
1136
1137         if (blk_rq_pos(__cfqq->next_rq) < sector)
1138                 node = rb_next(&__cfqq->p_node);
1139         else
1140                 node = rb_prev(&__cfqq->p_node);
1141         if (!node)
1142                 return NULL;
1143
1144         __cfqq = rb_entry(node, struct cfq_queue, p_node);
1145         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1146                 return __cfqq;
1147
1148         return NULL;
1149 }
1150
1151 /*
1152  * cfqd - obvious
1153  * cur_cfqq - passed in so that we don't decide that the current queue is
1154  *            closely cooperating with itself.
1155  *
1156  * So, basically we're assuming that that cur_cfqq has dispatched at least
1157  * one request, and that cfqd->last_position reflects a position on the disk
1158  * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
1159  * assumption.
1160  */
1161 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1162                                               struct cfq_queue *cur_cfqq)
1163 {
1164         struct cfq_queue *cfqq;
1165
1166         if (!cfq_cfqq_sync(cur_cfqq))
1167                 return NULL;
1168         if (CFQQ_SEEKY(cur_cfqq))
1169                 return NULL;
1170
1171         /*
1172          * We should notice if some of the queues are cooperating, eg
1173          * working closely on the same area of the disk. In that case,
1174          * we can group them together and don't waste time idling.
1175          */
1176         cfqq = cfqq_close(cfqd, cur_cfqq);
1177         if (!cfqq)
1178                 return NULL;
1179
1180         /*
1181          * It only makes sense to merge sync queues.
1182          */
1183         if (!cfq_cfqq_sync(cfqq))
1184                 return NULL;
1185         if (CFQQ_SEEKY(cfqq))
1186                 return NULL;
1187
1188         /*
1189          * Do not merge queues of different priority classes
1190          */
1191         if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
1192                 return NULL;
1193
1194         return cfqq;
1195 }
1196
1197 /*
1198  * Determine whether we should enforce idle window for this queue.
1199  */
1200
1201 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1202 {
1203         enum wl_prio_t prio = cfqq_prio(cfqq);
1204         struct cfq_rb_root *service_tree = cfqq->service_tree;
1205
1206         /* We never do for idle class queues. */
1207         if (prio == IDLE_WORKLOAD)
1208                 return false;
1209
1210         /* We do for queues that were marked with idle window flag. */
1211         if (cfq_cfqq_idle_window(cfqq))
1212                 return true;
1213
1214         /*
1215          * Otherwise, we do only if they are the last ones
1216          * in their service tree.
1217          */
1218         if (!service_tree)
1219                 service_tree = service_tree_for(prio, cfqq_type(cfqq), cfqd);
1220
1221         if (service_tree->count == 0)
1222                 return true;
1223
1224         return (service_tree->count == 1 && cfq_rb_first(service_tree) == cfqq);
1225 }
1226
1227 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1228 {
1229         struct cfq_queue *cfqq = cfqd->active_queue;
1230         struct cfq_io_context *cic;
1231         unsigned long sl;
1232
1233         /*
1234          * SSD device without seek penalty, disable idling. But only do so
1235          * for devices that support queuing, otherwise we still have a problem
1236          * with sync vs async workloads.
1237          */
1238         if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
1239                 return;
1240
1241         WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
1242         WARN_ON(cfq_cfqq_slice_new(cfqq));
1243
1244         /*
1245          * idle is disabled, either manually or by past process history
1246          */
1247         if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
1248                 return;
1249
1250         /*
1251          * still requests with the driver, don't idle
1252          */
1253         if (rq_in_driver(cfqd))
1254                 return;
1255
1256         /*
1257          * task has exited, don't wait
1258          */
1259         cic = cfqd->active_cic;
1260         if (!cic || !atomic_read(&cic->ioc->nr_tasks))
1261                 return;
1262
1263         /*
1264          * If our average think time is larger than the remaining time
1265          * slice, then don't idle. This avoids overrunning the allotted
1266          * time slice.
1267          */
1268         if (sample_valid(cic->ttime_samples) &&
1269             (cfqq->slice_end - jiffies < cic->ttime_mean))
1270                 return;
1271
1272         cfq_mark_cfqq_wait_request(cfqq);
1273
1274         sl = cfqd->cfq_slice_idle;
1275         /* are we servicing noidle tree, and there are more queues?
1276          * non-rotational or NCQ: no idle
1277          * non-NCQ rotational : very small idle, to allow
1278          *     fair distribution of slice time for a process doing back-to-back
1279          *     seeks.
1280          */
1281         if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
1282             service_tree_for(cfqd->serving_prio, SYNC_NOIDLE_WORKLOAD, cfqd)
1283                 ->count > 0) {
1284                 if (blk_queue_nonrot(cfqd->queue) || cfqd->hw_tag)
1285                         return;
1286                 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
1287         }
1288
1289         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1290         cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
1291 }
1292
1293 /*
1294  * Move request from internal lists to the request queue dispatch list.
1295  */
1296 static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1297 {
1298         struct cfq_data *cfqd = q->elevator->elevator_data;
1299         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1300
1301         cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
1302
1303         cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
1304         cfq_remove_request(rq);
1305         cfqq->dispatched++;
1306         elv_dispatch_sort(q, rq);
1307
1308         if (cfq_cfqq_sync(cfqq))
1309                 cfqd->sync_flight++;
1310 }
1311
1312 /*
1313  * return expired entry, or NULL to just start from scratch in rbtree
1314  */
1315 static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
1316 {
1317         struct request *rq = NULL;
1318
1319         if (cfq_cfqq_fifo_expire(cfqq))
1320                 return NULL;
1321
1322         cfq_mark_cfqq_fifo_expire(cfqq);
1323
1324         if (list_empty(&cfqq->fifo))
1325                 return NULL;
1326
1327         rq = rq_entry_fifo(cfqq->fifo.next);
1328         if (time_before(jiffies, rq_fifo_time(rq)))
1329                 rq = NULL;
1330
1331         cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
1332         return rq;
1333 }
1334
1335 static inline int
1336 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1337 {
1338         const int base_rq = cfqd->cfq_slice_async_rq;
1339
1340         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
1341
1342         return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
1343 }
1344
1345 /*
1346  * Must be called with the queue_lock held.
1347  */
1348 static int cfqq_process_refs(struct cfq_queue *cfqq)
1349 {
1350         int process_refs, io_refs;
1351
1352         io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
1353         process_refs = atomic_read(&cfqq->ref) - io_refs;
1354         BUG_ON(process_refs < 0);
1355         return process_refs;
1356 }
1357
1358 static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
1359 {
1360         int process_refs, new_process_refs;
1361         struct cfq_queue *__cfqq;
1362
1363         /* Avoid a circular list and skip interim queue merges */
1364         while ((__cfqq = new_cfqq->new_cfqq)) {
1365                 if (__cfqq == cfqq)
1366                         return;
1367                 new_cfqq = __cfqq;
1368         }
1369
1370         process_refs = cfqq_process_refs(cfqq);
1371         /*
1372          * If the process for the cfqq has gone away, there is no
1373          * sense in merging the queues.
1374          */
1375         if (process_refs == 0)
1376                 return;
1377
1378         /*
1379          * Merge in the direction of the lesser amount of work.
1380          */
1381         new_process_refs = cfqq_process_refs(new_cfqq);
1382         if (new_process_refs >= process_refs) {
1383                 cfqq->new_cfqq = new_cfqq;
1384                 atomic_add(process_refs, &new_cfqq->ref);
1385         } else {
1386                 new_cfqq->new_cfqq = cfqq;
1387                 atomic_add(new_process_refs, &cfqq->ref);
1388         }
1389 }
1390
1391 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio,
1392                                     bool prio_changed)
1393 {
1394         struct cfq_queue *queue;
1395         int i;
1396         bool key_valid = false;
1397         unsigned long lowest_key = 0;
1398         enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
1399
1400         if (prio_changed) {
1401                 /*
1402                  * When priorities switched, we prefer starting
1403                  * from SYNC_NOIDLE (first choice), or just SYNC
1404                  * over ASYNC
1405                  */
1406                 if (service_tree_for(prio, cur_best, cfqd)->count)
1407                         return cur_best;
1408                 cur_best = SYNC_WORKLOAD;
1409                 if (service_tree_for(prio, cur_best, cfqd)->count)
1410                         return cur_best;
1411
1412                 return ASYNC_WORKLOAD;
1413         }
1414
1415         for (i = 0; i < 3; ++i) {
1416                 /* otherwise, select the one with lowest rb_key */
1417                 queue = cfq_rb_first(service_tree_for(prio, i, cfqd));
1418                 if (queue &&
1419                     (!key_valid || time_before(queue->rb_key, lowest_key))) {
1420                         lowest_key = queue->rb_key;
1421                         cur_best = i;
1422                         key_valid = true;
1423                 }
1424         }
1425
1426         return cur_best;
1427 }
1428
1429 static void choose_service_tree(struct cfq_data *cfqd)
1430 {
1431         enum wl_prio_t previous_prio = cfqd->serving_prio;
1432         bool prio_changed;
1433         unsigned slice;
1434         unsigned count;
1435
1436         /* Choose next priority. RT > BE > IDLE */
1437         if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd))
1438                 cfqd->serving_prio = RT_WORKLOAD;
1439         else if (cfq_busy_queues_wl(BE_WORKLOAD, cfqd))
1440                 cfqd->serving_prio = BE_WORKLOAD;
1441         else {
1442                 cfqd->serving_prio = IDLE_WORKLOAD;
1443                 cfqd->workload_expires = jiffies + 1;
1444                 return;
1445         }
1446
1447         /*
1448          * For RT and BE, we have to choose also the type
1449          * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
1450          * expiration time
1451          */
1452         prio_changed = (cfqd->serving_prio != previous_prio);
1453         count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd)
1454                 ->count;
1455
1456         /*
1457          * If priority didn't change, check workload expiration,
1458          * and that we still have other queues ready
1459          */
1460         if (!prio_changed && count &&
1461             !time_after(jiffies, cfqd->workload_expires))
1462                 return;
1463
1464         /* otherwise select new workload type */
1465         cfqd->serving_type =
1466                 cfq_choose_wl(cfqd, cfqd->serving_prio, prio_changed);
1467         count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd)
1468                 ->count;
1469
1470         /*
1471          * the workload slice is computed as a fraction of target latency
1472          * proportional to the number of queues in that workload, over
1473          * all the queues in the same priority class
1474          */
1475         slice = cfq_target_latency * count /
1476                 max_t(unsigned, cfqd->busy_queues_avg[cfqd->serving_prio],
1477                       cfq_busy_queues_wl(cfqd->serving_prio, cfqd));
1478
1479         if (cfqd->serving_type == ASYNC_WORKLOAD)
1480                 /* async workload slice is scaled down according to
1481                  * the sync/async slice ratio. */
1482                 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
1483         else
1484                 /* sync workload slice is at least 2 * cfq_slice_idle */
1485                 slice = max(slice, 2 * cfqd->cfq_slice_idle);
1486
1487         slice = max_t(unsigned, slice, CFQ_MIN_TT);
1488         cfqd->workload_expires = jiffies + slice;
1489 }
1490
1491 /*
1492  * Select a queue for service. If we have a current active queue,
1493  * check whether to continue servicing it, or retrieve and set a new one.
1494  */
1495 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1496 {
1497         struct cfq_queue *cfqq, *new_cfqq = NULL;
1498
1499         cfqq = cfqd->active_queue;
1500         if (!cfqq)
1501                 goto new_queue;
1502
1503         /*
1504          * The active queue has run out of time, expire it and select new.
1505          */
1506         if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
1507                 goto expire;
1508
1509         /*
1510          * The active queue has requests and isn't expired, allow it to
1511          * dispatch.
1512          */
1513         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
1514                 goto keep_queue;
1515
1516         /*
1517          * If another queue has a request waiting within our mean seek
1518          * distance, let it run.  The expire code will check for close
1519          * cooperators and put the close queue at the front of the service
1520          * tree.  If possible, merge the expiring queue with the new cfqq.
1521          */
1522         new_cfqq = cfq_close_cooperator(cfqd, cfqq);
1523         if (new_cfqq) {
1524                 if (!cfqq->new_cfqq)
1525                         cfq_setup_merge(cfqq, new_cfqq);
1526                 goto expire;
1527         }
1528
1529         /*
1530          * No requests pending. If the active queue still has requests in
1531          * flight or is idling for a new request, allow either of these
1532          * conditions to happen (or time out) before selecting a new queue.
1533          */
1534         if (timer_pending(&cfqd->idle_slice_timer) ||
1535             (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
1536                 cfqq = NULL;
1537                 goto keep_queue;
1538         }
1539
1540 expire:
1541         cfq_slice_expired(cfqd, 0);
1542 new_queue:
1543         /*
1544          * Current queue expired. Check if we have to switch to a new
1545          * service tree
1546          */
1547         if (!new_cfqq)
1548                 choose_service_tree(cfqd);
1549
1550         cfqq = cfq_set_active_queue(cfqd, new_cfqq);
1551 keep_queue:
1552         return cfqq;
1553 }
1554
1555 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
1556 {
1557         int dispatched = 0;
1558
1559         while (cfqq->next_rq) {
1560                 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
1561                 dispatched++;
1562         }
1563
1564         BUG_ON(!list_empty(&cfqq->fifo));
1565         return dispatched;
1566 }
1567
1568 /*
1569  * Drain our current requests. Used for barriers and when switching
1570  * io schedulers on-the-fly.
1571  */
1572 static int cfq_forced_dispatch(struct cfq_data *cfqd)
1573 {
1574         struct cfq_queue *cfqq;
1575         int dispatched = 0;
1576         int i, j;
1577         for (i = 0; i < 2; ++i)
1578                 for (j = 0; j < 3; ++j)
1579                         while ((cfqq = cfq_rb_first(&cfqd->service_trees[i][j]))
1580                                 != NULL)
1581                                 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
1582
1583         while ((cfqq = cfq_rb_first(&cfqd->service_tree_idle)) != NULL)
1584                 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
1585
1586         cfq_slice_expired(cfqd, 0);
1587
1588         BUG_ON(cfqd->busy_queues);
1589
1590         cfq_log(cfqd, "forced_dispatch=%d", dispatched);
1591         return dispatched;
1592 }
1593
1594 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1595 {
1596         unsigned int max_dispatch;
1597
1598         /*
1599          * Drain async requests before we start sync IO
1600          */
1601         if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
1602                 return false;
1603
1604         /*
1605          * If this is an async queue and we have sync IO in flight, let it wait
1606          */
1607         if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
1608                 return false;
1609
1610         max_dispatch = cfqd->cfq_quantum;
1611         if (cfq_class_idle(cfqq))
1612                 max_dispatch = 1;
1613
1614         /*
1615          * Does this cfqq already have too much IO in flight?
1616          */
1617         if (cfqq->dispatched >= max_dispatch) {
1618                 /*
1619                  * idle queue must always only have a single IO in flight
1620                  */
1621                 if (cfq_class_idle(cfqq))
1622                         return false;
1623
1624                 /*
1625                  * We have other queues, don't allow more IO from this one
1626                  */
1627                 if (cfqd->busy_queues > 1)
1628                         return false;
1629
1630                 /*
1631                  * Sole queue user, allow bigger slice
1632                  */
1633                 max_dispatch *= 4;
1634         }
1635
1636         /*
1637          * Async queues must wait a bit before being allowed dispatch.
1638          * We also ramp up the dispatch depth gradually for async IO,
1639          * based on the last sync IO we serviced
1640          */
1641         if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
1642                 unsigned long last_sync = jiffies - cfqd->last_end_sync_rq;
1643                 unsigned int depth;
1644
1645                 depth = last_sync / cfqd->cfq_slice[1];
1646                 if (!depth && !cfqq->dispatched)
1647                         depth = 1;
1648                 if (depth < max_dispatch)
1649                         max_dispatch = depth;
1650         }
1651
1652         /*
1653          * If we're below the current max, allow a dispatch
1654          */
1655         return cfqq->dispatched < max_dispatch;
1656 }
1657
1658 /*
1659  * Dispatch a request from cfqq, moving them to the request queue
1660  * dispatch list.
1661  */
1662 static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1663 {
1664         struct request *rq;
1665
1666         BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
1667
1668         if (!cfq_may_dispatch(cfqd, cfqq))
1669                 return false;
1670
1671         /*
1672          * follow expired path, else get first next available
1673          */
1674         rq = cfq_check_fifo(cfqq);
1675         if (!rq)
1676                 rq = cfqq->next_rq;
1677
1678         /*
1679          * insert request into driver dispatch list
1680          */
1681         cfq_dispatch_insert(cfqd->queue, rq);
1682
1683         if (!cfqd->active_cic) {
1684                 struct cfq_io_context *cic = RQ_CIC(rq);
1685
1686                 atomic_long_inc(&cic->ioc->refcount);
1687                 cfqd->active_cic = cic;
1688         }
1689
1690         return true;
1691 }
1692
1693 /*
1694  * Find the cfqq that we need to service and move a request from that to the
1695  * dispatch list
1696  */
1697 static int cfq_dispatch_requests(struct request_queue *q, int force)
1698 {
1699         struct cfq_data *cfqd = q->elevator->elevator_data;
1700         struct cfq_queue *cfqq;
1701
1702         if (!cfqd->busy_queues)
1703                 return 0;
1704
1705         if (unlikely(force))
1706                 return cfq_forced_dispatch(cfqd);
1707
1708         cfqq = cfq_select_queue(cfqd);
1709         if (!cfqq)
1710                 return 0;
1711
1712         /*
1713          * Dispatch a request from this cfqq, if it is allowed
1714          */
1715         if (!cfq_dispatch_request(cfqd, cfqq))
1716                 return 0;
1717
1718         cfqq->slice_dispatch++;
1719         cfq_clear_cfqq_must_dispatch(cfqq);
1720
1721         /*
1722          * expire an async queue immediately if it has used up its slice. idle
1723          * queue always expire after 1 dispatch round.
1724          */
1725         if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
1726             cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1727             cfq_class_idle(cfqq))) {
1728                 cfqq->slice_end = jiffies + 1;
1729                 cfq_slice_expired(cfqd, 0);
1730         }
1731
1732         cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
1733         return 1;
1734 }
1735
1736 /*
1737  * task holds one reference to the queue, dropped when task exits. each rq
1738  * in-flight on this queue also holds a reference, dropped when rq is freed.
1739  *
1740  * queue lock must be held here.
1741  */
1742 static void cfq_put_queue(struct cfq_queue *cfqq)
1743 {
1744         struct cfq_data *cfqd = cfqq->cfqd;
1745
1746         BUG_ON(atomic_read(&cfqq->ref) <= 0);
1747
1748         if (!atomic_dec_and_test(&cfqq->ref))
1749                 return;
1750
1751         cfq_log_cfqq(cfqd, cfqq, "put_queue");
1752         BUG_ON(rb_first(&cfqq->sort_list));
1753         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1754         BUG_ON(cfq_cfqq_on_rr(cfqq));
1755
1756         if (unlikely(cfqd->active_queue == cfqq)) {
1757                 __cfq_slice_expired(cfqd, cfqq, 0);
1758                 cfq_schedule_dispatch(cfqd);
1759         }
1760
1761         kmem_cache_free(cfq_pool, cfqq);
1762 }
1763
1764 /*
1765  * Must always be called with the rcu_read_lock() held
1766  */
1767 static void
1768 __call_for_each_cic(struct io_context *ioc,
1769                     void (*func)(struct io_context *, struct cfq_io_context *))
1770 {
1771         struct cfq_io_context *cic;
1772         struct hlist_node *n;
1773
1774         hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
1775                 func(ioc, cic);
1776 }
1777
1778 /*
1779  * Call func for each cic attached to this ioc.
1780  */
1781 static void
1782 call_for_each_cic(struct io_context *ioc,
1783                   void (*func)(struct io_context *, struct cfq_io_context *))
1784 {
1785         rcu_read_lock();
1786         __call_for_each_cic(ioc, func);
1787         rcu_read_unlock();
1788 }
1789
1790 static void cfq_cic_free_rcu(struct rcu_head *head)
1791 {
1792         struct cfq_io_context *cic;
1793
1794         cic = container_of(head, struct cfq_io_context, rcu_head);
1795
1796         kmem_cache_free(cfq_ioc_pool, cic);
1797         elv_ioc_count_dec(cfq_ioc_count);
1798
1799         if (ioc_gone) {
1800                 /*
1801                  * CFQ scheduler is exiting, grab exit lock and check
1802                  * the pending io context count. If it hits zero,
1803                  * complete ioc_gone and set it back to NULL
1804                  */
1805                 spin_lock(&ioc_gone_lock);
1806                 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
1807                         complete(ioc_gone);
1808                         ioc_gone = NULL;
1809                 }
1810                 spin_unlock(&ioc_gone_lock);
1811         }
1812 }
1813
1814 static void cfq_cic_free(struct cfq_io_context *cic)
1815 {
1816         call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
1817 }
1818
1819 static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
1820 {
1821         unsigned long flags;
1822
1823         BUG_ON(!cic->dead_key);
1824
1825         spin_lock_irqsave(&ioc->lock, flags);
1826         radix_tree_delete(&ioc->radix_root, cic->dead_key);
1827         hlist_del_rcu(&cic->cic_list);
1828         spin_unlock_irqrestore(&ioc->lock, flags);
1829
1830         cfq_cic_free(cic);
1831 }
1832
1833 /*
1834  * Must be called with rcu_read_lock() held or preemption otherwise disabled.
1835  * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
1836  * and ->trim() which is called with the task lock held
1837  */
1838 static void cfq_free_io_context(struct io_context *ioc)
1839 {
1840         /*
1841          * ioc->refcount is zero here, or we are called from elv_unregister(),
1842          * so no more cic's are allowed to be linked into this ioc.  So it
1843          * should be ok to iterate over the known list, we will see all cic's
1844          * since no new ones are added.
1845          */
1846         __call_for_each_cic(ioc, cic_free_func);
1847 }
1848
1849 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1850 {
1851         struct cfq_queue *__cfqq, *next;
1852
1853         if (unlikely(cfqq == cfqd->active_queue)) {
1854                 __cfq_slice_expired(cfqd, cfqq, 0);
1855                 cfq_schedule_dispatch(cfqd);
1856         }
1857
1858         /*
1859          * If this queue was scheduled to merge with another queue, be
1860          * sure to drop the reference taken on that queue (and others in
1861          * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
1862          */
1863         __cfqq = cfqq->new_cfqq;
1864         while (__cfqq) {
1865                 if (__cfqq == cfqq) {
1866                         WARN(1, "cfqq->new_cfqq loop detected\n");
1867                         break;
1868                 }
1869                 next = __cfqq->new_cfqq;
1870                 cfq_put_queue(__cfqq);
1871                 __cfqq = next;
1872         }
1873
1874         cfq_put_queue(cfqq);
1875 }
1876
1877 static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
1878                                          struct cfq_io_context *cic)
1879 {
1880         struct io_context *ioc = cic->ioc;
1881
1882         list_del_init(&cic->queue_list);
1883
1884         /*
1885          * Make sure key == NULL is seen for dead queues
1886          */
1887         smp_wmb();
1888         cic->dead_key = (unsigned long) cic->key;
1889         cic->key = NULL;
1890
1891         if (ioc->ioc_data == cic)
1892                 rcu_assign_pointer(ioc->ioc_data, NULL);
1893
1894         if (cic->cfqq[BLK_RW_ASYNC]) {
1895                 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
1896                 cic->cfqq[BLK_RW_ASYNC] = NULL;
1897         }
1898
1899         if (cic->cfqq[BLK_RW_SYNC]) {
1900                 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
1901                 cic->cfqq[BLK_RW_SYNC] = NULL;
1902         }
1903 }
1904
1905 static void cfq_exit_single_io_context(struct io_context *ioc,
1906                                        struct cfq_io_context *cic)
1907 {
1908         struct cfq_data *cfqd = cic->key;
1909
1910         if (cfqd) {
1911                 struct request_queue *q = cfqd->queue;
1912                 unsigned long flags;
1913
1914                 spin_lock_irqsave(q->queue_lock, flags);
1915
1916                 /*
1917                  * Ensure we get a fresh copy of the ->key to prevent
1918                  * race between exiting task and queue
1919                  */
1920                 smp_read_barrier_depends();
1921                 if (cic->key)
1922                         __cfq_exit_single_io_context(cfqd, cic);
1923
1924                 spin_unlock_irqrestore(q->queue_lock, flags);
1925         }
1926 }
1927
1928 /*
1929  * The process that ioc belongs to has exited, we need to clean up
1930  * and put the internal structures we have that belongs to that process.
1931  */
1932 static void cfq_exit_io_context(struct io_context *ioc)
1933 {
1934         call_for_each_cic(ioc, cfq_exit_single_io_context);
1935 }
1936
1937 static struct cfq_io_context *
1938 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1939 {
1940         struct cfq_io_context *cic;
1941
1942         cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
1943                                                         cfqd->queue->node);
1944         if (cic) {
1945                 cic->last_end_request = jiffies;
1946                 INIT_LIST_HEAD(&cic->queue_list);
1947                 INIT_HLIST_NODE(&cic->cic_list);
1948                 cic->dtor = cfq_free_io_context;
1949                 cic->exit = cfq_exit_io_context;
1950                 elv_ioc_count_inc(cfq_ioc_count);
1951         }
1952
1953         return cic;
1954 }
1955
1956 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
1957 {
1958         struct task_struct *tsk = current;
1959         int ioprio_class;
1960
1961         if (!cfq_cfqq_prio_changed(cfqq))
1962                 return;
1963
1964         ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
1965         switch (ioprio_class) {
1966         default:
1967                 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1968         case IOPRIO_CLASS_NONE:
1969                 /*
1970                  * no prio set, inherit CPU scheduling settings
1971                  */
1972                 cfqq->ioprio = task_nice_ioprio(tsk);
1973                 cfqq->ioprio_class = task_nice_ioclass(tsk);
1974                 break;
1975         case IOPRIO_CLASS_RT:
1976                 cfqq->ioprio = task_ioprio(ioc);
1977                 cfqq->ioprio_class = IOPRIO_CLASS_RT;
1978                 break;
1979         case IOPRIO_CLASS_BE:
1980                 cfqq->ioprio = task_ioprio(ioc);
1981                 cfqq->ioprio_class = IOPRIO_CLASS_BE;
1982                 break;
1983         case IOPRIO_CLASS_IDLE:
1984                 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1985                 cfqq->ioprio = 7;
1986                 cfq_clear_cfqq_idle_window(cfqq);
1987                 break;
1988         }
1989
1990         /*
1991          * keep track of original prio settings in case we have to temporarily
1992          * elevate the priority of this queue
1993          */
1994         cfqq->org_ioprio = cfqq->ioprio;
1995         cfqq->org_ioprio_class = cfqq->ioprio_class;
1996         cfq_clear_cfqq_prio_changed(cfqq);
1997 }
1998
1999 static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
2000 {
2001         struct cfq_data *cfqd = cic->key;
2002         struct cfq_queue *cfqq;
2003         unsigned long flags;
2004
2005         if (unlikely(!cfqd))
2006                 return;
2007
2008         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2009
2010         cfqq = cic->cfqq[BLK_RW_ASYNC];
2011         if (cfqq) {
2012                 struct cfq_queue *new_cfqq;
2013                 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
2014                                                 GFP_ATOMIC);
2015                 if (new_cfqq) {
2016                         cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
2017                         cfq_put_queue(cfqq);
2018                 }
2019         }
2020
2021         cfqq = cic->cfqq[BLK_RW_SYNC];
2022         if (cfqq)
2023                 cfq_mark_cfqq_prio_changed(cfqq);
2024
2025         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2026 }
2027
2028 static void cfq_ioc_set_ioprio(struct io_context *ioc)
2029 {
2030         call_for_each_cic(ioc, changed_ioprio);
2031         ioc->ioprio_changed = 0;
2032 }
2033
2034 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2035                           pid_t pid, bool is_sync)
2036 {
2037         RB_CLEAR_NODE(&cfqq->rb_node);
2038         RB_CLEAR_NODE(&cfqq->p_node);
2039         INIT_LIST_HEAD(&cfqq->fifo);
2040
2041         atomic_set(&cfqq->ref, 0);
2042         cfqq->cfqd = cfqd;
2043
2044         cfq_mark_cfqq_prio_changed(cfqq);
2045
2046         if (is_sync) {
2047                 if (!cfq_class_idle(cfqq))
2048                         cfq_mark_cfqq_idle_window(cfqq);
2049                 cfq_mark_cfqq_sync(cfqq);
2050         }
2051         cfqq->pid = pid;
2052 }
2053
2054 static struct cfq_queue *
2055 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
2056                      struct io_context *ioc, gfp_t gfp_mask)
2057 {
2058         struct cfq_queue *cfqq, *new_cfqq = NULL;
2059         struct cfq_io_context *cic;
2060
2061 retry:
2062         cic = cfq_cic_lookup(cfqd, ioc);
2063         /* cic always exists here */
2064         cfqq = cic_to_cfqq(cic, is_sync);
2065
2066         /*
2067          * Always try a new alloc if we fell back to the OOM cfqq
2068          * originally, since it should just be a temporary situation.
2069          */
2070         if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2071                 cfqq = NULL;
2072                 if (new_cfqq) {
2073                         cfqq = new_cfqq;
2074                         new_cfqq = NULL;
2075                 } else if (gfp_mask & __GFP_WAIT) {
2076                         spin_unlock_irq(cfqd->queue->queue_lock);
2077                         new_cfqq = kmem_cache_alloc_node(cfq_pool,
2078                                         gfp_mask | __GFP_ZERO,
2079                                         cfqd->queue->node);
2080                         spin_lock_irq(cfqd->queue->queue_lock);
2081                         if (new_cfqq)
2082                                 goto retry;
2083                 } else {
2084                         cfqq = kmem_cache_alloc_node(cfq_pool,
2085                                         gfp_mask | __GFP_ZERO,
2086                                         cfqd->queue->node);
2087                 }
2088
2089                 if (cfqq) {
2090                         cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
2091                         cfq_init_prio_data(cfqq, ioc);
2092                         cfq_log_cfqq(cfqd, cfqq, "alloced");
2093                 } else
2094                         cfqq = &cfqd->oom_cfqq;
2095         }
2096
2097         if (new_cfqq)
2098                 kmem_cache_free(cfq_pool, new_cfqq);
2099
2100         return cfqq;
2101 }
2102
2103 static struct cfq_queue **
2104 cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
2105 {
2106         switch (ioprio_class) {
2107         case IOPRIO_CLASS_RT:
2108                 return &cfqd->async_cfqq[0][ioprio];
2109         case IOPRIO_CLASS_BE:
2110                 return &cfqd->async_cfqq[1][ioprio];
2111         case IOPRIO_CLASS_IDLE:
2112                 return &cfqd->async_idle_cfqq;
2113         default:
2114                 BUG();
2115         }
2116 }
2117
2118 static struct cfq_queue *
2119 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
2120               gfp_t gfp_mask)
2121 {
2122         const int ioprio = task_ioprio(ioc);
2123         const int ioprio_class = task_ioprio_class(ioc);
2124         struct cfq_queue **async_cfqq = NULL;
2125         struct cfq_queue *cfqq = NULL;
2126
2127         if (!is_sync) {
2128                 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
2129                 cfqq = *async_cfqq;
2130         }
2131
2132         if (!cfqq)
2133                 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
2134
2135         /*
2136          * pin the queue now that it's allocated, scheduler exit will prune it
2137          */
2138         if (!is_sync && !(*async_cfqq)) {
2139                 atomic_inc(&cfqq->ref);
2140                 *async_cfqq = cfqq;
2141         }
2142
2143         atomic_inc(&cfqq->ref);
2144         return cfqq;
2145 }
2146
2147 /*
2148  * We drop cfq io contexts lazily, so we may find a dead one.
2149  */
2150 static void
2151 cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
2152                   struct cfq_io_context *cic)
2153 {
2154         unsigned long flags;
2155
2156         WARN_ON(!list_empty(&cic->queue_list));
2157
2158         spin_lock_irqsave(&ioc->lock, flags);
2159
2160         BUG_ON(ioc->ioc_data == cic);
2161
2162         radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);
2163         hlist_del_rcu(&cic->cic_list);
2164         spin_unlock_irqrestore(&ioc->lock, flags);
2165
2166         cfq_cic_free(cic);
2167 }
2168
2169 static struct cfq_io_context *
2170 cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
2171 {
2172         struct cfq_io_context *cic;
2173         unsigned long flags;
2174         void *k;
2175
2176         if (unlikely(!ioc))
2177                 return NULL;
2178
2179         rcu_read_lock();
2180
2181         /*
2182          * we maintain a last-hit cache, to avoid browsing over the tree
2183          */
2184         cic = rcu_dereference(ioc->ioc_data);
2185         if (cic && cic->key == cfqd) {
2186                 rcu_read_unlock();
2187                 return cic;
2188         }
2189
2190         do {
2191                 cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
2192                 rcu_read_unlock();
2193                 if (!cic)
2194                         break;
2195                 /* ->key must be copied to avoid race with cfq_exit_queue() */
2196                 k = cic->key;
2197                 if (unlikely(!k)) {
2198                         cfq_drop_dead_cic(cfqd, ioc, cic);
2199                         rcu_read_lock();
2200                         continue;
2201                 }
2202
2203                 spin_lock_irqsave(&ioc->lock, flags);
2204                 rcu_assign_pointer(ioc->ioc_data, cic);
2205                 spin_unlock_irqrestore(&ioc->lock, flags);
2206                 break;
2207         } while (1);
2208
2209         return cic;
2210 }
2211
2212 /*
2213  * Add cic into ioc, using cfqd as the search key. This enables us to lookup
2214  * the process specific cfq io context when entered from the block layer.
2215  * Also adds the cic to a per-cfqd list, used when this queue is removed.
2216  */
2217 static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
2218                         struct cfq_io_context *cic, gfp_t gfp_mask)
2219 {
2220         unsigned long flags;
2221         int ret;
2222
2223         ret = radix_tree_preload(gfp_mask);
2224         if (!ret) {
2225                 cic->ioc = ioc;
2226                 cic->key = cfqd;
2227
2228                 spin_lock_irqsave(&ioc->lock, flags);
2229                 ret = radix_tree_insert(&ioc->radix_root,
2230                                                 (unsigned long) cfqd, cic);
2231                 if (!ret)
2232                         hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
2233                 spin_unlock_irqrestore(&ioc->lock, flags);
2234
2235                 radix_tree_preload_end();
2236
2237                 if (!ret) {
2238                         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2239                         list_add(&cic->queue_list, &cfqd->cic_list);
2240                         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2241                 }
2242         }
2243
2244         if (ret)
2245                 printk(KERN_ERR "cfq: cic link failed!\n");
2246
2247         return ret;
2248 }
2249
2250 /*
2251  * Setup general io context and cfq io context. There can be several cfq
2252  * io contexts per general io context, if this process is doing io to more
2253  * than one device managed by cfq.
2254  */
2255 static struct cfq_io_context *
2256 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
2257 {
2258         struct io_context *ioc = NULL;
2259         struct cfq_io_context *cic;
2260
2261         might_sleep_if(gfp_mask & __GFP_WAIT);
2262
2263         ioc = get_io_context(gfp_mask, cfqd->queue->node);
2264         if (!ioc)
2265                 return NULL;
2266
2267         cic = cfq_cic_lookup(cfqd, ioc);
2268         if (cic)
2269                 goto out;
2270
2271         cic = cfq_alloc_io_context(cfqd, gfp_mask);
2272         if (cic == NULL)
2273                 goto err;
2274
2275         if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
2276                 goto err_free;
2277
2278 out:
2279         smp_read_barrier_depends();
2280         if (unlikely(ioc->ioprio_changed))
2281                 cfq_ioc_set_ioprio(ioc);
2282
2283         return cic;
2284 err_free:
2285         cfq_cic_free(cic);
2286 err:
2287         put_io_context(ioc);
2288         return NULL;
2289 }
2290
2291 static void
2292 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
2293 {
2294         unsigned long elapsed = jiffies - cic->last_end_request;
2295         unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
2296
2297         cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
2298         cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
2299         cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
2300 }
2301
2302 static void
2303 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2304                        struct request *rq)
2305 {
2306         sector_t sdist;
2307         u64 total;
2308
2309         if (!cfqq->last_request_pos)
2310                 sdist = 0;
2311         else if (cfqq->last_request_pos < blk_rq_pos(rq))
2312                 sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
2313         else
2314                 sdist = cfqq->last_request_pos - blk_rq_pos(rq);
2315
2316         /*
2317          * Don't allow the seek distance to get too large from the
2318          * odd fragment, pagein, etc
2319          */
2320         if (cfqq->seek_samples <= 60) /* second&third seek */
2321                 sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024);
2322         else
2323                 sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64);
2324
2325         cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8;
2326         cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8;
2327         total = cfqq->seek_total + (cfqq->seek_samples/2);
2328         do_div(total, cfqq->seek_samples);
2329         cfqq->seek_mean = (sector_t)total;
2330
2331         /*
2332          * If this cfqq is shared between multiple processes, check to
2333          * make sure that those processes are still issuing I/Os within
2334          * the mean seek distance.  If not, it may be time to break the
2335          * queues apart again.
2336          */
2337         if (cfq_cfqq_coop(cfqq)) {
2338                 if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start)
2339                         cfqq->seeky_start = jiffies;
2340                 else if (!CFQQ_SEEKY(cfqq))
2341                         cfqq->seeky_start = 0;
2342         }
2343 }
2344
2345 /*
2346  * Disable idle window if the process thinks too long or seeks so much that
2347  * it doesn't matter
2348  */
2349 static void
2350 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2351                        struct cfq_io_context *cic)
2352 {
2353         int old_idle, enable_idle;
2354
2355         /*
2356          * Don't idle for async or idle io prio class
2357          */
2358         if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
2359                 return;
2360
2361         enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
2362
2363         if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
2364             (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq)))
2365                 enable_idle = 0;
2366         else if (sample_valid(cic->ttime_samples)) {
2367                 if (cic->ttime_mean > cfqd->cfq_slice_idle)
2368                         enable_idle = 0;
2369                 else
2370                         enable_idle = 1;
2371         }
2372
2373         if (old_idle != enable_idle) {
2374                 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
2375                 if (enable_idle)
2376                         cfq_mark_cfqq_idle_window(cfqq);
2377                 else
2378                         cfq_clear_cfqq_idle_window(cfqq);
2379         }
2380 }
2381
2382 /*
2383  * Check if new_cfqq should preempt the currently active queue. Return 0 for
2384  * no or if we aren't sure, a 1 will cause a preempt.
2385  */
2386 static bool
2387 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
2388                    struct request *rq)
2389 {
2390         struct cfq_queue *cfqq;
2391
2392         cfqq = cfqd->active_queue;
2393         if (!cfqq)
2394                 return false;
2395
2396         if (cfq_slice_used(cfqq))
2397                 return true;
2398
2399         if (cfq_class_idle(new_cfqq))
2400                 return false;
2401
2402         if (cfq_class_idle(cfqq))
2403                 return true;
2404
2405         if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD
2406             && new_cfqq->service_tree == cfqq->service_tree)
2407                 return true;
2408
2409         /*
2410          * if the new request is sync, but the currently running queue is
2411          * not, let the sync request have priority.
2412          */
2413         if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
2414                 return true;
2415
2416         /*
2417          * So both queues are sync. Let the new request get disk time if
2418          * it's a metadata request and the current queue is doing regular IO.
2419          */
2420         if (rq_is_meta(rq) && !cfqq->meta_pending)
2421                 return true;
2422
2423         /*
2424          * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
2425          */
2426         if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
2427                 return true;
2428
2429         if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
2430                 return false;
2431
2432         /*
2433          * if this request is as-good as one we would expect from the
2434          * current cfqq, let it preempt
2435          */
2436         if (cfq_rq_close(cfqd, cfqq, rq))
2437         if (cfq_rq_close(cfqd, cfqq, rq) && (!cfq_cfqq_coop(new_cfqq) ||
2438             cfqd->busy_queues == 1)) {
2439                 /*
2440                  * Mark new queue coop_preempt, so its coop flag will not be
2441                  * cleared when new queue gets scheduled at the very first time
2442                  */
2443                 cfq_mark_cfqq_coop_preempt(new_cfqq);
2444                 cfq_mark_cfqq_coop(new_cfqq);
2445                 return true;
2446         }
2447
2448         return false;
2449 }
2450
2451 /*
2452  * cfqq preempts the active queue. if we allowed preempt with no slice left,
2453  * let it have half of its nominal slice.
2454  */
2455 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2456 {
2457         cfq_log_cfqq(cfqd, cfqq, "preempt");
2458         cfq_slice_expired(cfqd, 1);
2459
2460         /*
2461          * Put the new queue at the front of the of the current list,
2462          * so we know that it will be selected next.
2463          */
2464         BUG_ON(!cfq_cfqq_on_rr(cfqq));
2465
2466         cfq_service_tree_add(cfqd, cfqq, 1);
2467
2468         cfqq->slice_end = 0;
2469         cfq_mark_cfqq_slice_new(cfqq);
2470 }
2471
2472 /*
2473  * Called when a new fs request (rq) is added (to cfqq). Check if there's
2474  * something we should do about it
2475  */
2476 static void
2477 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2478                 struct request *rq)
2479 {
2480         struct cfq_io_context *cic = RQ_CIC(rq);
2481
2482         cfqd->rq_queued++;
2483         if (rq_is_meta(rq))
2484                 cfqq->meta_pending++;
2485
2486         cfq_update_io_thinktime(cfqd, cic);
2487         cfq_update_io_seektime(cfqd, cfqq, rq);
2488         cfq_update_idle_window(cfqd, cfqq, cic);
2489
2490         cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
2491
2492         if (cfqq == cfqd->active_queue) {
2493                 /*
2494                  * Remember that we saw a request from this process, but
2495                  * don't start queuing just yet. Otherwise we risk seeing lots
2496                  * of tiny requests, because we disrupt the normal plugging
2497                  * and merging. If the request is already larger than a single
2498                  * page, let it rip immediately. For that case we assume that
2499                  * merging is already done. Ditto for a busy system that
2500                  * has other work pending, don't risk delaying until the
2501                  * idle timer unplug to continue working.
2502                  */
2503                 if (cfq_cfqq_wait_request(cfqq)) {
2504                         if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
2505                             cfqd->busy_queues > 1) {
2506                                 del_timer(&cfqd->idle_slice_timer);
2507                         __blk_run_queue(cfqd->queue);
2508                         }
2509                         cfq_mark_cfqq_must_dispatch(cfqq);
2510                 }
2511         } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
2512                 /*
2513                  * not the active queue - expire current slice if it is
2514                  * idle and has expired it's mean thinktime or this new queue
2515                  * has some old slice time left and is of higher priority or
2516                  * this new queue is RT and the current one is BE
2517                  */
2518                 cfq_preempt_queue(cfqd, cfqq);
2519                 __blk_run_queue(cfqd->queue);
2520         }
2521 }
2522
2523 static void cfq_insert_request(struct request_queue *q, struct request *rq)
2524 {
2525         struct cfq_data *cfqd = q->elevator->elevator_data;
2526         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2527
2528         cfq_log_cfqq(cfqd, cfqq, "insert_request");
2529         cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
2530
2531         rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
2532         list_add_tail(&rq->queuelist, &cfqq->fifo);
2533         cfq_add_rq_rb(rq);
2534
2535         cfq_rq_enqueued(cfqd, cfqq, rq);
2536 }
2537
2538 /*
2539  * Update hw_tag based on peak queue depth over 50 samples under
2540  * sufficient load.
2541  */
2542 static void cfq_update_hw_tag(struct cfq_data *cfqd)
2543 {
2544         struct cfq_queue *cfqq = cfqd->active_queue;
2545
2546         if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak)
2547                 cfqd->rq_in_driver_peak = rq_in_driver(cfqd);
2548
2549         if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
2550             rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
2551                 return;
2552
2553         /*
2554          * If active queue hasn't enough requests and can idle, cfq might not
2555          * dispatch sufficient requests to hardware. Don't zero hw_tag in this
2556          * case
2557          */
2558         if (cfqq && cfq_cfqq_idle_window(cfqq) &&
2559             cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
2560             CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN)
2561                 return;
2562
2563         if (cfqd->hw_tag_samples++ < 50)
2564                 return;
2565
2566         if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN)
2567                 cfqd->hw_tag = 1;
2568         else
2569                 cfqd->hw_tag = 0;
2570
2571         cfqd->hw_tag_samples = 0;
2572         cfqd->rq_in_driver_peak = 0;
2573 }
2574
2575 static void cfq_completed_request(struct request_queue *q, struct request *rq)
2576 {
2577         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2578         struct cfq_data *cfqd = cfqq->cfqd;
2579         const int sync = rq_is_sync(rq);
2580         unsigned long now;
2581
2582         now = jiffies;
2583         cfq_log_cfqq(cfqd, cfqq, "complete");
2584
2585         cfq_update_hw_tag(cfqd);
2586
2587         WARN_ON(!cfqd->rq_in_driver[sync]);
2588         WARN_ON(!cfqq->dispatched);
2589         cfqd->rq_in_driver[sync]--;
2590         cfqq->dispatched--;
2591
2592         if (cfq_cfqq_sync(cfqq))
2593                 cfqd->sync_flight--;
2594
2595         if (sync) {
2596                 RQ_CIC(rq)->last_end_request = now;
2597                 cfqd->last_end_sync_rq = now;
2598         }
2599
2600         /*
2601          * If this is the active queue, check if it needs to be expired,
2602          * or if we want to idle in case it has no pending requests.
2603          */
2604         if (cfqd->active_queue == cfqq) {
2605                 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
2606
2607                 if (cfq_cfqq_slice_new(cfqq)) {
2608                         cfq_set_prio_slice(cfqd, cfqq);
2609                         cfq_clear_cfqq_slice_new(cfqq);
2610                 }
2611                 /*
2612                  * If there are no requests waiting in this queue, and
2613                  * there are other queues ready to issue requests, AND
2614                  * those other queues are issuing requests within our
2615                  * mean seek distance, give them a chance to run instead
2616                  * of idling.
2617                  */
2618                 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
2619                         cfq_slice_expired(cfqd, 1);
2620                 else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq) &&
2621                          sync && !rq_noidle(rq))
2622                         cfq_arm_slice_timer(cfqd);
2623         }
2624
2625         if (!rq_in_driver(cfqd))
2626                 cfq_schedule_dispatch(cfqd);
2627 }
2628
2629 /*
2630  * we temporarily boost lower priority queues if they are holding fs exclusive
2631  * resources. they are boosted to normal prio (CLASS_BE/4)
2632  */
2633 static void cfq_prio_boost(struct cfq_queue *cfqq)
2634 {
2635         if (has_fs_excl()) {
2636                 /*
2637                  * boost idle prio on transactions that would lock out other
2638                  * users of the filesystem
2639                  */
2640                 if (cfq_class_idle(cfqq))
2641                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
2642                 if (cfqq->ioprio > IOPRIO_NORM)
2643                         cfqq->ioprio = IOPRIO_NORM;
2644         } else {
2645                 /*
2646                  * unboost the queue (if needed)
2647                  */
2648                 cfqq->ioprio_class = cfqq->org_ioprio_class;
2649                 cfqq->ioprio = cfqq->org_ioprio;
2650         }
2651 }
2652
2653 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
2654 {
2655         if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
2656                 cfq_mark_cfqq_must_alloc_slice(cfqq);
2657                 return ELV_MQUEUE_MUST;
2658         }
2659
2660         return ELV_MQUEUE_MAY;
2661 }
2662
2663 static int cfq_may_queue(struct request_queue *q, int rw)
2664 {
2665         struct cfq_data *cfqd = q->elevator->elevator_data;
2666         struct task_struct *tsk = current;
2667         struct cfq_io_context *cic;
2668         struct cfq_queue *cfqq;
2669
2670         /*
2671          * don't force setup of a queue from here, as a call to may_queue
2672          * does not necessarily imply that a request actually will be queued.
2673          * so just lookup a possibly existing queue, or return 'may queue'
2674          * if that fails
2675          */
2676         cic = cfq_cic_lookup(cfqd, tsk->io_context);
2677         if (!cic)
2678                 return ELV_MQUEUE_MAY;
2679
2680         cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
2681         if (cfqq) {
2682                 cfq_init_prio_data(cfqq, cic->ioc);
2683                 cfq_prio_boost(cfqq);
2684
2685                 return __cfq_may_queue(cfqq);
2686         }
2687
2688         return ELV_MQUEUE_MAY;
2689 }
2690
2691 /*
2692  * queue lock held here
2693  */
2694 static void cfq_put_request(struct request *rq)
2695 {
2696         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2697
2698         if (cfqq) {
2699                 const int rw = rq_data_dir(rq);
2700
2701                 BUG_ON(!cfqq->allocated[rw]);
2702                 cfqq->allocated[rw]--;
2703
2704                 put_io_context(RQ_CIC(rq)->ioc);
2705
2706                 rq->elevator_private = NULL;
2707                 rq->elevator_private2 = NULL;
2708
2709                 cfq_put_queue(cfqq);
2710         }
2711 }
2712
2713 static struct cfq_queue *
2714 cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
2715                 struct cfq_queue *cfqq)
2716 {
2717         cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
2718         cic_set_cfqq(cic, cfqq->new_cfqq, 1);
2719         cfq_mark_cfqq_coop(cfqq->new_cfqq);
2720         cfq_put_queue(cfqq);
2721         return cic_to_cfqq(cic, 1);
2722 }
2723
2724 static int should_split_cfqq(struct cfq_queue *cfqq)
2725 {
2726         if (cfqq->seeky_start &&
2727             time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT))
2728                 return 1;
2729         return 0;
2730 }
2731
2732 /*
2733  * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
2734  * was the last process referring to said cfqq.
2735  */
2736 static struct cfq_queue *
2737 split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
2738 {
2739         if (cfqq_process_refs(cfqq) == 1) {
2740                 cfqq->seeky_start = 0;
2741                 cfqq->pid = current->pid;
2742                 cfq_clear_cfqq_coop(cfqq);
2743                 return cfqq;
2744         }
2745
2746         cic_set_cfqq(cic, NULL, 1);
2747         cfq_put_queue(cfqq);
2748         return NULL;
2749 }
2750 /*
2751  * Allocate cfq data structures associated with this request.
2752  */
2753 static int
2754 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
2755 {
2756         struct cfq_data *cfqd = q->elevator->elevator_data;
2757         struct cfq_io_context *cic;
2758         const int rw = rq_data_dir(rq);
2759         const bool is_sync = rq_is_sync(rq);
2760         struct cfq_queue *cfqq;
2761         unsigned long flags;
2762
2763         might_sleep_if(gfp_mask & __GFP_WAIT);
2764
2765         cic = cfq_get_io_context(cfqd, gfp_mask);
2766
2767         spin_lock_irqsave(q->queue_lock, flags);
2768
2769         if (!cic)
2770                 goto queue_fail;
2771
2772 new_queue:
2773         cfqq = cic_to_cfqq(cic, is_sync);
2774         if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2775                 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
2776                 cic_set_cfqq(cic, cfqq, is_sync);
2777         } else {
2778                 /*
2779                  * If the queue was seeky for too long, break it apart.
2780                  */
2781                 if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) {
2782                         cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
2783                         cfqq = split_cfqq(cic, cfqq);
2784                         if (!cfqq)
2785                                 goto new_queue;
2786                 }
2787
2788                 /*
2789                  * Check to see if this queue is scheduled to merge with
2790                  * another, closely cooperating queue.  The merging of
2791                  * queues happens here as it must be done in process context.
2792                  * The reference on new_cfqq was taken in merge_cfqqs.
2793                  */
2794                 if (cfqq->new_cfqq)
2795                         cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
2796         }
2797
2798         cfqq->allocated[rw]++;
2799         atomic_inc(&cfqq->ref);
2800
2801         spin_unlock_irqrestore(q->queue_lock, flags);
2802
2803         rq->elevator_private = cic;
2804         rq->elevator_private2 = cfqq;
2805         return 0;
2806
2807 queue_fail:
2808         if (cic)
2809                 put_io_context(cic->ioc);
2810
2811         cfq_schedule_dispatch(cfqd);
2812         spin_unlock_irqrestore(q->queue_lock, flags);
2813         cfq_log(cfqd, "set_request fail");
2814         return 1;
2815 }
2816
2817 static void cfq_kick_queue(struct work_struct *work)
2818 {
2819         struct cfq_data *cfqd =
2820                 container_of(work, struct cfq_data, unplug_work);
2821         struct request_queue *q = cfqd->queue;
2822
2823         spin_lock_irq(q->queue_lock);
2824         __blk_run_queue(cfqd->queue);
2825         spin_unlock_irq(q->queue_lock);
2826 }
2827
2828 /*
2829  * Timer running if the active_queue is currently idling inside its time slice
2830  */
2831 static void cfq_idle_slice_timer(unsigned long data)
2832 {
2833         struct cfq_data *cfqd = (struct cfq_data *) data;
2834         struct cfq_queue *cfqq;
2835         unsigned long flags;
2836         int timed_out = 1;
2837
2838         cfq_log(cfqd, "idle timer fired");
2839
2840         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2841
2842         cfqq = cfqd->active_queue;
2843         if (cfqq) {
2844                 timed_out = 0;
2845
2846                 /*
2847                  * We saw a request before the queue expired, let it through
2848                  */
2849                 if (cfq_cfqq_must_dispatch(cfqq))
2850                         goto out_kick;
2851
2852                 /*
2853                  * expired
2854                  */
2855                 if (cfq_slice_used(cfqq))
2856                         goto expire;
2857
2858                 /*
2859                  * only expire and reinvoke request handler, if there are
2860                  * other queues with pending requests
2861                  */
2862                 if (!cfqd->busy_queues)
2863                         goto out_cont;
2864
2865                 /*
2866                  * not expired and it has a request pending, let it dispatch
2867                  */
2868                 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2869                         goto out_kick;
2870         }
2871 expire:
2872         cfq_slice_expired(cfqd, timed_out);
2873 out_kick:
2874         cfq_schedule_dispatch(cfqd);
2875 out_cont:
2876         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2877 }
2878
2879 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
2880 {
2881         del_timer_sync(&cfqd->idle_slice_timer);
2882         cancel_work_sync(&cfqd->unplug_work);
2883 }
2884
2885 static void cfq_put_async_queues(struct cfq_data *cfqd)
2886 {
2887         int i;
2888
2889         for (i = 0; i < IOPRIO_BE_NR; i++) {
2890                 if (cfqd->async_cfqq[0][i])
2891                         cfq_put_queue(cfqd->async_cfqq[0][i]);
2892                 if (cfqd->async_cfqq[1][i])
2893                         cfq_put_queue(cfqd->async_cfqq[1][i]);
2894         }
2895
2896         if (cfqd->async_idle_cfqq)
2897                 cfq_put_queue(cfqd->async_idle_cfqq);
2898 }
2899
2900 static void cfq_exit_queue(struct elevator_queue *e)
2901 {
2902         struct cfq_data *cfqd = e->elevator_data;
2903         struct request_queue *q = cfqd->queue;
2904
2905         cfq_shutdown_timer_wq(cfqd);
2906
2907         spin_lock_irq(q->queue_lock);
2908
2909         if (cfqd->active_queue)
2910                 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
2911
2912         while (!list_empty(&cfqd->cic_list)) {
2913                 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
2914                                                         struct cfq_io_context,
2915                                                         queue_list);
2916
2917                 __cfq_exit_single_io_context(cfqd, cic);
2918         }
2919
2920         cfq_put_async_queues(cfqd);
2921
2922         spin_unlock_irq(q->queue_lock);
2923
2924         cfq_shutdown_timer_wq(cfqd);
2925
2926         kfree(cfqd);
2927 }
2928
2929 static void *cfq_init_queue(struct request_queue *q)
2930 {
2931         struct cfq_data *cfqd;
2932         int i, j;
2933
2934         cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
2935         if (!cfqd)
2936                 return NULL;
2937
2938         for (i = 0; i < 2; ++i)
2939                 for (j = 0; j < 3; ++j)
2940                         cfqd->service_trees[i][j] = CFQ_RB_ROOT;
2941         cfqd->service_tree_idle = CFQ_RB_ROOT;
2942
2943         /*
2944          * Not strictly needed (since RB_ROOT just clears the node and we
2945          * zeroed cfqd on alloc), but better be safe in case someone decides
2946          * to add magic to the rb code
2947          */
2948         for (i = 0; i < CFQ_PRIO_LISTS; i++)
2949                 cfqd->prio_trees[i] = RB_ROOT;
2950
2951         /*
2952          * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
2953          * Grab a permanent reference to it, so that the normal code flow
2954          * will not attempt to free it.
2955          */
2956         cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
2957         atomic_inc(&cfqd->oom_cfqq.ref);
2958
2959         INIT_LIST_HEAD(&cfqd->cic_list);
2960
2961         cfqd->queue = q;
2962
2963         init_timer(&cfqd->idle_slice_timer);
2964         cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
2965         cfqd->idle_slice_timer.data = (unsigned long) cfqd;
2966
2967         INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
2968
2969         cfqd->cfq_quantum = cfq_quantum;
2970         cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
2971         cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
2972         cfqd->cfq_back_max = cfq_back_max;
2973         cfqd->cfq_back_penalty = cfq_back_penalty;
2974         cfqd->cfq_slice[0] = cfq_slice_async;
2975         cfqd->cfq_slice[1] = cfq_slice_sync;
2976         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2977         cfqd->cfq_slice_idle = cfq_slice_idle;
2978         cfqd->cfq_latency = 1;
2979         cfqd->hw_tag = 1;
2980         cfqd->last_end_sync_rq = jiffies;
2981         return cfqd;
2982 }
2983
2984 static void cfq_slab_kill(void)
2985 {
2986         /*
2987          * Caller already ensured that pending RCU callbacks are completed,
2988          * so we should have no busy allocations at this point.
2989          */
2990         if (cfq_pool)
2991                 kmem_cache_destroy(cfq_pool);
2992         if (cfq_ioc_pool)
2993                 kmem_cache_destroy(cfq_ioc_pool);
2994 }
2995
2996 static int __init cfq_slab_setup(void)
2997 {
2998         cfq_pool = KMEM_CACHE(cfq_queue, 0);
2999         if (!cfq_pool)
3000                 goto fail;
3001
3002         cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
3003         if (!cfq_ioc_pool)
3004                 goto fail;
3005
3006         return 0;
3007 fail:
3008         cfq_slab_kill();
3009         return -ENOMEM;
3010 }
3011
3012 /*
3013  * sysfs parts below -->
3014  */
3015 static ssize_t
3016 cfq_var_show(unsigned int var, char *page)
3017 {
3018         return sprintf(page, "%d\n", var);
3019 }
3020
3021 static ssize_t
3022 cfq_var_store(unsigned int *var, const char *page, size_t count)
3023 {
3024         char *p = (char *) page;
3025
3026         *var = simple_strtoul(p, &p, 10);
3027         return count;
3028 }
3029
3030 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
3031 static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
3032 {                                                                       \
3033         struct cfq_data *cfqd = e->elevator_data;                       \
3034         unsigned int __data = __VAR;                                    \
3035         if (__CONV)                                                     \
3036                 __data = jiffies_to_msecs(__data);                      \
3037         return cfq_var_show(__data, (page));                            \
3038 }
3039 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
3040 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
3041 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
3042 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
3043 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
3044 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
3045 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
3046 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
3047 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
3048 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
3049 #undef SHOW_FUNCTION
3050
3051 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
3052 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
3053 {                                                                       \
3054         struct cfq_data *cfqd = e->elevator_data;                       \
3055         unsigned int __data;                                            \
3056         int ret = cfq_var_store(&__data, (page), count);                \
3057         if (__data < (MIN))                                             \
3058                 __data = (MIN);                                         \
3059         else if (__data > (MAX))                                        \
3060                 __data = (MAX);                                         \
3061         if (__CONV)                                                     \
3062                 *(__PTR) = msecs_to_jiffies(__data);                    \
3063         else                                                            \
3064                 *(__PTR) = __data;                                      \
3065         return ret;                                                     \
3066 }
3067 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
3068 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
3069                 UINT_MAX, 1);
3070 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
3071                 UINT_MAX, 1);
3072 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
3073 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
3074                 UINT_MAX, 0);
3075 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
3076 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
3077 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
3078 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
3079                 UINT_MAX, 0);
3080 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
3081 #undef STORE_FUNCTION
3082
3083 #define CFQ_ATTR(name) \
3084         __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
3085
3086 static struct elv_fs_entry cfq_attrs[] = {
3087         CFQ_ATTR(quantum),
3088         CFQ_ATTR(fifo_expire_sync),
3089         CFQ_ATTR(fifo_expire_async),
3090         CFQ_ATTR(back_seek_max),
3091         CFQ_ATTR(back_seek_penalty),
3092         CFQ_ATTR(slice_sync),
3093         CFQ_ATTR(slice_async),
3094         CFQ_ATTR(slice_async_rq),
3095         CFQ_ATTR(slice_idle),
3096         CFQ_ATTR(low_latency),
3097         __ATTR_NULL
3098 };
3099
3100 static struct elevator_type iosched_cfq = {
3101         .ops = {
3102                 .elevator_merge_fn =            cfq_merge,
3103                 .elevator_merged_fn =           cfq_merged_request,
3104                 .elevator_merge_req_fn =        cfq_merged_requests,
3105                 .elevator_allow_merge_fn =      cfq_allow_merge,
3106                 .elevator_dispatch_fn =         cfq_dispatch_requests,
3107                 .elevator_add_req_fn =          cfq_insert_request,
3108                 .elevator_activate_req_fn =     cfq_activate_request,
3109                 .elevator_deactivate_req_fn =   cfq_deactivate_request,
3110                 .elevator_queue_empty_fn =      cfq_queue_empty,
3111                 .elevator_completed_req_fn =    cfq_completed_request,
3112                 .elevator_former_req_fn =       elv_rb_former_request,
3113                 .elevator_latter_req_fn =       elv_rb_latter_request,
3114                 .elevator_set_req_fn =          cfq_set_request,
3115                 .elevator_put_req_fn =          cfq_put_request,
3116                 .elevator_may_queue_fn =        cfq_may_queue,
3117                 .elevator_init_fn =             cfq_init_queue,
3118                 .elevator_exit_fn =             cfq_exit_queue,
3119                 .trim =                         cfq_free_io_context,
3120         },
3121         .elevator_attrs =       cfq_attrs,
3122         .elevator_name =        "cfq",
3123         .elevator_owner =       THIS_MODULE,
3124 };
3125
3126 static int __init cfq_init(void)
3127 {
3128         /*
3129          * could be 0 on HZ < 1000 setups
3130          */
3131         if (!cfq_slice_async)
3132                 cfq_slice_async = 1;
3133         if (!cfq_slice_idle)
3134                 cfq_slice_idle = 1;
3135
3136         if (cfq_slab_setup())
3137                 return -ENOMEM;
3138
3139         elv_register(&iosched_cfq);
3140
3141         return 0;
3142 }
3143
3144 static void __exit cfq_exit(void)
3145 {
3146         DECLARE_COMPLETION_ONSTACK(all_gone);
3147         elv_unregister(&iosched_cfq);
3148         ioc_gone = &all_gone;
3149         /* ioc_gone's update must be visible before reading ioc_count */
3150         smp_wmb();
3151
3152         /*
3153          * this also protects us from entering cfq_slab_kill() with
3154          * pending RCU callbacks
3155          */
3156         if (elv_ioc_count_read(cfq_ioc_count))
3157                 wait_for_completion(&all_gone);
3158         cfq_slab_kill();
3159 }
3160
3161 module_init(cfq_init);
3162 module_exit(cfq_exit);
3163
3164 MODULE_AUTHOR("Jens Axboe");
3165 MODULE_LICENSE("GPL");
3166 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");