mm/readahead.c

   1 /*
   2  * mm/readahead.c - address_space-level file readahead.
   3  *
   4  * Copyright (C) 2002, Linus Torvalds
   5  *
   6  * 09Apr2002    akpm@zip.com.au
   7  *              Initial version.
   8  */
   9
  10 #include <linux/kernel.h>
  11 #include <linux/fs.h>
  12 #include <linux/mm.h>
  13 #include <linux/module.h>
  14 #include <linux/blkdev.h>
  15 #include <linux/backing-dev.h>
  16 #include <linux/task_io_accounting_ops.h>
  17 #include <linux/pagevec.h>
  18
  19 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
  20 {
  21 }
  22 EXPORT_SYMBOL(default_unplug_io_fn);
  23
  24 /*
  25  * Convienent macros for min/max read-ahead pages.
  26  * Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up.
  27  * The latter is necessary for systems with large page size(i.e. 64k).
  28  */
  29 #define MAX_RA_PAGES    (VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE)
  30 #define MIN_RA_PAGES    DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE)
  31
  32 struct backing_dev_info default_backing_dev_info = {
  33         .ra_pages       = MAX_RA_PAGES,
  34         .state          = 0,
  35         .capabilities   = BDI_CAP_MAP_COPY,
  36         .unplug_io_fn   = default_unplug_io_fn,
  37 };
  38 EXPORT_SYMBOL_GPL(default_backing_dev_info);
  39
  40 /*
  41  * Initialise a struct file's readahead state.  Assumes that the caller has
  42  * memset *ra to zero.
  43  */
  44 void
  45 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
  46 {
  47         ra->ra_pages = mapping->backing_dev_info->ra_pages;
  48         ra->prev_index = -1;
  49 }
  50 EXPORT_SYMBOL_GPL(file_ra_state_init);
  51
  52 #define list_to_page(head) (list_entry((head)->prev, struct page, lru))
  53
  54 /**
  55  * read_cache_pages - populate an address space with some pages & start reads against them
  56  * @mapping: the address_space
  57  * @pages: The address of a list_head which contains the target pages.  These
  58  *   pages have their ->index populated and are otherwise uninitialised.
  59  * @filler: callback routine for filling a single page.
  60  * @data: private data for the callback routine.
  61  *
  62  * Hides the details of the LRU cache etc from the filesystems.
  63  */
  64 int read_cache_pages(struct address_space *mapping, struct list_head *pages,
  65                         int (*filler)(void *, struct page *), void *data)
  66 {
  67         struct page *page;
  68         struct pagevec lru_pvec;
  69         int ret = 0;
  70
  71         pagevec_init(&lru_pvec, 0);
  72
  73         while (!list_empty(pages)) {
  74                 page = list_to_page(pages);
  75                 list_del(&page->lru);
  76                 if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
  77                         page_cache_release(page);
  78                         continue;
  79                 }
  80                 ret = filler(data, page);
  81                 if (!pagevec_add(&lru_pvec, page))
  82                         __pagevec_lru_add(&lru_pvec);
  83                 if (ret) {
  84                         put_pages_list(pages);
  85                         break;
  86                 }
  87                 task_io_account_read(PAGE_CACHE_SIZE);
  88         }
  89         pagevec_lru_add(&lru_pvec);
  90         return ret;
  91 }
  92
  93 EXPORT_SYMBOL(read_cache_pages);
  94
  95 static int read_pages(struct address_space *mapping, struct file *filp,
  96                 struct list_head *pages, unsigned nr_pages)
  97 {
  98         unsigned page_idx;
  99         struct pagevec lru_pvec;
 100         int ret;
 101
 102         if (mapping->a_ops->readpages) {
 103                 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
 104                 /* Clean up the remaining pages */
 105                 put_pages_list(pages);
 106                 goto out;
 107         }
 108
 109         pagevec_init(&lru_pvec, 0);
 110         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 111                 struct page *page = list_to_page(pages);
 112                 list_del(&page->lru);
 113                 if (!add_to_page_cache(page, mapping,
 114                                         page->index, GFP_KERNEL)) {
 115                         mapping->a_ops->readpage(filp, page);
 116                         if (!pagevec_add(&lru_pvec, page))
 117                                 __pagevec_lru_add(&lru_pvec);
 118                 } else
 119                         page_cache_release(page);
 120         }
 121         pagevec_lru_add(&lru_pvec);
 122         ret = 0;
 123 out:
 124         return ret;
 125 }
 126
 127 /*
 128  * do_page_cache_readahead actually reads a chunk of disk.  It allocates all
 129  * the pages first, then submits them all for I/O. This avoids the very bad
 130  * behaviour which would occur if page allocations are causing VM writeback.
 131  * We really don't want to intermingle reads and writes like that.
 132  *
 133  * Returns the number of pages requested, or the maximum amount of I/O allowed.
 134  *
 135  * do_page_cache_readahead() returns -1 if it encountered request queue
 136  * congestion.
 137  */
 138 static int
 139 __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 140                         pgoff_t offset, unsigned long nr_to_read,
 141                         unsigned long lookahead_size)
 142 {
 143         struct inode *inode = mapping->host;
 144         struct page *page;
 145         unsigned long end_index;        /* The last page we want to read */
 146         LIST_HEAD(page_pool);
 147         int page_idx;
 148         int ret = 0;
 149         loff_t isize = i_size_read(inode);
 150
 151         if (isize == 0)
 152                 goto out;
 153
 154         end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
 155
 156         /*
 157          * Preallocate as many pages as we will need.
 158          */
 159         read_lock_irq(&mapping->tree_lock);
 160         for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
 161                 pgoff_t page_offset = offset + page_idx;
 162
 163                 if (page_offset > end_index)
 164                         break;
 165
 166                 page = radix_tree_lookup(&mapping->page_tree, page_offset);
 167                 if (page)
 168                         continue;
 169
 170                 read_unlock_irq(&mapping->tree_lock);
 171                 page = page_cache_alloc_cold(mapping);
 172                 read_lock_irq(&mapping->tree_lock);
 173                 if (!page)
 174                         break;
 175                 page->index = page_offset;
 176                 list_add(&page->lru, &page_pool);
 177                 if (page_idx == nr_to_read - lookahead_size)
 178                         SetPageReadahead(page);
 179                 ret++;
 180         }
 181         read_unlock_irq(&mapping->tree_lock);
 182
 183         /*
 184          * Now start the IO.  We ignore I/O errors - if the page is not
 185          * uptodate then the caller will launch readpage again, and
 186          * will then handle the error.
 187          */
 188         if (ret)
 189                 read_pages(mapping, filp, &page_pool, ret);
 190         BUG_ON(!list_empty(&page_pool));
 191 out:
 192         return ret;
 193 }
 194
 195 /*
 196  * Chunk the readahead into 2 megabyte units, so that we don't pin too much
 197  * memory at once.
 198  */
 199 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 200                 pgoff_t offset, unsigned long nr_to_read)
 201 {
 202         int ret = 0;
 203
 204         if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
 205                 return -EINVAL;
 206
 207         while (nr_to_read) {
 208                 int err;
 209
 210                 unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
 211
 212                 if (this_chunk > nr_to_read)
 213                         this_chunk = nr_to_read;
 214                 err = __do_page_cache_readahead(mapping, filp,
 215                                                 offset, this_chunk, 0);
 216                 if (err < 0) {
 217                         ret = err;
 218                         break;
 219                 }
 220                 ret += err;
 221                 offset += this_chunk;
 222                 nr_to_read -= this_chunk;
 223         }
 224         return ret;
 225 }
 226
 227 /*
 228  * This version skips the IO if the queue is read-congested, and will tell the
 229  * block layer to abandon the readahead if request allocation would block.
 230  *
 231  * force_page_cache_readahead() will ignore queue congestion and will block on
 232  * request queues.
 233  */
 234 int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 235                         pgoff_t offset, unsigned long nr_to_read)
 236 {
 237         if (bdi_read_congested(mapping->backing_dev_info))
 238                 return -1;
 239
 240         return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
 241 }
 242
 243 /*
 244  * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
 245  * sensible upper limit.
 246  */
 247 unsigned long max_sane_readahead(unsigned long nr)
 248 {
 249         return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE)
 250                 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
 251 }
 252
 253 /*
 254  * Submit IO for the read-ahead request in file_ra_state.
 255  */
 256 unsigned long ra_submit(struct file_ra_state *ra,
 257                        struct address_space *mapping, struct file *filp)
 258 {
 259         unsigned long ra_size;
 260         unsigned long la_size;
 261         int actual;
 262
 263         ra_size = ra_readahead_size(ra);
 264         la_size = ra_lookahead_size(ra);
 265         actual = __do_page_cache_readahead(mapping, filp,
 266                                         ra->ra_index, ra_size, la_size);
 267
 268         return actual;
 269 }
 270 EXPORT_SYMBOL_GPL(ra_submit);
 271
 272 /*
 273  * Set the initial window size, round to next power of 2 and square
 274  * for small size, x 4 for medium, and x 2 for large
 275  * for 128k (32 page) max ra
 276  * 1-8 page = 32k initial, > 8 page = 128k initial
 277  */
 278 static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
 279 {
 280         unsigned long newsize = roundup_pow_of_two(size);
 281
 282         if (newsize <= max / 32)
 283                 newsize = newsize * 4;
 284         else if (newsize <= max / 4)
 285                 newsize = newsize * 2;
 286         else
 287                 newsize = max;
 288
 289         return newsize;
 290 }
 291
 292 /*
 293  *  Get the previous window size, ramp it up, and
 294  *  return it as the new window size.
 295  */
 296 static unsigned long get_next_ra_size(struct file_ra_state *ra,
 297                                                 unsigned long max)
 298 {
 299         unsigned long cur = ra->readahead_index - ra->ra_index;
 300         unsigned long newsize;
 301
 302         if (cur < max / 16)
 303                 newsize = 4 * cur;
 304         else
 305                 newsize = 2 * cur;
 306
 307         return min(newsize, max);
 308 }
 309
 310 /*
 311  * On-demand readahead design.
 312  *
 313  * The fields in struct file_ra_state represent the most-recently-executed
 314  * readahead attempt:
 315  *
 316  *                    |-------- last readahead window -------->|
 317  *       |-- application walking here -->|
 318  * ======#============|==================#=====================|
 319  *       ^la_index    ^ra_index          ^lookahead_index      ^readahead_index
 320  *
 321  * [ra_index, readahead_index) represents the last readahead window.
 322  *
 323  * [la_index, lookahead_index] is where the application would be walking(in
 324  * the common case of cache-cold sequential reads): the last window was
 325  * established when the application was at la_index, and the next window will
 326  * be bring in when the application reaches lookahead_index.
 327  *
 328  * To overlap application thinking time and disk I/O time, we do
 329  * `readahead pipelining': Do not wait until the application consumed all
 330  * readahead pages and stalled on the missing page at readahead_index;
 331  * Instead, submit an asynchronous readahead I/O as early as the application
 332  * reads on the page at lookahead_index. Normally lookahead_index will be
 333  * equal to ra_index, for maximum pipelining.
 334  *
 335  * In interleaved sequential reads, concurrent streams on the same fd can
 336  * be invalidating each other's readahead state. So we flag the new readahead
 337  * page at lookahead_index with PG_readahead, and use it as readahead
 338  * indicator. The flag won't be set on already cached pages, to avoid the
 339  * readahead-for-nothing fuss, saving pointless page cache lookups.
 340  *
 341  * prev_index tracks the last visited page in the _previous_ read request.
 342  * It should be maintained by the caller, and will be used for detecting
 343  * small random reads. Note that the readahead algorithm checks loosely
 344  * for sequential patterns. Hence interleaved reads might be served as
 345  * sequential ones.
 346  *
 347  * There is a special-case: if the first page which the application tries to
 348  * read happens to be the first page of the file, it is assumed that a linear
 349  * read is about to happen and the window is immediately set to the initial size
 350  * based on I/O request size and the max_readahead.
 351  *
 352  * The code ramps up the readahead size aggressively at first, but slow down as
 353  * it approaches max_readhead.
 354  */
 355
 356 /*
 357  * A minimal readahead algorithm for trivial sequential/random reads.
 358  */
 359 static unsigned long
 360 ondemand_readahead(struct address_space *mapping,
 361                    struct file_ra_state *ra, struct file *filp,
 362                    struct page *page, pgoff_t offset,
 363                    unsigned long req_size)
 364 {
 365         unsigned long max;      /* max readahead pages */
 366         pgoff_t ra_index;       /* readahead index */
 367         unsigned long ra_size;  /* readahead size */
 368         unsigned long la_size;  /* lookahead size */
 369         int sequential;
 370
 371         max = ra->ra_pages;
 372         sequential = (offset - ra->prev_index <= 1UL) || (req_size > max);
 373
 374         /*
 375          * Lookahead/readahead hit, assume sequential access.
 376          * Ramp up sizes, and push forward the readahead window.
 377          */
 378         if (offset && (offset == ra->lookahead_index ||
 379                         offset == ra->readahead_index)) {
 380                 ra_index = ra->readahead_index;
 381                 ra_size = get_next_ra_size(ra, max);
 382                 la_size = ra_size;
 383                 goto fill_ra;
 384         }
 385
 386         /*
 387          * Standalone, small read.
 388          * Read as is, and do not pollute the readahead state.
 389          */
 390         if (!page && !sequential) {
 391                 return __do_page_cache_readahead(mapping, filp,
 392                                                 offset, req_size, 0);
 393         }
 394
 395         /*
 396          * It may be one of
 397          *      - first read on start of file
 398          *      - sequential cache miss
 399          *      - oversize random read
 400          * Start readahead for it.
 401          */
 402         ra_index = offset;
 403         ra_size = get_init_ra_size(req_size, max);
 404         la_size = ra_size > req_size ? ra_size - req_size : ra_size;
 405
 406         /*
 407          * Hit on a lookahead page without valid readahead state.
 408          * E.g. interleaved reads.
 409          * Not knowing its readahead pos/size, bet on the minimal possible one.
 410          */
 411         if (page) {
 412                 ra_index++;
 413                 ra_size = min(4 * ra_size, max);
 414         }
 415
 416 fill_ra:
 417         ra_set_index(ra, offset, ra_index);
 418         ra_set_size(ra, ra_size, la_size);
 419
 420         return ra_submit(ra, mapping, filp);
 421 }
 422
 423 /**
 424  * page_cache_readahead_ondemand - generic file readahead
 425  * @mapping: address_space which holds the pagecache and I/O vectors
 426  * @ra: file_ra_state which holds the readahead state
 427  * @filp: passed on to ->readpage() and ->readpages()
 428  * @page: the page at @offset, or NULL if non-present
 429  * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
 430  * @req_size: hint: total size of the read which the caller is performing in
 431  *            PAGE_CACHE_SIZE units
 432  *
 433  * page_cache_readahead_ondemand() is the entry point of readahead logic.
 434  * This function should be called when it is time to perform readahead:
 435  * 1) @page == NULL
 436  *    A cache miss happened, time for synchronous readahead.
 437  * 2) @page != NULL && PageReadahead(@page)
 438  *    A look-ahead hit occured, time for asynchronous readahead.
 439  */
 440 unsigned long
 441 page_cache_readahead_ondemand(struct address_space *mapping,
 442                                 struct file_ra_state *ra, struct file *filp,
 443                                 struct page *page, pgoff_t offset,
 444                                 unsigned long req_size)
 445 {
 446         /* no read-ahead */
 447         if (!ra->ra_pages)
 448                 return 0;
 449
 450         if (page) {
 451                 /*
 452                  * It can be PG_reclaim.
 453                  */
 454                 if (PageWriteback(page))
 455                         return 0;
 456
 457                 ClearPageReadahead(page);
 458
 459                 /*
 460                  * Defer asynchronous read-ahead on IO congestion.
 461                  */
 462                 if (bdi_read_congested(mapping->backing_dev_info))
 463                         return 0;
 464         }
 465
 466         /* do read-ahead */
 467         return ondemand_readahead(mapping, ra, filp, page,
 468                                         offset, req_size);
 469 }
 470 EXPORT_SYMBOL_GPL(page_cache_readahead_ondemand);