mm/migrate.c

   1 /*
   2  * Memory Migration functionality - linux/mm/migration.c
   3  *
   4  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   5  *
   6  * Page migration was first developed in the context of the memory hotplug
   7  * project. The main authors of the migration code are:
   8  *
   9  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10  * Hirokazu Takahashi <taka@valinux.co.jp>
  11  * Dave Hansen <haveblue@us.ibm.com>
  12  * Christoph Lameter <clameter@sgi.com>
  13  */
  14
  15 #include <linux/migrate.h>
  16 #include <linux/module.h>
  17 #include <linux/swap.h>
  18 #include <linux/swapops.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/buffer_head.h>
  21 #include <linux/mm_inline.h>
  22 #include <linux/pagevec.h>
  23 #include <linux/rmap.h>
  24 #include <linux/topology.h>
  25 #include <linux/cpu.h>
  26 #include <linux/cpuset.h>
  27 #include <linux/writeback.h>
  28
  29 #include "internal.h"
  30
  31 /* The maximum number of pages to take off the LRU for migration */
  32 #define MIGRATE_CHUNK_SIZE 256
  33
  34 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  35
  36 /*
  37  * Isolate one page from the LRU lists. If successful put it onto
  38  * the indicated list with elevated page count.
  39  *
  40  * Result:
  41  *  -EBUSY: page not on LRU list
  42  *  0: page removed from LRU list and added to the specified list.
  43  */
  44 int isolate_lru_page(struct page *page, struct list_head *pagelist)
  45 {
  46         int ret = -EBUSY;
  47
  48         if (PageLRU(page)) {
  49                 struct zone *zone = page_zone(page);
  50
  51                 spin_lock_irq(&zone->lru_lock);
  52                 if (PageLRU(page)) {
  53                         ret = 0;
  54                         get_page(page);
  55                         ClearPageLRU(page);
  56                         if (PageActive(page))
  57                                 del_page_from_active_list(zone, page);
  58                         else
  59                                 del_page_from_inactive_list(zone, page);
  60                         list_add_tail(&page->lru, pagelist);
  61                 }
  62                 spin_unlock_irq(&zone->lru_lock);
  63         }
  64         return ret;
  65 }
  66
  67 /*
  68  * migrate_prep() needs to be called after we have compiled the list of pages
  69  * to be migrated using isolate_lru_page() but before we begin a series of calls
  70  * to migrate_pages().
  71  */
  72 int migrate_prep(void)
  73 {
  74         /*
  75          * Clear the LRU lists so pages can be isolated.
  76          * Note that pages may be moved off the LRU after we have
  77          * drained them. Those pages will fail to migrate like other
  78          * pages that may be busy.
  79          */
  80         lru_add_drain_all();
  81
  82         return 0;
  83 }
  84
  85 static inline void move_to_lru(struct page *page)
  86 {
  87         if (PageActive(page)) {
  88                 /*
  89                  * lru_cache_add_active checks that
  90                  * the PG_active bit is off.
  91                  */
  92                 ClearPageActive(page);
  93                 lru_cache_add_active(page);
  94         } else {
  95                 lru_cache_add(page);
  96         }
  97         put_page(page);
  98 }
  99
 100 /*
 101  * Add isolated pages on the list back to the LRU.
 102  *
 103  * returns the number of pages put back.
 104  */
 105 int putback_lru_pages(struct list_head *l)
 106 {
 107         struct page *page;
 108         struct page *page2;
 109         int count = 0;
 110
 111         list_for_each_entry_safe(page, page2, l, lru) {
 112                 list_del(&page->lru);
 113                 move_to_lru(page);
 114                 count++;
 115         }
 116         return count;
 117 }
 118
 119 static inline int is_swap_pte(pte_t pte)
 120 {
 121         return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
 122 }
 123
 124 /*
 125  * Restore a potential migration pte to a working pte entry
 126  */
 127 static void remove_migration_pte(struct vm_area_struct *vma,
 128                 struct page *old, struct page *new)
 129 {
 130         struct mm_struct *mm = vma->vm_mm;
 131         swp_entry_t entry;
 132         pgd_t *pgd;
 133         pud_t *pud;
 134         pmd_t *pmd;
 135         pte_t *ptep, pte;
 136         spinlock_t *ptl;
 137         unsigned long addr = page_address_in_vma(new, vma);
 138
 139         if (addr == -EFAULT)
 140                 return;
 141
 142         pgd = pgd_offset(mm, addr);
 143         if (!pgd_present(*pgd))
 144                 return;
 145
 146         pud = pud_offset(pgd, addr);
 147         if (!pud_present(*pud))
 148                 return;
 149
 150         pmd = pmd_offset(pud, addr);
 151         if (!pmd_present(*pmd))
 152                 return;
 153
 154         ptep = pte_offset_map(pmd, addr);
 155
 156         if (!is_swap_pte(*ptep)) {
 157                 pte_unmap(ptep);
 158                 return;
 159         }
 160
 161         ptl = pte_lockptr(mm, pmd);
 162         spin_lock(ptl);
 163         pte = *ptep;
 164         if (!is_swap_pte(pte))
 165                 goto out;
 166
 167         entry = pte_to_swp_entry(pte);
 168
 169         if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
 170                 goto out;
 171
 172         get_page(new);
 173         pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 174         if (is_write_migration_entry(entry))
 175                 pte = pte_mkwrite(pte);
 176         set_pte_at(mm, addr, ptep, pte);
 177
 178         if (PageAnon(new))
 179                 page_add_anon_rmap(new, vma, addr);
 180         else
 181                 page_add_file_rmap(new);
 182
 183         /* No need to invalidate - it was non-present before */
 184         update_mmu_cache(vma, addr, pte);
 185         lazy_mmu_prot_update(pte);
 186
 187 out:
 188         pte_unmap_unlock(ptep, ptl);
 189 }
 190
 191 /*
 192  * Note that remove_file_migration_ptes will only work on regular mappings,
 193  * Nonlinear mappings do not use migration entries.
 194  */
 195 static void remove_file_migration_ptes(struct page *old, struct page *new)
 196 {
 197         struct vm_area_struct *vma;
 198         struct address_space *mapping = page_mapping(new);
 199         struct prio_tree_iter iter;
 200         pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 201
 202         if (!mapping)
 203                 return;
 204
 205         spin_lock(&mapping->i_mmap_lock);
 206
 207         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
 208                 remove_migration_pte(vma, old, new);
 209
 210         spin_unlock(&mapping->i_mmap_lock);
 211 }
 212
 213 /*
 214  * Must hold mmap_sem lock on at least one of the vmas containing
 215  * the page so that the anon_vma cannot vanish.
 216  */
 217 static void remove_anon_migration_ptes(struct page *old, struct page *new)
 218 {
 219         struct anon_vma *anon_vma;
 220         struct vm_area_struct *vma;
 221         unsigned long mapping;
 222
 223         mapping = (unsigned long)new->mapping;
 224
 225         if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
 226                 return;
 227
 228         /*
 229          * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
 230          */
 231         anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
 232         spin_lock(&anon_vma->lock);
 233
 234         list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
 235                 remove_migration_pte(vma, old, new);
 236
 237         spin_unlock(&anon_vma->lock);
 238 }
 239
 240 /*
 241  * Get rid of all migration entries and replace them by
 242  * references to the indicated page.
 243  */
 244 static void remove_migration_ptes(struct page *old, struct page *new)
 245 {
 246         if (PageAnon(new))
 247                 remove_anon_migration_ptes(old, new);
 248         else
 249                 remove_file_migration_ptes(old, new);
 250 }
 251
 252 /*
 253  * Something used the pte of a page under migration. We need to
 254  * get to the page and wait until migration is finished.
 255  * When we return from this function the fault will be retried.
 256  *
 257  * This function is called from do_swap_page().
 258  */
 259 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 260                                 unsigned long address)
 261 {
 262         pte_t *ptep, pte;
 263         spinlock_t *ptl;
 264         swp_entry_t entry;
 265         struct page *page;
 266
 267         ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 268         pte = *ptep;
 269         if (!is_swap_pte(pte))
 270                 goto out;
 271
 272         entry = pte_to_swp_entry(pte);
 273         if (!is_migration_entry(entry))
 274                 goto out;
 275
 276         page = migration_entry_to_page(entry);
 277
 278         get_page(page);
 279         pte_unmap_unlock(ptep, ptl);
 280         wait_on_page_locked(page);
 281         put_page(page);
 282         return;
 283 out:
 284         pte_unmap_unlock(ptep, ptl);
 285 }
 286
 287 /*
 288  * Replace the page in the mapping.
 289  *
 290  * The number of remaining references must be:
 291  * 1 for anonymous pages without a mapping
 292  * 2 for pages with a mapping
 293  * 3 for pages with a mapping and PagePrivate set.
 294  */
 295 static int migrate_page_move_mapping(struct address_space *mapping,
 296                 struct page *newpage, struct page *page)
 297 {
 298         struct page **radix_pointer;
 299
 300         if (!mapping) {
 301                 /* Anonymous page */
 302                 if (page_count(page) != 1)
 303                         return -EAGAIN;
 304                 return 0;
 305         }
 306
 307         write_lock_irq(&mapping->tree_lock);
 308
 309         radix_pointer = (struct page **)radix_tree_lookup_slot(
 310                                                 &mapping->page_tree,
 311                                                 page_index(page));
 312
 313         if (page_count(page) != 2 + !!PagePrivate(page) ||
 314                         *radix_pointer != page) {
 315                 write_unlock_irq(&mapping->tree_lock);
 316                 return -EAGAIN;
 317         }
 318
 319         /*
 320          * Now we know that no one else is looking at the page.
 321          */
 322         get_page(newpage);
 323 #ifdef CONFIG_SWAP
 324         if (PageSwapCache(page)) {
 325                 SetPageSwapCache(newpage);
 326                 set_page_private(newpage, page_private(page));
 327         }
 328 #endif
 329
 330         *radix_pointer = newpage;
 331         __put_page(page);
 332         write_unlock_irq(&mapping->tree_lock);
 333
 334         return 0;
 335 }
 336
 337 /*
 338  * Copy the page to its new location
 339  */
 340 static void migrate_page_copy(struct page *newpage, struct page *page)
 341 {
 342         copy_highpage(newpage, page);
 343
 344         if (PageError(page))
 345                 SetPageError(newpage);
 346         if (PageReferenced(page))
 347                 SetPageReferenced(newpage);
 348         if (PageUptodate(page))
 349                 SetPageUptodate(newpage);
 350         if (PageActive(page))
 351                 SetPageActive(newpage);
 352         if (PageChecked(page))
 353                 SetPageChecked(newpage);
 354         if (PageMappedToDisk(page))
 355                 SetPageMappedToDisk(newpage);
 356
 357         if (PageDirty(page)) {
 358                 clear_page_dirty_for_io(page);
 359                 set_page_dirty(newpage);
 360         }
 361
 362 #ifdef CONFIG_SWAP
 363         ClearPageSwapCache(page);
 364 #endif
 365         ClearPageActive(page);
 366         ClearPagePrivate(page);
 367         set_page_private(page, 0);
 368         page->mapping = NULL;
 369
 370         /*
 371          * If any waiters have accumulated on the new page then
 372          * wake them up.
 373          */
 374         if (PageWriteback(newpage))
 375                 end_page_writeback(newpage);
 376 }
 377
 378 /************************************************************
 379  *                    Migration functions
 380  ***********************************************************/
 381
 382 /* Always fail migration. Used for mappings that are not movable */
 383 int fail_migrate_page(struct address_space *mapping,
 384                         struct page *newpage, struct page *page)
 385 {
 386         return -EIO;
 387 }
 388 EXPORT_SYMBOL(fail_migrate_page);
 389
 390 /*
 391  * Common logic to directly migrate a single page suitable for
 392  * pages that do not use PagePrivate.
 393  *
 394  * Pages are locked upon entry and exit.
 395  */
 396 int migrate_page(struct address_space *mapping,
 397                 struct page *newpage, struct page *page)
 398 {
 399         int rc;
 400
 401         BUG_ON(PageWriteback(page));    /* Writeback must be complete */
 402
 403         rc = migrate_page_move_mapping(mapping, newpage, page);
 404
 405         if (rc)
 406                 return rc;
 407
 408         migrate_page_copy(newpage, page);
 409         return 0;
 410 }
 411 EXPORT_SYMBOL(migrate_page);
 412
 413 /*
 414  * Migration function for pages with buffers. This function can only be used
 415  * if the underlying filesystem guarantees that no other references to "page"
 416  * exist.
 417  */
 418 int buffer_migrate_page(struct address_space *mapping,
 419                 struct page *newpage, struct page *page)
 420 {
 421         struct buffer_head *bh, *head;
 422         int rc;
 423
 424         if (!page_has_buffers(page))
 425                 return migrate_page(mapping, newpage, page);
 426
 427         head = page_buffers(page);
 428
 429         rc = migrate_page_move_mapping(mapping, newpage, page);
 430
 431         if (rc)
 432                 return rc;
 433
 434         bh = head;
 435         do {
 436                 get_bh(bh);
 437                 lock_buffer(bh);
 438                 bh = bh->b_this_page;
 439
 440         } while (bh != head);
 441
 442         ClearPagePrivate(page);
 443         set_page_private(newpage, page_private(page));
 444         set_page_private(page, 0);
 445         put_page(page);
 446         get_page(newpage);
 447
 448         bh = head;
 449         do {
 450                 set_bh_page(bh, newpage, bh_offset(bh));
 451                 bh = bh->b_this_page;
 452
 453         } while (bh != head);
 454
 455         SetPagePrivate(newpage);
 456
 457         migrate_page_copy(newpage, page);
 458
 459         bh = head;
 460         do {
 461                 unlock_buffer(bh);
 462                 put_bh(bh);
 463                 bh = bh->b_this_page;
 464
 465         } while (bh != head);
 466
 467         return 0;
 468 }
 469 EXPORT_SYMBOL(buffer_migrate_page);
 470
 471 /*
 472  * Writeback a page to clean the dirty state
 473  */
 474 static int writeout(struct address_space *mapping, struct page *page)
 475 {
 476         struct writeback_control wbc = {
 477                 .sync_mode = WB_SYNC_NONE,
 478                 .nr_to_write = 1,
 479                 .range_start = 0,
 480                 .range_end = LLONG_MAX,
 481                 .nonblocking = 1,
 482                 .for_reclaim = 1
 483         };
 484         int rc;
 485
 486         if (!mapping->a_ops->writepage)
 487                 /* No write method for the address space */
 488                 return -EINVAL;
 489
 490         if (!clear_page_dirty_for_io(page))
 491                 /* Someone else already triggered a write */
 492                 return -EAGAIN;
 493
 494         /*
 495          * A dirty page may imply that the underlying filesystem has
 496          * the page on some queue. So the page must be clean for
 497          * migration. Writeout may mean we loose the lock and the
 498          * page state is no longer what we checked for earlier.
 499          * At this point we know that the migration attempt cannot
 500          * be successful.
 501          */
 502         remove_migration_ptes(page, page);
 503
 504         rc = mapping->a_ops->writepage(page, &wbc);
 505         if (rc < 0)
 506                 /* I/O Error writing */
 507                 return -EIO;
 508
 509         if (rc != AOP_WRITEPAGE_ACTIVATE)
 510                 /* unlocked. Relock */
 511                 lock_page(page);
 512
 513         return -EAGAIN;
 514 }
 515
 516 /*
 517  * Default handling if a filesystem does not provide a migration function.
 518  */
 519 static int fallback_migrate_page(struct address_space *mapping,
 520         struct page *newpage, struct page *page)
 521 {
 522         if (PageDirty(page))
 523                 return writeout(mapping, page);
 524
 525         /*
 526          * Buffers may be managed in a filesystem specific way.
 527          * We must have no buffers or drop them.
 528          */
 529         if (page_has_buffers(page) &&
 530             !try_to_release_page(page, GFP_KERNEL))
 531                 return -EAGAIN;
 532
 533         return migrate_page(mapping, newpage, page);
 534 }
 535
 536 /*
 537  * Move a page to a newly allocated page
 538  * The page is locked and all ptes have been successfully removed.
 539  *
 540  * The new page will have replaced the old page if this function
 541  * is successful.
 542  */
 543 static int move_to_new_page(struct page *newpage, struct page *page)
 544 {
 545         struct address_space *mapping;
 546         int rc;
 547
 548         /*
 549          * Block others from accessing the page when we get around to
 550          * establishing additional references. We are the only one
 551          * holding a reference to the new page at this point.
 552          */
 553         if (TestSetPageLocked(newpage))
 554                 BUG();
 555
 556         /* Prepare mapping for the new page.*/
 557         newpage->index = page->index;
 558         newpage->mapping = page->mapping;
 559
 560         mapping = page_mapping(page);
 561         if (!mapping)
 562                 rc = migrate_page(mapping, newpage, page);
 563         else if (mapping->a_ops->migratepage)
 564                 /*
 565                  * Most pages have a mapping and most filesystems
 566                  * should provide a migration function. Anonymous
 567                  * pages are part of swap space which also has its
 568                  * own migration function. This is the most common
 569                  * path for page migration.
 570                  */
 571                 rc = mapping->a_ops->migratepage(mapping,
 572                                                 newpage, page);
 573         else
 574                 rc = fallback_migrate_page(mapping, newpage, page);
 575
 576         if (!rc)
 577                 remove_migration_ptes(page, newpage);
 578         else
 579                 newpage->mapping = NULL;
 580
 581         unlock_page(newpage);
 582
 583         return rc;
 584 }
 585
 586 /*
 587  * Obtain the lock on page, remove all ptes and migrate the page
 588  * to the newly allocated page in newpage.
 589  */
 590 static int unmap_and_move(struct page *newpage, struct page *page, int force)
 591 {
 592         int rc = 0;
 593
 594         if (page_count(page) == 1)
 595                 /* page was freed from under us. So we are done. */
 596                 goto ret;
 597
 598         rc = -EAGAIN;
 599         if (TestSetPageLocked(page)) {
 600                 if (!force)
 601                         goto ret;
 602                 lock_page(page);
 603         }
 604
 605         if (PageWriteback(page)) {
 606                 if (!force)
 607                         goto unlock;
 608                 wait_on_page_writeback(page);
 609         }
 610
 611         /*
 612          * Establish migration ptes or remove ptes
 613          */
 614         if (try_to_unmap(page, 1) != SWAP_FAIL) {
 615                 if (!page_mapped(page))
 616                         rc = move_to_new_page(newpage, page);
 617         } else
 618                 /* A vma has VM_LOCKED set -> permanent failure */
 619                 rc = -EPERM;
 620
 621         if (rc)
 622                 remove_migration_ptes(page, page);
 623 unlock:
 624         unlock_page(page);
 625 ret:
 626         if (rc != -EAGAIN) {
 627                 /*
 628                  * A page that has been migrated has all references
 629                  * removed and will be freed. A page that has not been
 630                  * migrated will have kepts its references and be
 631                  * restored.
 632                  */
 633                 list_del(&page->lru);
 634                 move_to_lru(page);
 635
 636                 list_del(&newpage->lru);
 637                 move_to_lru(newpage);
 638         }
 639         return rc;
 640 }
 641
 642 /*
 643  * migrate_pages
 644  *
 645  * Two lists are passed to this function. The first list
 646  * contains the pages isolated from the LRU to be migrated.
 647  * The second list contains new pages that the isolated pages
 648  * can be moved to.
 649  *
 650  * The function returns after 10 attempts or if no pages
 651  * are movable anymore because to has become empty
 652  * or no retryable pages exist anymore. All pages will be
 653  * retruned to the LRU or freed.
 654  *
 655  * Return: Number of pages not migrated.
 656  */
 657 int migrate_pages(struct list_head *from, struct list_head *to)
 658 {
 659         int retry = 1;
 660         int nr_failed = 0;
 661         int pass = 0;
 662         struct page *page;
 663         struct page *page2;
 664         int swapwrite = current->flags & PF_SWAPWRITE;
 665         int rc;
 666
 667         if (!swapwrite)
 668                 current->flags |= PF_SWAPWRITE;
 669
 670         for(pass = 0; pass < 10 && retry; pass++) {
 671                 retry = 0;
 672
 673                 list_for_each_entry_safe(page, page2, from, lru) {
 674
 675                         if (list_empty(to))
 676                                 break;
 677
 678                         cond_resched();
 679
 680                         rc = unmap_and_move(lru_to_page(to), page, pass > 2);
 681
 682                         switch(rc) {
 683                         case -EAGAIN:
 684                                 retry++;
 685                                 break;
 686                         case 0:
 687                                 break;
 688                         default:
 689                                 /* Permanent failure */
 690                                 nr_failed++;
 691                                 break;
 692                         }
 693                 }
 694         }
 695
 696         if (!swapwrite)
 697                 current->flags &= ~PF_SWAPWRITE;
 698
 699         putback_lru_pages(from);
 700         return nr_failed + retry;
 701 }
 702
 703 /*
 704  * Migrate the list 'pagelist' of pages to a certain destination.
 705  *
 706  * Specify destination with either non-NULL vma or dest_node >= 0
 707  * Return the number of pages not migrated or error code
 708  */
 709 int migrate_pages_to(struct list_head *pagelist,
 710                         struct vm_area_struct *vma, int dest)
 711 {
 712         LIST_HEAD(newlist);
 713         int err = 0;
 714         unsigned long offset = 0;
 715         int nr_pages;
 716         int nr_failed = 0;
 717         struct page *page;
 718         struct list_head *p;
 719
 720 redo:
 721         nr_pages = 0;
 722         list_for_each(p, pagelist) {
 723                 if (vma) {
 724                         /*
 725                          * The address passed to alloc_page_vma is used to
 726                          * generate the proper interleave behavior. We fake
 727                          * the address here by an increasing offset in order
 728                          * to get the proper distribution of pages.
 729                          *
 730                          * No decision has been made as to which page
 731                          * a certain old page is moved to so we cannot
 732                          * specify the correct address.
 733                          */
 734                         page = alloc_page_vma(GFP_HIGHUSER, vma,
 735                                         offset + vma->vm_start);
 736                         offset += PAGE_SIZE;
 737                 }
 738                 else
 739                         page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
 740
 741                 if (!page) {
 742                         err = -ENOMEM;
 743                         goto out;
 744                 }
 745                 list_add_tail(&page->lru, &newlist);
 746                 nr_pages++;
 747                 if (nr_pages > MIGRATE_CHUNK_SIZE)
 748                         break;
 749         }
 750         err = migrate_pages(pagelist, &newlist);
 751
 752         if (err >= 0) {
 753                 nr_failed += err;
 754                 if (list_empty(&newlist) && !list_empty(pagelist))
 755                         goto redo;
 756         }
 757 out:
 758
 759         /* Calculate number of leftover pages */
 760         list_for_each(p, pagelist)
 761                 nr_failed++;
 762         return nr_failed;
 763 }