mm/migrate.c

   1 /*
   2  * Memory Migration functionality - linux/mm/migration.c
   3  *
   4  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   5  *
   6  * Page migration was first developed in the context of the memory hotplug
   7  * project. The main authors of the migration code are:
   8  *
   9  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10  * Hirokazu Takahashi <taka@valinux.co.jp>
  11  * Dave Hansen <haveblue@us.ibm.com>
  12  * Christoph Lameter <clameter@sgi.com>
  13  */
  14
  15 #include <linux/migrate.h>
  16 #include <linux/module.h>
  17 #include <linux/swap.h>
  18 #include <linux/swapops.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/buffer_head.h>
  21 #include <linux/mm_inline.h>
  22 #include <linux/pagevec.h>
  23 #include <linux/rmap.h>
  24 #include <linux/topology.h>
  25 #include <linux/cpu.h>
  26 #include <linux/cpuset.h>
  27
  28 #include "internal.h"
  29
  30 /* The maximum number of pages to take off the LRU for migration */
  31 #define MIGRATE_CHUNK_SIZE 256
  32
  33 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  34
  35 /*
  36  * Isolate one page from the LRU lists. If successful put it onto
  37  * the indicated list with elevated page count.
  38  *
  39  * Result:
  40  *  -EBUSY: page not on LRU list
  41  *  0: page removed from LRU list and added to the specified list.
  42  */
  43 int isolate_lru_page(struct page *page, struct list_head *pagelist)
  44 {
  45         int ret = -EBUSY;
  46
  47         if (PageLRU(page)) {
  48                 struct zone *zone = page_zone(page);
  49
  50                 spin_lock_irq(&zone->lru_lock);
  51                 if (PageLRU(page)) {
  52                         ret = 0;
  53                         get_page(page);
  54                         ClearPageLRU(page);
  55                         if (PageActive(page))
  56                                 del_page_from_active_list(zone, page);
  57                         else
  58                                 del_page_from_inactive_list(zone, page);
  59                         list_add_tail(&page->lru, pagelist);
  60                 }
  61                 spin_unlock_irq(&zone->lru_lock);
  62         }
  63         return ret;
  64 }
  65
  66 /*
  67  * migrate_prep() needs to be called after we have compiled the list of pages
  68  * to be migrated using isolate_lru_page() but before we begin a series of calls
  69  * to migrate_pages().
  70  */
  71 int migrate_prep(void)
  72 {
  73         /* Must have swap device for migration */
  74         if (nr_swap_pages <= 0)
  75                 return -ENODEV;
  76
  77         /*
  78          * Clear the LRU lists so pages can be isolated.
  79          * Note that pages may be moved off the LRU after we have
  80          * drained them. Those pages will fail to migrate like other
  81          * pages that may be busy.
  82          */
  83         lru_add_drain_all();
  84
  85         return 0;
  86 }
  87
  88 static inline void move_to_lru(struct page *page)
  89 {
  90         list_del(&page->lru);
  91         if (PageActive(page)) {
  92                 /*
  93                  * lru_cache_add_active checks that
  94                  * the PG_active bit is off.
  95                  */
  96                 ClearPageActive(page);
  97                 lru_cache_add_active(page);
  98         } else {
  99                 lru_cache_add(page);
 100         }
 101         put_page(page);
 102 }
 103
 104 /*
 105  * Add isolated pages on the list back to the LRU.
 106  *
 107  * returns the number of pages put back.
 108  */
 109 int putback_lru_pages(struct list_head *l)
 110 {
 111         struct page *page;
 112         struct page *page2;
 113         int count = 0;
 114
 115         list_for_each_entry_safe(page, page2, l, lru) {
 116                 move_to_lru(page);
 117                 count++;
 118         }
 119         return count;
 120 }
 121
 122 static inline int is_swap_pte(pte_t pte)
 123 {
 124         return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
 125 }
 126
 127 /*
 128  * Restore a potential migration pte to a working pte entry
 129  */
 130 static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
 131                 struct page *old, struct page *new)
 132 {
 133         struct mm_struct *mm = vma->vm_mm;
 134         swp_entry_t entry;
 135         pgd_t *pgd;
 136         pud_t *pud;
 137         pmd_t *pmd;
 138         pte_t *ptep, pte;
 139         spinlock_t *ptl;
 140
 141         pgd = pgd_offset(mm, addr);
 142         if (!pgd_present(*pgd))
 143                 return;
 144
 145         pud = pud_offset(pgd, addr);
 146         if (!pud_present(*pud))
 147                 return;
 148
 149         pmd = pmd_offset(pud, addr);
 150         if (!pmd_present(*pmd))
 151                 return;
 152
 153         ptep = pte_offset_map(pmd, addr);
 154
 155         if (!is_swap_pte(*ptep)) {
 156                 pte_unmap(ptep);
 157                 return;
 158         }
 159
 160         ptl = pte_lockptr(mm, pmd);
 161         spin_lock(ptl);
 162         pte = *ptep;
 163         if (!is_swap_pte(pte))
 164                 goto out;
 165
 166         entry = pte_to_swp_entry(pte);
 167
 168         if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
 169                 goto out;
 170
 171         inc_mm_counter(mm, anon_rss);
 172         get_page(new);
 173         pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 174         if (is_write_migration_entry(entry))
 175                 pte = pte_mkwrite(pte);
 176         set_pte_at(mm, addr, ptep, pte);
 177         page_add_anon_rmap(new, vma, addr);
 178 out:
 179         pte_unmap_unlock(ptep, ptl);
 180 }
 181
 182 /*
 183  * Get rid of all migration entries and replace them by
 184  * references to the indicated page.
 185  *
 186  * Must hold mmap_sem lock on at least one of the vmas containing
 187  * the page so that the anon_vma cannot vanish.
 188  */
 189 static void remove_migration_ptes(struct page *old, struct page *new)
 190 {
 191         struct anon_vma *anon_vma;
 192         struct vm_area_struct *vma;
 193         unsigned long mapping;
 194
 195         mapping = (unsigned long)new->mapping;
 196
 197         if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
 198                 return;
 199
 200         /*
 201          * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
 202          */
 203         anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
 204         spin_lock(&anon_vma->lock);
 205
 206         list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
 207                 remove_migration_pte(vma, page_address_in_vma(new, vma),
 208                                         old, new);
 209
 210         spin_unlock(&anon_vma->lock);
 211 }
 212
 213 /*
 214  * Something used the pte of a page under migration. We need to
 215  * get to the page and wait until migration is finished.
 216  * When we return from this function the fault will be retried.
 217  *
 218  * This function is called from do_swap_page().
 219  */
 220 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 221                                 unsigned long address)
 222 {
 223         pte_t *ptep, pte;
 224         spinlock_t *ptl;
 225         swp_entry_t entry;
 226         struct page *page;
 227
 228         ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 229         pte = *ptep;
 230         if (!is_swap_pte(pte))
 231                 goto out;
 232
 233         entry = pte_to_swp_entry(pte);
 234         if (!is_migration_entry(entry))
 235                 goto out;
 236
 237         page = migration_entry_to_page(entry);
 238
 239         get_page(page);
 240         pte_unmap_unlock(ptep, ptl);
 241         wait_on_page_locked(page);
 242         put_page(page);
 243         return;
 244 out:
 245         pte_unmap_unlock(ptep, ptl);
 246 }
 247
 248 /*
 249  * swapout a single page
 250  * page is locked upon entry, unlocked on exit
 251  */
 252 static int swap_page(struct page *page)
 253 {
 254         struct address_space *mapping = page_mapping(page);
 255
 256         if (page_mapped(page) && mapping)
 257                 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
 258                         goto unlock_retry;
 259
 260         if (PageDirty(page)) {
 261                 /* Page is dirty, try to write it out here */
 262                 switch(pageout(page, mapping)) {
 263                 case PAGE_KEEP:
 264                 case PAGE_ACTIVATE:
 265                         goto unlock_retry;
 266
 267                 case PAGE_SUCCESS:
 268                         goto retry;
 269
 270                 case PAGE_CLEAN:
 271                         ; /* try to free the page below */
 272                 }
 273         }
 274
 275         if (PagePrivate(page)) {
 276                 if (!try_to_release_page(page, GFP_KERNEL) ||
 277                     (!mapping && page_count(page) == 1))
 278                         goto unlock_retry;
 279         }
 280
 281         if (remove_mapping(mapping, page)) {
 282                 /* Success */
 283                 unlock_page(page);
 284                 return 0;
 285         }
 286
 287 unlock_retry:
 288         unlock_page(page);
 289
 290 retry:
 291         return -EAGAIN;
 292 }
 293
 294 /*
 295  * Replace the page in the mapping.
 296  *
 297  * The number of remaining references must be:
 298  * 1 for anonymous pages without a mapping
 299  * 2 for pages with a mapping
 300  * 3 for pages with a mapping and PagePrivate set.
 301  */
 302 static int migrate_page_move_mapping(struct address_space *mapping,
 303                 struct page *newpage, struct page *page)
 304 {
 305         struct page **radix_pointer;
 306
 307         write_lock_irq(&mapping->tree_lock);
 308
 309         radix_pointer = (struct page **)radix_tree_lookup_slot(
 310                                                 &mapping->page_tree,
 311                                                 page_index(page));
 312
 313         if (!page_mapping(page) ||
 314                         page_count(page) != 2 + !!PagePrivate(page) ||
 315                         *radix_pointer != page) {
 316                 write_unlock_irq(&mapping->tree_lock);
 317                 return -EAGAIN;
 318         }
 319
 320         /*
 321          * Now we know that no one else is looking at the page.
 322          */
 323         get_page(newpage);
 324         if (PageSwapCache(page)) {
 325                 SetPageSwapCache(newpage);
 326                 set_page_private(newpage, page_private(page));
 327         }
 328
 329         *radix_pointer = newpage;
 330         __put_page(page);
 331         write_unlock_irq(&mapping->tree_lock);
 332
 333         return 0;
 334 }
 335
 336 /*
 337  * Copy the page to its new location
 338  */
 339 static void migrate_page_copy(struct page *newpage, struct page *page)
 340 {
 341         copy_highpage(newpage, page);
 342
 343         if (PageError(page))
 344                 SetPageError(newpage);
 345         if (PageReferenced(page))
 346                 SetPageReferenced(newpage);
 347         if (PageUptodate(page))
 348                 SetPageUptodate(newpage);
 349         if (PageActive(page))
 350                 SetPageActive(newpage);
 351         if (PageChecked(page))
 352                 SetPageChecked(newpage);
 353         if (PageMappedToDisk(page))
 354                 SetPageMappedToDisk(newpage);
 355
 356         if (PageDirty(page)) {
 357                 clear_page_dirty_for_io(page);
 358                 set_page_dirty(newpage);
 359         }
 360
 361         ClearPageSwapCache(page);
 362         ClearPageActive(page);
 363         ClearPagePrivate(page);
 364         set_page_private(page, 0);
 365         page->mapping = NULL;
 366
 367         /*
 368          * If any waiters have accumulated on the new page then
 369          * wake them up.
 370          */
 371         if (PageWriteback(newpage))
 372                 end_page_writeback(newpage);
 373 }
 374
 375 /************************************************************
 376  *                    Migration functions
 377  ***********************************************************/
 378
 379 /* Always fail migration. Used for mappings that are not movable */
 380 int fail_migrate_page(struct address_space *mapping,
 381                         struct page *newpage, struct page *page)
 382 {
 383         return -EIO;
 384 }
 385 EXPORT_SYMBOL(fail_migrate_page);
 386
 387 /*
 388  * Common logic to directly migrate a single page suitable for
 389  * pages that do not use PagePrivate.
 390  *
 391  * Pages are locked upon entry and exit.
 392  */
 393 int migrate_page(struct address_space *mapping,
 394                 struct page *newpage, struct page *page)
 395 {
 396         int rc;
 397
 398         BUG_ON(PageWriteback(page));    /* Writeback must be complete */
 399
 400         rc = migrate_page_move_mapping(mapping, newpage, page);
 401
 402         if (rc)
 403                 return rc;
 404
 405         migrate_page_copy(newpage, page);
 406
 407         /*
 408          * Remove auxiliary swap entries and replace
 409          * them with real ptes.
 410          *
 411          * Note that a real pte entry will allow processes that are not
 412          * waiting on the page lock to use the new page via the page tables
 413          * before the new page is unlocked.
 414          */
 415         remove_from_swap(newpage);
 416         return 0;
 417 }
 418 EXPORT_SYMBOL(migrate_page);
 419
 420 /*
 421  * Migration function for pages with buffers. This function can only be used
 422  * if the underlying filesystem guarantees that no other references to "page"
 423  * exist.
 424  */
 425 int buffer_migrate_page(struct address_space *mapping,
 426                 struct page *newpage, struct page *page)
 427 {
 428         struct buffer_head *bh, *head;
 429         int rc;
 430
 431         if (!page_has_buffers(page))
 432                 return migrate_page(mapping, newpage, page);
 433
 434         head = page_buffers(page);
 435
 436         rc = migrate_page_move_mapping(mapping, newpage, page);
 437
 438         if (rc)
 439                 return rc;
 440
 441         bh = head;
 442         do {
 443                 get_bh(bh);
 444                 lock_buffer(bh);
 445                 bh = bh->b_this_page;
 446
 447         } while (bh != head);
 448
 449         ClearPagePrivate(page);
 450         set_page_private(newpage, page_private(page));
 451         set_page_private(page, 0);
 452         put_page(page);
 453         get_page(newpage);
 454
 455         bh = head;
 456         do {
 457                 set_bh_page(bh, newpage, bh_offset(bh));
 458                 bh = bh->b_this_page;
 459
 460         } while (bh != head);
 461
 462         SetPagePrivate(newpage);
 463
 464         migrate_page_copy(newpage, page);
 465
 466         bh = head;
 467         do {
 468                 unlock_buffer(bh);
 469                 put_bh(bh);
 470                 bh = bh->b_this_page;
 471
 472         } while (bh != head);
 473
 474         return 0;
 475 }
 476 EXPORT_SYMBOL(buffer_migrate_page);
 477
 478 static int fallback_migrate_page(struct address_space *mapping,
 479         struct page *newpage, struct page *page)
 480 {
 481         /*
 482          * Default handling if a filesystem does not provide
 483          * a migration function. We can only migrate clean
 484          * pages so try to write out any dirty pages first.
 485          */
 486         if (PageDirty(page)) {
 487                 switch (pageout(page, mapping)) {
 488                 case PAGE_KEEP:
 489                 case PAGE_ACTIVATE:
 490                         return -EAGAIN;
 491
 492                 case PAGE_SUCCESS:
 493                         /* Relock since we lost the lock */
 494                         lock_page(page);
 495                         /* Must retry since page state may have changed */
 496                         return -EAGAIN;
 497
 498                 case PAGE_CLEAN:
 499                         ; /* try to migrate the page below */
 500                 }
 501         }
 502
 503         /*
 504          * Buffers may be managed in a filesystem specific way.
 505          * We must have no buffers or drop them.
 506          */
 507         if (page_has_buffers(page) &&
 508             !try_to_release_page(page, GFP_KERNEL))
 509                 return -EAGAIN;
 510
 511         return migrate_page(mapping, newpage, page);
 512 }
 513
 514 /*
 515  * migrate_pages
 516  *
 517  * Two lists are passed to this function. The first list
 518  * contains the pages isolated from the LRU to be migrated.
 519  * The second list contains new pages that the pages isolated
 520  * can be moved to. If the second list is NULL then all
 521  * pages are swapped out.
 522  *
 523  * The function returns after 10 attempts or if no pages
 524  * are movable anymore because to has become empty
 525  * or no retryable pages exist anymore.
 526  *
 527  * Return: Number of pages not migrated when "to" ran empty.
 528  */
 529 int migrate_pages(struct list_head *from, struct list_head *to,
 530                   struct list_head *moved, struct list_head *failed)
 531 {
 532         int retry;
 533         int nr_failed = 0;
 534         int pass = 0;
 535         struct page *page;
 536         struct page *page2;
 537         int swapwrite = current->flags & PF_SWAPWRITE;
 538         int rc;
 539
 540         if (!swapwrite)
 541                 current->flags |= PF_SWAPWRITE;
 542
 543 redo:
 544         retry = 0;
 545
 546         list_for_each_entry_safe(page, page2, from, lru) {
 547                 struct page *newpage = NULL;
 548                 struct address_space *mapping;
 549
 550                 cond_resched();
 551
 552                 rc = 0;
 553                 if (page_count(page) == 1)
 554                         /* page was freed from under us. So we are done. */
 555                         goto next;
 556
 557                 if (to && list_empty(to))
 558                         break;
 559
 560                 /*
 561                  * Skip locked pages during the first two passes to give the
 562                  * functions holding the lock time to release the page. Later we
 563                  * use lock_page() to have a higher chance of acquiring the
 564                  * lock.
 565                  */
 566                 rc = -EAGAIN;
 567                 if (pass > 2)
 568                         lock_page(page);
 569                 else
 570                         if (TestSetPageLocked(page))
 571                                 goto next;
 572
 573                 /*
 574                  * Only wait on writeback if we have already done a pass where
 575                  * we we may have triggered writeouts for lots of pages.
 576                  */
 577                 if (pass > 0) {
 578                         wait_on_page_writeback(page);
 579                 } else {
 580                         if (PageWriteback(page))
 581                                 goto unlock_page;
 582                 }
 583
 584                 /*
 585                  * Anonymous pages must have swap cache references otherwise
 586                  * the information contained in the page maps cannot be
 587                  * preserved.
 588                  */
 589                 if (PageAnon(page) && !PageSwapCache(page)) {
 590                         if (!add_to_swap(page, GFP_KERNEL)) {
 591                                 rc = -ENOMEM;
 592                                 goto unlock_page;
 593                         }
 594                 }
 595
 596                 if (!to) {
 597                         rc = swap_page(page);
 598                         goto next;
 599                 }
 600
 601                 /*
 602                  * Establish swap ptes for anonymous pages or destroy pte
 603                  * maps for files.
 604                  *
 605                  * In order to reestablish file backed mappings the fault handlers
 606                  * will take the radix tree_lock which may then be used to stop
 607                  * processses from accessing this page until the new page is ready.
 608                  *
 609                  * A process accessing via a swap pte (an anonymous page) will take a
 610                  * page_lock on the old page which will block the process until the
 611                  * migration attempt is complete. At that time the PageSwapCache bit
 612                  * will be examined. If the page was migrated then the PageSwapCache
 613                  * bit will be clear and the operation to retrieve the page will be
 614                  * retried which will find the new page in the radix tree. Then a new
 615                  * direct mapping may be generated based on the radix tree contents.
 616                  *
 617                  * If the page was not migrated then the PageSwapCache bit
 618                  * is still set and the operation may continue.
 619                  */
 620                 rc = -EPERM;
 621                 if (try_to_unmap(page, 1) == SWAP_FAIL)
 622                         /* A vma has VM_LOCKED set -> permanent failure */
 623                         goto unlock_page;
 624
 625                 rc = -EAGAIN;
 626                 if (page_mapped(page))
 627                         goto unlock_page;
 628
 629                 newpage = lru_to_page(to);
 630                 lock_page(newpage);
 631                 /* Prepare mapping for the new page.*/
 632                 newpage->index = page->index;
 633                 newpage->mapping = page->mapping;
 634
 635                 /*
 636                  * Pages are properly locked and writeback is complete.
 637                  * Try to migrate the page.
 638                  */
 639                 mapping = page_mapping(page);
 640                 if (!mapping)
 641                         goto unlock_both;
 642
 643                 if (mapping->a_ops->migratepage)
 644                         /*
 645                          * Most pages have a mapping and most filesystems
 646                          * should provide a migration function. Anonymous
 647                          * pages are part of swap space which also has its
 648                          * own migration function. This is the most common
 649                          * path for page migration.
 650                          */
 651                         rc = mapping->a_ops->migratepage(mapping,
 652                                                         newpage, page);
 653                 else
 654                         rc = fallback_migrate_page(mapping, newpage, page);
 655
 656 unlock_both:
 657                 unlock_page(newpage);
 658
 659 unlock_page:
 660                 unlock_page(page);
 661
 662 next:
 663                 if (rc) {
 664                         if (newpage)
 665                                 newpage->mapping = NULL;
 666
 667                         if (rc == -EAGAIN)
 668                                 retry++;
 669                         else {
 670                                 /* Permanent failure */
 671                                 list_move(&page->lru, failed);
 672                                 nr_failed++;
 673                         }
 674                 } else {
 675                         if (newpage) {
 676                                 /* Successful migration. Return page to LRU */
 677                                 move_to_lru(newpage);
 678                         }
 679                         list_move(&page->lru, moved);
 680                 }
 681         }
 682         if (retry && pass++ < 10)
 683                 goto redo;
 684
 685         if (!swapwrite)
 686                 current->flags &= ~PF_SWAPWRITE;
 687
 688         return nr_failed + retry;
 689 }
 690
 691 /*
 692  * Migrate the list 'pagelist' of pages to a certain destination.
 693  *
 694  * Specify destination with either non-NULL vma or dest_node >= 0
 695  * Return the number of pages not migrated or error code
 696  */
 697 int migrate_pages_to(struct list_head *pagelist,
 698                         struct vm_area_struct *vma, int dest)
 699 {
 700         LIST_HEAD(newlist);
 701         LIST_HEAD(moved);
 702         LIST_HEAD(failed);
 703         int err = 0;
 704         unsigned long offset = 0;
 705         int nr_pages;
 706         struct page *page;
 707         struct list_head *p;
 708
 709 redo:
 710         nr_pages = 0;
 711         list_for_each(p, pagelist) {
 712                 if (vma) {
 713                         /*
 714                          * The address passed to alloc_page_vma is used to
 715                          * generate the proper interleave behavior. We fake
 716                          * the address here by an increasing offset in order
 717                          * to get the proper distribution of pages.
 718                          *
 719                          * No decision has been made as to which page
 720                          * a certain old page is moved to so we cannot
 721                          * specify the correct address.
 722                          */
 723                         page = alloc_page_vma(GFP_HIGHUSER, vma,
 724                                         offset + vma->vm_start);
 725                         offset += PAGE_SIZE;
 726                 }
 727                 else
 728                         page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
 729
 730                 if (!page) {
 731                         err = -ENOMEM;
 732                         goto out;
 733                 }
 734                 list_add_tail(&page->lru, &newlist);
 735                 nr_pages++;
 736                 if (nr_pages > MIGRATE_CHUNK_SIZE)
 737                         break;
 738         }
 739         err = migrate_pages(pagelist, &newlist, &moved, &failed);
 740
 741         putback_lru_pages(&moved);      /* Call release pages instead ?? */
 742
 743         if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
 744                 goto redo;
 745 out:
 746         /* Return leftover allocated pages */
 747         while (!list_empty(&newlist)) {
 748                 page = list_entry(newlist.next, struct page, lru);
 749                 list_del(&page->lru);
 750                 __free_page(page);
 751         }
 752         list_splice(&failed, pagelist);
 753         if (err < 0)
 754                 return err;
 755
 756         /* Calculate number of leftover pages */
 757         nr_pages = 0;
 758         list_for_each(p, pagelist)
 759                 nr_pages++;
 760         return nr_pages;
 761 }