X-Git-Url: http://drtracing.org/?a=blobdiff_plain;f=mm%2Fmigrate.c;h=d3a1810a4c9fe01bd32e46c763ccaea86fdbfcef;hb=aaa994b3;hp=d444229f2599245e1cb0d904e7cdfed11db88b7a;hpb=1d73135e55c47ca909c1fbd68f45623b16dc0211;p=deliverable%2Flinux.git diff --git a/mm/migrate.c b/mm/migrate.c index d444229f2599..d3a1810a4c9f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -23,7 +24,7 @@ #include #include #include -#include +#include #include "internal.h" @@ -70,10 +71,6 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist) */ int migrate_prep(void) { - /* Must have swap device for migration */ - if (nr_swap_pages <= 0) - return -ENODEV; - /* * Clear the LRU lists so pages can be isolated. * Note that pages may be moved off the LRU after we have @@ -87,7 +84,6 @@ int migrate_prep(void) static inline void move_to_lru(struct page *page) { - list_del(&page->lru); if (PageActive(page)) { /* * lru_cache_add_active checks that @@ -113,113 +109,200 @@ int putback_lru_pages(struct list_head *l) int count = 0; list_for_each_entry_safe(page, page2, l, lru) { + list_del(&page->lru); move_to_lru(page); count++; } return count; } -/* - * Non migratable page - */ -int fail_migrate_page(struct page *newpage, struct page *page) +static inline int is_swap_pte(pte_t pte) { - return -EIO; + return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); } -EXPORT_SYMBOL(fail_migrate_page); /* - * swapout a single page - * page is locked upon entry, unlocked on exit + * Restore a potential migration pte to a working pte entry */ -static int swap_page(struct page *page) +static void remove_migration_pte(struct vm_area_struct *vma, + struct page *old, struct page *new) { - struct address_space *mapping = page_mapping(page); + struct mm_struct *mm = vma->vm_mm; + swp_entry_t entry; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + unsigned long addr = page_address_in_vma(new, vma); + + if (addr == -EFAULT) + return; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return; + + ptep = pte_offset_map(pmd, addr); + + if (!is_swap_pte(*ptep)) { + pte_unmap(ptep); + return; + } - if (page_mapped(page) && mapping) - if (try_to_unmap(page, 1) != SWAP_SUCCESS) - goto unlock_retry; + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; - if (PageDirty(page)) { - /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_retry; + entry = pte_to_swp_entry(pte); - case PAGE_SUCCESS: - goto retry; + if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) + goto out; - case PAGE_CLEAN: - ; /* try to free the page below */ - } - } + get_page(new); + pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + if (is_write_migration_entry(entry)) + pte = pte_mkwrite(pte); + set_pte_at(mm, addr, ptep, pte); - if (PagePrivate(page)) { - if (!try_to_release_page(page, GFP_KERNEL) || - (!mapping && page_count(page) == 1)) - goto unlock_retry; - } + if (PageAnon(new)) + page_add_anon_rmap(new, vma, addr); + else + page_add_file_rmap(new); - if (remove_mapping(mapping, page)) { - /* Success */ - unlock_page(page); - return 0; - } + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, pte); + lazy_mmu_prot_update(pte); -unlock_retry: - unlock_page(page); +out: + pte_unmap_unlock(ptep, ptl); +} -retry: - return -EAGAIN; +/* + * Note that remove_file_migration_ptes will only work on regular mappings, + * Nonlinear mappings do not use migration entries. + */ +static void remove_file_migration_ptes(struct page *old, struct page *new) +{ + struct vm_area_struct *vma; + struct address_space *mapping = page_mapping(new); + struct prio_tree_iter iter; + pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + if (!mapping) + return; + + spin_lock(&mapping->i_mmap_lock); + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) + remove_migration_pte(vma, old, new); + + spin_unlock(&mapping->i_mmap_lock); } /* - * Remove references for a page and establish the new page with the correct - * basic settings to be able to stop accesses to the page. + * Must hold mmap_sem lock on at least one of the vmas containing + * the page so that the anon_vma cannot vanish. */ -int migrate_page_remove_references(struct page *newpage, - struct page *page, int nr_refs) +static void remove_anon_migration_ptes(struct page *old, struct page *new) { - struct address_space *mapping = page_mapping(page); - struct page **radix_pointer; + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + unsigned long mapping; - /* - * Avoid doing any of the following work if the page count - * indicates that the page is in use or truncate has removed - * the page. - */ - if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) - return -EAGAIN; + mapping = (unsigned long)new->mapping; - /* - * Establish swap ptes for anonymous pages or destroy pte - * maps for files. - * - * In order to reestablish file backed mappings the fault handlers - * will take the radix tree_lock which may then be used to stop - * processses from accessing this page until the new page is ready. - * - * A process accessing via a swap pte (an anonymous page) will take a - * page_lock on the old page which will block the process until the - * migration attempt is complete. At that time the PageSwapCache bit - * will be examined. If the page was migrated then the PageSwapCache - * bit will be clear and the operation to retrieve the page will be - * retried which will find the new page in the radix tree. Then a new - * direct mapping may be generated based on the radix tree contents. - * - * If the page was not migrated then the PageSwapCache bit - * is still set and the operation may continue. - */ - if (try_to_unmap(page, 1) == SWAP_FAIL) - /* A vma has VM_LOCKED set -> permanent failure */ - return -EPERM; + if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) + return; /* - * Give up if we were unable to remove all mappings. + * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. */ - if (page_mapcount(page)) - return -EAGAIN; + anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); + spin_lock(&anon_vma->lock); + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) + remove_migration_pte(vma, old, new); + + spin_unlock(&anon_vma->lock); +} + +/* + * Get rid of all migration entries and replace them by + * references to the indicated page. + */ +static void remove_migration_ptes(struct page *old, struct page *new) +{ + if (PageAnon(new)) + remove_anon_migration_ptes(old, new); + else + remove_file_migration_ptes(old, new); +} + +/* + * Something used the pte of a page under migration. We need to + * get to the page and wait until migration is finished. + * When we return from this function the fault will be retried. + * + * This function is called from do_swap_page(). + */ +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + pte_t *ptep, pte; + spinlock_t *ptl; + swp_entry_t entry; + struct page *page; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; + + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto out; + + page = migration_entry_to_page(entry); + + get_page(page); + pte_unmap_unlock(ptep, ptl); + wait_on_page_locked(page); + put_page(page); + return; +out: + pte_unmap_unlock(ptep, ptl); +} + +/* + * Replace the page in the mapping. + * + * The number of remaining references must be: + * 1 for anonymous pages without a mapping + * 2 for pages with a mapping + * 3 for pages with a mapping and PagePrivate set. + */ +static int migrate_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + struct page **radix_pointer; + + if (!mapping) { + /* Anonymous page */ + if (page_count(page) != 1) + return -EAGAIN; + return 0; + } write_lock_irq(&mapping->tree_lock); @@ -227,7 +310,7 @@ int migrate_page_remove_references(struct page *newpage, &mapping->page_tree, page_index(page)); - if (!page_mapping(page) || page_count(page) != nr_refs || + if (page_count(page) != 2 + !!PagePrivate(page) || *radix_pointer != page) { write_unlock_irq(&mapping->tree_lock); return -EAGAIN; @@ -235,19 +318,14 @@ int migrate_page_remove_references(struct page *newpage, /* * Now we know that no one else is looking at the page. - * - * Certain minimal information about a page must be available - * in order for other subsystems to properly handle the page if they - * find it through the radix tree update before we are finished - * copying the page. */ get_page(newpage); - newpage->index = page->index; - newpage->mapping = page->mapping; +#ifdef CONFIG_SWAP if (PageSwapCache(page)) { SetPageSwapCache(newpage); set_page_private(newpage, page_private(page)); } +#endif *radix_pointer = newpage; __put_page(page); @@ -255,12 +333,11 @@ int migrate_page_remove_references(struct page *newpage, return 0; } -EXPORT_SYMBOL(migrate_page_remove_references); /* * Copy the page to its new location */ -void migrate_page_copy(struct page *newpage, struct page *page) +static void migrate_page_copy(struct page *newpage, struct page *page) { copy_highpage(newpage, page); @@ -282,7 +359,9 @@ void migrate_page_copy(struct page *newpage, struct page *page) set_page_dirty(newpage); } +#ifdef CONFIG_SWAP ClearPageSwapCache(page); +#endif ClearPageActive(page); ClearPagePrivate(page); set_page_private(page, 0); @@ -295,7 +374,18 @@ void migrate_page_copy(struct page *newpage, struct page *page) if (PageWriteback(newpage)) end_page_writeback(newpage); } -EXPORT_SYMBOL(migrate_page_copy); + +/************************************************************ + * Migration functions + ***********************************************************/ + +/* Always fail migration. Used for mappings that are not movable */ +int fail_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + return -EIO; +} +EXPORT_SYMBOL(fail_migrate_page); /* * Common logic to directly migrate a single page suitable for @@ -303,240 +393,40 @@ EXPORT_SYMBOL(migrate_page_copy); * * Pages are locked upon entry and exit. */ -int migrate_page(struct page *newpage, struct page *page) +int migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) { int rc; BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_remove_references(newpage, page, 2); + rc = migrate_page_move_mapping(mapping, newpage, page); if (rc) return rc; migrate_page_copy(newpage, page); - - /* - * Remove auxiliary swap entries and replace - * them with real ptes. - * - * Note that a real pte entry will allow processes that are not - * waiting on the page lock to use the new page via the page tables - * before the new page is unlocked. - */ - remove_from_swap(newpage); return 0; } EXPORT_SYMBOL(migrate_page); -/* - * migrate_pages - * - * Two lists are passed to this function. The first list - * contains the pages isolated from the LRU to be migrated. - * The second list contains new pages that the pages isolated - * can be moved to. If the second list is NULL then all - * pages are swapped out. - * - * The function returns after 10 attempts or if no pages - * are movable anymore because to has become empty - * or no retryable pages exist anymore. - * - * Return: Number of pages not migrated when "to" ran empty. - */ -int migrate_pages(struct list_head *from, struct list_head *to, - struct list_head *moved, struct list_head *failed) -{ - int retry; - int nr_failed = 0; - int pass = 0; - struct page *page; - struct page *page2; - int swapwrite = current->flags & PF_SWAPWRITE; - int rc; - - if (!swapwrite) - current->flags |= PF_SWAPWRITE; - -redo: - retry = 0; - - list_for_each_entry_safe(page, page2, from, lru) { - struct page *newpage = NULL; - struct address_space *mapping; - - cond_resched(); - - rc = 0; - if (page_count(page) == 1) - /* page was freed from under us. So we are done. */ - goto next; - - if (to && list_empty(to)) - break; - - /* - * Skip locked pages during the first two passes to give the - * functions holding the lock time to release the page. Later we - * use lock_page() to have a higher chance of acquiring the - * lock. - */ - rc = -EAGAIN; - if (pass > 2) - lock_page(page); - else - if (TestSetPageLocked(page)) - goto next; - - /* - * Only wait on writeback if we have already done a pass where - * we we may have triggered writeouts for lots of pages. - */ - if (pass > 0) { - wait_on_page_writeback(page); - } else { - if (PageWriteback(page)) - goto unlock_page; - } - - /* - * Anonymous pages must have swap cache references otherwise - * the information contained in the page maps cannot be - * preserved. - */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!add_to_swap(page, GFP_KERNEL)) { - rc = -ENOMEM; - goto unlock_page; - } - } - - if (!to) { - rc = swap_page(page); - goto next; - } - - newpage = lru_to_page(to); - lock_page(newpage); - - /* - * Pages are properly locked and writeback is complete. - * Try to migrate the page. - */ - mapping = page_mapping(page); - if (!mapping) - goto unlock_both; - - if (mapping->a_ops->migratepage) { - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(newpage, page); - goto unlock_both; - } - - /* - * Default handling if a filesystem does not provide - * a migration function. We can only migrate clean - * pages so try to write out any dirty pages first. - */ - if (PageDirty(page)) { - switch (pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_both; - - case PAGE_SUCCESS: - unlock_page(newpage); - goto next; - - case PAGE_CLEAN: - ; /* try to migrate the page below */ - } - } - - /* - * Buffers are managed in a filesystem specific way. - * We must have no buffers or drop them. - */ - if (!page_has_buffers(page) || - try_to_release_page(page, GFP_KERNEL)) { - rc = migrate_page(newpage, page); - goto unlock_both; - } - - /* - * On early passes with mapped pages simply - * retry. There may be a lock held for some - * buffers that may go away. Later - * swap them out. - */ - if (pass > 4) { - /* - * Persistently unable to drop buffers..... As a - * measure of last resort we fall back to - * swap_page(). - */ - unlock_page(newpage); - newpage = NULL; - rc = swap_page(page); - goto next; - } - -unlock_both: - unlock_page(newpage); - -unlock_page: - unlock_page(page); - -next: - if (rc == -EAGAIN) { - retry++; - } else if (rc) { - /* Permanent failure */ - list_move(&page->lru, failed); - nr_failed++; - } else { - if (newpage) { - /* Successful migration. Return page to LRU */ - move_to_lru(newpage); - } - list_move(&page->lru, moved); - } - } - if (retry && pass++ < 10) - goto redo; - - if (!swapwrite) - current->flags &= ~PF_SWAPWRITE; - - return nr_failed + retry; -} - /* * Migration function for pages with buffers. This function can only be used * if the underlying filesystem guarantees that no other references to "page" * exist. */ -int buffer_migrate_page(struct page *newpage, struct page *page) +int buffer_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) { - struct address_space *mapping = page->mapping; struct buffer_head *bh, *head; int rc; - if (!mapping) - return -EAGAIN; - if (!page_has_buffers(page)) - return migrate_page(newpage, page); + return migrate_page(mapping, newpage, page); head = page_buffers(page); - rc = migrate_page_remove_references(newpage, page, 3); + rc = migrate_page_move_mapping(mapping, newpage, page); if (rc) return rc; @@ -578,6 +468,238 @@ int buffer_migrate_page(struct page *newpage, struct page *page) } EXPORT_SYMBOL(buffer_migrate_page); +/* + * Writeback a page to clean the dirty state + */ +static int writeout(struct address_space *mapping, struct page *page) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = 1, + .range_start = 0, + .range_end = LLONG_MAX, + .nonblocking = 1, + .for_reclaim = 1 + }; + int rc; + + if (!mapping->a_ops->writepage) + /* No write method for the address space */ + return -EINVAL; + + if (!clear_page_dirty_for_io(page)) + /* Someone else already triggered a write */ + return -EAGAIN; + + /* + * A dirty page may imply that the underlying filesystem has + * the page on some queue. So the page must be clean for + * migration. Writeout may mean we loose the lock and the + * page state is no longer what we checked for earlier. + * At this point we know that the migration attempt cannot + * be successful. + */ + remove_migration_ptes(page, page); + + rc = mapping->a_ops->writepage(page, &wbc); + if (rc < 0) + /* I/O Error writing */ + return -EIO; + + if (rc != AOP_WRITEPAGE_ACTIVATE) + /* unlocked. Relock */ + lock_page(page); + + return -EAGAIN; +} + +/* + * Default handling if a filesystem does not provide a migration function. + */ +static int fallback_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + if (PageDirty(page)) + return writeout(mapping, page); + + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (page_has_buffers(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; + + return migrate_page(mapping, newpage, page); +} + +/* + * Move a page to a newly allocated page + * The page is locked and all ptes have been successfully removed. + * + * The new page will have replaced the old page if this function + * is successful. + */ +static int move_to_new_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping; + int rc; + + /* + * Block others from accessing the page when we get around to + * establishing additional references. We are the only one + * holding a reference to the new page at this point. + */ + if (TestSetPageLocked(newpage)) + BUG(); + + /* Prepare mapping for the new page.*/ + newpage->index = page->index; + newpage->mapping = page->mapping; + + mapping = page_mapping(page); + if (!mapping) + rc = migrate_page(mapping, newpage, page); + else if (mapping->a_ops->migratepage) + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(mapping, + newpage, page); + else + rc = fallback_migrate_page(mapping, newpage, page); + + if (!rc) + remove_migration_ptes(page, newpage); + else + newpage->mapping = NULL; + + unlock_page(newpage); + + return rc; +} + +/* + * Obtain the lock on page, remove all ptes and migrate the page + * to the newly allocated page in newpage. + */ +static int unmap_and_move(struct page *newpage, struct page *page, int force) +{ + int rc = 0; + + if (page_count(page) == 1) + /* page was freed from under us. So we are done. */ + goto ret; + + rc = -EAGAIN; + if (TestSetPageLocked(page)) { + if (!force) + goto ret; + lock_page(page); + } + + if (PageWriteback(page)) { + if (!force) + goto unlock; + wait_on_page_writeback(page); + } + + /* + * Establish migration ptes or remove ptes + */ + if (try_to_unmap(page, 1) != SWAP_FAIL) { + if (!page_mapped(page)) + rc = move_to_new_page(newpage, page); + } else + /* A vma has VM_LOCKED set -> permanent failure */ + rc = -EPERM; + + if (rc) + remove_migration_ptes(page, page); +unlock: + unlock_page(page); +ret: + if (rc != -EAGAIN) { + /* + * A page that has been migrated has all references + * removed and will be freed. A page that has not been + * migrated will have kepts its references and be + * restored. + */ + list_del(&page->lru); + move_to_lru(page); + + list_del(&newpage->lru); + move_to_lru(newpage); + } + return rc; +} + +/* + * migrate_pages + * + * Two lists are passed to this function. The first list + * contains the pages isolated from the LRU to be migrated. + * The second list contains new pages that the isolated pages + * can be moved to. + * + * The function returns after 10 attempts or if no pages + * are movable anymore because to has become empty + * or no retryable pages exist anymore. All pages will be + * retruned to the LRU or freed. + * + * Return: Number of pages not migrated. + */ +int migrate_pages(struct list_head *from, struct list_head *to) +{ + int retry = 1; + int nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int swapwrite = current->flags & PF_SWAPWRITE; + int rc; + + if (!swapwrite) + current->flags |= PF_SWAPWRITE; + + for(pass = 0; pass < 10 && retry; pass++) { + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + + if (list_empty(to)) + break; + + cond_resched(); + + rc = unmap_and_move(lru_to_page(to), page, pass > 2); + + switch(rc) { + case -EAGAIN: + retry++; + break; + case 0: + break; + default: + /* Permanent failure */ + nr_failed++; + break; + } + } + } + + if (!swapwrite) + current->flags &= ~PF_SWAPWRITE; + + putback_lru_pages(from); + return nr_failed + retry; +} + /* * Migrate the list 'pagelist' of pages to a certain destination. * @@ -588,11 +710,10 @@ int migrate_pages_to(struct list_head *pagelist, struct vm_area_struct *vma, int dest) { LIST_HEAD(newlist); - LIST_HEAD(moved); - LIST_HEAD(failed); int err = 0; unsigned long offset = 0; int nr_pages; + int nr_failed = 0; struct page *page; struct list_head *p; @@ -626,26 +747,17 @@ redo: if (nr_pages > MIGRATE_CHUNK_SIZE) break; } - err = migrate_pages(pagelist, &newlist, &moved, &failed); + err = migrate_pages(pagelist, &newlist); - putback_lru_pages(&moved); /* Call release pages instead ?? */ - - if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) - goto redo; -out: - /* Return leftover allocated pages */ - while (!list_empty(&newlist)) { - page = list_entry(newlist.next, struct page, lru); - list_del(&page->lru); - __free_page(page); + if (err >= 0) { + nr_failed += err; + if (list_empty(&newlist) && !list_empty(pagelist)) + goto redo; } - list_splice(&failed, pagelist); - if (err < 0) - return err; +out: /* Calculate number of leftover pages */ - nr_pages = 0; list_for_each(p, pagelist) - nr_pages++; - return nr_pages; + nr_failed++; + return nr_failed; }