2 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
3 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License, version 2, as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 #include <linux/mman.h>
20 #include <linux/kvm_host.h>
22 #include <linux/hugetlb.h>
23 #include <trace/events/kvm.h>
24 #include <asm/pgalloc.h>
25 #include <asm/cacheflush.h>
26 #include <asm/kvm_arm.h>
27 #include <asm/kvm_mmu.h>
28 #include <asm/kvm_mmio.h>
29 #include <asm/kvm_asm.h>
30 #include <asm/kvm_emulate.h>
34 extern char __hyp_idmap_text_start
[], __hyp_idmap_text_end
[];
36 static pgd_t
*boot_hyp_pgd
;
37 static pgd_t
*hyp_pgd
;
38 static DEFINE_MUTEX(kvm_hyp_pgd_mutex
);
40 static void *init_bounce_page
;
41 static unsigned long hyp_idmap_start
;
42 static unsigned long hyp_idmap_end
;
43 static phys_addr_t hyp_idmap_vector
;
45 #define pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
47 #define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x))
49 static void kvm_tlb_flush_vmid_ipa(struct kvm
*kvm
, phys_addr_t ipa
)
52 * This function also gets called when dealing with HYP page
53 * tables. As HYP doesn't have an associated struct kvm (and
54 * the HYP page tables are fairly static), we don't do
58 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa
, kvm
, ipa
);
61 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache
*cache
,
66 BUG_ON(max
> KVM_NR_MEM_OBJS
);
67 if (cache
->nobjs
>= min
)
69 while (cache
->nobjs
< max
) {
70 page
= (void *)__get_free_page(PGALLOC_GFP
);
73 cache
->objects
[cache
->nobjs
++] = page
;
78 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache
*mc
)
81 free_page((unsigned long)mc
->objects
[--mc
->nobjs
]);
84 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache
*mc
)
88 BUG_ON(!mc
|| !mc
->nobjs
);
89 p
= mc
->objects
[--mc
->nobjs
];
93 static bool page_empty(void *ptr
)
95 struct page
*ptr_page
= virt_to_page(ptr
);
96 return page_count(ptr_page
) == 1;
99 static void clear_pud_entry(struct kvm
*kvm
, pud_t
*pud
, phys_addr_t addr
)
101 if (pud_huge(*pud
)) {
103 kvm_tlb_flush_vmid_ipa(kvm
, addr
);
105 pmd_t
*pmd_table
= pmd_offset(pud
, 0);
107 kvm_tlb_flush_vmid_ipa(kvm
, addr
);
108 pmd_free(NULL
, pmd_table
);
110 put_page(virt_to_page(pud
));
113 static void clear_pmd_entry(struct kvm
*kvm
, pmd_t
*pmd
, phys_addr_t addr
)
115 if (kvm_pmd_huge(*pmd
)) {
117 kvm_tlb_flush_vmid_ipa(kvm
, addr
);
119 pte_t
*pte_table
= pte_offset_kernel(pmd
, 0);
121 kvm_tlb_flush_vmid_ipa(kvm
, addr
);
122 pte_free_kernel(NULL
, pte_table
);
124 put_page(virt_to_page(pmd
));
127 static void clear_pte_entry(struct kvm
*kvm
, pte_t
*pte
, phys_addr_t addr
)
129 if (pte_present(*pte
)) {
130 kvm_set_pte(pte
, __pte(0));
131 put_page(virt_to_page(pte
));
132 kvm_tlb_flush_vmid_ipa(kvm
, addr
);
136 static void unmap_range(struct kvm
*kvm
, pgd_t
*pgdp
,
137 unsigned long long start
, u64 size
)
143 unsigned long long addr
= start
, end
= start
+ size
;
147 pgd
= pgdp
+ pgd_index(addr
);
148 pud
= pud_offset(pgd
, addr
);
150 if (pud_none(*pud
)) {
151 addr
= kvm_pud_addr_end(addr
, end
);
155 if (pud_huge(*pud
)) {
157 * If we are dealing with a huge pud, just clear it and
160 clear_pud_entry(kvm
, pud
, addr
);
161 addr
= kvm_pud_addr_end(addr
, end
);
165 pmd
= pmd_offset(pud
, addr
);
166 if (pmd_none(*pmd
)) {
167 addr
= kvm_pmd_addr_end(addr
, end
);
171 if (!kvm_pmd_huge(*pmd
)) {
172 pte
= pte_offset_kernel(pmd
, addr
);
173 clear_pte_entry(kvm
, pte
, addr
);
174 next
= addr
+ PAGE_SIZE
;
178 * If the pmd entry is to be cleared, walk back up the ladder
180 if (kvm_pmd_huge(*pmd
) || (pte
&& page_empty(pte
))) {
181 clear_pmd_entry(kvm
, pmd
, addr
);
182 next
= kvm_pmd_addr_end(addr
, end
);
183 if (page_empty(pmd
) && !page_empty(pud
)) {
184 clear_pud_entry(kvm
, pud
, addr
);
185 next
= kvm_pud_addr_end(addr
, end
);
193 static void stage2_flush_ptes(struct kvm
*kvm
, pmd_t
*pmd
,
194 phys_addr_t addr
, phys_addr_t end
)
198 pte
= pte_offset_kernel(pmd
, addr
);
200 if (!pte_none(*pte
)) {
201 hva_t hva
= gfn_to_hva(kvm
, addr
>> PAGE_SHIFT
);
202 kvm_flush_dcache_to_poc((void*)hva
, PAGE_SIZE
);
204 } while (pte
++, addr
+= PAGE_SIZE
, addr
!= end
);
207 static void stage2_flush_pmds(struct kvm
*kvm
, pud_t
*pud
,
208 phys_addr_t addr
, phys_addr_t end
)
213 pmd
= pmd_offset(pud
, addr
);
215 next
= kvm_pmd_addr_end(addr
, end
);
216 if (!pmd_none(*pmd
)) {
217 if (kvm_pmd_huge(*pmd
)) {
218 hva_t hva
= gfn_to_hva(kvm
, addr
>> PAGE_SHIFT
);
219 kvm_flush_dcache_to_poc((void*)hva
, PMD_SIZE
);
221 stage2_flush_ptes(kvm
, pmd
, addr
, next
);
224 } while (pmd
++, addr
= next
, addr
!= end
);
227 static void stage2_flush_puds(struct kvm
*kvm
, pgd_t
*pgd
,
228 phys_addr_t addr
, phys_addr_t end
)
233 pud
= pud_offset(pgd
, addr
);
235 next
= kvm_pud_addr_end(addr
, end
);
236 if (!pud_none(*pud
)) {
237 if (pud_huge(*pud
)) {
238 hva_t hva
= gfn_to_hva(kvm
, addr
>> PAGE_SHIFT
);
239 kvm_flush_dcache_to_poc((void*)hva
, PUD_SIZE
);
241 stage2_flush_pmds(kvm
, pud
, addr
, next
);
244 } while (pud
++, addr
= next
, addr
!= end
);
247 static void stage2_flush_memslot(struct kvm
*kvm
,
248 struct kvm_memory_slot
*memslot
)
250 phys_addr_t addr
= memslot
->base_gfn
<< PAGE_SHIFT
;
251 phys_addr_t end
= addr
+ PAGE_SIZE
* memslot
->npages
;
255 pgd
= kvm
->arch
.pgd
+ pgd_index(addr
);
257 next
= kvm_pgd_addr_end(addr
, end
);
258 stage2_flush_puds(kvm
, pgd
, addr
, next
);
259 } while (pgd
++, addr
= next
, addr
!= end
);
263 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
264 * @kvm: The struct kvm pointer
266 * Go through the stage 2 page tables and invalidate any cache lines
267 * backing memory already mapped to the VM.
269 void stage2_flush_vm(struct kvm
*kvm
)
271 struct kvm_memslots
*slots
;
272 struct kvm_memory_slot
*memslot
;
275 idx
= srcu_read_lock(&kvm
->srcu
);
276 spin_lock(&kvm
->mmu_lock
);
278 slots
= kvm_memslots(kvm
);
279 kvm_for_each_memslot(memslot
, slots
)
280 stage2_flush_memslot(kvm
, memslot
);
282 spin_unlock(&kvm
->mmu_lock
);
283 srcu_read_unlock(&kvm
->srcu
, idx
);
287 * free_boot_hyp_pgd - free HYP boot page tables
289 * Free the HYP boot page tables. The bounce page is also freed.
291 void free_boot_hyp_pgd(void)
293 mutex_lock(&kvm_hyp_pgd_mutex
);
296 unmap_range(NULL
, boot_hyp_pgd
, hyp_idmap_start
, PAGE_SIZE
);
297 unmap_range(NULL
, boot_hyp_pgd
, TRAMPOLINE_VA
, PAGE_SIZE
);
298 free_pages((unsigned long)boot_hyp_pgd
, pgd_order
);
303 unmap_range(NULL
, hyp_pgd
, TRAMPOLINE_VA
, PAGE_SIZE
);
305 free_page((unsigned long)init_bounce_page
);
306 init_bounce_page
= NULL
;
308 mutex_unlock(&kvm_hyp_pgd_mutex
);
312 * free_hyp_pgds - free Hyp-mode page tables
314 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
315 * therefore contains either mappings in the kernel memory area (above
316 * PAGE_OFFSET), or device mappings in the vmalloc range (from
317 * VMALLOC_START to VMALLOC_END).
319 * boot_hyp_pgd should only map two pages for the init code.
321 void free_hyp_pgds(void)
327 mutex_lock(&kvm_hyp_pgd_mutex
);
330 for (addr
= PAGE_OFFSET
; virt_addr_valid(addr
); addr
+= PGDIR_SIZE
)
331 unmap_range(NULL
, hyp_pgd
, KERN_TO_HYP(addr
), PGDIR_SIZE
);
332 for (addr
= VMALLOC_START
; is_vmalloc_addr((void*)addr
); addr
+= PGDIR_SIZE
)
333 unmap_range(NULL
, hyp_pgd
, KERN_TO_HYP(addr
), PGDIR_SIZE
);
335 free_pages((unsigned long)hyp_pgd
, pgd_order
);
339 mutex_unlock(&kvm_hyp_pgd_mutex
);
342 static void create_hyp_pte_mappings(pmd_t
*pmd
, unsigned long start
,
343 unsigned long end
, unsigned long pfn
,
351 pte
= pte_offset_kernel(pmd
, addr
);
352 kvm_set_pte(pte
, pfn_pte(pfn
, prot
));
353 get_page(virt_to_page(pte
));
354 kvm_flush_dcache_to_poc(pte
, sizeof(*pte
));
356 } while (addr
+= PAGE_SIZE
, addr
!= end
);
359 static int create_hyp_pmd_mappings(pud_t
*pud
, unsigned long start
,
360 unsigned long end
, unsigned long pfn
,
365 unsigned long addr
, next
;
369 pmd
= pmd_offset(pud
, addr
);
371 BUG_ON(pmd_sect(*pmd
));
373 if (pmd_none(*pmd
)) {
374 pte
= pte_alloc_one_kernel(NULL
, addr
);
376 kvm_err("Cannot allocate Hyp pte\n");
379 pmd_populate_kernel(NULL
, pmd
, pte
);
380 get_page(virt_to_page(pmd
));
381 kvm_flush_dcache_to_poc(pmd
, sizeof(*pmd
));
384 next
= pmd_addr_end(addr
, end
);
386 create_hyp_pte_mappings(pmd
, addr
, next
, pfn
, prot
);
387 pfn
+= (next
- addr
) >> PAGE_SHIFT
;
388 } while (addr
= next
, addr
!= end
);
393 static int __create_hyp_mappings(pgd_t
*pgdp
,
394 unsigned long start
, unsigned long end
,
395 unsigned long pfn
, pgprot_t prot
)
400 unsigned long addr
, next
;
403 mutex_lock(&kvm_hyp_pgd_mutex
);
404 addr
= start
& PAGE_MASK
;
405 end
= PAGE_ALIGN(end
);
407 pgd
= pgdp
+ pgd_index(addr
);
408 pud
= pud_offset(pgd
, addr
);
410 if (pud_none_or_clear_bad(pud
)) {
411 pmd
= pmd_alloc_one(NULL
, addr
);
413 kvm_err("Cannot allocate Hyp pmd\n");
417 pud_populate(NULL
, pud
, pmd
);
418 get_page(virt_to_page(pud
));
419 kvm_flush_dcache_to_poc(pud
, sizeof(*pud
));
422 next
= pgd_addr_end(addr
, end
);
423 err
= create_hyp_pmd_mappings(pud
, addr
, next
, pfn
, prot
);
426 pfn
+= (next
- addr
) >> PAGE_SHIFT
;
427 } while (addr
= next
, addr
!= end
);
429 mutex_unlock(&kvm_hyp_pgd_mutex
);
433 static phys_addr_t
kvm_kaddr_to_phys(void *kaddr
)
435 if (!is_vmalloc_addr(kaddr
)) {
436 BUG_ON(!virt_addr_valid(kaddr
));
439 return page_to_phys(vmalloc_to_page(kaddr
)) +
440 offset_in_page(kaddr
);
445 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
446 * @from: The virtual kernel start address of the range
447 * @to: The virtual kernel end address of the range (exclusive)
449 * The same virtual address as the kernel virtual address is also used
450 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
453 int create_hyp_mappings(void *from
, void *to
)
455 phys_addr_t phys_addr
;
456 unsigned long virt_addr
;
457 unsigned long start
= KERN_TO_HYP((unsigned long)from
);
458 unsigned long end
= KERN_TO_HYP((unsigned long)to
);
460 start
= start
& PAGE_MASK
;
461 end
= PAGE_ALIGN(end
);
463 for (virt_addr
= start
; virt_addr
< end
; virt_addr
+= PAGE_SIZE
) {
466 phys_addr
= kvm_kaddr_to_phys(from
+ virt_addr
- start
);
467 err
= __create_hyp_mappings(hyp_pgd
, virt_addr
,
468 virt_addr
+ PAGE_SIZE
,
469 __phys_to_pfn(phys_addr
),
479 * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode
480 * @from: The kernel start VA of the range
481 * @to: The kernel end VA of the range (exclusive)
482 * @phys_addr: The physical start address which gets mapped
484 * The resulting HYP VA is the same as the kernel VA, modulo
487 int create_hyp_io_mappings(void *from
, void *to
, phys_addr_t phys_addr
)
489 unsigned long start
= KERN_TO_HYP((unsigned long)from
);
490 unsigned long end
= KERN_TO_HYP((unsigned long)to
);
492 /* Check for a valid kernel IO mapping */
493 if (!is_vmalloc_addr(from
) || !is_vmalloc_addr(to
- 1))
496 return __create_hyp_mappings(hyp_pgd
, start
, end
,
497 __phys_to_pfn(phys_addr
), PAGE_HYP_DEVICE
);
501 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
502 * @kvm: The KVM struct pointer for the VM.
504 * Allocates the 1st level table only of size defined by S2_PGD_ORDER (can
505 * support either full 40-bit input addresses or limited to 32-bit input
506 * addresses). Clears the allocated pages.
508 * Note we don't need locking here as this is only called when the VM is
509 * created, which can only be done once.
511 int kvm_alloc_stage2_pgd(struct kvm
*kvm
)
515 if (kvm
->arch
.pgd
!= NULL
) {
516 kvm_err("kvm_arch already initialized?\n");
520 pgd
= (pgd_t
*)__get_free_pages(GFP_KERNEL
, S2_PGD_ORDER
);
524 memset(pgd
, 0, PTRS_PER_S2_PGD
* sizeof(pgd_t
));
532 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
533 * @kvm: The VM pointer
534 * @start: The intermediate physical base address of the range to unmap
535 * @size: The size of the area to unmap
537 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
538 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
539 * destroying the VM), otherwise another faulting VCPU may come in and mess
540 * with things behind our backs.
542 static void unmap_stage2_range(struct kvm
*kvm
, phys_addr_t start
, u64 size
)
544 unmap_range(kvm
, kvm
->arch
.pgd
, start
, size
);
548 * kvm_free_stage2_pgd - free all stage-2 tables
549 * @kvm: The KVM struct pointer for the VM.
551 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
552 * underlying level-2 and level-3 tables before freeing the actual level-1 table
553 * and setting the struct pointer to NULL.
555 * Note we don't need locking here as this is only called when the VM is
556 * destroyed, which can only be done once.
558 void kvm_free_stage2_pgd(struct kvm
*kvm
)
560 if (kvm
->arch
.pgd
== NULL
)
563 unmap_stage2_range(kvm
, 0, KVM_PHYS_SIZE
);
564 free_pages((unsigned long)kvm
->arch
.pgd
, S2_PGD_ORDER
);
565 kvm
->arch
.pgd
= NULL
;
568 static pmd_t
*stage2_get_pmd(struct kvm
*kvm
, struct kvm_mmu_memory_cache
*cache
,
575 pgd
= kvm
->arch
.pgd
+ pgd_index(addr
);
576 pud
= pud_offset(pgd
, addr
);
577 if (pud_none(*pud
)) {
580 pmd
= mmu_memory_cache_alloc(cache
);
581 pud_populate(NULL
, pud
, pmd
);
582 get_page(virt_to_page(pud
));
585 return pmd_offset(pud
, addr
);
588 static int stage2_set_pmd_huge(struct kvm
*kvm
, struct kvm_mmu_memory_cache
589 *cache
, phys_addr_t addr
, const pmd_t
*new_pmd
)
593 pmd
= stage2_get_pmd(kvm
, cache
, addr
);
597 * Mapping in huge pages should only happen through a fault. If a
598 * page is merged into a transparent huge page, the individual
599 * subpages of that huge page should be unmapped through MMU
600 * notifiers before we get here.
602 * Merging of CompoundPages is not supported; they should become
603 * splitting first, unmapped, merged, and mapped back in on-demand.
605 VM_BUG_ON(pmd_present(*pmd
) && pmd_pfn(*pmd
) != pmd_pfn(*new_pmd
));
608 kvm_set_pmd(pmd
, *new_pmd
);
609 if (pmd_present(old_pmd
))
610 kvm_tlb_flush_vmid_ipa(kvm
, addr
);
612 get_page(virt_to_page(pmd
));
616 static int stage2_set_pte(struct kvm
*kvm
, struct kvm_mmu_memory_cache
*cache
,
617 phys_addr_t addr
, const pte_t
*new_pte
, bool iomap
)
622 /* Create stage-2 page table mapping - Level 1 */
623 pmd
= stage2_get_pmd(kvm
, cache
, addr
);
626 * Ignore calls from kvm_set_spte_hva for unallocated
632 /* Create stage-2 page mappings - Level 2 */
633 if (pmd_none(*pmd
)) {
635 return 0; /* ignore calls from kvm_set_spte_hva */
636 pte
= mmu_memory_cache_alloc(cache
);
638 pmd_populate_kernel(NULL
, pmd
, pte
);
639 get_page(virt_to_page(pmd
));
642 pte
= pte_offset_kernel(pmd
, addr
);
644 if (iomap
&& pte_present(*pte
))
647 /* Create 2nd stage page table mapping - Level 3 */
649 kvm_set_pte(pte
, *new_pte
);
650 if (pte_present(old_pte
))
651 kvm_tlb_flush_vmid_ipa(kvm
, addr
);
653 get_page(virt_to_page(pte
));
659 * kvm_phys_addr_ioremap - map a device range to guest IPA
661 * @kvm: The KVM pointer
662 * @guest_ipa: The IPA at which to insert the mapping
663 * @pa: The physical address of the device
664 * @size: The size of the mapping
666 int kvm_phys_addr_ioremap(struct kvm
*kvm
, phys_addr_t guest_ipa
,
667 phys_addr_t pa
, unsigned long size
)
669 phys_addr_t addr
, end
;
672 struct kvm_mmu_memory_cache cache
= { 0, };
674 end
= (guest_ipa
+ size
+ PAGE_SIZE
- 1) & PAGE_MASK
;
675 pfn
= __phys_to_pfn(pa
);
677 for (addr
= guest_ipa
; addr
< end
; addr
+= PAGE_SIZE
) {
678 pte_t pte
= pfn_pte(pfn
, PAGE_S2_DEVICE
);
680 ret
= mmu_topup_memory_cache(&cache
, 2, 2);
683 spin_lock(&kvm
->mmu_lock
);
684 ret
= stage2_set_pte(kvm
, &cache
, addr
, &pte
, true);
685 spin_unlock(&kvm
->mmu_lock
);
693 mmu_free_memory_cache(&cache
);
697 static bool transparent_hugepage_adjust(pfn_t
*pfnp
, phys_addr_t
*ipap
)
700 gfn_t gfn
= *ipap
>> PAGE_SHIFT
;
702 if (PageTransCompound(pfn_to_page(pfn
))) {
705 * The address we faulted on is backed by a transparent huge
706 * page. However, because we map the compound huge page and
707 * not the individual tail page, we need to transfer the
708 * refcount to the head page. We have to be careful that the
709 * THP doesn't start to split while we are adjusting the
712 * We are sure this doesn't happen, because mmu_notifier_retry
713 * was successful and we are holding the mmu_lock, so if this
714 * THP is trying to split, it will be blocked in the mmu
715 * notifier before touching any of the pages, specifically
716 * before being able to call __split_huge_page_refcount().
718 * We can therefore safely transfer the refcount from PG_tail
719 * to PG_head and switch the pfn from a tail page to the head
722 mask
= PTRS_PER_PMD
- 1;
723 VM_BUG_ON((gfn
& mask
) != (pfn
& mask
));
726 kvm_release_pfn_clean(pfn
);
738 static int user_mem_abort(struct kvm_vcpu
*vcpu
, phys_addr_t fault_ipa
,
739 struct kvm_memory_slot
*memslot
,
740 unsigned long fault_status
)
743 bool write_fault
, writable
, hugetlb
= false, force_pte
= false;
744 unsigned long mmu_seq
;
745 gfn_t gfn
= fault_ipa
>> PAGE_SHIFT
;
746 unsigned long hva
= gfn_to_hva(vcpu
->kvm
, gfn
);
747 struct kvm
*kvm
= vcpu
->kvm
;
748 struct kvm_mmu_memory_cache
*memcache
= &vcpu
->arch
.mmu_page_cache
;
749 struct vm_area_struct
*vma
;
752 write_fault
= kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu
));
753 if (fault_status
== FSC_PERM
&& !write_fault
) {
754 kvm_err("Unexpected L2 read permission error\n");
758 /* Let's check if we will get back a huge page backed by hugetlbfs */
759 down_read(¤t
->mm
->mmap_sem
);
760 vma
= find_vma_intersection(current
->mm
, hva
, hva
+ 1);
761 if (is_vm_hugetlb_page(vma
)) {
763 gfn
= (fault_ipa
& PMD_MASK
) >> PAGE_SHIFT
;
766 * Pages belonging to memslots that don't have the same
767 * alignment for userspace and IPA cannot be mapped using
768 * block descriptors even if the pages belong to a THP for
769 * the process, because the stage-2 block descriptor will
770 * cover more than a single THP and we loose atomicity for
771 * unmapping, updates, and splits of the THP or other pages
772 * in the stage-2 block range.
774 if ((memslot
->userspace_addr
& ~PMD_MASK
) !=
775 ((memslot
->base_gfn
<< PAGE_SHIFT
) & ~PMD_MASK
))
778 up_read(¤t
->mm
->mmap_sem
);
780 /* We need minimum second+third level pages */
781 ret
= mmu_topup_memory_cache(memcache
, 2, KVM_NR_MEM_OBJS
);
785 mmu_seq
= vcpu
->kvm
->mmu_notifier_seq
;
787 * Ensure the read of mmu_notifier_seq happens before we call
788 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
789 * the page we just got a reference to gets unmapped before we have a
790 * chance to grab the mmu_lock, which ensure that if the page gets
791 * unmapped afterwards, the call to kvm_unmap_hva will take it away
792 * from us again properly. This smp_rmb() interacts with the smp_wmb()
793 * in kvm_mmu_notifier_invalidate_<page|range_end>.
797 pfn
= gfn_to_pfn_prot(kvm
, gfn
, write_fault
, &writable
);
798 if (is_error_pfn(pfn
))
801 spin_lock(&kvm
->mmu_lock
);
802 if (mmu_notifier_retry(kvm
, mmu_seq
))
804 if (!hugetlb
&& !force_pte
)
805 hugetlb
= transparent_hugepage_adjust(&pfn
, &fault_ipa
);
808 pmd_t new_pmd
= pfn_pmd(pfn
, PAGE_S2
);
809 new_pmd
= pmd_mkhuge(new_pmd
);
811 kvm_set_s2pmd_writable(&new_pmd
);
812 kvm_set_pfn_dirty(pfn
);
814 coherent_cache_guest_page(vcpu
, hva
& PMD_MASK
, PMD_SIZE
);
815 ret
= stage2_set_pmd_huge(kvm
, memcache
, fault_ipa
, &new_pmd
);
817 pte_t new_pte
= pfn_pte(pfn
, PAGE_S2
);
819 kvm_set_s2pte_writable(&new_pte
);
820 kvm_set_pfn_dirty(pfn
);
822 coherent_cache_guest_page(vcpu
, hva
, PAGE_SIZE
);
823 ret
= stage2_set_pte(kvm
, memcache
, fault_ipa
, &new_pte
, false);
828 spin_unlock(&kvm
->mmu_lock
);
829 kvm_release_pfn_clean(pfn
);
834 * kvm_handle_guest_abort - handles all 2nd stage aborts
835 * @vcpu: the VCPU pointer
836 * @run: the kvm_run structure
838 * Any abort that gets to the host is almost guaranteed to be caused by a
839 * missing second stage translation table entry, which can mean that either the
840 * guest simply needs more memory and we must allocate an appropriate page or it
841 * can mean that the guest tried to access I/O memory, which is emulated by user
842 * space. The distinction is based on the IPA causing the fault and whether this
843 * memory region has been registered as standard RAM by user space.
845 int kvm_handle_guest_abort(struct kvm_vcpu
*vcpu
, struct kvm_run
*run
)
847 unsigned long fault_status
;
848 phys_addr_t fault_ipa
;
849 struct kvm_memory_slot
*memslot
;
854 is_iabt
= kvm_vcpu_trap_is_iabt(vcpu
);
855 fault_ipa
= kvm_vcpu_get_fault_ipa(vcpu
);
857 trace_kvm_guest_fault(*vcpu_pc(vcpu
), kvm_vcpu_get_hsr(vcpu
),
858 kvm_vcpu_get_hfar(vcpu
), fault_ipa
);
860 /* Check the stage-2 fault is trans. fault or write fault */
861 fault_status
= kvm_vcpu_trap_get_fault(vcpu
);
862 if (fault_status
!= FSC_FAULT
&& fault_status
!= FSC_PERM
) {
863 kvm_err("Unsupported fault status: EC=%#x DFCS=%#lx\n",
864 kvm_vcpu_trap_get_class(vcpu
), fault_status
);
868 idx
= srcu_read_lock(&vcpu
->kvm
->srcu
);
870 gfn
= fault_ipa
>> PAGE_SHIFT
;
871 if (!kvm_is_visible_gfn(vcpu
->kvm
, gfn
)) {
873 /* Prefetch Abort on I/O address */
874 kvm_inject_pabt(vcpu
, kvm_vcpu_get_hfar(vcpu
));
879 if (fault_status
!= FSC_FAULT
) {
880 kvm_err("Unsupported fault status on io memory: %#lx\n",
887 * The IPA is reported as [MAX:12], so we need to
888 * complement it with the bottom 12 bits from the
889 * faulting VA. This is always 12 bits, irrespective
892 fault_ipa
|= kvm_vcpu_get_hfar(vcpu
) & ((1 << 12) - 1);
893 ret
= io_mem_abort(vcpu
, run
, fault_ipa
);
897 memslot
= gfn_to_memslot(vcpu
->kvm
, gfn
);
899 ret
= user_mem_abort(vcpu
, fault_ipa
, memslot
, fault_status
);
903 srcu_read_unlock(&vcpu
->kvm
->srcu
, idx
);
907 static void handle_hva_to_gpa(struct kvm
*kvm
,
910 void (*handler
)(struct kvm
*kvm
,
911 gpa_t gpa
, void *data
),
914 struct kvm_memslots
*slots
;
915 struct kvm_memory_slot
*memslot
;
917 slots
= kvm_memslots(kvm
);
919 /* we only care about the pages that the guest sees */
920 kvm_for_each_memslot(memslot
, slots
) {
921 unsigned long hva_start
, hva_end
;
924 hva_start
= max(start
, memslot
->userspace_addr
);
925 hva_end
= min(end
, memslot
->userspace_addr
+
926 (memslot
->npages
<< PAGE_SHIFT
));
927 if (hva_start
>= hva_end
)
931 * {gfn(page) | page intersects with [hva_start, hva_end)} =
932 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
934 gfn
= hva_to_gfn_memslot(hva_start
, memslot
);
935 gfn_end
= hva_to_gfn_memslot(hva_end
+ PAGE_SIZE
- 1, memslot
);
937 for (; gfn
< gfn_end
; ++gfn
) {
938 gpa_t gpa
= gfn
<< PAGE_SHIFT
;
939 handler(kvm
, gpa
, data
);
944 static void kvm_unmap_hva_handler(struct kvm
*kvm
, gpa_t gpa
, void *data
)
946 unmap_stage2_range(kvm
, gpa
, PAGE_SIZE
);
949 int kvm_unmap_hva(struct kvm
*kvm
, unsigned long hva
)
951 unsigned long end
= hva
+ PAGE_SIZE
;
956 trace_kvm_unmap_hva(hva
);
957 handle_hva_to_gpa(kvm
, hva
, end
, &kvm_unmap_hva_handler
, NULL
);
961 int kvm_unmap_hva_range(struct kvm
*kvm
,
962 unsigned long start
, unsigned long end
)
967 trace_kvm_unmap_hva_range(start
, end
);
968 handle_hva_to_gpa(kvm
, start
, end
, &kvm_unmap_hva_handler
, NULL
);
972 static void kvm_set_spte_handler(struct kvm
*kvm
, gpa_t gpa
, void *data
)
974 pte_t
*pte
= (pte_t
*)data
;
976 stage2_set_pte(kvm
, NULL
, gpa
, pte
, false);
980 void kvm_set_spte_hva(struct kvm
*kvm
, unsigned long hva
, pte_t pte
)
982 unsigned long end
= hva
+ PAGE_SIZE
;
988 trace_kvm_set_spte_hva(hva
);
989 stage2_pte
= pfn_pte(pte_pfn(pte
), PAGE_S2
);
990 handle_hva_to_gpa(kvm
, hva
, end
, &kvm_set_spte_handler
, &stage2_pte
);
993 void kvm_mmu_free_memory_caches(struct kvm_vcpu
*vcpu
)
995 mmu_free_memory_cache(&vcpu
->arch
.mmu_page_cache
);
998 phys_addr_t
kvm_mmu_get_httbr(void)
1000 return virt_to_phys(hyp_pgd
);
1003 phys_addr_t
kvm_mmu_get_boot_httbr(void)
1005 return virt_to_phys(boot_hyp_pgd
);
1008 phys_addr_t
kvm_get_idmap_vector(void)
1010 return hyp_idmap_vector
;
1013 int kvm_mmu_init(void)
1017 hyp_idmap_start
= kvm_virt_to_phys(__hyp_idmap_text_start
);
1018 hyp_idmap_end
= kvm_virt_to_phys(__hyp_idmap_text_end
);
1019 hyp_idmap_vector
= kvm_virt_to_phys(__kvm_hyp_init
);
1021 if ((hyp_idmap_start
^ hyp_idmap_end
) & PAGE_MASK
) {
1023 * Our init code is crossing a page boundary. Allocate
1024 * a bounce page, copy the code over and use that.
1026 size_t len
= __hyp_idmap_text_end
- __hyp_idmap_text_start
;
1027 phys_addr_t phys_base
;
1029 init_bounce_page
= (void *)__get_free_page(GFP_KERNEL
);
1030 if (!init_bounce_page
) {
1031 kvm_err("Couldn't allocate HYP init bounce page\n");
1036 memcpy(init_bounce_page
, __hyp_idmap_text_start
, len
);
1038 * Warning: the code we just copied to the bounce page
1039 * must be flushed to the point of coherency.
1040 * Otherwise, the data may be sitting in L2, and HYP
1041 * mode won't be able to observe it as it runs with
1042 * caches off at that point.
1044 kvm_flush_dcache_to_poc(init_bounce_page
, len
);
1046 phys_base
= kvm_virt_to_phys(init_bounce_page
);
1047 hyp_idmap_vector
+= phys_base
- hyp_idmap_start
;
1048 hyp_idmap_start
= phys_base
;
1049 hyp_idmap_end
= phys_base
+ len
;
1051 kvm_info("Using HYP init bounce page @%lx\n",
1052 (unsigned long)phys_base
);
1055 hyp_pgd
= (pgd_t
*)__get_free_pages(GFP_KERNEL
| __GFP_ZERO
, pgd_order
);
1056 boot_hyp_pgd
= (pgd_t
*)__get_free_pages(GFP_KERNEL
| __GFP_ZERO
, pgd_order
);
1058 if (!hyp_pgd
|| !boot_hyp_pgd
) {
1059 kvm_err("Hyp mode PGD not allocated\n");
1064 /* Create the idmap in the boot page tables */
1065 err
= __create_hyp_mappings(boot_hyp_pgd
,
1066 hyp_idmap_start
, hyp_idmap_end
,
1067 __phys_to_pfn(hyp_idmap_start
),
1071 kvm_err("Failed to idmap %lx-%lx\n",
1072 hyp_idmap_start
, hyp_idmap_end
);
1076 /* Map the very same page at the trampoline VA */
1077 err
= __create_hyp_mappings(boot_hyp_pgd
,
1078 TRAMPOLINE_VA
, TRAMPOLINE_VA
+ PAGE_SIZE
,
1079 __phys_to_pfn(hyp_idmap_start
),
1082 kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
1087 /* Map the same page again into the runtime page tables */
1088 err
= __create_hyp_mappings(hyp_pgd
,
1089 TRAMPOLINE_VA
, TRAMPOLINE_VA
+ PAGE_SIZE
,
1090 __phys_to_pfn(hyp_idmap_start
),
1093 kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",