2 * Kernel-based Virtual Machine driver for Linux
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
7 * Copyright (C) 2006 Qumranet, Inc.
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
20 #include <linux/kvm_host.h>
21 #include <linux/kvm.h>
22 #include <linux/module.h>
23 #include <linux/errno.h>
24 #include <linux/percpu.h>
25 #include <linux/gfp.h>
27 #include <linux/miscdevice.h>
28 #include <linux/vmalloc.h>
29 #include <linux/reboot.h>
30 #include <linux/debugfs.h>
31 #include <linux/highmem.h>
32 #include <linux/file.h>
33 #include <linux/sysdev.h>
34 #include <linux/cpu.h>
35 #include <linux/sched.h>
36 #include <linux/cpumask.h>
37 #include <linux/smp.h>
38 #include <linux/anon_inodes.h>
39 #include <linux/profile.h>
40 #include <linux/kvm_para.h>
41 #include <linux/pagemap.h>
42 #include <linux/mman.h>
43 #include <linux/swap.h>
44 #include <linux/bitops.h>
45 #include <linux/spinlock.h>
46 #include <linux/compat.h>
48 #include <asm/processor.h>
50 #include <asm/uaccess.h>
51 #include <asm/pgtable.h>
52 #include <asm-generic/bitops/le.h>
54 #include "coalesced_mmio.h"
56 #define CREATE_TRACE_POINTS
57 #include <trace/events/kvm.h>
59 MODULE_AUTHOR("Qumranet");
60 MODULE_LICENSE("GPL");
65 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
68 DEFINE_SPINLOCK(kvm_lock
);
71 static cpumask_var_t cpus_hardware_enabled
;
72 static int kvm_usage_count
= 0;
73 static atomic_t hardware_enable_failed
;
75 struct kmem_cache
*kvm_vcpu_cache
;
76 EXPORT_SYMBOL_GPL(kvm_vcpu_cache
);
78 static __read_mostly
struct preempt_ops kvm_preempt_ops
;
80 struct dentry
*kvm_debugfs_dir
;
82 static long kvm_vcpu_ioctl(struct file
*file
, unsigned int ioctl
,
84 static int hardware_enable_all(void);
85 static void hardware_disable_all(void);
87 static bool kvm_rebooting
;
89 static bool largepages_enabled
= true;
91 inline int kvm_is_mmio_pfn(pfn_t pfn
)
94 struct page
*page
= compound_head(pfn_to_page(pfn
));
95 return PageReserved(page
);
102 * Switches to specified vcpu, until a matching vcpu_put()
104 void vcpu_load(struct kvm_vcpu
*vcpu
)
108 mutex_lock(&vcpu
->mutex
);
110 preempt_notifier_register(&vcpu
->preempt_notifier
);
111 kvm_arch_vcpu_load(vcpu
, cpu
);
115 void vcpu_put(struct kvm_vcpu
*vcpu
)
118 kvm_arch_vcpu_put(vcpu
);
119 preempt_notifier_unregister(&vcpu
->preempt_notifier
);
121 mutex_unlock(&vcpu
->mutex
);
124 static void ack_flush(void *_completed
)
128 static bool make_all_cpus_request(struct kvm
*kvm
, unsigned int req
)
133 struct kvm_vcpu
*vcpu
;
135 zalloc_cpumask_var(&cpus
, GFP_ATOMIC
);
137 spin_lock(&kvm
->requests_lock
);
138 me
= smp_processor_id();
139 kvm_for_each_vcpu(i
, vcpu
, kvm
) {
140 if (test_and_set_bit(req
, &vcpu
->requests
))
143 if (cpus
!= NULL
&& cpu
!= -1 && cpu
!= me
)
144 cpumask_set_cpu(cpu
, cpus
);
146 if (unlikely(cpus
== NULL
))
147 smp_call_function_many(cpu_online_mask
, ack_flush
, NULL
, 1);
148 else if (!cpumask_empty(cpus
))
149 smp_call_function_many(cpus
, ack_flush
, NULL
, 1);
152 spin_unlock(&kvm
->requests_lock
);
153 free_cpumask_var(cpus
);
157 void kvm_flush_remote_tlbs(struct kvm
*kvm
)
159 if (make_all_cpus_request(kvm
, KVM_REQ_TLB_FLUSH
))
160 ++kvm
->stat
.remote_tlb_flush
;
163 void kvm_reload_remote_mmus(struct kvm
*kvm
)
165 make_all_cpus_request(kvm
, KVM_REQ_MMU_RELOAD
);
168 int kvm_vcpu_init(struct kvm_vcpu
*vcpu
, struct kvm
*kvm
, unsigned id
)
173 mutex_init(&vcpu
->mutex
);
177 init_waitqueue_head(&vcpu
->wq
);
179 page
= alloc_page(GFP_KERNEL
| __GFP_ZERO
);
184 vcpu
->run
= page_address(page
);
186 r
= kvm_arch_vcpu_init(vcpu
);
192 free_page((unsigned long)vcpu
->run
);
196 EXPORT_SYMBOL_GPL(kvm_vcpu_init
);
198 void kvm_vcpu_uninit(struct kvm_vcpu
*vcpu
)
200 kvm_arch_vcpu_uninit(vcpu
);
201 free_page((unsigned long)vcpu
->run
);
203 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit
);
205 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
206 static inline struct kvm
*mmu_notifier_to_kvm(struct mmu_notifier
*mn
)
208 return container_of(mn
, struct kvm
, mmu_notifier
);
211 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier
*mn
,
212 struct mm_struct
*mm
,
213 unsigned long address
)
215 struct kvm
*kvm
= mmu_notifier_to_kvm(mn
);
219 * When ->invalidate_page runs, the linux pte has been zapped
220 * already but the page is still allocated until
221 * ->invalidate_page returns. So if we increase the sequence
222 * here the kvm page fault will notice if the spte can't be
223 * established because the page is going to be freed. If
224 * instead the kvm page fault establishes the spte before
225 * ->invalidate_page runs, kvm_unmap_hva will release it
228 * The sequence increase only need to be seen at spin_unlock
229 * time, and not at spin_lock time.
231 * Increasing the sequence after the spin_unlock would be
232 * unsafe because the kvm page fault could then establish the
233 * pte after kvm_unmap_hva returned, without noticing the page
234 * is going to be freed.
236 spin_lock(&kvm
->mmu_lock
);
237 kvm
->mmu_notifier_seq
++;
238 need_tlb_flush
= kvm_unmap_hva(kvm
, address
);
239 spin_unlock(&kvm
->mmu_lock
);
241 /* we've to flush the tlb before the pages can be freed */
243 kvm_flush_remote_tlbs(kvm
);
247 static void kvm_mmu_notifier_change_pte(struct mmu_notifier
*mn
,
248 struct mm_struct
*mm
,
249 unsigned long address
,
252 struct kvm
*kvm
= mmu_notifier_to_kvm(mn
);
254 spin_lock(&kvm
->mmu_lock
);
255 kvm
->mmu_notifier_seq
++;
256 kvm_set_spte_hva(kvm
, address
, pte
);
257 spin_unlock(&kvm
->mmu_lock
);
260 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier
*mn
,
261 struct mm_struct
*mm
,
265 struct kvm
*kvm
= mmu_notifier_to_kvm(mn
);
266 int need_tlb_flush
= 0;
268 spin_lock(&kvm
->mmu_lock
);
270 * The count increase must become visible at unlock time as no
271 * spte can be established without taking the mmu_lock and
272 * count is also read inside the mmu_lock critical section.
274 kvm
->mmu_notifier_count
++;
275 for (; start
< end
; start
+= PAGE_SIZE
)
276 need_tlb_flush
|= kvm_unmap_hva(kvm
, start
);
277 spin_unlock(&kvm
->mmu_lock
);
279 /* we've to flush the tlb before the pages can be freed */
281 kvm_flush_remote_tlbs(kvm
);
284 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier
*mn
,
285 struct mm_struct
*mm
,
289 struct kvm
*kvm
= mmu_notifier_to_kvm(mn
);
291 spin_lock(&kvm
->mmu_lock
);
293 * This sequence increase will notify the kvm page fault that
294 * the page that is going to be mapped in the spte could have
297 kvm
->mmu_notifier_seq
++;
299 * The above sequence increase must be visible before the
300 * below count decrease but both values are read by the kvm
301 * page fault under mmu_lock spinlock so we don't need to add
302 * a smb_wmb() here in between the two.
304 kvm
->mmu_notifier_count
--;
305 spin_unlock(&kvm
->mmu_lock
);
307 BUG_ON(kvm
->mmu_notifier_count
< 0);
310 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier
*mn
,
311 struct mm_struct
*mm
,
312 unsigned long address
)
314 struct kvm
*kvm
= mmu_notifier_to_kvm(mn
);
317 spin_lock(&kvm
->mmu_lock
);
318 young
= kvm_age_hva(kvm
, address
);
319 spin_unlock(&kvm
->mmu_lock
);
322 kvm_flush_remote_tlbs(kvm
);
327 static void kvm_mmu_notifier_release(struct mmu_notifier
*mn
,
328 struct mm_struct
*mm
)
330 struct kvm
*kvm
= mmu_notifier_to_kvm(mn
);
331 kvm_arch_flush_shadow(kvm
);
334 static const struct mmu_notifier_ops kvm_mmu_notifier_ops
= {
335 .invalidate_page
= kvm_mmu_notifier_invalidate_page
,
336 .invalidate_range_start
= kvm_mmu_notifier_invalidate_range_start
,
337 .invalidate_range_end
= kvm_mmu_notifier_invalidate_range_end
,
338 .clear_flush_young
= kvm_mmu_notifier_clear_flush_young
,
339 .change_pte
= kvm_mmu_notifier_change_pte
,
340 .release
= kvm_mmu_notifier_release
,
343 static int kvm_init_mmu_notifier(struct kvm
*kvm
)
345 kvm
->mmu_notifier
.ops
= &kvm_mmu_notifier_ops
;
346 return mmu_notifier_register(&kvm
->mmu_notifier
, current
->mm
);
349 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
351 static int kvm_init_mmu_notifier(struct kvm
*kvm
)
356 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
358 static struct kvm
*kvm_create_vm(void)
361 struct kvm
*kvm
= kvm_arch_create_vm();
362 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
369 r
= hardware_enable_all();
371 goto out_err_nodisable
;
373 #ifdef CONFIG_HAVE_KVM_IRQCHIP
374 INIT_HLIST_HEAD(&kvm
->mask_notifier_list
);
375 INIT_HLIST_HEAD(&kvm
->irq_ack_notifier_list
);
378 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
379 page
= alloc_page(GFP_KERNEL
| __GFP_ZERO
);
384 kvm
->coalesced_mmio_ring
=
385 (struct kvm_coalesced_mmio_ring
*)page_address(page
);
388 r
= kvm_init_mmu_notifier(kvm
);
390 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
396 kvm
->mm
= current
->mm
;
397 atomic_inc(&kvm
->mm
->mm_count
);
398 spin_lock_init(&kvm
->mmu_lock
);
399 spin_lock_init(&kvm
->requests_lock
);
400 kvm_io_bus_init(&kvm
->pio_bus
);
401 kvm_eventfd_init(kvm
);
402 mutex_init(&kvm
->lock
);
403 mutex_init(&kvm
->irq_lock
);
404 kvm_io_bus_init(&kvm
->mmio_bus
);
405 init_rwsem(&kvm
->slots_lock
);
406 atomic_set(&kvm
->users_count
, 1);
407 spin_lock(&kvm_lock
);
408 list_add(&kvm
->vm_list
, &vm_list
);
409 spin_unlock(&kvm_lock
);
410 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
411 kvm_coalesced_mmio_init(kvm
);
417 hardware_disable_all();
424 * Free any memory in @free but not in @dont.
426 static void kvm_free_physmem_slot(struct kvm_memory_slot
*free
,
427 struct kvm_memory_slot
*dont
)
431 if (!dont
|| free
->rmap
!= dont
->rmap
)
434 if (!dont
|| free
->dirty_bitmap
!= dont
->dirty_bitmap
)
435 vfree(free
->dirty_bitmap
);
438 for (i
= 0; i
< KVM_NR_PAGE_SIZES
- 1; ++i
) {
439 if (!dont
|| free
->lpage_info
[i
] != dont
->lpage_info
[i
]) {
440 vfree(free
->lpage_info
[i
]);
441 free
->lpage_info
[i
] = NULL
;
446 free
->dirty_bitmap
= NULL
;
450 void kvm_free_physmem(struct kvm
*kvm
)
454 for (i
= 0; i
< kvm
->nmemslots
; ++i
)
455 kvm_free_physmem_slot(&kvm
->memslots
[i
], NULL
);
458 static void kvm_destroy_vm(struct kvm
*kvm
)
460 struct mm_struct
*mm
= kvm
->mm
;
462 kvm_arch_sync_events(kvm
);
463 spin_lock(&kvm_lock
);
464 list_del(&kvm
->vm_list
);
465 spin_unlock(&kvm_lock
);
466 kvm_free_irq_routing(kvm
);
467 kvm_io_bus_destroy(&kvm
->pio_bus
);
468 kvm_io_bus_destroy(&kvm
->mmio_bus
);
469 kvm_coalesced_mmio_free(kvm
);
470 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
471 mmu_notifier_unregister(&kvm
->mmu_notifier
, kvm
->mm
);
473 kvm_arch_flush_shadow(kvm
);
475 kvm_arch_destroy_vm(kvm
);
476 hardware_disable_all();
480 void kvm_get_kvm(struct kvm
*kvm
)
482 atomic_inc(&kvm
->users_count
);
484 EXPORT_SYMBOL_GPL(kvm_get_kvm
);
486 void kvm_put_kvm(struct kvm
*kvm
)
488 if (atomic_dec_and_test(&kvm
->users_count
))
491 EXPORT_SYMBOL_GPL(kvm_put_kvm
);
494 static int kvm_vm_release(struct inode
*inode
, struct file
*filp
)
496 struct kvm
*kvm
= filp
->private_data
;
498 kvm_irqfd_release(kvm
);
505 * Allocate some memory and give it an address in the guest physical address
508 * Discontiguous memory is allowed, mostly for framebuffers.
510 * Must be called holding mmap_sem for write.
512 int __kvm_set_memory_region(struct kvm
*kvm
,
513 struct kvm_userspace_memory_region
*mem
,
518 unsigned long npages
;
520 struct kvm_memory_slot
*memslot
;
521 struct kvm_memory_slot old
, new;
524 /* General sanity checks */
525 if (mem
->memory_size
& (PAGE_SIZE
- 1))
527 if (mem
->guest_phys_addr
& (PAGE_SIZE
- 1))
529 if (user_alloc
&& (mem
->userspace_addr
& (PAGE_SIZE
- 1)))
531 if (mem
->slot
>= KVM_MEMORY_SLOTS
+ KVM_PRIVATE_MEM_SLOTS
)
533 if (mem
->guest_phys_addr
+ mem
->memory_size
< mem
->guest_phys_addr
)
536 memslot
= &kvm
->memslots
[mem
->slot
];
537 base_gfn
= mem
->guest_phys_addr
>> PAGE_SHIFT
;
538 npages
= mem
->memory_size
>> PAGE_SHIFT
;
541 mem
->flags
&= ~KVM_MEM_LOG_DIRTY_PAGES
;
543 new = old
= *memslot
;
545 new.base_gfn
= base_gfn
;
547 new.flags
= mem
->flags
;
549 /* Disallow changing a memory slot's size. */
551 if (npages
&& old
.npages
&& npages
!= old
.npages
)
554 /* Check for overlaps */
556 for (i
= 0; i
< KVM_MEMORY_SLOTS
; ++i
) {
557 struct kvm_memory_slot
*s
= &kvm
->memslots
[i
];
559 if (s
== memslot
|| !s
->npages
)
561 if (!((base_gfn
+ npages
<= s
->base_gfn
) ||
562 (base_gfn
>= s
->base_gfn
+ s
->npages
)))
566 /* Free page dirty bitmap if unneeded */
567 if (!(new.flags
& KVM_MEM_LOG_DIRTY_PAGES
))
568 new.dirty_bitmap
= NULL
;
572 /* Allocate if a slot is being created */
574 if (npages
&& !new.rmap
) {
575 new.rmap
= vmalloc(npages
* sizeof(struct page
*));
580 memset(new.rmap
, 0, npages
* sizeof(*new.rmap
));
582 new.user_alloc
= user_alloc
;
584 * hva_to_rmmap() serialzies with the mmu_lock and to be
585 * safe it has to ignore memslots with !user_alloc &&
589 new.userspace_addr
= mem
->userspace_addr
;
591 new.userspace_addr
= 0;
596 for (i
= 0; i
< KVM_NR_PAGE_SIZES
- 1; ++i
) {
602 /* Avoid unused variable warning if no large pages */
605 if (new.lpage_info
[i
])
608 lpages
= 1 + (base_gfn
+ npages
- 1) /
609 KVM_PAGES_PER_HPAGE(level
);
610 lpages
-= base_gfn
/ KVM_PAGES_PER_HPAGE(level
);
612 new.lpage_info
[i
] = vmalloc(lpages
* sizeof(*new.lpage_info
[i
]));
614 if (!new.lpage_info
[i
])
617 memset(new.lpage_info
[i
], 0,
618 lpages
* sizeof(*new.lpage_info
[i
]));
620 if (base_gfn
% KVM_PAGES_PER_HPAGE(level
))
621 new.lpage_info
[i
][0].write_count
= 1;
622 if ((base_gfn
+npages
) % KVM_PAGES_PER_HPAGE(level
))
623 new.lpage_info
[i
][lpages
- 1].write_count
= 1;
624 ugfn
= new.userspace_addr
>> PAGE_SHIFT
;
626 * If the gfn and userspace address are not aligned wrt each
627 * other, or if explicitly asked to, disable large page
628 * support for this slot
630 if ((base_gfn
^ ugfn
) & (KVM_PAGES_PER_HPAGE(level
) - 1) ||
632 for (j
= 0; j
< lpages
; ++j
)
633 new.lpage_info
[i
][j
].write_count
= 1;
638 /* Allocate page dirty bitmap if needed */
639 if ((new.flags
& KVM_MEM_LOG_DIRTY_PAGES
) && !new.dirty_bitmap
) {
640 unsigned dirty_bytes
= ALIGN(npages
, BITS_PER_LONG
) / 8;
642 new.dirty_bitmap
= vmalloc(dirty_bytes
);
643 if (!new.dirty_bitmap
)
645 memset(new.dirty_bitmap
, 0, dirty_bytes
);
647 kvm_arch_flush_shadow(kvm
);
649 #else /* not defined CONFIG_S390 */
650 new.user_alloc
= user_alloc
;
652 new.userspace_addr
= mem
->userspace_addr
;
653 #endif /* not defined CONFIG_S390 */
656 kvm_arch_flush_shadow(kvm
);
658 spin_lock(&kvm
->mmu_lock
);
659 if (mem
->slot
>= kvm
->nmemslots
)
660 kvm
->nmemslots
= mem
->slot
+ 1;
663 spin_unlock(&kvm
->mmu_lock
);
665 r
= kvm_arch_set_memory_region(kvm
, mem
, old
, user_alloc
);
667 spin_lock(&kvm
->mmu_lock
);
669 spin_unlock(&kvm
->mmu_lock
);
673 kvm_free_physmem_slot(&old
, npages
? &new : NULL
);
674 /* Slot deletion case: we have to update the current slot */
675 spin_lock(&kvm
->mmu_lock
);
678 spin_unlock(&kvm
->mmu_lock
);
680 /* map the pages in iommu page table */
681 r
= kvm_iommu_map_pages(kvm
, base_gfn
, npages
);
688 kvm_free_physmem_slot(&new, &old
);
693 EXPORT_SYMBOL_GPL(__kvm_set_memory_region
);
695 int kvm_set_memory_region(struct kvm
*kvm
,
696 struct kvm_userspace_memory_region
*mem
,
701 down_write(&kvm
->slots_lock
);
702 r
= __kvm_set_memory_region(kvm
, mem
, user_alloc
);
703 up_write(&kvm
->slots_lock
);
706 EXPORT_SYMBOL_GPL(kvm_set_memory_region
);
708 int kvm_vm_ioctl_set_memory_region(struct kvm
*kvm
,
710 kvm_userspace_memory_region
*mem
,
713 if (mem
->slot
>= KVM_MEMORY_SLOTS
)
715 return kvm_set_memory_region(kvm
, mem
, user_alloc
);
718 int kvm_get_dirty_log(struct kvm
*kvm
,
719 struct kvm_dirty_log
*log
, int *is_dirty
)
721 struct kvm_memory_slot
*memslot
;
724 unsigned long any
= 0;
727 if (log
->slot
>= KVM_MEMORY_SLOTS
)
730 memslot
= &kvm
->memslots
[log
->slot
];
732 if (!memslot
->dirty_bitmap
)
735 n
= ALIGN(memslot
->npages
, BITS_PER_LONG
) / 8;
737 for (i
= 0; !any
&& i
< n
/sizeof(long); ++i
)
738 any
= memslot
->dirty_bitmap
[i
];
741 if (copy_to_user(log
->dirty_bitmap
, memslot
->dirty_bitmap
, n
))
752 void kvm_disable_largepages(void)
754 largepages_enabled
= false;
756 EXPORT_SYMBOL_GPL(kvm_disable_largepages
);
758 int is_error_page(struct page
*page
)
760 return page
== bad_page
;
762 EXPORT_SYMBOL_GPL(is_error_page
);
764 int is_error_pfn(pfn_t pfn
)
766 return pfn
== bad_pfn
;
768 EXPORT_SYMBOL_GPL(is_error_pfn
);
770 static inline unsigned long bad_hva(void)
775 int kvm_is_error_hva(unsigned long addr
)
777 return addr
== bad_hva();
779 EXPORT_SYMBOL_GPL(kvm_is_error_hva
);
781 struct kvm_memory_slot
*gfn_to_memslot_unaliased(struct kvm
*kvm
, gfn_t gfn
)
785 for (i
= 0; i
< kvm
->nmemslots
; ++i
) {
786 struct kvm_memory_slot
*memslot
= &kvm
->memslots
[i
];
788 if (gfn
>= memslot
->base_gfn
789 && gfn
< memslot
->base_gfn
+ memslot
->npages
)
794 EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased
);
796 struct kvm_memory_slot
*gfn_to_memslot(struct kvm
*kvm
, gfn_t gfn
)
798 gfn
= unalias_gfn(kvm
, gfn
);
799 return gfn_to_memslot_unaliased(kvm
, gfn
);
802 int kvm_is_visible_gfn(struct kvm
*kvm
, gfn_t gfn
)
806 gfn
= unalias_gfn(kvm
, gfn
);
807 for (i
= 0; i
< KVM_MEMORY_SLOTS
; ++i
) {
808 struct kvm_memory_slot
*memslot
= &kvm
->memslots
[i
];
810 if (gfn
>= memslot
->base_gfn
811 && gfn
< memslot
->base_gfn
+ memslot
->npages
)
816 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn
);
818 unsigned long gfn_to_hva(struct kvm
*kvm
, gfn_t gfn
)
820 struct kvm_memory_slot
*slot
;
822 gfn
= unalias_gfn(kvm
, gfn
);
823 slot
= gfn_to_memslot_unaliased(kvm
, gfn
);
826 return (slot
->userspace_addr
+ (gfn
- slot
->base_gfn
) * PAGE_SIZE
);
828 EXPORT_SYMBOL_GPL(gfn_to_hva
);
830 pfn_t
gfn_to_pfn(struct kvm
*kvm
, gfn_t gfn
)
832 struct page
*page
[1];
839 addr
= gfn_to_hva(kvm
, gfn
);
840 if (kvm_is_error_hva(addr
)) {
842 return page_to_pfn(bad_page
);
845 npages
= get_user_pages_fast(addr
, 1, 1, page
);
847 if (unlikely(npages
!= 1)) {
848 struct vm_area_struct
*vma
;
850 down_read(¤t
->mm
->mmap_sem
);
851 vma
= find_vma(current
->mm
, addr
);
853 if (vma
== NULL
|| addr
< vma
->vm_start
||
854 !(vma
->vm_flags
& VM_PFNMAP
)) {
855 up_read(¤t
->mm
->mmap_sem
);
857 return page_to_pfn(bad_page
);
860 pfn
= ((addr
- vma
->vm_start
) >> PAGE_SHIFT
) + vma
->vm_pgoff
;
861 up_read(¤t
->mm
->mmap_sem
);
862 BUG_ON(!kvm_is_mmio_pfn(pfn
));
864 pfn
= page_to_pfn(page
[0]);
869 EXPORT_SYMBOL_GPL(gfn_to_pfn
);
871 struct page
*gfn_to_page(struct kvm
*kvm
, gfn_t gfn
)
875 pfn
= gfn_to_pfn(kvm
, gfn
);
876 if (!kvm_is_mmio_pfn(pfn
))
877 return pfn_to_page(pfn
);
879 WARN_ON(kvm_is_mmio_pfn(pfn
));
885 EXPORT_SYMBOL_GPL(gfn_to_page
);
887 void kvm_release_page_clean(struct page
*page
)
889 kvm_release_pfn_clean(page_to_pfn(page
));
891 EXPORT_SYMBOL_GPL(kvm_release_page_clean
);
893 void kvm_release_pfn_clean(pfn_t pfn
)
895 if (!kvm_is_mmio_pfn(pfn
))
896 put_page(pfn_to_page(pfn
));
898 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean
);
900 void kvm_release_page_dirty(struct page
*page
)
902 kvm_release_pfn_dirty(page_to_pfn(page
));
904 EXPORT_SYMBOL_GPL(kvm_release_page_dirty
);
906 void kvm_release_pfn_dirty(pfn_t pfn
)
908 kvm_set_pfn_dirty(pfn
);
909 kvm_release_pfn_clean(pfn
);
911 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty
);
913 void kvm_set_page_dirty(struct page
*page
)
915 kvm_set_pfn_dirty(page_to_pfn(page
));
917 EXPORT_SYMBOL_GPL(kvm_set_page_dirty
);
919 void kvm_set_pfn_dirty(pfn_t pfn
)
921 if (!kvm_is_mmio_pfn(pfn
)) {
922 struct page
*page
= pfn_to_page(pfn
);
923 if (!PageReserved(page
))
927 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty
);
929 void kvm_set_pfn_accessed(pfn_t pfn
)
931 if (!kvm_is_mmio_pfn(pfn
))
932 mark_page_accessed(pfn_to_page(pfn
));
934 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed
);
936 void kvm_get_pfn(pfn_t pfn
)
938 if (!kvm_is_mmio_pfn(pfn
))
939 get_page(pfn_to_page(pfn
));
941 EXPORT_SYMBOL_GPL(kvm_get_pfn
);
943 static int next_segment(unsigned long len
, int offset
)
945 if (len
> PAGE_SIZE
- offset
)
946 return PAGE_SIZE
- offset
;
951 int kvm_read_guest_page(struct kvm
*kvm
, gfn_t gfn
, void *data
, int offset
,
957 addr
= gfn_to_hva(kvm
, gfn
);
958 if (kvm_is_error_hva(addr
))
960 r
= copy_from_user(data
, (void __user
*)addr
+ offset
, len
);
965 EXPORT_SYMBOL_GPL(kvm_read_guest_page
);
967 int kvm_read_guest(struct kvm
*kvm
, gpa_t gpa
, void *data
, unsigned long len
)
969 gfn_t gfn
= gpa
>> PAGE_SHIFT
;
971 int offset
= offset_in_page(gpa
);
974 while ((seg
= next_segment(len
, offset
)) != 0) {
975 ret
= kvm_read_guest_page(kvm
, gfn
, data
, offset
, seg
);
985 EXPORT_SYMBOL_GPL(kvm_read_guest
);
987 int kvm_read_guest_atomic(struct kvm
*kvm
, gpa_t gpa
, void *data
,
992 gfn_t gfn
= gpa
>> PAGE_SHIFT
;
993 int offset
= offset_in_page(gpa
);
995 addr
= gfn_to_hva(kvm
, gfn
);
996 if (kvm_is_error_hva(addr
))
999 r
= __copy_from_user_inatomic(data
, (void __user
*)addr
+ offset
, len
);
1005 EXPORT_SYMBOL(kvm_read_guest_atomic
);
1007 int kvm_write_guest_page(struct kvm
*kvm
, gfn_t gfn
, const void *data
,
1008 int offset
, int len
)
1013 addr
= gfn_to_hva(kvm
, gfn
);
1014 if (kvm_is_error_hva(addr
))
1016 r
= copy_to_user((void __user
*)addr
+ offset
, data
, len
);
1019 mark_page_dirty(kvm
, gfn
);
1022 EXPORT_SYMBOL_GPL(kvm_write_guest_page
);
1024 int kvm_write_guest(struct kvm
*kvm
, gpa_t gpa
, const void *data
,
1027 gfn_t gfn
= gpa
>> PAGE_SHIFT
;
1029 int offset
= offset_in_page(gpa
);
1032 while ((seg
= next_segment(len
, offset
)) != 0) {
1033 ret
= kvm_write_guest_page(kvm
, gfn
, data
, offset
, seg
);
1044 int kvm_clear_guest_page(struct kvm
*kvm
, gfn_t gfn
, int offset
, int len
)
1046 return kvm_write_guest_page(kvm
, gfn
, empty_zero_page
, offset
, len
);
1048 EXPORT_SYMBOL_GPL(kvm_clear_guest_page
);
1050 int kvm_clear_guest(struct kvm
*kvm
, gpa_t gpa
, unsigned long len
)
1052 gfn_t gfn
= gpa
>> PAGE_SHIFT
;
1054 int offset
= offset_in_page(gpa
);
1057 while ((seg
= next_segment(len
, offset
)) != 0) {
1058 ret
= kvm_clear_guest_page(kvm
, gfn
, offset
, seg
);
1067 EXPORT_SYMBOL_GPL(kvm_clear_guest
);
1069 void mark_page_dirty(struct kvm
*kvm
, gfn_t gfn
)
1071 struct kvm_memory_slot
*memslot
;
1073 gfn
= unalias_gfn(kvm
, gfn
);
1074 memslot
= gfn_to_memslot_unaliased(kvm
, gfn
);
1075 if (memslot
&& memslot
->dirty_bitmap
) {
1076 unsigned long rel_gfn
= gfn
- memslot
->base_gfn
;
1079 if (!generic_test_le_bit(rel_gfn
, memslot
->dirty_bitmap
))
1080 generic___set_le_bit(rel_gfn
, memslot
->dirty_bitmap
);
1085 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1087 void kvm_vcpu_block(struct kvm_vcpu
*vcpu
)
1092 prepare_to_wait(&vcpu
->wq
, &wait
, TASK_INTERRUPTIBLE
);
1094 if (kvm_arch_vcpu_runnable(vcpu
)) {
1095 set_bit(KVM_REQ_UNHALT
, &vcpu
->requests
);
1098 if (kvm_cpu_has_pending_timer(vcpu
))
1100 if (signal_pending(current
))
1106 finish_wait(&vcpu
->wq
, &wait
);
1109 void kvm_resched(struct kvm_vcpu
*vcpu
)
1111 if (!need_resched())
1115 EXPORT_SYMBOL_GPL(kvm_resched
);
1117 void kvm_vcpu_on_spin(struct kvm_vcpu
*vcpu
)
1122 prepare_to_wait(&vcpu
->wq
, &wait
, TASK_INTERRUPTIBLE
);
1124 /* Sleep for 100 us, and hope lock-holder got scheduled */
1125 expires
= ktime_add_ns(ktime_get(), 100000UL);
1126 schedule_hrtimeout(&expires
, HRTIMER_MODE_ABS
);
1128 finish_wait(&vcpu
->wq
, &wait
);
1130 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin
);
1132 static int kvm_vcpu_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
1134 struct kvm_vcpu
*vcpu
= vma
->vm_file
->private_data
;
1137 if (vmf
->pgoff
== 0)
1138 page
= virt_to_page(vcpu
->run
);
1140 else if (vmf
->pgoff
== KVM_PIO_PAGE_OFFSET
)
1141 page
= virt_to_page(vcpu
->arch
.pio_data
);
1143 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1144 else if (vmf
->pgoff
== KVM_COALESCED_MMIO_PAGE_OFFSET
)
1145 page
= virt_to_page(vcpu
->kvm
->coalesced_mmio_ring
);
1148 return VM_FAULT_SIGBUS
;
1154 static const struct vm_operations_struct kvm_vcpu_vm_ops
= {
1155 .fault
= kvm_vcpu_fault
,
1158 static int kvm_vcpu_mmap(struct file
*file
, struct vm_area_struct
*vma
)
1160 vma
->vm_ops
= &kvm_vcpu_vm_ops
;
1164 static int kvm_vcpu_release(struct inode
*inode
, struct file
*filp
)
1166 struct kvm_vcpu
*vcpu
= filp
->private_data
;
1168 kvm_put_kvm(vcpu
->kvm
);
1172 static struct file_operations kvm_vcpu_fops
= {
1173 .release
= kvm_vcpu_release
,
1174 .unlocked_ioctl
= kvm_vcpu_ioctl
,
1175 .compat_ioctl
= kvm_vcpu_ioctl
,
1176 .mmap
= kvm_vcpu_mmap
,
1180 * Allocates an inode for the vcpu.
1182 static int create_vcpu_fd(struct kvm_vcpu
*vcpu
)
1184 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops
, vcpu
, O_RDWR
);
1188 * Creates some virtual cpus. Good luck creating more than one.
1190 static int kvm_vm_ioctl_create_vcpu(struct kvm
*kvm
, u32 id
)
1193 struct kvm_vcpu
*vcpu
, *v
;
1195 vcpu
= kvm_arch_vcpu_create(kvm
, id
);
1197 return PTR_ERR(vcpu
);
1199 preempt_notifier_init(&vcpu
->preempt_notifier
, &kvm_preempt_ops
);
1201 r
= kvm_arch_vcpu_setup(vcpu
);
1205 mutex_lock(&kvm
->lock
);
1206 if (atomic_read(&kvm
->online_vcpus
) == KVM_MAX_VCPUS
) {
1211 kvm_for_each_vcpu(r
, v
, kvm
)
1212 if (v
->vcpu_id
== id
) {
1217 BUG_ON(kvm
->vcpus
[atomic_read(&kvm
->online_vcpus
)]);
1219 /* Now it's all set up, let userspace reach it */
1221 r
= create_vcpu_fd(vcpu
);
1227 kvm
->vcpus
[atomic_read(&kvm
->online_vcpus
)] = vcpu
;
1229 atomic_inc(&kvm
->online_vcpus
);
1231 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
1232 if (kvm
->bsp_vcpu_id
== id
)
1233 kvm
->bsp_vcpu
= vcpu
;
1235 mutex_unlock(&kvm
->lock
);
1239 mutex_unlock(&kvm
->lock
);
1240 kvm_arch_vcpu_destroy(vcpu
);
1244 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu
*vcpu
, sigset_t
*sigset
)
1247 sigdelsetmask(sigset
, sigmask(SIGKILL
)|sigmask(SIGSTOP
));
1248 vcpu
->sigset_active
= 1;
1249 vcpu
->sigset
= *sigset
;
1251 vcpu
->sigset_active
= 0;
1255 static long kvm_vcpu_ioctl(struct file
*filp
,
1256 unsigned int ioctl
, unsigned long arg
)
1258 struct kvm_vcpu
*vcpu
= filp
->private_data
;
1259 void __user
*argp
= (void __user
*)arg
;
1261 struct kvm_fpu
*fpu
= NULL
;
1262 struct kvm_sregs
*kvm_sregs
= NULL
;
1264 if (vcpu
->kvm
->mm
!= current
->mm
)
1271 r
= kvm_arch_vcpu_ioctl_run(vcpu
, vcpu
->run
);
1273 case KVM_GET_REGS
: {
1274 struct kvm_regs
*kvm_regs
;
1277 kvm_regs
= kzalloc(sizeof(struct kvm_regs
), GFP_KERNEL
);
1280 r
= kvm_arch_vcpu_ioctl_get_regs(vcpu
, kvm_regs
);
1284 if (copy_to_user(argp
, kvm_regs
, sizeof(struct kvm_regs
)))
1291 case KVM_SET_REGS
: {
1292 struct kvm_regs
*kvm_regs
;
1295 kvm_regs
= kzalloc(sizeof(struct kvm_regs
), GFP_KERNEL
);
1299 if (copy_from_user(kvm_regs
, argp
, sizeof(struct kvm_regs
)))
1301 r
= kvm_arch_vcpu_ioctl_set_regs(vcpu
, kvm_regs
);
1309 case KVM_GET_SREGS
: {
1310 kvm_sregs
= kzalloc(sizeof(struct kvm_sregs
), GFP_KERNEL
);
1314 r
= kvm_arch_vcpu_ioctl_get_sregs(vcpu
, kvm_sregs
);
1318 if (copy_to_user(argp
, kvm_sregs
, sizeof(struct kvm_sregs
)))
1323 case KVM_SET_SREGS
: {
1324 kvm_sregs
= kmalloc(sizeof(struct kvm_sregs
), GFP_KERNEL
);
1329 if (copy_from_user(kvm_sregs
, argp
, sizeof(struct kvm_sregs
)))
1331 r
= kvm_arch_vcpu_ioctl_set_sregs(vcpu
, kvm_sregs
);
1337 case KVM_GET_MP_STATE
: {
1338 struct kvm_mp_state mp_state
;
1340 r
= kvm_arch_vcpu_ioctl_get_mpstate(vcpu
, &mp_state
);
1344 if (copy_to_user(argp
, &mp_state
, sizeof mp_state
))
1349 case KVM_SET_MP_STATE
: {
1350 struct kvm_mp_state mp_state
;
1353 if (copy_from_user(&mp_state
, argp
, sizeof mp_state
))
1355 r
= kvm_arch_vcpu_ioctl_set_mpstate(vcpu
, &mp_state
);
1361 case KVM_TRANSLATE
: {
1362 struct kvm_translation tr
;
1365 if (copy_from_user(&tr
, argp
, sizeof tr
))
1367 r
= kvm_arch_vcpu_ioctl_translate(vcpu
, &tr
);
1371 if (copy_to_user(argp
, &tr
, sizeof tr
))
1376 case KVM_SET_GUEST_DEBUG
: {
1377 struct kvm_guest_debug dbg
;
1380 if (copy_from_user(&dbg
, argp
, sizeof dbg
))
1382 r
= kvm_arch_vcpu_ioctl_set_guest_debug(vcpu
, &dbg
);
1388 case KVM_SET_SIGNAL_MASK
: {
1389 struct kvm_signal_mask __user
*sigmask_arg
= argp
;
1390 struct kvm_signal_mask kvm_sigmask
;
1391 sigset_t sigset
, *p
;
1396 if (copy_from_user(&kvm_sigmask
, argp
,
1397 sizeof kvm_sigmask
))
1400 if (kvm_sigmask
.len
!= sizeof sigset
)
1403 if (copy_from_user(&sigset
, sigmask_arg
->sigset
,
1408 r
= kvm_vcpu_ioctl_set_sigmask(vcpu
, &sigset
);
1412 fpu
= kzalloc(sizeof(struct kvm_fpu
), GFP_KERNEL
);
1416 r
= kvm_arch_vcpu_ioctl_get_fpu(vcpu
, fpu
);
1420 if (copy_to_user(argp
, fpu
, sizeof(struct kvm_fpu
)))
1426 fpu
= kmalloc(sizeof(struct kvm_fpu
), GFP_KERNEL
);
1431 if (copy_from_user(fpu
, argp
, sizeof(struct kvm_fpu
)))
1433 r
= kvm_arch_vcpu_ioctl_set_fpu(vcpu
, fpu
);
1440 r
= kvm_arch_vcpu_ioctl(filp
, ioctl
, arg
);
1448 static long kvm_vm_ioctl(struct file
*filp
,
1449 unsigned int ioctl
, unsigned long arg
)
1451 struct kvm
*kvm
= filp
->private_data
;
1452 void __user
*argp
= (void __user
*)arg
;
1455 if (kvm
->mm
!= current
->mm
)
1458 case KVM_CREATE_VCPU
:
1459 r
= kvm_vm_ioctl_create_vcpu(kvm
, arg
);
1463 case KVM_SET_USER_MEMORY_REGION
: {
1464 struct kvm_userspace_memory_region kvm_userspace_mem
;
1467 if (copy_from_user(&kvm_userspace_mem
, argp
,
1468 sizeof kvm_userspace_mem
))
1471 r
= kvm_vm_ioctl_set_memory_region(kvm
, &kvm_userspace_mem
, 1);
1476 case KVM_GET_DIRTY_LOG
: {
1477 struct kvm_dirty_log log
;
1480 if (copy_from_user(&log
, argp
, sizeof log
))
1482 r
= kvm_vm_ioctl_get_dirty_log(kvm
, &log
);
1487 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1488 case KVM_REGISTER_COALESCED_MMIO
: {
1489 struct kvm_coalesced_mmio_zone zone
;
1491 if (copy_from_user(&zone
, argp
, sizeof zone
))
1494 r
= kvm_vm_ioctl_register_coalesced_mmio(kvm
, &zone
);
1500 case KVM_UNREGISTER_COALESCED_MMIO
: {
1501 struct kvm_coalesced_mmio_zone zone
;
1503 if (copy_from_user(&zone
, argp
, sizeof zone
))
1506 r
= kvm_vm_ioctl_unregister_coalesced_mmio(kvm
, &zone
);
1514 struct kvm_irqfd data
;
1517 if (copy_from_user(&data
, argp
, sizeof data
))
1519 r
= kvm_irqfd(kvm
, data
.fd
, data
.gsi
, data
.flags
);
1522 case KVM_IOEVENTFD
: {
1523 struct kvm_ioeventfd data
;
1526 if (copy_from_user(&data
, argp
, sizeof data
))
1528 r
= kvm_ioeventfd(kvm
, &data
);
1531 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
1532 case KVM_SET_BOOT_CPU_ID
:
1534 mutex_lock(&kvm
->lock
);
1535 if (atomic_read(&kvm
->online_vcpus
) != 0)
1538 kvm
->bsp_vcpu_id
= arg
;
1539 mutex_unlock(&kvm
->lock
);
1543 r
= kvm_arch_vm_ioctl(filp
, ioctl
, arg
);
1545 r
= kvm_vm_ioctl_assigned_device(kvm
, ioctl
, arg
);
1551 #ifdef CONFIG_COMPAT
1552 struct compat_kvm_dirty_log
{
1556 compat_uptr_t dirty_bitmap
; /* one bit per page */
1561 static long kvm_vm_compat_ioctl(struct file
*filp
,
1562 unsigned int ioctl
, unsigned long arg
)
1564 struct kvm
*kvm
= filp
->private_data
;
1567 if (kvm
->mm
!= current
->mm
)
1570 case KVM_GET_DIRTY_LOG
: {
1571 struct compat_kvm_dirty_log compat_log
;
1572 struct kvm_dirty_log log
;
1575 if (copy_from_user(&compat_log
, (void __user
*)arg
,
1576 sizeof(compat_log
)))
1578 log
.slot
= compat_log
.slot
;
1579 log
.padding1
= compat_log
.padding1
;
1580 log
.padding2
= compat_log
.padding2
;
1581 log
.dirty_bitmap
= compat_ptr(compat_log
.dirty_bitmap
);
1583 r
= kvm_vm_ioctl_get_dirty_log(kvm
, &log
);
1589 r
= kvm_vm_ioctl(filp
, ioctl
, arg
);
1597 static int kvm_vm_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
1599 struct page
*page
[1];
1602 gfn_t gfn
= vmf
->pgoff
;
1603 struct kvm
*kvm
= vma
->vm_file
->private_data
;
1605 addr
= gfn_to_hva(kvm
, gfn
);
1606 if (kvm_is_error_hva(addr
))
1607 return VM_FAULT_SIGBUS
;
1609 npages
= get_user_pages(current
, current
->mm
, addr
, 1, 1, 0, page
,
1611 if (unlikely(npages
!= 1))
1612 return VM_FAULT_SIGBUS
;
1614 vmf
->page
= page
[0];
1618 static const struct vm_operations_struct kvm_vm_vm_ops
= {
1619 .fault
= kvm_vm_fault
,
1622 static int kvm_vm_mmap(struct file
*file
, struct vm_area_struct
*vma
)
1624 vma
->vm_ops
= &kvm_vm_vm_ops
;
1628 static struct file_operations kvm_vm_fops
= {
1629 .release
= kvm_vm_release
,
1630 .unlocked_ioctl
= kvm_vm_ioctl
,
1631 #ifdef CONFIG_COMPAT
1632 .compat_ioctl
= kvm_vm_compat_ioctl
,
1634 .mmap
= kvm_vm_mmap
,
1637 static int kvm_dev_ioctl_create_vm(void)
1642 kvm
= kvm_create_vm();
1644 return PTR_ERR(kvm
);
1645 fd
= anon_inode_getfd("kvm-vm", &kvm_vm_fops
, kvm
, O_RDWR
);
1652 static long kvm_dev_ioctl_check_extension_generic(long arg
)
1655 case KVM_CAP_USER_MEMORY
:
1656 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS
:
1657 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
:
1658 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
1659 case KVM_CAP_SET_BOOT_CPU_ID
:
1661 case KVM_CAP_INTERNAL_ERROR_DATA
:
1663 #ifdef CONFIG_HAVE_KVM_IRQCHIP
1664 case KVM_CAP_IRQ_ROUTING
:
1665 return KVM_MAX_IRQ_ROUTES
;
1670 return kvm_dev_ioctl_check_extension(arg
);
1673 static long kvm_dev_ioctl(struct file
*filp
,
1674 unsigned int ioctl
, unsigned long arg
)
1679 case KVM_GET_API_VERSION
:
1683 r
= KVM_API_VERSION
;
1689 r
= kvm_dev_ioctl_create_vm();
1691 case KVM_CHECK_EXTENSION
:
1692 r
= kvm_dev_ioctl_check_extension_generic(arg
);
1694 case KVM_GET_VCPU_MMAP_SIZE
:
1698 r
= PAGE_SIZE
; /* struct kvm_run */
1700 r
+= PAGE_SIZE
; /* pio data page */
1702 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1703 r
+= PAGE_SIZE
; /* coalesced mmio ring page */
1706 case KVM_TRACE_ENABLE
:
1707 case KVM_TRACE_PAUSE
:
1708 case KVM_TRACE_DISABLE
:
1712 return kvm_arch_dev_ioctl(filp
, ioctl
, arg
);
1718 static struct file_operations kvm_chardev_ops
= {
1719 .unlocked_ioctl
= kvm_dev_ioctl
,
1720 .compat_ioctl
= kvm_dev_ioctl
,
1723 static struct miscdevice kvm_dev
= {
1729 static void hardware_enable(void *junk
)
1731 int cpu
= raw_smp_processor_id();
1734 if (cpumask_test_cpu(cpu
, cpus_hardware_enabled
))
1737 cpumask_set_cpu(cpu
, cpus_hardware_enabled
);
1739 r
= kvm_arch_hardware_enable(NULL
);
1742 cpumask_clear_cpu(cpu
, cpus_hardware_enabled
);
1743 atomic_inc(&hardware_enable_failed
);
1744 printk(KERN_INFO
"kvm: enabling virtualization on "
1745 "CPU%d failed\n", cpu
);
1749 static void hardware_disable(void *junk
)
1751 int cpu
= raw_smp_processor_id();
1753 if (!cpumask_test_cpu(cpu
, cpus_hardware_enabled
))
1755 cpumask_clear_cpu(cpu
, cpus_hardware_enabled
);
1756 kvm_arch_hardware_disable(NULL
);
1759 static void hardware_disable_all_nolock(void)
1761 BUG_ON(!kvm_usage_count
);
1764 if (!kvm_usage_count
)
1765 on_each_cpu(hardware_disable
, NULL
, 1);
1768 static void hardware_disable_all(void)
1770 spin_lock(&kvm_lock
);
1771 hardware_disable_all_nolock();
1772 spin_unlock(&kvm_lock
);
1775 static int hardware_enable_all(void)
1779 spin_lock(&kvm_lock
);
1782 if (kvm_usage_count
== 1) {
1783 atomic_set(&hardware_enable_failed
, 0);
1784 on_each_cpu(hardware_enable
, NULL
, 1);
1786 if (atomic_read(&hardware_enable_failed
)) {
1787 hardware_disable_all_nolock();
1792 spin_unlock(&kvm_lock
);
1797 static int kvm_cpu_hotplug(struct notifier_block
*notifier
, unsigned long val
,
1802 if (!kvm_usage_count
)
1805 val
&= ~CPU_TASKS_FROZEN
;
1808 printk(KERN_INFO
"kvm: disabling virtualization on CPU%d\n",
1810 hardware_disable(NULL
);
1812 case CPU_UP_CANCELED
:
1813 printk(KERN_INFO
"kvm: disabling virtualization on CPU%d\n",
1815 smp_call_function_single(cpu
, hardware_disable
, NULL
, 1);
1818 printk(KERN_INFO
"kvm: enabling virtualization on CPU%d\n",
1820 smp_call_function_single(cpu
, hardware_enable
, NULL
, 1);
1827 asmlinkage
void kvm_handle_fault_on_reboot(void)
1830 /* spin while reset goes on */
1833 /* Fault while not rebooting. We want the trace. */
1836 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot
);
1838 static int kvm_reboot(struct notifier_block
*notifier
, unsigned long val
,
1842 * Some (well, at least mine) BIOSes hang on reboot if
1845 * And Intel TXT required VMX off for all cpu when system shutdown.
1847 printk(KERN_INFO
"kvm: exiting hardware virtualization\n");
1848 kvm_rebooting
= true;
1849 on_each_cpu(hardware_disable
, NULL
, 1);
1853 static struct notifier_block kvm_reboot_notifier
= {
1854 .notifier_call
= kvm_reboot
,
1858 void kvm_io_bus_init(struct kvm_io_bus
*bus
)
1860 memset(bus
, 0, sizeof(*bus
));
1863 void kvm_io_bus_destroy(struct kvm_io_bus
*bus
)
1867 for (i
= 0; i
< bus
->dev_count
; i
++) {
1868 struct kvm_io_device
*pos
= bus
->devs
[i
];
1870 kvm_iodevice_destructor(pos
);
1874 /* kvm_io_bus_write - called under kvm->slots_lock */
1875 int kvm_io_bus_write(struct kvm_io_bus
*bus
, gpa_t addr
,
1876 int len
, const void *val
)
1879 for (i
= 0; i
< bus
->dev_count
; i
++)
1880 if (!kvm_iodevice_write(bus
->devs
[i
], addr
, len
, val
))
1885 /* kvm_io_bus_read - called under kvm->slots_lock */
1886 int kvm_io_bus_read(struct kvm_io_bus
*bus
, gpa_t addr
, int len
, void *val
)
1889 for (i
= 0; i
< bus
->dev_count
; i
++)
1890 if (!kvm_iodevice_read(bus
->devs
[i
], addr
, len
, val
))
1895 int kvm_io_bus_register_dev(struct kvm
*kvm
, struct kvm_io_bus
*bus
,
1896 struct kvm_io_device
*dev
)
1900 down_write(&kvm
->slots_lock
);
1901 ret
= __kvm_io_bus_register_dev(bus
, dev
);
1902 up_write(&kvm
->slots_lock
);
1907 /* An unlocked version. Caller must have write lock on slots_lock. */
1908 int __kvm_io_bus_register_dev(struct kvm_io_bus
*bus
,
1909 struct kvm_io_device
*dev
)
1911 if (bus
->dev_count
> NR_IOBUS_DEVS
-1)
1914 bus
->devs
[bus
->dev_count
++] = dev
;
1919 void kvm_io_bus_unregister_dev(struct kvm
*kvm
,
1920 struct kvm_io_bus
*bus
,
1921 struct kvm_io_device
*dev
)
1923 down_write(&kvm
->slots_lock
);
1924 __kvm_io_bus_unregister_dev(bus
, dev
);
1925 up_write(&kvm
->slots_lock
);
1928 /* An unlocked version. Caller must have write lock on slots_lock. */
1929 void __kvm_io_bus_unregister_dev(struct kvm_io_bus
*bus
,
1930 struct kvm_io_device
*dev
)
1934 for (i
= 0; i
< bus
->dev_count
; i
++)
1935 if (bus
->devs
[i
] == dev
) {
1936 bus
->devs
[i
] = bus
->devs
[--bus
->dev_count
];
1941 static struct notifier_block kvm_cpu_notifier
= {
1942 .notifier_call
= kvm_cpu_hotplug
,
1943 .priority
= 20, /* must be > scheduler priority */
1946 static int vm_stat_get(void *_offset
, u64
*val
)
1948 unsigned offset
= (long)_offset
;
1952 spin_lock(&kvm_lock
);
1953 list_for_each_entry(kvm
, &vm_list
, vm_list
)
1954 *val
+= *(u32
*)((void *)kvm
+ offset
);
1955 spin_unlock(&kvm_lock
);
1959 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops
, vm_stat_get
, NULL
, "%llu\n");
1961 static int vcpu_stat_get(void *_offset
, u64
*val
)
1963 unsigned offset
= (long)_offset
;
1965 struct kvm_vcpu
*vcpu
;
1969 spin_lock(&kvm_lock
);
1970 list_for_each_entry(kvm
, &vm_list
, vm_list
)
1971 kvm_for_each_vcpu(i
, vcpu
, kvm
)
1972 *val
+= *(u32
*)((void *)vcpu
+ offset
);
1974 spin_unlock(&kvm_lock
);
1978 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops
, vcpu_stat_get
, NULL
, "%llu\n");
1980 static const struct file_operations
*stat_fops
[] = {
1981 [KVM_STAT_VCPU
] = &vcpu_stat_fops
,
1982 [KVM_STAT_VM
] = &vm_stat_fops
,
1985 static void kvm_init_debug(void)
1987 struct kvm_stats_debugfs_item
*p
;
1989 kvm_debugfs_dir
= debugfs_create_dir("kvm", NULL
);
1990 for (p
= debugfs_entries
; p
->name
; ++p
)
1991 p
->dentry
= debugfs_create_file(p
->name
, 0444, kvm_debugfs_dir
,
1992 (void *)(long)p
->offset
,
1993 stat_fops
[p
->kind
]);
1996 static void kvm_exit_debug(void)
1998 struct kvm_stats_debugfs_item
*p
;
2000 for (p
= debugfs_entries
; p
->name
; ++p
)
2001 debugfs_remove(p
->dentry
);
2002 debugfs_remove(kvm_debugfs_dir
);
2005 static int kvm_suspend(struct sys_device
*dev
, pm_message_t state
)
2007 if (kvm_usage_count
)
2008 hardware_disable(NULL
);
2012 static int kvm_resume(struct sys_device
*dev
)
2014 if (kvm_usage_count
)
2015 hardware_enable(NULL
);
2019 static struct sysdev_class kvm_sysdev_class
= {
2021 .suspend
= kvm_suspend
,
2022 .resume
= kvm_resume
,
2025 static struct sys_device kvm_sysdev
= {
2027 .cls
= &kvm_sysdev_class
,
2030 struct page
*bad_page
;
2034 struct kvm_vcpu
*preempt_notifier_to_vcpu(struct preempt_notifier
*pn
)
2036 return container_of(pn
, struct kvm_vcpu
, preempt_notifier
);
2039 static void kvm_sched_in(struct preempt_notifier
*pn
, int cpu
)
2041 struct kvm_vcpu
*vcpu
= preempt_notifier_to_vcpu(pn
);
2043 kvm_arch_vcpu_load(vcpu
, cpu
);
2046 static void kvm_sched_out(struct preempt_notifier
*pn
,
2047 struct task_struct
*next
)
2049 struct kvm_vcpu
*vcpu
= preempt_notifier_to_vcpu(pn
);
2051 kvm_arch_vcpu_put(vcpu
);
2054 int kvm_init(void *opaque
, unsigned int vcpu_size
,
2055 struct module
*module
)
2060 r
= kvm_arch_init(opaque
);
2064 bad_page
= alloc_page(GFP_KERNEL
| __GFP_ZERO
);
2066 if (bad_page
== NULL
) {
2071 bad_pfn
= page_to_pfn(bad_page
);
2073 if (!zalloc_cpumask_var(&cpus_hardware_enabled
, GFP_KERNEL
)) {
2078 r
= kvm_arch_hardware_setup();
2082 for_each_online_cpu(cpu
) {
2083 smp_call_function_single(cpu
,
2084 kvm_arch_check_processor_compat
,
2090 r
= register_cpu_notifier(&kvm_cpu_notifier
);
2093 register_reboot_notifier(&kvm_reboot_notifier
);
2095 r
= sysdev_class_register(&kvm_sysdev_class
);
2099 r
= sysdev_register(&kvm_sysdev
);
2103 /* A kmem cache lets us meet the alignment requirements of fx_save. */
2104 kvm_vcpu_cache
= kmem_cache_create("kvm_vcpu", vcpu_size
,
2105 __alignof__(struct kvm_vcpu
),
2107 if (!kvm_vcpu_cache
) {
2112 kvm_chardev_ops
.owner
= module
;
2113 kvm_vm_fops
.owner
= module
;
2114 kvm_vcpu_fops
.owner
= module
;
2116 r
= misc_register(&kvm_dev
);
2118 printk(KERN_ERR
"kvm: misc device register failed\n");
2122 kvm_preempt_ops
.sched_in
= kvm_sched_in
;
2123 kvm_preempt_ops
.sched_out
= kvm_sched_out
;
2130 kmem_cache_destroy(kvm_vcpu_cache
);
2132 sysdev_unregister(&kvm_sysdev
);
2134 sysdev_class_unregister(&kvm_sysdev_class
);
2136 unregister_reboot_notifier(&kvm_reboot_notifier
);
2137 unregister_cpu_notifier(&kvm_cpu_notifier
);
2140 kvm_arch_hardware_unsetup();
2142 free_cpumask_var(cpus_hardware_enabled
);
2144 __free_page(bad_page
);
2150 EXPORT_SYMBOL_GPL(kvm_init
);
2154 tracepoint_synchronize_unregister();
2156 misc_deregister(&kvm_dev
);
2157 kmem_cache_destroy(kvm_vcpu_cache
);
2158 sysdev_unregister(&kvm_sysdev
);
2159 sysdev_class_unregister(&kvm_sysdev_class
);
2160 unregister_reboot_notifier(&kvm_reboot_notifier
);
2161 unregister_cpu_notifier(&kvm_cpu_notifier
);
2162 on_each_cpu(hardware_disable
, NULL
, 1);
2163 kvm_arch_hardware_unsetup();
2165 free_cpumask_var(cpus_hardware_enabled
);
2166 __free_page(bad_page
);
2168 EXPORT_SYMBOL_GPL(kvm_exit
);