2 * Kernel-based Virtual Machine driver for Linux
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
7 * Copyright (C) 2006 Qumranet, Inc.
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
20 #include "x86_emulate.h"
23 #include <linux/kvm.h>
24 #include <linux/module.h>
25 #include <linux/errno.h>
26 #include <linux/percpu.h>
27 #include <linux/gfp.h>
29 #include <linux/miscdevice.h>
30 #include <linux/vmalloc.h>
31 #include <linux/reboot.h>
32 #include <linux/debugfs.h>
33 #include <linux/highmem.h>
34 #include <linux/file.h>
35 #include <linux/sysdev.h>
36 #include <linux/cpu.h>
37 #include <linux/sched.h>
38 #include <linux/cpumask.h>
39 #include <linux/smp.h>
40 #include <linux/anon_inodes.h>
41 #include <linux/profile.h>
42 #include <linux/kvm_para.h>
43 #include <linux/pagemap.h>
44 #include <linux/mman.h>
46 #include <asm/processor.h>
49 #include <asm/uaccess.h>
52 MODULE_AUTHOR("Qumranet");
53 MODULE_LICENSE("GPL");
55 static DEFINE_SPINLOCK(kvm_lock
);
56 static LIST_HEAD(vm_list
);
58 static cpumask_t cpus_hardware_enabled
;
60 struct kvm_x86_ops
*kvm_x86_ops
;
61 struct kmem_cache
*kvm_vcpu_cache
;
62 EXPORT_SYMBOL_GPL(kvm_vcpu_cache
);
64 static __read_mostly
struct preempt_ops kvm_preempt_ops
;
66 #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
68 static struct kvm_stats_debugfs_item
{
71 struct dentry
*dentry
;
72 } debugfs_entries
[] = {
73 { "pf_fixed", STAT_OFFSET(pf_fixed
) },
74 { "pf_guest", STAT_OFFSET(pf_guest
) },
75 { "tlb_flush", STAT_OFFSET(tlb_flush
) },
76 { "invlpg", STAT_OFFSET(invlpg
) },
77 { "exits", STAT_OFFSET(exits
) },
78 { "io_exits", STAT_OFFSET(io_exits
) },
79 { "mmio_exits", STAT_OFFSET(mmio_exits
) },
80 { "signal_exits", STAT_OFFSET(signal_exits
) },
81 { "irq_window", STAT_OFFSET(irq_window_exits
) },
82 { "halt_exits", STAT_OFFSET(halt_exits
) },
83 { "halt_wakeup", STAT_OFFSET(halt_wakeup
) },
84 { "request_irq", STAT_OFFSET(request_irq_exits
) },
85 { "irq_exits", STAT_OFFSET(irq_exits
) },
86 { "light_exits", STAT_OFFSET(light_exits
) },
87 { "efer_reload", STAT_OFFSET(efer_reload
) },
91 static struct dentry
*debugfs_dir
;
93 static long kvm_vcpu_ioctl(struct file
*file
, unsigned int ioctl
,
96 static inline int valid_vcpu(int n
)
98 return likely(n
>= 0 && n
< KVM_MAX_VCPUS
);
101 void kvm_load_guest_fpu(struct kvm_vcpu
*vcpu
)
103 if (!vcpu
->fpu_active
|| vcpu
->guest_fpu_loaded
)
106 vcpu
->guest_fpu_loaded
= 1;
107 fx_save(&vcpu
->host_fx_image
);
108 fx_restore(&vcpu
->guest_fx_image
);
110 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu
);
112 void kvm_put_guest_fpu(struct kvm_vcpu
*vcpu
)
114 if (!vcpu
->guest_fpu_loaded
)
117 vcpu
->guest_fpu_loaded
= 0;
118 fx_save(&vcpu
->guest_fx_image
);
119 fx_restore(&vcpu
->host_fx_image
);
121 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu
);
124 * Switches to specified vcpu, until a matching vcpu_put()
126 void vcpu_load(struct kvm_vcpu
*vcpu
)
130 mutex_lock(&vcpu
->mutex
);
132 preempt_notifier_register(&vcpu
->preempt_notifier
);
133 kvm_arch_vcpu_load(vcpu
, cpu
);
137 void vcpu_put(struct kvm_vcpu
*vcpu
)
140 kvm_arch_vcpu_put(vcpu
);
141 preempt_notifier_unregister(&vcpu
->preempt_notifier
);
143 mutex_unlock(&vcpu
->mutex
);
146 static void ack_flush(void *_completed
)
150 void kvm_flush_remote_tlbs(struct kvm
*kvm
)
154 struct kvm_vcpu
*vcpu
;
157 for (i
= 0; i
< KVM_MAX_VCPUS
; ++i
) {
158 vcpu
= kvm
->vcpus
[i
];
161 if (test_and_set_bit(KVM_REQ_TLB_FLUSH
, &vcpu
->requests
))
164 if (cpu
!= -1 && cpu
!= raw_smp_processor_id())
167 smp_call_function_mask(cpus
, ack_flush
, NULL
, 1);
170 int kvm_vcpu_init(struct kvm_vcpu
*vcpu
, struct kvm
*kvm
, unsigned id
)
175 mutex_init(&vcpu
->mutex
);
177 vcpu
->mmu
.root_hpa
= INVALID_PAGE
;
180 if (!irqchip_in_kernel(kvm
) || id
== 0)
181 vcpu
->mp_state
= VCPU_MP_STATE_RUNNABLE
;
183 vcpu
->mp_state
= VCPU_MP_STATE_UNINITIALIZED
;
184 init_waitqueue_head(&vcpu
->wq
);
186 page
= alloc_page(GFP_KERNEL
| __GFP_ZERO
);
191 vcpu
->run
= page_address(page
);
193 page
= alloc_page(GFP_KERNEL
| __GFP_ZERO
);
198 vcpu
->pio_data
= page_address(page
);
200 r
= kvm_mmu_create(vcpu
);
202 goto fail_free_pio_data
;
204 if (irqchip_in_kernel(kvm
)) {
205 r
= kvm_create_lapic(vcpu
);
207 goto fail_mmu_destroy
;
213 kvm_mmu_destroy(vcpu
);
215 free_page((unsigned long)vcpu
->pio_data
);
217 free_page((unsigned long)vcpu
->run
);
221 EXPORT_SYMBOL_GPL(kvm_vcpu_init
);
223 void kvm_vcpu_uninit(struct kvm_vcpu
*vcpu
)
225 kvm_free_lapic(vcpu
);
226 kvm_mmu_destroy(vcpu
);
227 free_page((unsigned long)vcpu
->pio_data
);
228 free_page((unsigned long)vcpu
->run
);
230 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit
);
232 static struct kvm
*kvm_create_vm(void)
234 struct kvm
*kvm
= kzalloc(sizeof(struct kvm
), GFP_KERNEL
);
237 return ERR_PTR(-ENOMEM
);
239 kvm_io_bus_init(&kvm
->pio_bus
);
240 mutex_init(&kvm
->lock
);
241 INIT_LIST_HEAD(&kvm
->active_mmu_pages
);
242 kvm_io_bus_init(&kvm
->mmio_bus
);
243 spin_lock(&kvm_lock
);
244 list_add(&kvm
->vm_list
, &vm_list
);
245 spin_unlock(&kvm_lock
);
250 * Free any memory in @free but not in @dont.
252 static void kvm_free_physmem_slot(struct kvm_memory_slot
*free
,
253 struct kvm_memory_slot
*dont
)
255 if (!dont
|| free
->rmap
!= dont
->rmap
)
258 if (!dont
|| free
->dirty_bitmap
!= dont
->dirty_bitmap
)
259 vfree(free
->dirty_bitmap
);
262 free
->dirty_bitmap
= NULL
;
266 static void kvm_free_physmem(struct kvm
*kvm
)
270 for (i
= 0; i
< kvm
->nmemslots
; ++i
)
271 kvm_free_physmem_slot(&kvm
->memslots
[i
], NULL
);
274 static void kvm_unload_vcpu_mmu(struct kvm_vcpu
*vcpu
)
277 kvm_mmu_unload(vcpu
);
281 static void kvm_free_vcpus(struct kvm
*kvm
)
286 * Unpin any mmu pages first.
288 for (i
= 0; i
< KVM_MAX_VCPUS
; ++i
)
290 kvm_unload_vcpu_mmu(kvm
->vcpus
[i
]);
291 for (i
= 0; i
< KVM_MAX_VCPUS
; ++i
) {
293 kvm_x86_ops
->vcpu_free(kvm
->vcpus
[i
]);
294 kvm
->vcpus
[i
] = NULL
;
300 static void kvm_destroy_vm(struct kvm
*kvm
)
302 spin_lock(&kvm_lock
);
303 list_del(&kvm
->vm_list
);
304 spin_unlock(&kvm_lock
);
305 kvm_io_bus_destroy(&kvm
->pio_bus
);
306 kvm_io_bus_destroy(&kvm
->mmio_bus
);
310 kvm_free_physmem(kvm
);
314 static int kvm_vm_release(struct inode
*inode
, struct file
*filp
)
316 struct kvm
*kvm
= filp
->private_data
;
322 void fx_init(struct kvm_vcpu
*vcpu
)
324 unsigned after_mxcsr_mask
;
326 /* Initialize guest FPU by resetting ours and saving into guest's */
328 fx_save(&vcpu
->host_fx_image
);
330 fx_save(&vcpu
->guest_fx_image
);
331 fx_restore(&vcpu
->host_fx_image
);
334 vcpu
->cr0
|= X86_CR0_ET
;
335 after_mxcsr_mask
= offsetof(struct i387_fxsave_struct
, st_space
);
336 vcpu
->guest_fx_image
.mxcsr
= 0x1f80;
337 memset((void *)&vcpu
->guest_fx_image
+ after_mxcsr_mask
,
338 0, sizeof(struct i387_fxsave_struct
) - after_mxcsr_mask
);
340 EXPORT_SYMBOL_GPL(fx_init
);
343 * Allocate some memory and give it an address in the guest physical address
346 * Discontiguous memory is allowed, mostly for framebuffers.
348 * Must be called holding kvm->lock.
350 int __kvm_set_memory_region(struct kvm
*kvm
,
351 struct kvm_userspace_memory_region
*mem
,
356 unsigned long npages
;
358 struct kvm_memory_slot
*memslot
;
359 struct kvm_memory_slot old
, new;
362 /* General sanity checks */
363 if (mem
->memory_size
& (PAGE_SIZE
- 1))
365 if (mem
->guest_phys_addr
& (PAGE_SIZE
- 1))
367 if (mem
->slot
>= KVM_MEMORY_SLOTS
+ KVM_PRIVATE_MEM_SLOTS
)
369 if (mem
->guest_phys_addr
+ mem
->memory_size
< mem
->guest_phys_addr
)
372 memslot
= &kvm
->memslots
[mem
->slot
];
373 base_gfn
= mem
->guest_phys_addr
>> PAGE_SHIFT
;
374 npages
= mem
->memory_size
>> PAGE_SHIFT
;
377 mem
->flags
&= ~KVM_MEM_LOG_DIRTY_PAGES
;
379 new = old
= *memslot
;
381 new.base_gfn
= base_gfn
;
383 new.flags
= mem
->flags
;
385 /* Disallow changing a memory slot's size. */
387 if (npages
&& old
.npages
&& npages
!= old
.npages
)
390 /* Check for overlaps */
392 for (i
= 0; i
< KVM_MEMORY_SLOTS
; ++i
) {
393 struct kvm_memory_slot
*s
= &kvm
->memslots
[i
];
397 if (!((base_gfn
+ npages
<= s
->base_gfn
) ||
398 (base_gfn
>= s
->base_gfn
+ s
->npages
)))
402 /* Free page dirty bitmap if unneeded */
403 if (!(new.flags
& KVM_MEM_LOG_DIRTY_PAGES
))
404 new.dirty_bitmap
= NULL
;
408 /* Allocate if a slot is being created */
409 if (npages
&& !new.rmap
) {
410 new.rmap
= vmalloc(npages
* sizeof(struct page
*));
415 memset(new.rmap
, 0, npages
* sizeof(*new.rmap
));
417 new.user_alloc
= user_alloc
;
419 new.userspace_addr
= mem
->userspace_addr
;
421 down_write(¤t
->mm
->mmap_sem
);
422 new.userspace_addr
= do_mmap(NULL
, 0,
424 PROT_READ
| PROT_WRITE
,
425 MAP_SHARED
| MAP_ANONYMOUS
,
427 up_write(¤t
->mm
->mmap_sem
);
429 if (IS_ERR((void *)new.userspace_addr
))
433 if (!old
.user_alloc
&& old
.rmap
) {
436 down_write(¤t
->mm
->mmap_sem
);
437 ret
= do_munmap(current
->mm
, old
.userspace_addr
,
438 old
.npages
* PAGE_SIZE
);
439 up_write(¤t
->mm
->mmap_sem
);
442 "kvm_vm_ioctl_set_memory_region: "
443 "failed to munmap memory\n");
447 /* Allocate page dirty bitmap if needed */
448 if ((new.flags
& KVM_MEM_LOG_DIRTY_PAGES
) && !new.dirty_bitmap
) {
449 unsigned dirty_bytes
= ALIGN(npages
, BITS_PER_LONG
) / 8;
451 new.dirty_bitmap
= vmalloc(dirty_bytes
);
452 if (!new.dirty_bitmap
)
454 memset(new.dirty_bitmap
, 0, dirty_bytes
);
457 if (mem
->slot
>= kvm
->nmemslots
)
458 kvm
->nmemslots
= mem
->slot
+ 1;
460 if (!kvm
->n_requested_mmu_pages
) {
461 unsigned int n_pages
;
464 n_pages
= npages
* KVM_PERMILLE_MMU_PAGES
/ 1000;
465 kvm_mmu_change_mmu_pages(kvm
, kvm
->n_alloc_mmu_pages
+
468 unsigned int nr_mmu_pages
;
470 n_pages
= old
.npages
* KVM_PERMILLE_MMU_PAGES
/ 1000;
471 nr_mmu_pages
= kvm
->n_alloc_mmu_pages
- n_pages
;
472 nr_mmu_pages
= max(nr_mmu_pages
,
473 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES
);
474 kvm_mmu_change_mmu_pages(kvm
, nr_mmu_pages
);
480 kvm_mmu_slot_remove_write_access(kvm
, mem
->slot
);
481 kvm_flush_remote_tlbs(kvm
);
483 kvm_free_physmem_slot(&old
, &new);
487 kvm_free_physmem_slot(&new, &old
);
492 EXPORT_SYMBOL_GPL(__kvm_set_memory_region
);
494 int kvm_set_memory_region(struct kvm
*kvm
,
495 struct kvm_userspace_memory_region
*mem
,
500 mutex_lock(&kvm
->lock
);
501 r
= __kvm_set_memory_region(kvm
, mem
, user_alloc
);
502 mutex_unlock(&kvm
->lock
);
505 EXPORT_SYMBOL_GPL(kvm_set_memory_region
);
507 int kvm_vm_ioctl_set_memory_region(struct kvm
*kvm
,
509 kvm_userspace_memory_region
*mem
,
512 if (mem
->slot
>= KVM_MEMORY_SLOTS
)
514 return kvm_set_memory_region(kvm
, mem
, user_alloc
);
518 * Get (and clear) the dirty memory log for a memory slot.
520 static int kvm_vm_ioctl_get_dirty_log(struct kvm
*kvm
,
521 struct kvm_dirty_log
*log
)
523 struct kvm_memory_slot
*memslot
;
526 unsigned long any
= 0;
528 mutex_lock(&kvm
->lock
);
531 if (log
->slot
>= KVM_MEMORY_SLOTS
)
534 memslot
= &kvm
->memslots
[log
->slot
];
536 if (!memslot
->dirty_bitmap
)
539 n
= ALIGN(memslot
->npages
, BITS_PER_LONG
) / 8;
541 for (i
= 0; !any
&& i
< n
/sizeof(long); ++i
)
542 any
= memslot
->dirty_bitmap
[i
];
545 if (copy_to_user(log
->dirty_bitmap
, memslot
->dirty_bitmap
, n
))
548 /* If nothing is dirty, don't bother messing with page tables. */
550 kvm_mmu_slot_remove_write_access(kvm
, log
->slot
);
551 kvm_flush_remote_tlbs(kvm
);
552 memset(memslot
->dirty_bitmap
, 0, n
);
558 mutex_unlock(&kvm
->lock
);
562 int is_error_page(struct page
*page
)
564 return page
== bad_page
;
566 EXPORT_SYMBOL_GPL(is_error_page
);
568 gfn_t
unalias_gfn(struct kvm
*kvm
, gfn_t gfn
)
571 struct kvm_mem_alias
*alias
;
573 for (i
= 0; i
< kvm
->naliases
; ++i
) {
574 alias
= &kvm
->aliases
[i
];
575 if (gfn
>= alias
->base_gfn
576 && gfn
< alias
->base_gfn
+ alias
->npages
)
577 return alias
->target_gfn
+ gfn
- alias
->base_gfn
;
582 static struct kvm_memory_slot
*__gfn_to_memslot(struct kvm
*kvm
, gfn_t gfn
)
586 for (i
= 0; i
< kvm
->nmemslots
; ++i
) {
587 struct kvm_memory_slot
*memslot
= &kvm
->memslots
[i
];
589 if (gfn
>= memslot
->base_gfn
590 && gfn
< memslot
->base_gfn
+ memslot
->npages
)
596 struct kvm_memory_slot
*gfn_to_memslot(struct kvm
*kvm
, gfn_t gfn
)
598 gfn
= unalias_gfn(kvm
, gfn
);
599 return __gfn_to_memslot(kvm
, gfn
);
602 int kvm_is_visible_gfn(struct kvm
*kvm
, gfn_t gfn
)
606 gfn
= unalias_gfn(kvm
, gfn
);
607 for (i
= 0; i
< KVM_MEMORY_SLOTS
; ++i
) {
608 struct kvm_memory_slot
*memslot
= &kvm
->memslots
[i
];
610 if (gfn
>= memslot
->base_gfn
611 && gfn
< memslot
->base_gfn
+ memslot
->npages
)
616 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn
);
619 * Requires current->mm->mmap_sem to be held
621 static struct page
*__gfn_to_page(struct kvm
*kvm
, gfn_t gfn
)
623 struct kvm_memory_slot
*slot
;
624 struct page
*page
[1];
629 gfn
= unalias_gfn(kvm
, gfn
);
630 slot
= __gfn_to_memslot(kvm
, gfn
);
636 npages
= get_user_pages(current
, current
->mm
,
638 + (gfn
- slot
->base_gfn
) * PAGE_SIZE
, 1,
648 struct page
*gfn_to_page(struct kvm
*kvm
, gfn_t gfn
)
652 down_read(¤t
->mm
->mmap_sem
);
653 page
= __gfn_to_page(kvm
, gfn
);
654 up_read(¤t
->mm
->mmap_sem
);
659 EXPORT_SYMBOL_GPL(gfn_to_page
);
661 void kvm_release_page(struct page
*page
)
663 if (!PageReserved(page
))
667 EXPORT_SYMBOL_GPL(kvm_release_page
);
669 static int next_segment(unsigned long len
, int offset
)
671 if (len
> PAGE_SIZE
- offset
)
672 return PAGE_SIZE
- offset
;
677 int kvm_read_guest_page(struct kvm
*kvm
, gfn_t gfn
, void *data
, int offset
,
683 page
= gfn_to_page(kvm
, gfn
);
684 if (is_error_page(page
)) {
685 kvm_release_page(page
);
688 page_virt
= kmap_atomic(page
, KM_USER0
);
690 memcpy(data
, page_virt
+ offset
, len
);
692 kunmap_atomic(page_virt
, KM_USER0
);
693 kvm_release_page(page
);
696 EXPORT_SYMBOL_GPL(kvm_read_guest_page
);
698 int kvm_read_guest(struct kvm
*kvm
, gpa_t gpa
, void *data
, unsigned long len
)
700 gfn_t gfn
= gpa
>> PAGE_SHIFT
;
702 int offset
= offset_in_page(gpa
);
705 while ((seg
= next_segment(len
, offset
)) != 0) {
706 ret
= kvm_read_guest_page(kvm
, gfn
, data
, offset
, seg
);
716 EXPORT_SYMBOL_GPL(kvm_read_guest
);
718 int kvm_write_guest_page(struct kvm
*kvm
, gfn_t gfn
, const void *data
,
724 page
= gfn_to_page(kvm
, gfn
);
725 if (is_error_page(page
)) {
726 kvm_release_page(page
);
729 page_virt
= kmap_atomic(page
, KM_USER0
);
731 memcpy(page_virt
+ offset
, data
, len
);
733 kunmap_atomic(page_virt
, KM_USER0
);
734 mark_page_dirty(kvm
, gfn
);
735 kvm_release_page(page
);
738 EXPORT_SYMBOL_GPL(kvm_write_guest_page
);
740 int kvm_write_guest(struct kvm
*kvm
, gpa_t gpa
, const void *data
,
743 gfn_t gfn
= gpa
>> PAGE_SHIFT
;
745 int offset
= offset_in_page(gpa
);
748 while ((seg
= next_segment(len
, offset
)) != 0) {
749 ret
= kvm_write_guest_page(kvm
, gfn
, data
, offset
, seg
);
760 int kvm_clear_guest_page(struct kvm
*kvm
, gfn_t gfn
, int offset
, int len
)
765 page
= gfn_to_page(kvm
, gfn
);
766 if (is_error_page(page
)) {
767 kvm_release_page(page
);
770 page_virt
= kmap_atomic(page
, KM_USER0
);
772 memset(page_virt
+ offset
, 0, len
);
774 kunmap_atomic(page_virt
, KM_USER0
);
775 kvm_release_page(page
);
778 EXPORT_SYMBOL_GPL(kvm_clear_guest_page
);
780 int kvm_clear_guest(struct kvm
*kvm
, gpa_t gpa
, unsigned long len
)
782 gfn_t gfn
= gpa
>> PAGE_SHIFT
;
784 int offset
= offset_in_page(gpa
);
787 while ((seg
= next_segment(len
, offset
)) != 0) {
788 ret
= kvm_clear_guest_page(kvm
, gfn
, offset
, seg
);
797 EXPORT_SYMBOL_GPL(kvm_clear_guest
);
799 /* WARNING: Does not work on aliased pages. */
800 void mark_page_dirty(struct kvm
*kvm
, gfn_t gfn
)
802 struct kvm_memory_slot
*memslot
;
804 memslot
= __gfn_to_memslot(kvm
, gfn
);
805 if (memslot
&& memslot
->dirty_bitmap
) {
806 unsigned long rel_gfn
= gfn
- memslot
->base_gfn
;
809 if (!test_bit(rel_gfn
, memslot
->dirty_bitmap
))
810 set_bit(rel_gfn
, memslot
->dirty_bitmap
);
815 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
817 static void kvm_vcpu_block(struct kvm_vcpu
*vcpu
)
819 DECLARE_WAITQUEUE(wait
, current
);
821 add_wait_queue(&vcpu
->wq
, &wait
);
824 * We will block until either an interrupt or a signal wakes us up
826 while (!kvm_cpu_has_interrupt(vcpu
)
827 && !signal_pending(current
)
828 && vcpu
->mp_state
!= VCPU_MP_STATE_RUNNABLE
829 && vcpu
->mp_state
!= VCPU_MP_STATE_SIPI_RECEIVED
) {
830 set_current_state(TASK_INTERRUPTIBLE
);
836 __set_current_state(TASK_RUNNING
);
837 remove_wait_queue(&vcpu
->wq
, &wait
);
840 int kvm_emulate_halt(struct kvm_vcpu
*vcpu
)
842 ++vcpu
->stat
.halt_exits
;
843 if (irqchip_in_kernel(vcpu
->kvm
)) {
844 vcpu
->mp_state
= VCPU_MP_STATE_HALTED
;
845 kvm_vcpu_block(vcpu
);
846 if (vcpu
->mp_state
!= VCPU_MP_STATE_RUNNABLE
)
850 vcpu
->run
->exit_reason
= KVM_EXIT_HLT
;
854 EXPORT_SYMBOL_GPL(kvm_emulate_halt
);
856 int kvm_emulate_hypercall(struct kvm_vcpu
*vcpu
)
858 unsigned long nr
, a0
, a1
, a2
, a3
, ret
;
860 kvm_x86_ops
->cache_regs(vcpu
);
862 nr
= vcpu
->regs
[VCPU_REGS_RAX
];
863 a0
= vcpu
->regs
[VCPU_REGS_RBX
];
864 a1
= vcpu
->regs
[VCPU_REGS_RCX
];
865 a2
= vcpu
->regs
[VCPU_REGS_RDX
];
866 a3
= vcpu
->regs
[VCPU_REGS_RSI
];
868 if (!is_long_mode(vcpu
)) {
881 vcpu
->regs
[VCPU_REGS_RAX
] = ret
;
882 kvm_x86_ops
->decache_regs(vcpu
);
885 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall
);
887 int kvm_fix_hypercall(struct kvm_vcpu
*vcpu
)
892 mutex_lock(&vcpu
->kvm
->lock
);
895 * Blow out the MMU to ensure that no other VCPU has an active mapping
896 * to ensure that the updated hypercall appears atomically across all
899 kvm_mmu_zap_all(vcpu
->kvm
);
901 kvm_x86_ops
->cache_regs(vcpu
);
902 kvm_x86_ops
->patch_hypercall(vcpu
, instruction
);
903 if (emulator_write_emulated(vcpu
->rip
, instruction
, 3, vcpu
)
907 mutex_unlock(&vcpu
->kvm
->lock
);
912 static u64
mk_cr_64(u64 curr_cr
, u32 new_val
)
914 return (curr_cr
& ~((1ULL << 32) - 1)) | new_val
;
917 void realmode_lgdt(struct kvm_vcpu
*vcpu
, u16 limit
, unsigned long base
)
919 struct descriptor_table dt
= { limit
, base
};
921 kvm_x86_ops
->set_gdt(vcpu
, &dt
);
924 void realmode_lidt(struct kvm_vcpu
*vcpu
, u16 limit
, unsigned long base
)
926 struct descriptor_table dt
= { limit
, base
};
928 kvm_x86_ops
->set_idt(vcpu
, &dt
);
931 void realmode_lmsw(struct kvm_vcpu
*vcpu
, unsigned long msw
,
932 unsigned long *rflags
)
935 *rflags
= kvm_x86_ops
->get_rflags(vcpu
);
938 unsigned long realmode_get_cr(struct kvm_vcpu
*vcpu
, int cr
)
940 kvm_x86_ops
->decache_cr4_guest_bits(vcpu
);
951 vcpu_printf(vcpu
, "%s: unexpected cr %u\n", __FUNCTION__
, cr
);
956 void realmode_set_cr(struct kvm_vcpu
*vcpu
, int cr
, unsigned long val
,
957 unsigned long *rflags
)
961 set_cr0(vcpu
, mk_cr_64(vcpu
->cr0
, val
));
962 *rflags
= kvm_x86_ops
->get_rflags(vcpu
);
971 set_cr4(vcpu
, mk_cr_64(vcpu
->cr4
, val
));
974 vcpu_printf(vcpu
, "%s: unexpected cr %u\n", __FUNCTION__
, cr
);
978 void kvm_resched(struct kvm_vcpu
*vcpu
)
984 EXPORT_SYMBOL_GPL(kvm_resched
);
986 void kvm_emulate_cpuid(struct kvm_vcpu
*vcpu
)
990 struct kvm_cpuid_entry
*e
, *best
;
992 kvm_x86_ops
->cache_regs(vcpu
);
993 function
= vcpu
->regs
[VCPU_REGS_RAX
];
994 vcpu
->regs
[VCPU_REGS_RAX
] = 0;
995 vcpu
->regs
[VCPU_REGS_RBX
] = 0;
996 vcpu
->regs
[VCPU_REGS_RCX
] = 0;
997 vcpu
->regs
[VCPU_REGS_RDX
] = 0;
999 for (i
= 0; i
< vcpu
->cpuid_nent
; ++i
) {
1000 e
= &vcpu
->cpuid_entries
[i
];
1001 if (e
->function
== function
) {
1006 * Both basic or both extended?
1008 if (((e
->function
^ function
) & 0x80000000) == 0)
1009 if (!best
|| e
->function
> best
->function
)
1013 vcpu
->regs
[VCPU_REGS_RAX
] = best
->eax
;
1014 vcpu
->regs
[VCPU_REGS_RBX
] = best
->ebx
;
1015 vcpu
->regs
[VCPU_REGS_RCX
] = best
->ecx
;
1016 vcpu
->regs
[VCPU_REGS_RDX
] = best
->edx
;
1018 kvm_x86_ops
->decache_regs(vcpu
);
1019 kvm_x86_ops
->skip_emulated_instruction(vcpu
);
1021 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid
);
1024 * Check if userspace requested an interrupt window, and that the
1025 * interrupt window is open.
1027 * No need to exit to userspace if we already have an interrupt queued.
1029 static int dm_request_for_irq_injection(struct kvm_vcpu
*vcpu
,
1030 struct kvm_run
*kvm_run
)
1032 return (!vcpu
->irq_summary
&&
1033 kvm_run
->request_interrupt_window
&&
1034 vcpu
->interrupt_window_open
&&
1035 (kvm_x86_ops
->get_rflags(vcpu
) & X86_EFLAGS_IF
));
1038 static void post_kvm_run_save(struct kvm_vcpu
*vcpu
,
1039 struct kvm_run
*kvm_run
)
1041 kvm_run
->if_flag
= (kvm_x86_ops
->get_rflags(vcpu
) & X86_EFLAGS_IF
) != 0;
1042 kvm_run
->cr8
= get_cr8(vcpu
);
1043 kvm_run
->apic_base
= kvm_get_apic_base(vcpu
);
1044 if (irqchip_in_kernel(vcpu
->kvm
))
1045 kvm_run
->ready_for_interrupt_injection
= 1;
1047 kvm_run
->ready_for_interrupt_injection
=
1048 (vcpu
->interrupt_window_open
&&
1049 vcpu
->irq_summary
== 0);
1052 static int __vcpu_run(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
1056 if (unlikely(vcpu
->mp_state
== VCPU_MP_STATE_SIPI_RECEIVED
)) {
1057 pr_debug("vcpu %d received sipi with vector # %x\n",
1058 vcpu
->vcpu_id
, vcpu
->sipi_vector
);
1059 kvm_lapic_reset(vcpu
);
1060 r
= kvm_x86_ops
->vcpu_reset(vcpu
);
1063 vcpu
->mp_state
= VCPU_MP_STATE_RUNNABLE
;
1067 if (vcpu
->guest_debug
.enabled
)
1068 kvm_x86_ops
->guest_debug_pre(vcpu
);
1071 r
= kvm_mmu_reload(vcpu
);
1075 kvm_inject_pending_timer_irqs(vcpu
);
1079 kvm_x86_ops
->prepare_guest_switch(vcpu
);
1080 kvm_load_guest_fpu(vcpu
);
1082 local_irq_disable();
1084 if (signal_pending(current
)) {
1088 kvm_run
->exit_reason
= KVM_EXIT_INTR
;
1089 ++vcpu
->stat
.signal_exits
;
1093 if (irqchip_in_kernel(vcpu
->kvm
))
1094 kvm_x86_ops
->inject_pending_irq(vcpu
);
1095 else if (!vcpu
->mmio_read_completed
)
1096 kvm_x86_ops
->inject_pending_vectors(vcpu
, kvm_run
);
1098 vcpu
->guest_mode
= 1;
1102 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH
, &vcpu
->requests
))
1103 kvm_x86_ops
->tlb_flush(vcpu
);
1105 kvm_x86_ops
->run(vcpu
, kvm_run
);
1107 vcpu
->guest_mode
= 0;
1113 * We must have an instruction between local_irq_enable() and
1114 * kvm_guest_exit(), so the timer interrupt isn't delayed by
1115 * the interrupt shadow. The stat.exits increment will do nicely.
1116 * But we need to prevent reordering, hence this barrier():
1125 * Profile KVM exit RIPs:
1127 if (unlikely(prof_on
== KVM_PROFILING
)) {
1128 kvm_x86_ops
->cache_regs(vcpu
);
1129 profile_hit(KVM_PROFILING
, (void *)vcpu
->rip
);
1132 r
= kvm_x86_ops
->handle_exit(kvm_run
, vcpu
);
1135 if (dm_request_for_irq_injection(vcpu
, kvm_run
)) {
1137 kvm_run
->exit_reason
= KVM_EXIT_INTR
;
1138 ++vcpu
->stat
.request_irq_exits
;
1141 if (!need_resched()) {
1142 ++vcpu
->stat
.light_exits
;
1153 post_kvm_run_save(vcpu
, kvm_run
);
1159 static int kvm_vcpu_ioctl_run(struct kvm_vcpu
*vcpu
, struct kvm_run
*kvm_run
)
1166 if (unlikely(vcpu
->mp_state
== VCPU_MP_STATE_UNINITIALIZED
)) {
1167 kvm_vcpu_block(vcpu
);
1172 if (vcpu
->sigset_active
)
1173 sigprocmask(SIG_SETMASK
, &vcpu
->sigset
, &sigsaved
);
1175 /* re-sync apic's tpr */
1176 if (!irqchip_in_kernel(vcpu
->kvm
))
1177 set_cr8(vcpu
, kvm_run
->cr8
);
1179 if (vcpu
->pio
.cur_count
) {
1180 r
= complete_pio(vcpu
);
1184 #if CONFIG_HAS_IOMEM
1185 if (vcpu
->mmio_needed
) {
1186 memcpy(vcpu
->mmio_data
, kvm_run
->mmio
.data
, 8);
1187 vcpu
->mmio_read_completed
= 1;
1188 vcpu
->mmio_needed
= 0;
1189 r
= emulate_instruction(vcpu
, kvm_run
,
1190 vcpu
->mmio_fault_cr2
, 0, 1);
1191 if (r
== EMULATE_DO_MMIO
) {
1193 * Read-modify-write. Back to userspace.
1200 if (kvm_run
->exit_reason
== KVM_EXIT_HYPERCALL
) {
1201 kvm_x86_ops
->cache_regs(vcpu
);
1202 vcpu
->regs
[VCPU_REGS_RAX
] = kvm_run
->hypercall
.ret
;
1203 kvm_x86_ops
->decache_regs(vcpu
);
1206 r
= __vcpu_run(vcpu
, kvm_run
);
1209 if (vcpu
->sigset_active
)
1210 sigprocmask(SIG_SETMASK
, &sigsaved
, NULL
);
1216 static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu
*vcpu
,
1217 struct kvm_regs
*regs
)
1221 kvm_x86_ops
->cache_regs(vcpu
);
1223 regs
->rax
= vcpu
->regs
[VCPU_REGS_RAX
];
1224 regs
->rbx
= vcpu
->regs
[VCPU_REGS_RBX
];
1225 regs
->rcx
= vcpu
->regs
[VCPU_REGS_RCX
];
1226 regs
->rdx
= vcpu
->regs
[VCPU_REGS_RDX
];
1227 regs
->rsi
= vcpu
->regs
[VCPU_REGS_RSI
];
1228 regs
->rdi
= vcpu
->regs
[VCPU_REGS_RDI
];
1229 regs
->rsp
= vcpu
->regs
[VCPU_REGS_RSP
];
1230 regs
->rbp
= vcpu
->regs
[VCPU_REGS_RBP
];
1231 #ifdef CONFIG_X86_64
1232 regs
->r8
= vcpu
->regs
[VCPU_REGS_R8
];
1233 regs
->r9
= vcpu
->regs
[VCPU_REGS_R9
];
1234 regs
->r10
= vcpu
->regs
[VCPU_REGS_R10
];
1235 regs
->r11
= vcpu
->regs
[VCPU_REGS_R11
];
1236 regs
->r12
= vcpu
->regs
[VCPU_REGS_R12
];
1237 regs
->r13
= vcpu
->regs
[VCPU_REGS_R13
];
1238 regs
->r14
= vcpu
->regs
[VCPU_REGS_R14
];
1239 regs
->r15
= vcpu
->regs
[VCPU_REGS_R15
];
1242 regs
->rip
= vcpu
->rip
;
1243 regs
->rflags
= kvm_x86_ops
->get_rflags(vcpu
);
1246 * Don't leak debug flags in case they were set for guest debugging
1248 if (vcpu
->guest_debug
.enabled
&& vcpu
->guest_debug
.singlestep
)
1249 regs
->rflags
&= ~(X86_EFLAGS_TF
| X86_EFLAGS_RF
);
1256 static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu
*vcpu
,
1257 struct kvm_regs
*regs
)
1261 vcpu
->regs
[VCPU_REGS_RAX
] = regs
->rax
;
1262 vcpu
->regs
[VCPU_REGS_RBX
] = regs
->rbx
;
1263 vcpu
->regs
[VCPU_REGS_RCX
] = regs
->rcx
;
1264 vcpu
->regs
[VCPU_REGS_RDX
] = regs
->rdx
;
1265 vcpu
->regs
[VCPU_REGS_RSI
] = regs
->rsi
;
1266 vcpu
->regs
[VCPU_REGS_RDI
] = regs
->rdi
;
1267 vcpu
->regs
[VCPU_REGS_RSP
] = regs
->rsp
;
1268 vcpu
->regs
[VCPU_REGS_RBP
] = regs
->rbp
;
1269 #ifdef CONFIG_X86_64
1270 vcpu
->regs
[VCPU_REGS_R8
] = regs
->r8
;
1271 vcpu
->regs
[VCPU_REGS_R9
] = regs
->r9
;
1272 vcpu
->regs
[VCPU_REGS_R10
] = regs
->r10
;
1273 vcpu
->regs
[VCPU_REGS_R11
] = regs
->r11
;
1274 vcpu
->regs
[VCPU_REGS_R12
] = regs
->r12
;
1275 vcpu
->regs
[VCPU_REGS_R13
] = regs
->r13
;
1276 vcpu
->regs
[VCPU_REGS_R14
] = regs
->r14
;
1277 vcpu
->regs
[VCPU_REGS_R15
] = regs
->r15
;
1280 vcpu
->rip
= regs
->rip
;
1281 kvm_x86_ops
->set_rflags(vcpu
, regs
->rflags
);
1283 kvm_x86_ops
->decache_regs(vcpu
);
1290 static void get_segment(struct kvm_vcpu
*vcpu
,
1291 struct kvm_segment
*var
, int seg
)
1293 return kvm_x86_ops
->get_segment(vcpu
, var
, seg
);
1296 static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu
*vcpu
,
1297 struct kvm_sregs
*sregs
)
1299 struct descriptor_table dt
;
1304 get_segment(vcpu
, &sregs
->cs
, VCPU_SREG_CS
);
1305 get_segment(vcpu
, &sregs
->ds
, VCPU_SREG_DS
);
1306 get_segment(vcpu
, &sregs
->es
, VCPU_SREG_ES
);
1307 get_segment(vcpu
, &sregs
->fs
, VCPU_SREG_FS
);
1308 get_segment(vcpu
, &sregs
->gs
, VCPU_SREG_GS
);
1309 get_segment(vcpu
, &sregs
->ss
, VCPU_SREG_SS
);
1311 get_segment(vcpu
, &sregs
->tr
, VCPU_SREG_TR
);
1312 get_segment(vcpu
, &sregs
->ldt
, VCPU_SREG_LDTR
);
1314 kvm_x86_ops
->get_idt(vcpu
, &dt
);
1315 sregs
->idt
.limit
= dt
.limit
;
1316 sregs
->idt
.base
= dt
.base
;
1317 kvm_x86_ops
->get_gdt(vcpu
, &dt
);
1318 sregs
->gdt
.limit
= dt
.limit
;
1319 sregs
->gdt
.base
= dt
.base
;
1321 kvm_x86_ops
->decache_cr4_guest_bits(vcpu
);
1322 sregs
->cr0
= vcpu
->cr0
;
1323 sregs
->cr2
= vcpu
->cr2
;
1324 sregs
->cr3
= vcpu
->cr3
;
1325 sregs
->cr4
= vcpu
->cr4
;
1326 sregs
->cr8
= get_cr8(vcpu
);
1327 sregs
->efer
= vcpu
->shadow_efer
;
1328 sregs
->apic_base
= kvm_get_apic_base(vcpu
);
1330 if (irqchip_in_kernel(vcpu
->kvm
)) {
1331 memset(sregs
->interrupt_bitmap
, 0,
1332 sizeof sregs
->interrupt_bitmap
);
1333 pending_vec
= kvm_x86_ops
->get_irq(vcpu
);
1334 if (pending_vec
>= 0)
1335 set_bit(pending_vec
,
1336 (unsigned long *)sregs
->interrupt_bitmap
);
1338 memcpy(sregs
->interrupt_bitmap
, vcpu
->irq_pending
,
1339 sizeof sregs
->interrupt_bitmap
);
1346 static void set_segment(struct kvm_vcpu
*vcpu
,
1347 struct kvm_segment
*var
, int seg
)
1349 return kvm_x86_ops
->set_segment(vcpu
, var
, seg
);
1352 static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu
*vcpu
,
1353 struct kvm_sregs
*sregs
)
1355 int mmu_reset_needed
= 0;
1356 int i
, pending_vec
, max_bits
;
1357 struct descriptor_table dt
;
1361 dt
.limit
= sregs
->idt
.limit
;
1362 dt
.base
= sregs
->idt
.base
;
1363 kvm_x86_ops
->set_idt(vcpu
, &dt
);
1364 dt
.limit
= sregs
->gdt
.limit
;
1365 dt
.base
= sregs
->gdt
.base
;
1366 kvm_x86_ops
->set_gdt(vcpu
, &dt
);
1368 vcpu
->cr2
= sregs
->cr2
;
1369 mmu_reset_needed
|= vcpu
->cr3
!= sregs
->cr3
;
1370 vcpu
->cr3
= sregs
->cr3
;
1372 set_cr8(vcpu
, sregs
->cr8
);
1374 mmu_reset_needed
|= vcpu
->shadow_efer
!= sregs
->efer
;
1375 #ifdef CONFIG_X86_64
1376 kvm_x86_ops
->set_efer(vcpu
, sregs
->efer
);
1378 kvm_set_apic_base(vcpu
, sregs
->apic_base
);
1380 kvm_x86_ops
->decache_cr4_guest_bits(vcpu
);
1382 mmu_reset_needed
|= vcpu
->cr0
!= sregs
->cr0
;
1383 vcpu
->cr0
= sregs
->cr0
;
1384 kvm_x86_ops
->set_cr0(vcpu
, sregs
->cr0
);
1386 mmu_reset_needed
|= vcpu
->cr4
!= sregs
->cr4
;
1387 kvm_x86_ops
->set_cr4(vcpu
, sregs
->cr4
);
1388 if (!is_long_mode(vcpu
) && is_pae(vcpu
))
1389 load_pdptrs(vcpu
, vcpu
->cr3
);
1391 if (mmu_reset_needed
)
1392 kvm_mmu_reset_context(vcpu
);
1394 if (!irqchip_in_kernel(vcpu
->kvm
)) {
1395 memcpy(vcpu
->irq_pending
, sregs
->interrupt_bitmap
,
1396 sizeof vcpu
->irq_pending
);
1397 vcpu
->irq_summary
= 0;
1398 for (i
= 0; i
< ARRAY_SIZE(vcpu
->irq_pending
); ++i
)
1399 if (vcpu
->irq_pending
[i
])
1400 __set_bit(i
, &vcpu
->irq_summary
);
1402 max_bits
= (sizeof sregs
->interrupt_bitmap
) << 3;
1403 pending_vec
= find_first_bit(
1404 (const unsigned long *)sregs
->interrupt_bitmap
,
1406 /* Only pending external irq is handled here */
1407 if (pending_vec
< max_bits
) {
1408 kvm_x86_ops
->set_irq(vcpu
, pending_vec
);
1409 pr_debug("Set back pending irq %d\n",
1414 set_segment(vcpu
, &sregs
->cs
, VCPU_SREG_CS
);
1415 set_segment(vcpu
, &sregs
->ds
, VCPU_SREG_DS
);
1416 set_segment(vcpu
, &sregs
->es
, VCPU_SREG_ES
);
1417 set_segment(vcpu
, &sregs
->fs
, VCPU_SREG_FS
);
1418 set_segment(vcpu
, &sregs
->gs
, VCPU_SREG_GS
);
1419 set_segment(vcpu
, &sregs
->ss
, VCPU_SREG_SS
);
1421 set_segment(vcpu
, &sregs
->tr
, VCPU_SREG_TR
);
1422 set_segment(vcpu
, &sregs
->ldt
, VCPU_SREG_LDTR
);
1429 void kvm_get_cs_db_l_bits(struct kvm_vcpu
*vcpu
, int *db
, int *l
)
1431 struct kvm_segment cs
;
1433 get_segment(vcpu
, &cs
, VCPU_SREG_CS
);
1437 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits
);
1440 * Translate a guest virtual address to a guest physical address.
1442 static int kvm_vcpu_ioctl_translate(struct kvm_vcpu
*vcpu
,
1443 struct kvm_translation
*tr
)
1445 unsigned long vaddr
= tr
->linear_address
;
1449 mutex_lock(&vcpu
->kvm
->lock
);
1450 gpa
= vcpu
->mmu
.gva_to_gpa(vcpu
, vaddr
);
1451 tr
->physical_address
= gpa
;
1452 tr
->valid
= gpa
!= UNMAPPED_GVA
;
1455 mutex_unlock(&vcpu
->kvm
->lock
);
1461 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu
*vcpu
,
1462 struct kvm_interrupt
*irq
)
1464 if (irq
->irq
< 0 || irq
->irq
>= 256)
1466 if (irqchip_in_kernel(vcpu
->kvm
))
1470 set_bit(irq
->irq
, vcpu
->irq_pending
);
1471 set_bit(irq
->irq
/ BITS_PER_LONG
, &vcpu
->irq_summary
);
1478 static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu
*vcpu
,
1479 struct kvm_debug_guest
*dbg
)
1485 r
= kvm_x86_ops
->set_guest_debug(vcpu
, dbg
);
1492 static struct page
*kvm_vcpu_nopage(struct vm_area_struct
*vma
,
1493 unsigned long address
,
1496 struct kvm_vcpu
*vcpu
= vma
->vm_file
->private_data
;
1497 unsigned long pgoff
;
1500 pgoff
= ((address
- vma
->vm_start
) >> PAGE_SHIFT
) + vma
->vm_pgoff
;
1502 page
= virt_to_page(vcpu
->run
);
1503 else if (pgoff
== KVM_PIO_PAGE_OFFSET
)
1504 page
= virt_to_page(vcpu
->pio_data
);
1506 return NOPAGE_SIGBUS
;
1509 *type
= VM_FAULT_MINOR
;
1514 static struct vm_operations_struct kvm_vcpu_vm_ops
= {
1515 .nopage
= kvm_vcpu_nopage
,
1518 static int kvm_vcpu_mmap(struct file
*file
, struct vm_area_struct
*vma
)
1520 vma
->vm_ops
= &kvm_vcpu_vm_ops
;
1524 static int kvm_vcpu_release(struct inode
*inode
, struct file
*filp
)
1526 struct kvm_vcpu
*vcpu
= filp
->private_data
;
1528 fput(vcpu
->kvm
->filp
);
1532 static struct file_operations kvm_vcpu_fops
= {
1533 .release
= kvm_vcpu_release
,
1534 .unlocked_ioctl
= kvm_vcpu_ioctl
,
1535 .compat_ioctl
= kvm_vcpu_ioctl
,
1536 .mmap
= kvm_vcpu_mmap
,
1540 * Allocates an inode for the vcpu.
1542 static int create_vcpu_fd(struct kvm_vcpu
*vcpu
)
1545 struct inode
*inode
;
1548 r
= anon_inode_getfd(&fd
, &inode
, &file
,
1549 "kvm-vcpu", &kvm_vcpu_fops
, vcpu
);
1552 atomic_inc(&vcpu
->kvm
->filp
->f_count
);
1557 * Creates some virtual cpus. Good luck creating more than one.
1559 static int kvm_vm_ioctl_create_vcpu(struct kvm
*kvm
, int n
)
1562 struct kvm_vcpu
*vcpu
;
1567 vcpu
= kvm_x86_ops
->vcpu_create(kvm
, n
);
1569 return PTR_ERR(vcpu
);
1571 preempt_notifier_init(&vcpu
->preempt_notifier
, &kvm_preempt_ops
);
1573 /* We do fxsave: this must be aligned. */
1574 BUG_ON((unsigned long)&vcpu
->host_fx_image
& 0xF);
1577 r
= kvm_x86_ops
->vcpu_reset(vcpu
);
1579 r
= kvm_mmu_setup(vcpu
);
1584 mutex_lock(&kvm
->lock
);
1585 if (kvm
->vcpus
[n
]) {
1587 mutex_unlock(&kvm
->lock
);
1590 kvm
->vcpus
[n
] = vcpu
;
1591 mutex_unlock(&kvm
->lock
);
1593 /* Now it's all set up, let userspace reach it */
1594 r
= create_vcpu_fd(vcpu
);
1600 mutex_lock(&kvm
->lock
);
1601 kvm
->vcpus
[n
] = NULL
;
1602 mutex_unlock(&kvm
->lock
);
1606 kvm_mmu_unload(vcpu
);
1610 kvm_x86_ops
->vcpu_free(vcpu
);
1614 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu
*vcpu
, sigset_t
*sigset
)
1617 sigdelsetmask(sigset
, sigmask(SIGKILL
)|sigmask(SIGSTOP
));
1618 vcpu
->sigset_active
= 1;
1619 vcpu
->sigset
= *sigset
;
1621 vcpu
->sigset_active
= 0;
1626 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
1627 * we have asm/x86/processor.h
1638 u32 st_space
[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
1639 #ifdef CONFIG_X86_64
1640 u32 xmm_space
[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
1642 u32 xmm_space
[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
1646 static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu
*vcpu
, struct kvm_fpu
*fpu
)
1648 struct fxsave
*fxsave
= (struct fxsave
*)&vcpu
->guest_fx_image
;
1652 memcpy(fpu
->fpr
, fxsave
->st_space
, 128);
1653 fpu
->fcw
= fxsave
->cwd
;
1654 fpu
->fsw
= fxsave
->swd
;
1655 fpu
->ftwx
= fxsave
->twd
;
1656 fpu
->last_opcode
= fxsave
->fop
;
1657 fpu
->last_ip
= fxsave
->rip
;
1658 fpu
->last_dp
= fxsave
->rdp
;
1659 memcpy(fpu
->xmm
, fxsave
->xmm_space
, sizeof fxsave
->xmm_space
);
1666 static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu
*vcpu
, struct kvm_fpu
*fpu
)
1668 struct fxsave
*fxsave
= (struct fxsave
*)&vcpu
->guest_fx_image
;
1672 memcpy(fxsave
->st_space
, fpu
->fpr
, 128);
1673 fxsave
->cwd
= fpu
->fcw
;
1674 fxsave
->swd
= fpu
->fsw
;
1675 fxsave
->twd
= fpu
->ftwx
;
1676 fxsave
->fop
= fpu
->last_opcode
;
1677 fxsave
->rip
= fpu
->last_ip
;
1678 fxsave
->rdp
= fpu
->last_dp
;
1679 memcpy(fxsave
->xmm_space
, fpu
->xmm
, sizeof fxsave
->xmm_space
);
1686 static long kvm_vcpu_ioctl(struct file
*filp
,
1687 unsigned int ioctl
, unsigned long arg
)
1689 struct kvm_vcpu
*vcpu
= filp
->private_data
;
1690 void __user
*argp
= (void __user
*)arg
;
1698 r
= kvm_vcpu_ioctl_run(vcpu
, vcpu
->run
);
1700 case KVM_GET_REGS
: {
1701 struct kvm_regs kvm_regs
;
1703 memset(&kvm_regs
, 0, sizeof kvm_regs
);
1704 r
= kvm_vcpu_ioctl_get_regs(vcpu
, &kvm_regs
);
1708 if (copy_to_user(argp
, &kvm_regs
, sizeof kvm_regs
))
1713 case KVM_SET_REGS
: {
1714 struct kvm_regs kvm_regs
;
1717 if (copy_from_user(&kvm_regs
, argp
, sizeof kvm_regs
))
1719 r
= kvm_vcpu_ioctl_set_regs(vcpu
, &kvm_regs
);
1725 case KVM_GET_SREGS
: {
1726 struct kvm_sregs kvm_sregs
;
1728 memset(&kvm_sregs
, 0, sizeof kvm_sregs
);
1729 r
= kvm_vcpu_ioctl_get_sregs(vcpu
, &kvm_sregs
);
1733 if (copy_to_user(argp
, &kvm_sregs
, sizeof kvm_sregs
))
1738 case KVM_SET_SREGS
: {
1739 struct kvm_sregs kvm_sregs
;
1742 if (copy_from_user(&kvm_sregs
, argp
, sizeof kvm_sregs
))
1744 r
= kvm_vcpu_ioctl_set_sregs(vcpu
, &kvm_sregs
);
1750 case KVM_TRANSLATE
: {
1751 struct kvm_translation tr
;
1754 if (copy_from_user(&tr
, argp
, sizeof tr
))
1756 r
= kvm_vcpu_ioctl_translate(vcpu
, &tr
);
1760 if (copy_to_user(argp
, &tr
, sizeof tr
))
1765 case KVM_INTERRUPT
: {
1766 struct kvm_interrupt irq
;
1769 if (copy_from_user(&irq
, argp
, sizeof irq
))
1771 r
= kvm_vcpu_ioctl_interrupt(vcpu
, &irq
);
1777 case KVM_DEBUG_GUEST
: {
1778 struct kvm_debug_guest dbg
;
1781 if (copy_from_user(&dbg
, argp
, sizeof dbg
))
1783 r
= kvm_vcpu_ioctl_debug_guest(vcpu
, &dbg
);
1789 case KVM_SET_SIGNAL_MASK
: {
1790 struct kvm_signal_mask __user
*sigmask_arg
= argp
;
1791 struct kvm_signal_mask kvm_sigmask
;
1792 sigset_t sigset
, *p
;
1797 if (copy_from_user(&kvm_sigmask
, argp
,
1798 sizeof kvm_sigmask
))
1801 if (kvm_sigmask
.len
!= sizeof sigset
)
1804 if (copy_from_user(&sigset
, sigmask_arg
->sigset
,
1809 r
= kvm_vcpu_ioctl_set_sigmask(vcpu
, &sigset
);
1815 memset(&fpu
, 0, sizeof fpu
);
1816 r
= kvm_vcpu_ioctl_get_fpu(vcpu
, &fpu
);
1820 if (copy_to_user(argp
, &fpu
, sizeof fpu
))
1829 if (copy_from_user(&fpu
, argp
, sizeof fpu
))
1831 r
= kvm_vcpu_ioctl_set_fpu(vcpu
, &fpu
);
1838 r
= kvm_arch_vcpu_ioctl(filp
, ioctl
, arg
);
1844 static long kvm_vm_ioctl(struct file
*filp
,
1845 unsigned int ioctl
, unsigned long arg
)
1847 struct kvm
*kvm
= filp
->private_data
;
1848 void __user
*argp
= (void __user
*)arg
;
1852 case KVM_CREATE_VCPU
:
1853 r
= kvm_vm_ioctl_create_vcpu(kvm
, arg
);
1857 case KVM_SET_USER_MEMORY_REGION
: {
1858 struct kvm_userspace_memory_region kvm_userspace_mem
;
1861 if (copy_from_user(&kvm_userspace_mem
, argp
,
1862 sizeof kvm_userspace_mem
))
1865 r
= kvm_vm_ioctl_set_memory_region(kvm
, &kvm_userspace_mem
, 1);
1870 case KVM_GET_DIRTY_LOG
: {
1871 struct kvm_dirty_log log
;
1874 if (copy_from_user(&log
, argp
, sizeof log
))
1876 r
= kvm_vm_ioctl_get_dirty_log(kvm
, &log
);
1882 r
= kvm_arch_vm_ioctl(filp
, ioctl
, arg
);
1888 static struct page
*kvm_vm_nopage(struct vm_area_struct
*vma
,
1889 unsigned long address
,
1892 struct kvm
*kvm
= vma
->vm_file
->private_data
;
1893 unsigned long pgoff
;
1896 pgoff
= ((address
- vma
->vm_start
) >> PAGE_SHIFT
) + vma
->vm_pgoff
;
1897 if (!kvm_is_visible_gfn(kvm
, pgoff
))
1898 return NOPAGE_SIGBUS
;
1899 /* current->mm->mmap_sem is already held so call lockless version */
1900 page
= __gfn_to_page(kvm
, pgoff
);
1901 if (is_error_page(page
)) {
1902 kvm_release_page(page
);
1903 return NOPAGE_SIGBUS
;
1906 *type
= VM_FAULT_MINOR
;
1911 static struct vm_operations_struct kvm_vm_vm_ops
= {
1912 .nopage
= kvm_vm_nopage
,
1915 static int kvm_vm_mmap(struct file
*file
, struct vm_area_struct
*vma
)
1917 vma
->vm_ops
= &kvm_vm_vm_ops
;
1921 static struct file_operations kvm_vm_fops
= {
1922 .release
= kvm_vm_release
,
1923 .unlocked_ioctl
= kvm_vm_ioctl
,
1924 .compat_ioctl
= kvm_vm_ioctl
,
1925 .mmap
= kvm_vm_mmap
,
1928 static int kvm_dev_ioctl_create_vm(void)
1931 struct inode
*inode
;
1935 kvm
= kvm_create_vm();
1937 return PTR_ERR(kvm
);
1938 r
= anon_inode_getfd(&fd
, &inode
, &file
, "kvm-vm", &kvm_vm_fops
, kvm
);
1940 kvm_destroy_vm(kvm
);
1949 static long kvm_dev_ioctl(struct file
*filp
,
1950 unsigned int ioctl
, unsigned long arg
)
1952 void __user
*argp
= (void __user
*)arg
;
1956 case KVM_GET_API_VERSION
:
1960 r
= KVM_API_VERSION
;
1966 r
= kvm_dev_ioctl_create_vm();
1968 case KVM_CHECK_EXTENSION
: {
1969 int ext
= (long)argp
;
1972 case KVM_CAP_IRQCHIP
:
1974 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL
:
1975 case KVM_CAP_USER_MEMORY
:
1976 case KVM_CAP_SET_TSS_ADDR
:
1985 case KVM_GET_VCPU_MMAP_SIZE
:
1992 return kvm_arch_dev_ioctl(filp
, ioctl
, arg
);
1998 static struct file_operations kvm_chardev_ops
= {
1999 .unlocked_ioctl
= kvm_dev_ioctl
,
2000 .compat_ioctl
= kvm_dev_ioctl
,
2003 static struct miscdevice kvm_dev
= {
2010 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
2013 static void decache_vcpus_on_cpu(int cpu
)
2016 struct kvm_vcpu
*vcpu
;
2019 spin_lock(&kvm_lock
);
2020 list_for_each_entry(vm
, &vm_list
, vm_list
)
2021 for (i
= 0; i
< KVM_MAX_VCPUS
; ++i
) {
2022 vcpu
= vm
->vcpus
[i
];
2026 * If the vcpu is locked, then it is running on some
2027 * other cpu and therefore it is not cached on the
2030 * If it's not locked, check the last cpu it executed
2033 if (mutex_trylock(&vcpu
->mutex
)) {
2034 if (vcpu
->cpu
== cpu
) {
2035 kvm_x86_ops
->vcpu_decache(vcpu
);
2038 mutex_unlock(&vcpu
->mutex
);
2041 spin_unlock(&kvm_lock
);
2044 static void hardware_enable(void *junk
)
2046 int cpu
= raw_smp_processor_id();
2048 if (cpu_isset(cpu
, cpus_hardware_enabled
))
2050 cpu_set(cpu
, cpus_hardware_enabled
);
2051 kvm_x86_ops
->hardware_enable(NULL
);
2054 static void hardware_disable(void *junk
)
2056 int cpu
= raw_smp_processor_id();
2058 if (!cpu_isset(cpu
, cpus_hardware_enabled
))
2060 cpu_clear(cpu
, cpus_hardware_enabled
);
2061 decache_vcpus_on_cpu(cpu
);
2062 kvm_x86_ops
->hardware_disable(NULL
);
2065 static int kvm_cpu_hotplug(struct notifier_block
*notifier
, unsigned long val
,
2072 case CPU_DYING_FROZEN
:
2073 printk(KERN_INFO
"kvm: disabling virtualization on CPU%d\n",
2075 hardware_disable(NULL
);
2077 case CPU_UP_CANCELED
:
2078 case CPU_UP_CANCELED_FROZEN
:
2079 printk(KERN_INFO
"kvm: disabling virtualization on CPU%d\n",
2081 smp_call_function_single(cpu
, hardware_disable
, NULL
, 0, 1);
2084 case CPU_ONLINE_FROZEN
:
2085 printk(KERN_INFO
"kvm: enabling virtualization on CPU%d\n",
2087 smp_call_function_single(cpu
, hardware_enable
, NULL
, 0, 1);
2093 static int kvm_reboot(struct notifier_block
*notifier
, unsigned long val
,
2096 if (val
== SYS_RESTART
) {
2098 * Some (well, at least mine) BIOSes hang on reboot if
2101 printk(KERN_INFO
"kvm: exiting hardware virtualization\n");
2102 on_each_cpu(hardware_disable
, NULL
, 0, 1);
2107 static struct notifier_block kvm_reboot_notifier
= {
2108 .notifier_call
= kvm_reboot
,
2112 void kvm_io_bus_init(struct kvm_io_bus
*bus
)
2114 memset(bus
, 0, sizeof(*bus
));
2117 void kvm_io_bus_destroy(struct kvm_io_bus
*bus
)
2121 for (i
= 0; i
< bus
->dev_count
; i
++) {
2122 struct kvm_io_device
*pos
= bus
->devs
[i
];
2124 kvm_iodevice_destructor(pos
);
2128 struct kvm_io_device
*kvm_io_bus_find_dev(struct kvm_io_bus
*bus
, gpa_t addr
)
2132 for (i
= 0; i
< bus
->dev_count
; i
++) {
2133 struct kvm_io_device
*pos
= bus
->devs
[i
];
2135 if (pos
->in_range(pos
, addr
))
2142 void kvm_io_bus_register_dev(struct kvm_io_bus
*bus
, struct kvm_io_device
*dev
)
2144 BUG_ON(bus
->dev_count
> (NR_IOBUS_DEVS
-1));
2146 bus
->devs
[bus
->dev_count
++] = dev
;
2149 static struct notifier_block kvm_cpu_notifier
= {
2150 .notifier_call
= kvm_cpu_hotplug
,
2151 .priority
= 20, /* must be > scheduler priority */
2154 static u64
stat_get(void *_offset
)
2156 unsigned offset
= (long)_offset
;
2159 struct kvm_vcpu
*vcpu
;
2162 spin_lock(&kvm_lock
);
2163 list_for_each_entry(kvm
, &vm_list
, vm_list
)
2164 for (i
= 0; i
< KVM_MAX_VCPUS
; ++i
) {
2165 vcpu
= kvm
->vcpus
[i
];
2167 total
+= *(u32
*)((void *)vcpu
+ offset
);
2169 spin_unlock(&kvm_lock
);
2173 DEFINE_SIMPLE_ATTRIBUTE(stat_fops
, stat_get
, NULL
, "%llu\n");
2175 static __init
void kvm_init_debug(void)
2177 struct kvm_stats_debugfs_item
*p
;
2179 debugfs_dir
= debugfs_create_dir("kvm", NULL
);
2180 for (p
= debugfs_entries
; p
->name
; ++p
)
2181 p
->dentry
= debugfs_create_file(p
->name
, 0444, debugfs_dir
,
2182 (void *)(long)p
->offset
,
2186 static void kvm_exit_debug(void)
2188 struct kvm_stats_debugfs_item
*p
;
2190 for (p
= debugfs_entries
; p
->name
; ++p
)
2191 debugfs_remove(p
->dentry
);
2192 debugfs_remove(debugfs_dir
);
2195 static int kvm_suspend(struct sys_device
*dev
, pm_message_t state
)
2197 hardware_disable(NULL
);
2201 static int kvm_resume(struct sys_device
*dev
)
2203 hardware_enable(NULL
);
2207 static struct sysdev_class kvm_sysdev_class
= {
2209 .suspend
= kvm_suspend
,
2210 .resume
= kvm_resume
,
2213 static struct sys_device kvm_sysdev
= {
2215 .cls
= &kvm_sysdev_class
,
2218 struct page
*bad_page
;
2221 struct kvm_vcpu
*preempt_notifier_to_vcpu(struct preempt_notifier
*pn
)
2223 return container_of(pn
, struct kvm_vcpu
, preempt_notifier
);
2226 static void kvm_sched_in(struct preempt_notifier
*pn
, int cpu
)
2228 struct kvm_vcpu
*vcpu
= preempt_notifier_to_vcpu(pn
);
2230 kvm_x86_ops
->vcpu_load(vcpu
, cpu
);
2233 static void kvm_sched_out(struct preempt_notifier
*pn
,
2234 struct task_struct
*next
)
2236 struct kvm_vcpu
*vcpu
= preempt_notifier_to_vcpu(pn
);
2238 kvm_x86_ops
->vcpu_put(vcpu
);
2241 int kvm_init_x86(struct kvm_x86_ops
*ops
, unsigned int vcpu_size
,
2242 struct module
*module
)
2248 printk(KERN_ERR
"kvm: already loaded the other module\n");
2252 if (!ops
->cpu_has_kvm_support()) {
2253 printk(KERN_ERR
"kvm: no hardware support\n");
2256 if (ops
->disabled_by_bios()) {
2257 printk(KERN_ERR
"kvm: disabled by bios\n");
2263 r
= kvm_x86_ops
->hardware_setup();
2267 for_each_online_cpu(cpu
) {
2268 smp_call_function_single(cpu
,
2269 kvm_x86_ops
->check_processor_compatibility
,
2275 on_each_cpu(hardware_enable
, NULL
, 0, 1);
2276 r
= register_cpu_notifier(&kvm_cpu_notifier
);
2279 register_reboot_notifier(&kvm_reboot_notifier
);
2281 r
= sysdev_class_register(&kvm_sysdev_class
);
2285 r
= sysdev_register(&kvm_sysdev
);
2289 /* A kmem cache lets us meet the alignment requirements of fx_save. */
2290 kvm_vcpu_cache
= kmem_cache_create("kvm_vcpu", vcpu_size
,
2291 __alignof__(struct kvm_vcpu
), 0, 0);
2292 if (!kvm_vcpu_cache
) {
2297 kvm_chardev_ops
.owner
= module
;
2299 r
= misc_register(&kvm_dev
);
2301 printk(KERN_ERR
"kvm: misc device register failed\n");
2305 kvm_preempt_ops
.sched_in
= kvm_sched_in
;
2306 kvm_preempt_ops
.sched_out
= kvm_sched_out
;
2308 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2313 kmem_cache_destroy(kvm_vcpu_cache
);
2315 sysdev_unregister(&kvm_sysdev
);
2317 sysdev_class_unregister(&kvm_sysdev_class
);
2319 unregister_reboot_notifier(&kvm_reboot_notifier
);
2320 unregister_cpu_notifier(&kvm_cpu_notifier
);
2322 on_each_cpu(hardware_disable
, NULL
, 0, 1);
2324 kvm_x86_ops
->hardware_unsetup();
2329 EXPORT_SYMBOL_GPL(kvm_init_x86
);
2331 void kvm_exit_x86(void)
2333 misc_deregister(&kvm_dev
);
2334 kmem_cache_destroy(kvm_vcpu_cache
);
2335 sysdev_unregister(&kvm_sysdev
);
2336 sysdev_class_unregister(&kvm_sysdev_class
);
2337 unregister_reboot_notifier(&kvm_reboot_notifier
);
2338 unregister_cpu_notifier(&kvm_cpu_notifier
);
2339 on_each_cpu(hardware_disable
, NULL
, 0, 1);
2340 kvm_x86_ops
->hardware_unsetup();
2343 EXPORT_SYMBOL_GPL(kvm_exit_x86
);
2345 static __init
int kvm_init(void)
2349 r
= kvm_mmu_module_init();
2357 bad_page
= alloc_page(GFP_KERNEL
| __GFP_ZERO
);
2359 if (bad_page
== NULL
) {
2368 kvm_mmu_module_exit();
2373 static __exit
void kvm_exit(void)
2376 __free_page(bad_page
);
2377 kvm_mmu_module_exit();
2380 module_init(kvm_init
)
2381 module_exit(kvm_exit
)