arch/x86/kvm/x86.c

   1 /*
   2  * Kernel-based Virtual Machine driver for Linux
   3  *
   4  * derived from drivers/kvm/kvm_main.c
   5  *
   6  * Copyright (C) 2006 Qumranet, Inc.
   7  * Copyright (C) 2008 Qumranet, Inc.
   8  * Copyright IBM Corporation, 2008
   9  *
  10  * Authors:
  11  *   Avi Kivity   <avi@qumranet.com>
  12  *   Yaniv Kamay  <yaniv@qumranet.com>
  13  *   Amit Shah    <amit.shah@qumranet.com>
  14  *   Ben-Ami Yassour <benami@il.ibm.com>
  15  *
  16  * This work is licensed under the terms of the GNU GPL, version 2.  See
  17  * the COPYING file in the top-level directory.
  18  *
  19  */
  20
  21 #include <linux/kvm_host.h>
  22 #include "irq.h"
  23 #include "mmu.h"
  24 #include "i8254.h"
  25 #include "tss.h"
  26 #include "kvm_cache_regs.h"
  27 #include "x86.h"
  28
  29 #include <linux/clocksource.h>
  30 #include <linux/interrupt.h>
  31 #include <linux/kvm.h>
  32 #include <linux/fs.h>
  33 #include <linux/vmalloc.h>
  34 #include <linux/module.h>
  35 #include <linux/mman.h>
  36 #include <linux/highmem.h>
  37 #include <linux/iommu.h>
  38 #include <linux/intel-iommu.h>
  39 #include <linux/cpufreq.h>
  40
  41 #include <asm/uaccess.h>
  42 #include <asm/msr.h>
  43 #include <asm/desc.h>
  44 #include <asm/mtrr.h>
  45
  46 #define MAX_IO_MSRS 256
  47 #define CR0_RESERVED_BITS                                               \
  48         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
  49                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  50                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
  51 #define CR4_RESERVED_BITS                                               \
  52         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
  53                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
  54                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
  55                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
  56
  57 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
  58 /* EFER defaults:
  59  * - enable syscall per default because its emulated by KVM
  60  * - enable LME and LMA per default on 64 bit KVM
  61  */
  62 #ifdef CONFIG_X86_64
  63 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
  64 #else
  65 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
  66 #endif
  67
  68 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  69 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  70
  71 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
  72                                     struct kvm_cpuid_entry2 __user *entries);
  73 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
  74                                               u32 function, u32 index);
  75
  76 struct kvm_x86_ops *kvm_x86_ops;
  77 EXPORT_SYMBOL_GPL(kvm_x86_ops);
  78
  79 struct kvm_stats_debugfs_item debugfs_entries[] = {
  80         { "pf_fixed", VCPU_STAT(pf_fixed) },
  81         { "pf_guest", VCPU_STAT(pf_guest) },
  82         { "tlb_flush", VCPU_STAT(tlb_flush) },
  83         { "invlpg", VCPU_STAT(invlpg) },
  84         { "exits", VCPU_STAT(exits) },
  85         { "io_exits", VCPU_STAT(io_exits) },
  86         { "mmio_exits", VCPU_STAT(mmio_exits) },
  87         { "signal_exits", VCPU_STAT(signal_exits) },
  88         { "irq_window", VCPU_STAT(irq_window_exits) },
  89         { "nmi_window", VCPU_STAT(nmi_window_exits) },
  90         { "halt_exits", VCPU_STAT(halt_exits) },
  91         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
  92         { "hypercalls", VCPU_STAT(hypercalls) },
  93         { "request_irq", VCPU_STAT(request_irq_exits) },
  94         { "request_nmi", VCPU_STAT(request_nmi_exits) },
  95         { "irq_exits", VCPU_STAT(irq_exits) },
  96         { "host_state_reload", VCPU_STAT(host_state_reload) },
  97         { "efer_reload", VCPU_STAT(efer_reload) },
  98         { "fpu_reload", VCPU_STAT(fpu_reload) },
  99         { "insn_emulation", VCPU_STAT(insn_emulation) },
 100         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 101         { "irq_injections", VCPU_STAT(irq_injections) },
 102         { "nmi_injections", VCPU_STAT(nmi_injections) },
 103         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 104         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
 105         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
 106         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
 107         { "mmu_flooded", VM_STAT(mmu_flooded) },
 108         { "mmu_recycled", VM_STAT(mmu_recycled) },
 109         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 110         { "mmu_unsync", VM_STAT(mmu_unsync) },
 111         { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
 112         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 113         { "largepages", VM_STAT(lpages) },
 114         { NULL }
 115 };
 116
 117 unsigned long segment_base(u16 selector)
 118 {
 119         struct descriptor_table gdt;
 120         struct desc_struct *d;
 121         unsigned long table_base;
 122         unsigned long v;
 123
 124         if (selector == 0)
 125                 return 0;
 126
 127         asm("sgdt %0" : "=m"(gdt));
 128         table_base = gdt.base;
 129
 130         if (selector & 4) {           /* from ldt */
 131                 u16 ldt_selector;
 132
 133                 asm("sldt %0" : "=g"(ldt_selector));
 134                 table_base = segment_base(ldt_selector);
 135         }
 136         d = (struct desc_struct *)(table_base + (selector & ~7));
 137         v = d->base0 | ((unsigned long)d->base1 << 16) |
 138                 ((unsigned long)d->base2 << 24);
 139 #ifdef CONFIG_X86_64
 140         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
 141                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
 142 #endif
 143         return v;
 144 }
 145 EXPORT_SYMBOL_GPL(segment_base);
 146
 147 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 148 {
 149         if (irqchip_in_kernel(vcpu->kvm))
 150                 return vcpu->arch.apic_base;
 151         else
 152                 return vcpu->arch.apic_base;
 153 }
 154 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 155
 156 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 157 {
 158         /* TODO: reserve bits check */
 159         if (irqchip_in_kernel(vcpu->kvm))
 160                 kvm_lapic_set_base(vcpu, data);
 161         else
 162                 vcpu->arch.apic_base = data;
 163 }
 164 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 165
 166 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 167 {
 168         WARN_ON(vcpu->arch.exception.pending);
 169         vcpu->arch.exception.pending = true;
 170         vcpu->arch.exception.has_error_code = false;
 171         vcpu->arch.exception.nr = nr;
 172 }
 173 EXPORT_SYMBOL_GPL(kvm_queue_exception);
 174
 175 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 176                            u32 error_code)
 177 {
 178         ++vcpu->stat.pf_guest;
 179
 180         if (vcpu->arch.exception.pending) {
 181                 if (vcpu->arch.exception.nr == PF_VECTOR) {
 182                         printk(KERN_DEBUG "kvm: inject_page_fault:"
 183                                         " double fault 0x%lx\n", addr);
 184                         vcpu->arch.exception.nr = DF_VECTOR;
 185                         vcpu->arch.exception.error_code = 0;
 186                 } else if (vcpu->arch.exception.nr == DF_VECTOR) {
 187                         /* triple fault -> shutdown */
 188                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
 189                 }
 190                 return;
 191         }
 192         vcpu->arch.cr2 = addr;
 193         kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 194 }
 195
 196 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 197 {
 198         vcpu->arch.nmi_pending = 1;
 199 }
 200 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 201
 202 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 203 {
 204         WARN_ON(vcpu->arch.exception.pending);
 205         vcpu->arch.exception.pending = true;
 206         vcpu->arch.exception.has_error_code = true;
 207         vcpu->arch.exception.nr = nr;
 208         vcpu->arch.exception.error_code = error_code;
 209 }
 210 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 211
 212 static void __queue_exception(struct kvm_vcpu *vcpu)
 213 {
 214         kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
 215                                      vcpu->arch.exception.has_error_code,
 216                                      vcpu->arch.exception.error_code);
 217 }
 218
 219 /*
 220  * Load the pae pdptrs.  Return true is they are all valid.
 221  */
 222 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 223 {
 224         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 225         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 226         int i;
 227         int ret;
 228         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 229
 230         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
 231                                   offset * sizeof(u64), sizeof(pdpte));
 232         if (ret < 0) {
 233                 ret = 0;
 234                 goto out;
 235         }
 236         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 237                 if (is_present_pte(pdpte[i]) &&
 238                     (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
 239                         ret = 0;
 240                         goto out;
 241                 }
 242         }
 243         ret = 1;
 244
 245         memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 246 out:
 247
 248         return ret;
 249 }
 250 EXPORT_SYMBOL_GPL(load_pdptrs);
 251
 252 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 253 {
 254         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 255         bool changed = true;
 256         int r;
 257
 258         if (is_long_mode(vcpu) || !is_pae(vcpu))
 259                 return false;
 260
 261         r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
 262         if (r < 0)
 263                 goto out;
 264         changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 265 out:
 266
 267         return changed;
 268 }
 269
 270 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 271 {
 272         if (cr0 & CR0_RESERVED_BITS) {
 273                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
 274                        cr0, vcpu->arch.cr0);
 275                 kvm_inject_gp(vcpu, 0);
 276                 return;
 277         }
 278
 279         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 280                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
 281                 kvm_inject_gp(vcpu, 0);
 282                 return;
 283         }
 284
 285         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 286                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
 287                        "and a clear PE flag\n");
 288                 kvm_inject_gp(vcpu, 0);
 289                 return;
 290         }
 291
 292         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 293 #ifdef CONFIG_X86_64
 294                 if ((vcpu->arch.shadow_efer & EFER_LME)) {
 295                         int cs_db, cs_l;
 296
 297                         if (!is_pae(vcpu)) {
 298                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 299                                        "in long mode while PAE is disabled\n");
 300                                 kvm_inject_gp(vcpu, 0);
 301                                 return;
 302                         }
 303                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 304                         if (cs_l) {
 305                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 306                                        "in long mode while CS.L == 1\n");
 307                                 kvm_inject_gp(vcpu, 0);
 308                                 return;
 309
 310                         }
 311                 } else
 312 #endif
 313                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 314                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
 315                                "reserved bits\n");
 316                         kvm_inject_gp(vcpu, 0);
 317                         return;
 318                 }
 319
 320         }
 321
 322         kvm_x86_ops->set_cr0(vcpu, cr0);
 323         vcpu->arch.cr0 = cr0;
 324
 325         kvm_mmu_sync_global(vcpu);
 326         kvm_mmu_reset_context(vcpu);
 327         return;
 328 }
 329 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 330
 331 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 332 {
 333         kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
 334         KVMTRACE_1D(LMSW, vcpu,
 335                     (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
 336                     handler);
 337 }
 338 EXPORT_SYMBOL_GPL(kvm_lmsw);
 339
 340 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 341 {
 342         unsigned long old_cr4 = vcpu->arch.cr4;
 343         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
 344
 345         if (cr4 & CR4_RESERVED_BITS) {
 346                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
 347                 kvm_inject_gp(vcpu, 0);
 348                 return;
 349         }
 350
 351         if (is_long_mode(vcpu)) {
 352                 if (!(cr4 & X86_CR4_PAE)) {
 353                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
 354                                "in long mode\n");
 355                         kvm_inject_gp(vcpu, 0);
 356                         return;
 357                 }
 358         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 359                    && ((cr4 ^ old_cr4) & pdptr_bits)
 360                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 361                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
 362                 kvm_inject_gp(vcpu, 0);
 363                 return;
 364         }
 365
 366         if (cr4 & X86_CR4_VMXE) {
 367                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 368                 kvm_inject_gp(vcpu, 0);
 369                 return;
 370         }
 371         kvm_x86_ops->set_cr4(vcpu, cr4);
 372         vcpu->arch.cr4 = cr4;
 373         vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
 374         kvm_mmu_sync_global(vcpu);
 375         kvm_mmu_reset_context(vcpu);
 376 }
 377 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 378
 379 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 380 {
 381         if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 382                 kvm_mmu_sync_roots(vcpu);
 383                 kvm_mmu_flush_tlb(vcpu);
 384                 return;
 385         }
 386
 387         if (is_long_mode(vcpu)) {
 388                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 389                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
 390                         kvm_inject_gp(vcpu, 0);
 391                         return;
 392                 }
 393         } else {
 394                 if (is_pae(vcpu)) {
 395                         if (cr3 & CR3_PAE_RESERVED_BITS) {
 396                                 printk(KERN_DEBUG
 397                                        "set_cr3: #GP, reserved bits\n");
 398                                 kvm_inject_gp(vcpu, 0);
 399                                 return;
 400                         }
 401                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
 402                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
 403                                        "reserved bits\n");
 404                                 kvm_inject_gp(vcpu, 0);
 405                                 return;
 406                         }
 407                 }
 408                 /*
 409                  * We don't check reserved bits in nonpae mode, because
 410                  * this isn't enforced, and VMware depends on this.
 411                  */
 412         }
 413
 414         /*
 415          * Does the new cr3 value map to physical memory? (Note, we
 416          * catch an invalid cr3 even in real-mode, because it would
 417          * cause trouble later on when we turn on paging anyway.)
 418          *
 419          * A real CPU would silently accept an invalid cr3 and would
 420          * attempt to use it - with largely undefined (and often hard
 421          * to debug) behavior on the guest side.
 422          */
 423         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 424                 kvm_inject_gp(vcpu, 0);
 425         else {
 426                 vcpu->arch.cr3 = cr3;
 427                 vcpu->arch.mmu.new_cr3(vcpu);
 428         }
 429 }
 430 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 431
 432 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 433 {
 434         if (cr8 & CR8_RESERVED_BITS) {
 435                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
 436                 kvm_inject_gp(vcpu, 0);
 437                 return;
 438         }
 439         if (irqchip_in_kernel(vcpu->kvm))
 440                 kvm_lapic_set_tpr(vcpu, cr8);
 441         else
 442                 vcpu->arch.cr8 = cr8;
 443 }
 444 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 445
 446 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 447 {
 448         if (irqchip_in_kernel(vcpu->kvm))
 449                 return kvm_lapic_get_cr8(vcpu);
 450         else
 451                 return vcpu->arch.cr8;
 452 }
 453 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 454
 455 static inline u32 bit(int bitno)
 456 {
 457         return 1 << (bitno & 31);
 458 }
 459
 460 /*
 461  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 462  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 463  *
 464  * This list is modified at module load time to reflect the
 465  * capabilities of the host cpu.
 466  */
 467 static u32 msrs_to_save[] = {
 468         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 469         MSR_K6_STAR,
 470 #ifdef CONFIG_X86_64
 471         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 472 #endif
 473         MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 474         MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 475 };
 476
 477 static unsigned num_msrs_to_save;
 478
 479 static u32 emulated_msrs[] = {
 480         MSR_IA32_MISC_ENABLE,
 481 };
 482
 483 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 484 {
 485         if (efer & efer_reserved_bits) {
 486                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
 487                        efer);
 488                 kvm_inject_gp(vcpu, 0);
 489                 return;
 490         }
 491
 492         if (is_paging(vcpu)
 493             && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
 494                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
 495                 kvm_inject_gp(vcpu, 0);
 496                 return;
 497         }
 498
 499         if (efer & EFER_FFXSR) {
 500                 struct kvm_cpuid_entry2 *feat;
 501
 502                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 503                 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
 504                         printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
 505                         kvm_inject_gp(vcpu, 0);
 506                         return;
 507                 }
 508         }
 509
 510         if (efer & EFER_SVME) {
 511                 struct kvm_cpuid_entry2 *feat;
 512
 513                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 514                 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
 515                         printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
 516                         kvm_inject_gp(vcpu, 0);
 517                         return;
 518                 }
 519         }
 520
 521         kvm_x86_ops->set_efer(vcpu, efer);
 522
 523         efer &= ~EFER_LMA;
 524         efer |= vcpu->arch.shadow_efer & EFER_LMA;
 525
 526         vcpu->arch.shadow_efer = efer;
 527
 528         vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 529         kvm_mmu_reset_context(vcpu);
 530 }
 531
 532 void kvm_enable_efer_bits(u64 mask)
 533 {
 534        efer_reserved_bits &= ~mask;
 535 }
 536 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 537
 538
 539 /*
 540  * Writes msr value into into the appropriate "register".
 541  * Returns 0 on success, non-0 otherwise.
 542  * Assumes vcpu_load() was already called.
 543  */
 544 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 545 {
 546         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
 547 }
 548
 549 /*
 550  * Adapt set_msr() to msr_io()'s calling convention
 551  */
 552 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 553 {
 554         return kvm_set_msr(vcpu, index, *data);
 555 }
 556
 557 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 558 {
 559         static int version;
 560         struct pvclock_wall_clock wc;
 561         struct timespec now, sys, boot;
 562
 563         if (!wall_clock)
 564                 return;
 565
 566         version++;
 567
 568         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 569
 570         /*
 571          * The guest calculates current wall clock time by adding
 572          * system time (updated by kvm_write_guest_time below) to the
 573          * wall clock specified here.  guest system time equals host
 574          * system time for us, thus we must fill in host boot time here.
 575          */
 576         now = current_kernel_time();
 577         ktime_get_ts(&sys);
 578         boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
 579
 580         wc.sec = boot.tv_sec;
 581         wc.nsec = boot.tv_nsec;
 582         wc.version = version;
 583
 584         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 585
 586         version++;
 587         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 588 }
 589
 590 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 591 {
 592         uint32_t quotient, remainder;
 593
 594         /* Don't try to replace with do_div(), this one calculates
 595          * "(dividend << 32) / divisor" */
 596         __asm__ ( "divl %4"
 597                   : "=a" (quotient), "=d" (remainder)
 598                   : "0" (0), "1" (dividend), "r" (divisor) );
 599         return quotient;
 600 }
 601
 602 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
 603 {
 604         uint64_t nsecs = 1000000000LL;
 605         int32_t  shift = 0;
 606         uint64_t tps64;
 607         uint32_t tps32;
 608
 609         tps64 = tsc_khz * 1000LL;
 610         while (tps64 > nsecs*2) {
 611                 tps64 >>= 1;
 612                 shift--;
 613         }
 614
 615         tps32 = (uint32_t)tps64;
 616         while (tps32 <= (uint32_t)nsecs) {
 617                 tps32 <<= 1;
 618                 shift++;
 619         }
 620
 621         hv_clock->tsc_shift = shift;
 622         hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
 623
 624         pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
 625                  __func__, tsc_khz, hv_clock->tsc_shift,
 626                  hv_clock->tsc_to_system_mul);
 627 }
 628
 629 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 630
 631 static void kvm_write_guest_time(struct kvm_vcpu *v)
 632 {
 633         struct timespec ts;
 634         unsigned long flags;
 635         struct kvm_vcpu_arch *vcpu = &v->arch;
 636         void *shared_kaddr;
 637
 638         if ((!vcpu->time_page))
 639                 return;
 640
 641         preempt_disable();
 642         if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
 643                 kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
 644                 vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 645         }
 646         preempt_enable();
 647
 648         /* Keep irq disabled to prevent changes to the clock */
 649         local_irq_save(flags);
 650         kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
 651                           &vcpu->hv_clock.tsc_timestamp);
 652         ktime_get_ts(&ts);
 653         local_irq_restore(flags);
 654
 655         /* With all the info we got, fill in the values */
 656
 657         vcpu->hv_clock.system_time = ts.tv_nsec +
 658                                      (NSEC_PER_SEC * (u64)ts.tv_sec);
 659         /*
 660          * The interface expects us to write an even number signaling that the
 661          * update is finished. Since the guest won't see the intermediate
 662          * state, we just increase by 2 at the end.
 663          */
 664         vcpu->hv_clock.version += 2;
 665
 666         shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
 667
 668         memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
 669                sizeof(vcpu->hv_clock));
 670
 671         kunmap_atomic(shared_kaddr, KM_USER0);
 672
 673         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 674 }
 675
 676 static int kvm_request_guest_time_update(struct kvm_vcpu *v)
 677 {
 678         struct kvm_vcpu_arch *vcpu = &v->arch;
 679
 680         if (!vcpu->time_page)
 681                 return 0;
 682         set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
 683         return 1;
 684 }
 685
 686 static bool msr_mtrr_valid(unsigned msr)
 687 {
 688         switch (msr) {
 689         case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
 690         case MSR_MTRRfix64K_00000:
 691         case MSR_MTRRfix16K_80000:
 692         case MSR_MTRRfix16K_A0000:
 693         case MSR_MTRRfix4K_C0000:
 694         case MSR_MTRRfix4K_C8000:
 695         case MSR_MTRRfix4K_D0000:
 696         case MSR_MTRRfix4K_D8000:
 697         case MSR_MTRRfix4K_E0000:
 698         case MSR_MTRRfix4K_E8000:
 699         case MSR_MTRRfix4K_F0000:
 700         case MSR_MTRRfix4K_F8000:
 701         case MSR_MTRRdefType:
 702         case MSR_IA32_CR_PAT:
 703                 return true;
 704         case 0x2f8:
 705                 return true;
 706         }
 707         return false;
 708 }
 709
 710 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 711 {
 712         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 713
 714         if (!msr_mtrr_valid(msr))
 715                 return 1;
 716
 717         if (msr == MSR_MTRRdefType) {
 718                 vcpu->arch.mtrr_state.def_type = data;
 719                 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
 720         } else if (msr == MSR_MTRRfix64K_00000)
 721                 p[0] = data;
 722         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 723                 p[1 + msr - MSR_MTRRfix16K_80000] = data;
 724         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 725                 p[3 + msr - MSR_MTRRfix4K_C0000] = data;
 726         else if (msr == MSR_IA32_CR_PAT)
 727                 vcpu->arch.pat = data;
 728         else {  /* Variable MTRRs */
 729                 int idx, is_mtrr_mask;
 730                 u64 *pt;
 731
 732                 idx = (msr - 0x200) / 2;
 733                 is_mtrr_mask = msr - 0x200 - 2 * idx;
 734                 if (!is_mtrr_mask)
 735                         pt =
 736                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
 737                 else
 738                         pt =
 739                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
 740                 *pt = data;
 741         }
 742
 743         kvm_mmu_reset_context(vcpu);
 744         return 0;
 745 }
 746
 747 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 748 {
 749         switch (msr) {
 750         case MSR_EFER:
 751                 set_efer(vcpu, data);
 752                 break;
 753         case MSR_IA32_MC0_STATUS:
 754                 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
 755                        __func__, data);
 756                 break;
 757         case MSR_IA32_MCG_STATUS:
 758                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
 759                         __func__, data);
 760                 break;
 761         case MSR_IA32_MCG_CTL:
 762                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
 763                         __func__, data);
 764                 break;
 765         case MSR_IA32_DEBUGCTLMSR:
 766                 if (!data) {
 767                         /* We support the non-activated case already */
 768                         break;
 769                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
 770                         /* Values other than LBR and BTF are vendor-specific,
 771                            thus reserved and should throw a #GP */
 772                         return 1;
 773                 }
 774                 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
 775                         __func__, data);
 776                 break;
 777         case MSR_IA32_UCODE_REV:
 778         case MSR_IA32_UCODE_WRITE:
 779         case MSR_VM_HSAVE_PA:
 780                 break;
 781         case 0x200 ... 0x2ff:
 782                 return set_msr_mtrr(vcpu, msr, data);
 783         case MSR_IA32_APICBASE:
 784                 kvm_set_apic_base(vcpu, data);
 785                 break;
 786         case MSR_IA32_MISC_ENABLE:
 787                 vcpu->arch.ia32_misc_enable_msr = data;
 788                 break;
 789         case MSR_KVM_WALL_CLOCK:
 790                 vcpu->kvm->arch.wall_clock = data;
 791                 kvm_write_wall_clock(vcpu->kvm, data);
 792                 break;
 793         case MSR_KVM_SYSTEM_TIME: {
 794                 if (vcpu->arch.time_page) {
 795                         kvm_release_page_dirty(vcpu->arch.time_page);
 796                         vcpu->arch.time_page = NULL;
 797                 }
 798
 799                 vcpu->arch.time = data;
 800
 801                 /* we verify if the enable bit is set... */
 802                 if (!(data & 1))
 803                         break;
 804
 805                 /* ...but clean it before doing the actual write */
 806                 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 807
 808                 vcpu->arch.time_page =
 809                                 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
 810
 811                 if (is_error_page(vcpu->arch.time_page)) {
 812                         kvm_release_page_clean(vcpu->arch.time_page);
 813                         vcpu->arch.time_page = NULL;
 814                 }
 815
 816                 kvm_request_guest_time_update(vcpu);
 817                 break;
 818         }
 819         default:
 820                 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
 821                 return 1;
 822         }
 823         return 0;
 824 }
 825 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
 826
 827
 828 /*
 829  * Reads an msr value (of 'msr_index') into 'pdata'.
 830  * Returns 0 on success, non-0 otherwise.
 831  * Assumes vcpu_load() was already called.
 832  */
 833 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 834 {
 835         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
 836 }
 837
 838 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 839 {
 840         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 841
 842         if (!msr_mtrr_valid(msr))
 843                 return 1;
 844
 845         if (msr == MSR_MTRRdefType)
 846                 *pdata = vcpu->arch.mtrr_state.def_type +
 847                          (vcpu->arch.mtrr_state.enabled << 10);
 848         else if (msr == MSR_MTRRfix64K_00000)
 849                 *pdata = p[0];
 850         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 851                 *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
 852         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 853                 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
 854         else if (msr == MSR_IA32_CR_PAT)
 855                 *pdata = vcpu->arch.pat;
 856         else {  /* Variable MTRRs */
 857                 int idx, is_mtrr_mask;
 858                 u64 *pt;
 859
 860                 idx = (msr - 0x200) / 2;
 861                 is_mtrr_mask = msr - 0x200 - 2 * idx;
 862                 if (!is_mtrr_mask)
 863                         pt =
 864                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
 865                 else
 866                         pt =
 867                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
 868                 *pdata = *pt;
 869         }
 870
 871         return 0;
 872 }
 873
 874 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 875 {
 876         u64 data;
 877
 878         switch (msr) {
 879         case 0xc0010010: /* SYSCFG */
 880         case 0xc0010015: /* HWCR */
 881         case MSR_IA32_PLATFORM_ID:
 882         case MSR_IA32_P5_MC_ADDR:
 883         case MSR_IA32_P5_MC_TYPE:
 884         case MSR_IA32_MC0_CTL:
 885         case MSR_IA32_MCG_STATUS:
 886         case MSR_IA32_MCG_CAP:
 887         case MSR_IA32_MCG_CTL:
 888         case MSR_IA32_MC0_MISC:
 889         case MSR_IA32_MC0_MISC+4:
 890         case MSR_IA32_MC0_MISC+8:
 891         case MSR_IA32_MC0_MISC+12:
 892         case MSR_IA32_MC0_MISC+16:
 893         case MSR_IA32_MC0_MISC+20:
 894         case MSR_IA32_UCODE_REV:
 895         case MSR_IA32_EBL_CR_POWERON:
 896         case MSR_IA32_DEBUGCTLMSR:
 897         case MSR_IA32_LASTBRANCHFROMIP:
 898         case MSR_IA32_LASTBRANCHTOIP:
 899         case MSR_IA32_LASTINTFROMIP:
 900         case MSR_IA32_LASTINTTOIP:
 901         case MSR_VM_HSAVE_PA:
 902         case MSR_P6_EVNTSEL0:
 903         case MSR_P6_EVNTSEL1:
 904                 data = 0;
 905                 break;
 906         case MSR_MTRRcap:
 907                 data = 0x500 | KVM_NR_VAR_MTRR;
 908                 break;
 909         case 0x200 ... 0x2ff:
 910                 return get_msr_mtrr(vcpu, msr, pdata);
 911         case 0xcd: /* fsb frequency */
 912                 data = 3;
 913                 break;
 914         case MSR_IA32_APICBASE:
 915                 data = kvm_get_apic_base(vcpu);
 916                 break;
 917         case MSR_IA32_MISC_ENABLE:
 918                 data = vcpu->arch.ia32_misc_enable_msr;
 919                 break;
 920         case MSR_IA32_PERF_STATUS:
 921                 /* TSC increment by tick */
 922                 data = 1000ULL;
 923                 /* CPU multiplier */
 924                 data |= (((uint64_t)4ULL) << 40);
 925                 break;
 926         case MSR_EFER:
 927                 data = vcpu->arch.shadow_efer;
 928                 break;
 929         case MSR_KVM_WALL_CLOCK:
 930                 data = vcpu->kvm->arch.wall_clock;
 931                 break;
 932         case MSR_KVM_SYSTEM_TIME:
 933                 data = vcpu->arch.time;
 934                 break;
 935         default:
 936                 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 937                 return 1;
 938         }
 939         *pdata = data;
 940         return 0;
 941 }
 942 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
 943
 944 /*
 945  * Read or write a bunch of msrs. All parameters are kernel addresses.
 946  *
 947  * @return number of msrs set successfully.
 948  */
 949 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 950                     struct kvm_msr_entry *entries,
 951                     int (*do_msr)(struct kvm_vcpu *vcpu,
 952                                   unsigned index, u64 *data))
 953 {
 954         int i;
 955
 956         vcpu_load(vcpu);
 957
 958         down_read(&vcpu->kvm->slots_lock);
 959         for (i = 0; i < msrs->nmsrs; ++i)
 960                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
 961                         break;
 962         up_read(&vcpu->kvm->slots_lock);
 963
 964         vcpu_put(vcpu);
 965
 966         return i;
 967 }
 968
 969 /*
 970  * Read or write a bunch of msrs. Parameters are user addresses.
 971  *
 972  * @return number of msrs set successfully.
 973  */
 974 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
 975                   int (*do_msr)(struct kvm_vcpu *vcpu,
 976                                 unsigned index, u64 *data),
 977                   int writeback)
 978 {
 979         struct kvm_msrs msrs;
 980         struct kvm_msr_entry *entries;
 981         int r, n;
 982         unsigned size;
 983
 984         r = -EFAULT;
 985         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
 986                 goto out;
 987
 988         r = -E2BIG;
 989         if (msrs.nmsrs >= MAX_IO_MSRS)
 990                 goto out;
 991
 992         r = -ENOMEM;
 993         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
 994         entries = vmalloc(size);
 995         if (!entries)
 996                 goto out;
 997
 998         r = -EFAULT;
 999         if (copy_from_user(entries, user_msrs->entries, size))
1000                 goto out_free;
1001
1002         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
1003         if (r < 0)
1004                 goto out_free;
1005
1006         r = -EFAULT;
1007         if (writeback && copy_to_user(user_msrs->entries, entries, size))
1008                 goto out_free;
1009
1010         r = n;
1011
1012 out_free:
1013         vfree(entries);
1014 out:
1015         return r;
1016 }
1017
1018 int kvm_dev_ioctl_check_extension(long ext)
1019 {
1020         int r;
1021
1022         switch (ext) {
1023         case KVM_CAP_IRQCHIP:
1024         case KVM_CAP_HLT:
1025         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
1026         case KVM_CAP_SET_TSS_ADDR:
1027         case KVM_CAP_EXT_CPUID:
1028         case KVM_CAP_CLOCKSOURCE:
1029         case KVM_CAP_PIT:
1030         case KVM_CAP_NOP_IO_DELAY:
1031         case KVM_CAP_MP_STATE:
1032         case KVM_CAP_SYNC_MMU:
1033         case KVM_CAP_REINJECT_CONTROL:
1034         case KVM_CAP_IRQ_INJECT_STATUS:
1035         case KVM_CAP_ASSIGN_DEV_IRQ:
1036                 r = 1;
1037                 break;
1038         case KVM_CAP_COALESCED_MMIO:
1039                 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
1040                 break;
1041         case KVM_CAP_VAPIC:
1042                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
1043                 break;
1044         case KVM_CAP_NR_VCPUS:
1045                 r = KVM_MAX_VCPUS;
1046                 break;
1047         case KVM_CAP_NR_MEMSLOTS:
1048                 r = KVM_MEMORY_SLOTS;
1049                 break;
1050         case KVM_CAP_PV_MMU:
1051                 r = !tdp_enabled;
1052                 break;
1053         case KVM_CAP_IOMMU:
1054                 r = iommu_found();
1055                 break;
1056         default:
1057                 r = 0;
1058                 break;
1059         }
1060         return r;
1061
1062 }
1063
1064 long kvm_arch_dev_ioctl(struct file *filp,
1065                         unsigned int ioctl, unsigned long arg)
1066 {
1067         void __user *argp = (void __user *)arg;
1068         long r;
1069
1070         switch (ioctl) {
1071         case KVM_GET_MSR_INDEX_LIST: {
1072                 struct kvm_msr_list __user *user_msr_list = argp;
1073                 struct kvm_msr_list msr_list;
1074                 unsigned n;
1075
1076                 r = -EFAULT;
1077                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1078                         goto out;
1079                 n = msr_list.nmsrs;
1080                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1081                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1082                         goto out;
1083                 r = -E2BIG;
1084                 if (n < num_msrs_to_save)
1085                         goto out;
1086                 r = -EFAULT;
1087                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1088                                  num_msrs_to_save * sizeof(u32)))
1089                         goto out;
1090                 if (copy_to_user(user_msr_list->indices
1091                                  + num_msrs_to_save * sizeof(u32),
1092                                  &emulated_msrs,
1093                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1094                         goto out;
1095                 r = 0;
1096                 break;
1097         }
1098         case KVM_GET_SUPPORTED_CPUID: {
1099                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1100                 struct kvm_cpuid2 cpuid;
1101
1102                 r = -EFAULT;
1103                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1104                         goto out;
1105                 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
1106                                                       cpuid_arg->entries);
1107                 if (r)
1108                         goto out;
1109
1110                 r = -EFAULT;
1111                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1112                         goto out;
1113                 r = 0;
1114                 break;
1115         }
1116         default:
1117                 r = -EINVAL;
1118         }
1119 out:
1120         return r;
1121 }
1122
1123 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1124 {
1125         kvm_x86_ops->vcpu_load(vcpu, cpu);
1126         kvm_request_guest_time_update(vcpu);
1127 }
1128
1129 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1130 {
1131         kvm_x86_ops->vcpu_put(vcpu);
1132         kvm_put_guest_fpu(vcpu);
1133 }
1134
1135 static int is_efer_nx(void)
1136 {
1137         unsigned long long efer = 0;
1138
1139         rdmsrl_safe(MSR_EFER, &efer);
1140         return efer & EFER_NX;
1141 }
1142
1143 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
1144 {
1145         int i;
1146         struct kvm_cpuid_entry2 *e, *entry;
1147
1148         entry = NULL;
1149         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
1150                 e = &vcpu->arch.cpuid_entries[i];
1151                 if (e->function == 0x80000001) {
1152                         entry = e;
1153                         break;
1154                 }
1155         }
1156         if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
1157                 entry->edx &= ~(1 << 20);
1158                 printk(KERN_INFO "kvm: guest NX capability removed\n");
1159         }
1160 }
1161
1162 /* when an old userspace process fills a new kernel module */
1163 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1164                                     struct kvm_cpuid *cpuid,
1165                                     struct kvm_cpuid_entry __user *entries)
1166 {
1167         int r, i;
1168         struct kvm_cpuid_entry *cpuid_entries;
1169
1170         r = -E2BIG;
1171         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1172                 goto out;
1173         r = -ENOMEM;
1174         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
1175         if (!cpuid_entries)
1176                 goto out;
1177         r = -EFAULT;
1178         if (copy_from_user(cpuid_entries, entries,
1179                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1180                 goto out_free;
1181         for (i = 0; i < cpuid->nent; i++) {
1182                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1183                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
1184                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
1185                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
1186                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
1187                 vcpu->arch.cpuid_entries[i].index = 0;
1188                 vcpu->arch.cpuid_entries[i].flags = 0;
1189                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
1190                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
1191                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
1192         }
1193         vcpu->arch.cpuid_nent = cpuid->nent;
1194         cpuid_fix_nx_cap(vcpu);
1195         r = 0;
1196
1197 out_free:
1198         vfree(cpuid_entries);
1199 out:
1200         return r;
1201 }
1202
1203 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1204                                      struct kvm_cpuid2 *cpuid,
1205                                      struct kvm_cpuid_entry2 __user *entries)
1206 {
1207         int r;
1208
1209         r = -E2BIG;
1210         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1211                 goto out;
1212         r = -EFAULT;
1213         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1214                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1215                 goto out;
1216         vcpu->arch.cpuid_nent = cpuid->nent;
1217         return 0;
1218
1219 out:
1220         return r;
1221 }
1222
1223 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1224                                      struct kvm_cpuid2 *cpuid,
1225                                      struct kvm_cpuid_entry2 __user *entries)
1226 {
1227         int r;
1228
1229         r = -E2BIG;
1230         if (cpuid->nent < vcpu->arch.cpuid_nent)
1231                 goto out;
1232         r = -EFAULT;
1233         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1234                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1235                 goto out;
1236         return 0;
1237
1238 out:
1239         cpuid->nent = vcpu->arch.cpuid_nent;
1240         return r;
1241 }
1242
1243 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1244                            u32 index)
1245 {
1246         entry->function = function;
1247         entry->index = index;
1248         cpuid_count(entry->function, entry->index,
1249                     &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1250         entry->flags = 0;
1251 }
1252
1253 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1254                          u32 index, int *nent, int maxnent)
1255 {
1256         const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
1257                 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1258                 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1259                 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1260                 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1261                 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
1262                 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1263                 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
1264                 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
1265                 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
1266         const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
1267                 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1268                 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1269                 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1270                 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1271                 bit(X86_FEATURE_PGE) |
1272                 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1273                 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
1274                 bit(X86_FEATURE_SYSCALL) |
1275                 (is_efer_nx() ? bit(X86_FEATURE_NX) : 0) |
1276 #ifdef CONFIG_X86_64
1277                 bit(X86_FEATURE_LM) |
1278 #endif
1279                 bit(X86_FEATURE_FXSR_OPT) |
1280                 bit(X86_FEATURE_MMXEXT) |
1281                 bit(X86_FEATURE_3DNOWEXT) |
1282                 bit(X86_FEATURE_3DNOW);
1283         const u32 kvm_supported_word3_x86_features =
1284                 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
1285         const u32 kvm_supported_word6_x86_features =
1286                 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) |
1287                 bit(X86_FEATURE_SVM);
1288
1289         /* all calls to cpuid_count() should be made on the same cpu */
1290         get_cpu();
1291         do_cpuid_1_ent(entry, function, index);
1292         ++*nent;
1293
1294         switch (function) {
1295         case 0:
1296                 entry->eax = min(entry->eax, (u32)0xb);
1297                 break;
1298         case 1:
1299                 entry->edx &= kvm_supported_word0_x86_features;
1300                 entry->ecx &= kvm_supported_word3_x86_features;
1301                 break;
1302         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1303          * may return different values. This forces us to get_cpu() before
1304          * issuing the first command, and also to emulate this annoying behavior
1305          * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1306         case 2: {
1307                 int t, times = entry->eax & 0xff;
1308
1309                 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1310                 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1311                 for (t = 1; t < times && *nent < maxnent; ++t) {
1312                         do_cpuid_1_ent(&entry[t], function, 0);
1313                         entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1314                         ++*nent;
1315                 }
1316                 break;
1317         }
1318         /* function 4 and 0xb have additional index. */
1319         case 4: {
1320                 int i, cache_type;
1321
1322                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1323                 /* read more entries until cache_type is zero */
1324                 for (i = 1; *nent < maxnent; ++i) {
1325                         cache_type = entry[i - 1].eax & 0x1f;
1326                         if (!cache_type)
1327                                 break;
1328                         do_cpuid_1_ent(&entry[i], function, i);
1329                         entry[i].flags |=
1330                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1331                         ++*nent;
1332                 }
1333                 break;
1334         }
1335         case 0xb: {
1336                 int i, level_type;
1337
1338                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1339                 /* read more entries until level_type is zero */
1340                 for (i = 1; *nent < maxnent; ++i) {
1341                         level_type = entry[i - 1].ecx & 0xff00;
1342                         if (!level_type)
1343                                 break;
1344                         do_cpuid_1_ent(&entry[i], function, i);
1345                         entry[i].flags |=
1346                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1347                         ++*nent;
1348                 }
1349                 break;
1350         }
1351         case 0x80000000:
1352                 entry->eax = min(entry->eax, 0x8000001a);
1353                 break;
1354         case 0x80000001:
1355                 entry->edx &= kvm_supported_word1_x86_features;
1356                 entry->ecx &= kvm_supported_word6_x86_features;
1357                 break;
1358         }
1359         put_cpu();
1360 }
1361
1362 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1363                                      struct kvm_cpuid_entry2 __user *entries)
1364 {
1365         struct kvm_cpuid_entry2 *cpuid_entries;
1366         int limit, nent = 0, r = -E2BIG;
1367         u32 func;
1368
1369         if (cpuid->nent < 1)
1370                 goto out;
1371         r = -ENOMEM;
1372         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1373         if (!cpuid_entries)
1374                 goto out;
1375
1376         do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1377         limit = cpuid_entries[0].eax;
1378         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1379                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1380                              &nent, cpuid->nent);
1381         r = -E2BIG;
1382         if (nent >= cpuid->nent)
1383                 goto out_free;
1384
1385         do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1386         limit = cpuid_entries[nent - 1].eax;
1387         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1388                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1389                              &nent, cpuid->nent);
1390         r = -EFAULT;
1391         if (copy_to_user(entries, cpuid_entries,
1392                          nent * sizeof(struct kvm_cpuid_entry2)))
1393                 goto out_free;
1394         cpuid->nent = nent;
1395         r = 0;
1396
1397 out_free:
1398         vfree(cpuid_entries);
1399 out:
1400         return r;
1401 }
1402
1403 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1404                                     struct kvm_lapic_state *s)
1405 {
1406         vcpu_load(vcpu);
1407         memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1408         vcpu_put(vcpu);
1409
1410         return 0;
1411 }
1412
1413 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1414                                     struct kvm_lapic_state *s)
1415 {
1416         vcpu_load(vcpu);
1417         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1418         kvm_apic_post_state_restore(vcpu);
1419         vcpu_put(vcpu);
1420
1421         return 0;
1422 }
1423
1424 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1425                                     struct kvm_interrupt *irq)
1426 {
1427         if (irq->irq < 0 || irq->irq >= 256)
1428                 return -EINVAL;
1429         if (irqchip_in_kernel(vcpu->kvm))
1430                 return -ENXIO;
1431         vcpu_load(vcpu);
1432
1433         set_bit(irq->irq, vcpu->arch.irq_pending);
1434         set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1435
1436         vcpu_put(vcpu);
1437
1438         return 0;
1439 }
1440
1441 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1442 {
1443         vcpu_load(vcpu);
1444         kvm_inject_nmi(vcpu);
1445         vcpu_put(vcpu);
1446
1447         return 0;
1448 }
1449
1450 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1451                                            struct kvm_tpr_access_ctl *tac)
1452 {
1453         if (tac->flags)
1454                 return -EINVAL;
1455         vcpu->arch.tpr_access_reporting = !!tac->enabled;
1456         return 0;
1457 }
1458
1459 long kvm_arch_vcpu_ioctl(struct file *filp,
1460                          unsigned int ioctl, unsigned long arg)
1461 {
1462         struct kvm_vcpu *vcpu = filp->private_data;
1463         void __user *argp = (void __user *)arg;
1464         int r;
1465         struct kvm_lapic_state *lapic = NULL;
1466
1467         switch (ioctl) {
1468         case KVM_GET_LAPIC: {
1469                 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1470
1471                 r = -ENOMEM;
1472                 if (!lapic)
1473                         goto out;
1474                 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1475                 if (r)
1476                         goto out;
1477                 r = -EFAULT;
1478                 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1479                         goto out;
1480                 r = 0;
1481                 break;
1482         }
1483         case KVM_SET_LAPIC: {
1484                 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1485                 r = -ENOMEM;
1486                 if (!lapic)
1487                         goto out;
1488                 r = -EFAULT;
1489                 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1490                         goto out;
1491                 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1492                 if (r)
1493                         goto out;
1494                 r = 0;
1495                 break;
1496         }
1497         case KVM_INTERRUPT: {
1498                 struct kvm_interrupt irq;
1499
1500                 r = -EFAULT;
1501                 if (copy_from_user(&irq, argp, sizeof irq))
1502                         goto out;
1503                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1504                 if (r)
1505                         goto out;
1506                 r = 0;
1507                 break;
1508         }
1509         case KVM_NMI: {
1510                 r = kvm_vcpu_ioctl_nmi(vcpu);
1511                 if (r)
1512                         goto out;
1513                 r = 0;
1514                 break;
1515         }
1516         case KVM_SET_CPUID: {
1517                 struct kvm_cpuid __user *cpuid_arg = argp;
1518                 struct kvm_cpuid cpuid;
1519
1520                 r = -EFAULT;
1521                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1522                         goto out;
1523                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1524                 if (r)
1525                         goto out;
1526                 break;
1527         }
1528         case KVM_SET_CPUID2: {
1529                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1530                 struct kvm_cpuid2 cpuid;
1531
1532                 r = -EFAULT;
1533                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1534                         goto out;
1535                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1536                                               cpuid_arg->entries);
1537                 if (r)
1538                         goto out;
1539                 break;
1540         }
1541         case KVM_GET_CPUID2: {
1542                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1543                 struct kvm_cpuid2 cpuid;
1544
1545                 r = -EFAULT;
1546                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1547                         goto out;
1548                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1549                                               cpuid_arg->entries);
1550                 if (r)
1551                         goto out;
1552                 r = -EFAULT;
1553                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1554                         goto out;
1555                 r = 0;
1556                 break;
1557         }
1558         case KVM_GET_MSRS:
1559                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1560                 break;
1561         case KVM_SET_MSRS:
1562                 r = msr_io(vcpu, argp, do_set_msr, 0);
1563                 break;
1564         case KVM_TPR_ACCESS_REPORTING: {
1565                 struct kvm_tpr_access_ctl tac;
1566
1567                 r = -EFAULT;
1568                 if (copy_from_user(&tac, argp, sizeof tac))
1569                         goto out;
1570                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1571                 if (r)
1572                         goto out;
1573                 r = -EFAULT;
1574                 if (copy_to_user(argp, &tac, sizeof tac))
1575                         goto out;
1576                 r = 0;
1577                 break;
1578         };
1579         case KVM_SET_VAPIC_ADDR: {
1580                 struct kvm_vapic_addr va;
1581
1582                 r = -EINVAL;
1583                 if (!irqchip_in_kernel(vcpu->kvm))
1584                         goto out;
1585                 r = -EFAULT;
1586                 if (copy_from_user(&va, argp, sizeof va))
1587                         goto out;
1588                 r = 0;
1589                 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1590                 break;
1591         }
1592         default:
1593                 r = -EINVAL;
1594         }
1595 out:
1596         kfree(lapic);
1597         return r;
1598 }
1599
1600 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1601 {
1602         int ret;
1603
1604         if (addr > (unsigned int)(-3 * PAGE_SIZE))
1605                 return -1;
1606         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1607         return ret;
1608 }
1609
1610 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1611                                           u32 kvm_nr_mmu_pages)
1612 {
1613         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1614                 return -EINVAL;
1615
1616         down_write(&kvm->slots_lock);
1617
1618         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1619         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1620
1621         up_write(&kvm->slots_lock);
1622         return 0;
1623 }
1624
1625 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1626 {
1627         return kvm->arch.n_alloc_mmu_pages;
1628 }
1629
1630 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1631 {
1632         int i;
1633         struct kvm_mem_alias *alias;
1634
1635         for (i = 0; i < kvm->arch.naliases; ++i) {
1636                 alias = &kvm->arch.aliases[i];
1637                 if (gfn >= alias->base_gfn
1638                     && gfn < alias->base_gfn + alias->npages)
1639                         return alias->target_gfn + gfn - alias->base_gfn;
1640         }
1641         return gfn;
1642 }
1643
1644 /*
1645  * Set a new alias region.  Aliases map a portion of physical memory into
1646  * another portion.  This is useful for memory windows, for example the PC
1647  * VGA region.
1648  */
1649 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1650                                          struct kvm_memory_alias *alias)
1651 {
1652         int r, n;
1653         struct kvm_mem_alias *p;
1654
1655         r = -EINVAL;
1656         /* General sanity checks */
1657         if (alias->memory_size & (PAGE_SIZE - 1))
1658                 goto out;
1659         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1660                 goto out;
1661         if (alias->slot >= KVM_ALIAS_SLOTS)
1662                 goto out;
1663         if (alias->guest_phys_addr + alias->memory_size
1664             < alias->guest_phys_addr)
1665                 goto out;
1666         if (alias->target_phys_addr + alias->memory_size
1667             < alias->target_phys_addr)
1668                 goto out;
1669
1670         down_write(&kvm->slots_lock);
1671         spin_lock(&kvm->mmu_lock);
1672
1673         p = &kvm->arch.aliases[alias->slot];
1674         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1675         p->npages = alias->memory_size >> PAGE_SHIFT;
1676         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1677
1678         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1679                 if (kvm->arch.aliases[n - 1].npages)
1680                         break;
1681         kvm->arch.naliases = n;
1682
1683         spin_unlock(&kvm->mmu_lock);
1684         kvm_mmu_zap_all(kvm);
1685
1686         up_write(&kvm->slots_lock);
1687
1688         return 0;
1689
1690 out:
1691         return r;
1692 }
1693
1694 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1695 {
1696         int r;
1697
1698         r = 0;
1699         switch (chip->chip_id) {
1700         case KVM_IRQCHIP_PIC_MASTER:
1701                 memcpy(&chip->chip.pic,
1702                         &pic_irqchip(kvm)->pics[0],
1703                         sizeof(struct kvm_pic_state));
1704                 break;
1705         case KVM_IRQCHIP_PIC_SLAVE:
1706                 memcpy(&chip->chip.pic,
1707                         &pic_irqchip(kvm)->pics[1],
1708                         sizeof(struct kvm_pic_state));
1709                 break;
1710         case KVM_IRQCHIP_IOAPIC:
1711                 memcpy(&chip->chip.ioapic,
1712                         ioapic_irqchip(kvm),
1713                         sizeof(struct kvm_ioapic_state));
1714                 break;
1715         default:
1716                 r = -EINVAL;
1717                 break;
1718         }
1719         return r;
1720 }
1721
1722 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1723 {
1724         int r;
1725
1726         r = 0;
1727         switch (chip->chip_id) {
1728         case KVM_IRQCHIP_PIC_MASTER:
1729                 memcpy(&pic_irqchip(kvm)->pics[0],
1730                         &chip->chip.pic,
1731                         sizeof(struct kvm_pic_state));
1732                 break;
1733         case KVM_IRQCHIP_PIC_SLAVE:
1734                 memcpy(&pic_irqchip(kvm)->pics[1],
1735                         &chip->chip.pic,
1736                         sizeof(struct kvm_pic_state));
1737                 break;
1738         case KVM_IRQCHIP_IOAPIC:
1739                 memcpy(ioapic_irqchip(kvm),
1740                         &chip->chip.ioapic,
1741                         sizeof(struct kvm_ioapic_state));
1742                 break;
1743         default:
1744                 r = -EINVAL;
1745                 break;
1746         }
1747         kvm_pic_update_irq(pic_irqchip(kvm));
1748         return r;
1749 }
1750
1751 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1752 {
1753         int r = 0;
1754
1755         memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
1756         return r;
1757 }
1758
1759 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1760 {
1761         int r = 0;
1762
1763         memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
1764         kvm_pit_load_count(kvm, 0, ps->channels[0].count);
1765         return r;
1766 }
1767
1768 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
1769                                  struct kvm_reinject_control *control)
1770 {
1771         if (!kvm->arch.vpit)
1772                 return -ENXIO;
1773         kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
1774         return 0;
1775 }
1776
1777 /*
1778  * Get (and clear) the dirty memory log for a memory slot.
1779  */
1780 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1781                                       struct kvm_dirty_log *log)
1782 {
1783         int r;
1784         int n;
1785         struct kvm_memory_slot *memslot;
1786         int is_dirty = 0;
1787
1788         down_write(&kvm->slots_lock);
1789
1790         r = kvm_get_dirty_log(kvm, log, &is_dirty);
1791         if (r)
1792                 goto out;
1793
1794         /* If nothing is dirty, don't bother messing with page tables. */
1795         if (is_dirty) {
1796                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1797                 kvm_flush_remote_tlbs(kvm);
1798                 memslot = &kvm->memslots[log->slot];
1799                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1800                 memset(memslot->dirty_bitmap, 0, n);
1801         }
1802         r = 0;
1803 out:
1804         up_write(&kvm->slots_lock);
1805         return r;
1806 }
1807
1808 long kvm_arch_vm_ioctl(struct file *filp,
1809                        unsigned int ioctl, unsigned long arg)
1810 {
1811         struct kvm *kvm = filp->private_data;
1812         void __user *argp = (void __user *)arg;
1813         int r = -EINVAL;
1814         /*
1815          * This union makes it completely explicit to gcc-3.x
1816          * that these two variables' stack usage should be
1817          * combined, not added together.
1818          */
1819         union {
1820                 struct kvm_pit_state ps;
1821                 struct kvm_memory_alias alias;
1822         } u;
1823
1824         switch (ioctl) {
1825         case KVM_SET_TSS_ADDR:
1826                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1827                 if (r < 0)
1828                         goto out;
1829                 break;
1830         case KVM_SET_MEMORY_REGION: {
1831                 struct kvm_memory_region kvm_mem;
1832                 struct kvm_userspace_memory_region kvm_userspace_mem;
1833
1834                 r = -EFAULT;
1835                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1836                         goto out;
1837                 kvm_userspace_mem.slot = kvm_mem.slot;
1838                 kvm_userspace_mem.flags = kvm_mem.flags;
1839                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1840                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1841                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1842                 if (r)
1843                         goto out;
1844                 break;
1845         }
1846         case KVM_SET_NR_MMU_PAGES:
1847                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1848                 if (r)
1849                         goto out;
1850                 break;
1851         case KVM_GET_NR_MMU_PAGES:
1852                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1853                 break;
1854         case KVM_SET_MEMORY_ALIAS:
1855                 r = -EFAULT;
1856                 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
1857                         goto out;
1858                 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
1859                 if (r)
1860                         goto out;
1861                 break;
1862         case KVM_CREATE_IRQCHIP:
1863                 r = -ENOMEM;
1864                 kvm->arch.vpic = kvm_create_pic(kvm);
1865                 if (kvm->arch.vpic) {
1866                         r = kvm_ioapic_init(kvm);
1867                         if (r) {
1868                                 kfree(kvm->arch.vpic);
1869                                 kvm->arch.vpic = NULL;
1870                                 goto out;
1871                         }
1872                 } else
1873                         goto out;
1874                 r = kvm_setup_default_irq_routing(kvm);
1875                 if (r) {
1876                         kfree(kvm->arch.vpic);
1877                         kfree(kvm->arch.vioapic);
1878                         goto out;
1879                 }
1880                 break;
1881         case KVM_CREATE_PIT:
1882                 mutex_lock(&kvm->lock);
1883                 r = -EEXIST;
1884                 if (kvm->arch.vpit)
1885                         goto create_pit_unlock;
1886                 r = -ENOMEM;
1887                 kvm->arch.vpit = kvm_create_pit(kvm);
1888                 if (kvm->arch.vpit)
1889                         r = 0;
1890         create_pit_unlock:
1891                 mutex_unlock(&kvm->lock);
1892                 break;
1893         case KVM_IRQ_LINE_STATUS:
1894         case KVM_IRQ_LINE: {
1895                 struct kvm_irq_level irq_event;
1896
1897                 r = -EFAULT;
1898                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1899                         goto out;
1900                 if (irqchip_in_kernel(kvm)) {
1901                         __s32 status;
1902                         mutex_lock(&kvm->lock);
1903                         status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
1904                                         irq_event.irq, irq_event.level);
1905                         mutex_unlock(&kvm->lock);
1906                         if (ioctl == KVM_IRQ_LINE_STATUS) {
1907                                 irq_event.status = status;
1908                                 if (copy_to_user(argp, &irq_event,
1909                                                         sizeof irq_event))
1910                                         goto out;
1911                         }
1912                         r = 0;
1913                 }
1914                 break;
1915         }
1916         case KVM_GET_IRQCHIP: {
1917                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1918                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1919
1920                 r = -ENOMEM;
1921                 if (!chip)
1922                         goto out;
1923                 r = -EFAULT;
1924                 if (copy_from_user(chip, argp, sizeof *chip))
1925                         goto get_irqchip_out;
1926                 r = -ENXIO;
1927                 if (!irqchip_in_kernel(kvm))
1928                         goto get_irqchip_out;
1929                 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
1930                 if (r)
1931                         goto get_irqchip_out;
1932                 r = -EFAULT;
1933                 if (copy_to_user(argp, chip, sizeof *chip))
1934                         goto get_irqchip_out;
1935                 r = 0;
1936         get_irqchip_out:
1937                 kfree(chip);
1938                 if (r)
1939                         goto out;
1940                 break;
1941         }
1942         case KVM_SET_IRQCHIP: {
1943                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1944                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1945
1946                 r = -ENOMEM;
1947                 if (!chip)
1948                         goto out;
1949                 r = -EFAULT;
1950                 if (copy_from_user(chip, argp, sizeof *chip))
1951                         goto set_irqchip_out;
1952                 r = -ENXIO;
1953                 if (!irqchip_in_kernel(kvm))
1954                         goto set_irqchip_out;
1955                 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
1956                 if (r)
1957                         goto set_irqchip_out;
1958                 r = 0;
1959         set_irqchip_out:
1960                 kfree(chip);
1961                 if (r)
1962                         goto out;
1963                 break;
1964         }
1965         case KVM_GET_PIT: {
1966                 r = -EFAULT;
1967                 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
1968                         goto out;
1969                 r = -ENXIO;
1970                 if (!kvm->arch.vpit)
1971                         goto out;
1972                 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
1973                 if (r)
1974                         goto out;
1975                 r = -EFAULT;
1976                 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
1977                         goto out;
1978                 r = 0;
1979                 break;
1980         }
1981         case KVM_SET_PIT: {
1982                 r = -EFAULT;
1983                 if (copy_from_user(&u.ps, argp, sizeof u.ps))
1984                         goto out;
1985                 r = -ENXIO;
1986                 if (!kvm->arch.vpit)
1987                         goto out;
1988                 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
1989                 if (r)
1990                         goto out;
1991                 r = 0;
1992                 break;
1993         }
1994         case KVM_REINJECT_CONTROL: {
1995                 struct kvm_reinject_control control;
1996                 r =  -EFAULT;
1997                 if (copy_from_user(&control, argp, sizeof(control)))
1998                         goto out;
1999                 r = kvm_vm_ioctl_reinject(kvm, &control);
2000                 if (r)
2001                         goto out;
2002                 r = 0;
2003                 break;
2004         }
2005         default:
2006                 ;
2007         }
2008 out:
2009         return r;
2010 }
2011
2012 static void kvm_init_msr_list(void)
2013 {
2014         u32 dummy[2];
2015         unsigned i, j;
2016
2017         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2018                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2019                         continue;
2020                 if (j < i)
2021                         msrs_to_save[j] = msrs_to_save[i];
2022                 j++;
2023         }
2024         num_msrs_to_save = j;
2025 }
2026
2027 /*
2028  * Only apic need an MMIO device hook, so shortcut now..
2029  */
2030 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
2031                                                 gpa_t addr, int len,
2032                                                 int is_write)
2033 {
2034         struct kvm_io_device *dev;
2035
2036         if (vcpu->arch.apic) {
2037                 dev = &vcpu->arch.apic->dev;
2038                 if (dev->in_range(dev, addr, len, is_write))
2039                         return dev;
2040         }
2041         return NULL;
2042 }
2043
2044
2045 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
2046                                                 gpa_t addr, int len,
2047                                                 int is_write)
2048 {
2049         struct kvm_io_device *dev;
2050
2051         dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
2052         if (dev == NULL)
2053                 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
2054                                           is_write);
2055         return dev;
2056 }
2057
2058 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
2059                                struct kvm_vcpu *vcpu)
2060 {
2061         void *data = val;
2062         int r = X86EMUL_CONTINUE;
2063
2064         while (bytes) {
2065                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2066                 unsigned offset = addr & (PAGE_SIZE-1);
2067                 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2068                 int ret;
2069
2070                 if (gpa == UNMAPPED_GVA) {
2071                         r = X86EMUL_PROPAGATE_FAULT;
2072                         goto out;
2073                 }
2074                 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
2075                 if (ret < 0) {
2076                         r = X86EMUL_UNHANDLEABLE;
2077                         goto out;
2078                 }
2079
2080                 bytes -= toread;
2081                 data += toread;
2082                 addr += toread;
2083         }
2084 out:
2085         return r;
2086 }
2087
2088 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2089                                 struct kvm_vcpu *vcpu)
2090 {
2091         void *data = val;
2092         int r = X86EMUL_CONTINUE;
2093
2094         while (bytes) {
2095                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2096                 unsigned offset = addr & (PAGE_SIZE-1);
2097                 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2098                 int ret;
2099
2100                 if (gpa == UNMAPPED_GVA) {
2101                         r = X86EMUL_PROPAGATE_FAULT;
2102                         goto out;
2103                 }
2104                 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
2105                 if (ret < 0) {
2106                         r = X86EMUL_UNHANDLEABLE;
2107                         goto out;
2108                 }
2109
2110                 bytes -= towrite;
2111                 data += towrite;
2112                 addr += towrite;
2113         }
2114 out:
2115         return r;
2116 }
2117
2118
2119 static int emulator_read_emulated(unsigned long addr,
2120                                   void *val,
2121                                   unsigned int bytes,
2122                                   struct kvm_vcpu *vcpu)
2123 {
2124         struct kvm_io_device *mmio_dev;
2125         gpa_t                 gpa;
2126
2127         if (vcpu->mmio_read_completed) {
2128                 memcpy(val, vcpu->mmio_data, bytes);
2129                 vcpu->mmio_read_completed = 0;
2130                 return X86EMUL_CONTINUE;
2131         }
2132
2133         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2134
2135         /* For APIC access vmexit */
2136         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2137                 goto mmio;
2138
2139         if (kvm_read_guest_virt(addr, val, bytes, vcpu)
2140                                 == X86EMUL_CONTINUE)
2141                 return X86EMUL_CONTINUE;
2142         if (gpa == UNMAPPED_GVA)
2143                 return X86EMUL_PROPAGATE_FAULT;
2144
2145 mmio:
2146         /*
2147          * Is this MMIO handled locally?
2148          */
2149         mutex_lock(&vcpu->kvm->lock);
2150         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
2151         if (mmio_dev) {
2152                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
2153                 mutex_unlock(&vcpu->kvm->lock);
2154                 return X86EMUL_CONTINUE;
2155         }
2156         mutex_unlock(&vcpu->kvm->lock);
2157
2158         vcpu->mmio_needed = 1;
2159         vcpu->mmio_phys_addr = gpa;
2160         vcpu->mmio_size = bytes;
2161         vcpu->mmio_is_write = 0;
2162
2163         return X86EMUL_UNHANDLEABLE;
2164 }
2165
2166 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
2167                           const void *val, int bytes)
2168 {
2169         int ret;
2170
2171         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
2172         if (ret < 0)
2173                 return 0;
2174         kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
2175         return 1;
2176 }
2177
2178 static int emulator_write_emulated_onepage(unsigned long addr,
2179                                            const void *val,
2180                                            unsigned int bytes,
2181                                            struct kvm_vcpu *vcpu)
2182 {
2183         struct kvm_io_device *mmio_dev;
2184         gpa_t                 gpa;
2185
2186         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2187
2188         if (gpa == UNMAPPED_GVA) {
2189                 kvm_inject_page_fault(vcpu, addr, 2);
2190                 return X86EMUL_PROPAGATE_FAULT;
2191         }
2192
2193         /* For APIC access vmexit */
2194         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2195                 goto mmio;
2196
2197         if (emulator_write_phys(vcpu, gpa, val, bytes))
2198                 return X86EMUL_CONTINUE;
2199
2200 mmio:
2201         /*
2202          * Is this MMIO handled locally?
2203          */
2204         mutex_lock(&vcpu->kvm->lock);
2205         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
2206         if (mmio_dev) {
2207                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
2208                 mutex_unlock(&vcpu->kvm->lock);
2209                 return X86EMUL_CONTINUE;
2210         }
2211         mutex_unlock(&vcpu->kvm->lock);
2212
2213         vcpu->mmio_needed = 1;
2214         vcpu->mmio_phys_addr = gpa;
2215         vcpu->mmio_size = bytes;
2216         vcpu->mmio_is_write = 1;
2217         memcpy(vcpu->mmio_data, val, bytes);
2218
2219         return X86EMUL_CONTINUE;
2220 }
2221
2222 int emulator_write_emulated(unsigned long addr,
2223                                    const void *val,
2224                                    unsigned int bytes,
2225                                    struct kvm_vcpu *vcpu)
2226 {
2227         /* Crossing a page boundary? */
2228         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
2229                 int rc, now;
2230
2231                 now = -addr & ~PAGE_MASK;
2232                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
2233                 if (rc != X86EMUL_CONTINUE)
2234                         return rc;
2235                 addr += now;
2236                 val += now;
2237                 bytes -= now;
2238         }
2239         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
2240 }
2241 EXPORT_SYMBOL_GPL(emulator_write_emulated);
2242
2243 static int emulator_cmpxchg_emulated(unsigned long addr,
2244                                      const void *old,
2245                                      const void *new,
2246                                      unsigned int bytes,
2247                                      struct kvm_vcpu *vcpu)
2248 {
2249         static int reported;
2250
2251         if (!reported) {
2252                 reported = 1;
2253                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
2254         }
2255 #ifndef CONFIG_X86_64
2256         /* guests cmpxchg8b have to be emulated atomically */
2257         if (bytes == 8) {
2258                 gpa_t gpa;
2259                 struct page *page;
2260                 char *kaddr;
2261                 u64 val;
2262
2263                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2264
2265                 if (gpa == UNMAPPED_GVA ||
2266                    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2267                         goto emul_write;
2268
2269                 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
2270                         goto emul_write;
2271
2272                 val = *(u64 *)new;
2273
2274                 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2275
2276                 kaddr = kmap_atomic(page, KM_USER0);
2277                 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
2278                 kunmap_atomic(kaddr, KM_USER0);
2279                 kvm_release_page_dirty(page);
2280         }
2281 emul_write:
2282 #endif
2283
2284         return emulator_write_emulated(addr, new, bytes, vcpu);
2285 }
2286
2287 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2288 {
2289         return kvm_x86_ops->get_segment_base(vcpu, seg);
2290 }
2291
2292 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2293 {
2294         kvm_mmu_invlpg(vcpu, address);
2295         return X86EMUL_CONTINUE;
2296 }
2297
2298 int emulate_clts(struct kvm_vcpu *vcpu)
2299 {
2300         KVMTRACE_0D(CLTS, vcpu, handler);
2301         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2302         return X86EMUL_CONTINUE;
2303 }
2304
2305 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2306 {
2307         struct kvm_vcpu *vcpu = ctxt->vcpu;
2308
2309         switch (dr) {
2310         case 0 ... 3:
2311                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2312                 return X86EMUL_CONTINUE;
2313         default:
2314                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2315                 return X86EMUL_UNHANDLEABLE;
2316         }
2317 }
2318
2319 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2320 {
2321         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2322         int exception;
2323
2324         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
2325         if (exception) {
2326                 /* FIXME: better handling */
2327                 return X86EMUL_UNHANDLEABLE;
2328         }
2329         return X86EMUL_CONTINUE;
2330 }
2331
2332 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2333 {
2334         u8 opcodes[4];
2335         unsigned long rip = kvm_rip_read(vcpu);
2336         unsigned long rip_linear;
2337
2338         if (!printk_ratelimit())
2339                 return;
2340
2341         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2342
2343         kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
2344
2345         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2346                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2347 }
2348 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2349
2350 static struct x86_emulate_ops emulate_ops = {
2351         .read_std            = kvm_read_guest_virt,
2352         .read_emulated       = emulator_read_emulated,
2353         .write_emulated      = emulator_write_emulated,
2354         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
2355 };
2356
2357 static void cache_all_regs(struct kvm_vcpu *vcpu)
2358 {
2359         kvm_register_read(vcpu, VCPU_REGS_RAX);
2360         kvm_register_read(vcpu, VCPU_REGS_RSP);
2361         kvm_register_read(vcpu, VCPU_REGS_RIP);
2362         vcpu->arch.regs_dirty = ~0;
2363 }
2364
2365 int emulate_instruction(struct kvm_vcpu *vcpu,
2366                         struct kvm_run *run,
2367                         unsigned long cr2,
2368                         u16 error_code,
2369                         int emulation_type)
2370 {
2371         int r;
2372         struct decode_cache *c;
2373
2374         kvm_clear_exception_queue(vcpu);
2375         vcpu->arch.mmio_fault_cr2 = cr2;
2376         /*
2377          * TODO: fix x86_emulate.c to use guest_read/write_register
2378          * instead of direct ->regs accesses, can save hundred cycles
2379          * on Intel for instructions that don't read/change RSP, for
2380          * for example.
2381          */
2382         cache_all_regs(vcpu);
2383
2384         vcpu->mmio_is_write = 0;
2385         vcpu->arch.pio.string = 0;
2386
2387         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
2388                 int cs_db, cs_l;
2389                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2390
2391                 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2392                 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
2393                 vcpu->arch.emulate_ctxt.mode =
2394                         (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2395                         ? X86EMUL_MODE_REAL : cs_l
2396                         ? X86EMUL_MODE_PROT64 : cs_db
2397                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2398
2399                 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2400
2401                 /* Reject the instructions other than VMCALL/VMMCALL when
2402                  * try to emulate invalid opcode */
2403                 c = &vcpu->arch.emulate_ctxt.decode;
2404                 if ((emulation_type & EMULTYPE_TRAP_UD) &&
2405                     (!(c->twobyte && c->b == 0x01 &&
2406                       (c->modrm_reg == 0 || c->modrm_reg == 3) &&
2407                        c->modrm_mod == 3 && c->modrm_rm == 1)))
2408                         return EMULATE_FAIL;
2409
2410                 ++vcpu->stat.insn_emulation;
2411                 if (r)  {
2412                         ++vcpu->stat.insn_emulation_fail;
2413                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2414                                 return EMULATE_DONE;
2415                         return EMULATE_FAIL;
2416                 }
2417         }
2418
2419         r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2420
2421         if (vcpu->arch.pio.string)
2422                 return EMULATE_DO_MMIO;
2423
2424         if ((r || vcpu->mmio_is_write) && run) {
2425                 run->exit_reason = KVM_EXIT_MMIO;
2426                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2427                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2428                 run->mmio.len = vcpu->mmio_size;
2429                 run->mmio.is_write = vcpu->mmio_is_write;
2430         }
2431
2432         if (r) {
2433                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2434                         return EMULATE_DONE;
2435                 if (!vcpu->mmio_needed) {
2436                         kvm_report_emulation_failure(vcpu, "mmio");
2437                         return EMULATE_FAIL;
2438                 }
2439                 return EMULATE_DO_MMIO;
2440         }
2441
2442         kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2443
2444         if (vcpu->mmio_is_write) {
2445                 vcpu->mmio_needed = 0;
2446                 return EMULATE_DO_MMIO;
2447         }
2448
2449         return EMULATE_DONE;
2450 }
2451 EXPORT_SYMBOL_GPL(emulate_instruction);
2452
2453 static int pio_copy_data(struct kvm_vcpu *vcpu)
2454 {
2455         void *p = vcpu->arch.pio_data;
2456         gva_t q = vcpu->arch.pio.guest_gva;
2457         unsigned bytes;
2458         int ret;
2459
2460         bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2461         if (vcpu->arch.pio.in)
2462                 ret = kvm_write_guest_virt(q, p, bytes, vcpu);
2463         else
2464                 ret = kvm_read_guest_virt(q, p, bytes, vcpu);
2465         return ret;
2466 }
2467
2468 int complete_pio(struct kvm_vcpu *vcpu)
2469 {
2470         struct kvm_pio_request *io = &vcpu->arch.pio;
2471         long delta;
2472         int r;
2473         unsigned long val;
2474
2475         if (!io->string) {
2476                 if (io->in) {
2477                         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2478                         memcpy(&val, vcpu->arch.pio_data, io->size);
2479                         kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2480                 }
2481         } else {
2482                 if (io->in) {
2483                         r = pio_copy_data(vcpu);
2484                         if (r)
2485                                 return r;
2486                 }
2487
2488                 delta = 1;
2489                 if (io->rep) {
2490                         delta *= io->cur_count;
2491                         /*
2492                          * The size of the register should really depend on
2493                          * current address size.
2494                          */
2495                         val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2496                         val -= delta;
2497                         kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2498                 }
2499                 if (io->down)
2500                         delta = -delta;
2501                 delta *= io->size;
2502                 if (io->in) {
2503                         val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2504                         val += delta;
2505                         kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2506                 } else {
2507                         val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2508                         val += delta;
2509                         kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2510                 }
2511         }
2512
2513         io->count -= io->cur_count;
2514         io->cur_count = 0;
2515
2516         return 0;
2517 }
2518
2519 static void kernel_pio(struct kvm_io_device *pio_dev,
2520                        struct kvm_vcpu *vcpu,
2521                        void *pd)
2522 {
2523         /* TODO: String I/O for in kernel device */
2524
2525         mutex_lock(&vcpu->kvm->lock);
2526         if (vcpu->arch.pio.in)
2527                 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
2528                                   vcpu->arch.pio.size,
2529                                   pd);
2530         else
2531                 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
2532                                    vcpu->arch.pio.size,
2533                                    pd);
2534         mutex_unlock(&vcpu->kvm->lock);
2535 }
2536
2537 static void pio_string_write(struct kvm_io_device *pio_dev,
2538                              struct kvm_vcpu *vcpu)
2539 {
2540         struct kvm_pio_request *io = &vcpu->arch.pio;
2541         void *pd = vcpu->arch.pio_data;
2542         int i;
2543
2544         mutex_lock(&vcpu->kvm->lock);
2545         for (i = 0; i < io->cur_count; i++) {
2546                 kvm_iodevice_write(pio_dev, io->port,
2547                                    io->size,
2548                                    pd);
2549                 pd += io->size;
2550         }
2551         mutex_unlock(&vcpu->kvm->lock);
2552 }
2553
2554 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2555                                                gpa_t addr, int len,
2556                                                int is_write)
2557 {
2558         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2559 }
2560
2561 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2562                   int size, unsigned port)
2563 {
2564         struct kvm_io_device *pio_dev;
2565         unsigned long val;
2566
2567         vcpu->run->exit_reason = KVM_EXIT_IO;
2568         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2569         vcpu->run->io.size = vcpu->arch.pio.size = size;
2570         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2571         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2572         vcpu->run->io.port = vcpu->arch.pio.port = port;
2573         vcpu->arch.pio.in = in;
2574         vcpu->arch.pio.string = 0;
2575         vcpu->arch.pio.down = 0;
2576         vcpu->arch.pio.rep = 0;
2577
2578         if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2579                 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2580                             handler);
2581         else
2582                 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2583                             handler);
2584
2585         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2586         memcpy(vcpu->arch.pio_data, &val, 4);
2587
2588         pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2589         if (pio_dev) {
2590                 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2591                 complete_pio(vcpu);
2592                 return 1;
2593         }
2594         return 0;
2595 }
2596 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2597
2598 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2599                   int size, unsigned long count, int down,
2600                   gva_t address, int rep, unsigned port)
2601 {
2602         unsigned now, in_page;
2603         int ret = 0;
2604         struct kvm_io_device *pio_dev;
2605
2606         vcpu->run->exit_reason = KVM_EXIT_IO;
2607         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2608         vcpu->run->io.size = vcpu->arch.pio.size = size;
2609         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2610         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2611         vcpu->run->io.port = vcpu->arch.pio.port = port;
2612         vcpu->arch.pio.in = in;
2613         vcpu->arch.pio.string = 1;
2614         vcpu->arch.pio.down = down;
2615         vcpu->arch.pio.rep = rep;
2616
2617         if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2618                 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2619                             handler);
2620         else
2621                 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2622                             handler);
2623
2624         if (!count) {
2625                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2626                 return 1;
2627         }
2628
2629         if (!down)
2630                 in_page = PAGE_SIZE - offset_in_page(address);
2631         else
2632                 in_page = offset_in_page(address) + size;
2633         now = min(count, (unsigned long)in_page / size);
2634         if (!now)
2635                 now = 1;
2636         if (down) {
2637                 /*
2638                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
2639                  */
2640                 pr_unimpl(vcpu, "guest string pio down\n");
2641                 kvm_inject_gp(vcpu, 0);
2642                 return 1;
2643         }
2644         vcpu->run->io.count = now;
2645         vcpu->arch.pio.cur_count = now;
2646
2647         if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2648                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2649
2650         vcpu->arch.pio.guest_gva = address;
2651
2652         pio_dev = vcpu_find_pio_dev(vcpu, port,
2653                                     vcpu->arch.pio.cur_count,
2654                                     !vcpu->arch.pio.in);
2655         if (!vcpu->arch.pio.in) {
2656                 /* string PIO write */
2657                 ret = pio_copy_data(vcpu);
2658                 if (ret == X86EMUL_PROPAGATE_FAULT) {
2659                         kvm_inject_gp(vcpu, 0);
2660                         return 1;
2661                 }
2662                 if (ret == 0 && pio_dev) {
2663                         pio_string_write(pio_dev, vcpu);
2664                         complete_pio(vcpu);
2665                         if (vcpu->arch.pio.count == 0)
2666                                 ret = 1;
2667                 }
2668         } else if (pio_dev)
2669                 pr_unimpl(vcpu, "no string pio read support yet, "
2670                        "port %x size %d count %ld\n",
2671                         port, size, count);
2672
2673         return ret;
2674 }
2675 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2676
2677 static void bounce_off(void *info)
2678 {
2679         /* nothing */
2680 }
2681
2682 static unsigned int  ref_freq;
2683 static unsigned long tsc_khz_ref;
2684
2685 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
2686                                      void *data)
2687 {
2688         struct cpufreq_freqs *freq = data;
2689         struct kvm *kvm;
2690         struct kvm_vcpu *vcpu;
2691         int i, send_ipi = 0;
2692
2693         if (!ref_freq)
2694                 ref_freq = freq->old;
2695
2696         if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
2697                 return 0;
2698         if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
2699                 return 0;
2700         per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
2701
2702         spin_lock(&kvm_lock);
2703         list_for_each_entry(kvm, &vm_list, vm_list) {
2704                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2705                         vcpu = kvm->vcpus[i];
2706                         if (!vcpu)
2707                                 continue;
2708                         if (vcpu->cpu != freq->cpu)
2709                                 continue;
2710                         if (!kvm_request_guest_time_update(vcpu))
2711                                 continue;
2712                         if (vcpu->cpu != smp_processor_id())
2713                                 send_ipi++;
2714                 }
2715         }
2716         spin_unlock(&kvm_lock);
2717
2718         if (freq->old < freq->new && send_ipi) {
2719                 /*
2720                  * We upscale the frequency.  Must make the guest
2721                  * doesn't see old kvmclock values while running with
2722                  * the new frequency, otherwise we risk the guest sees
2723                  * time go backwards.
2724                  *
2725                  * In case we update the frequency for another cpu
2726                  * (which might be in guest context) send an interrupt
2727                  * to kick the cpu out of guest context.  Next time
2728                  * guest context is entered kvmclock will be updated,
2729                  * so the guest will not see stale values.
2730                  */
2731                 smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
2732         }
2733         return 0;
2734 }
2735
2736 static struct notifier_block kvmclock_cpufreq_notifier_block = {
2737         .notifier_call  = kvmclock_cpufreq_notifier
2738 };
2739
2740 int kvm_arch_init(void *opaque)
2741 {
2742         int r, cpu;
2743         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2744
2745         if (kvm_x86_ops) {
2746                 printk(KERN_ERR "kvm: already loaded the other module\n");
2747                 r = -EEXIST;
2748                 goto out;
2749         }
2750
2751         if (!ops->cpu_has_kvm_support()) {
2752                 printk(KERN_ERR "kvm: no hardware support\n");
2753                 r = -EOPNOTSUPP;
2754                 goto out;
2755         }
2756         if (ops->disabled_by_bios()) {
2757                 printk(KERN_ERR "kvm: disabled by bios\n");
2758                 r = -EOPNOTSUPP;
2759                 goto out;
2760         }
2761
2762         r = kvm_mmu_module_init();
2763         if (r)
2764                 goto out;
2765
2766         kvm_init_msr_list();
2767
2768         kvm_x86_ops = ops;
2769         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2770         kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2771         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2772                         PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
2773
2774         for_each_possible_cpu(cpu)
2775                 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
2776         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2777                 tsc_khz_ref = tsc_khz;
2778                 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
2779                                           CPUFREQ_TRANSITION_NOTIFIER);
2780         }
2781
2782         return 0;
2783
2784 out:
2785         return r;
2786 }
2787
2788 void kvm_arch_exit(void)
2789 {
2790         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
2791                 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
2792                                             CPUFREQ_TRANSITION_NOTIFIER);
2793         kvm_x86_ops = NULL;
2794         kvm_mmu_module_exit();
2795 }
2796
2797 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2798 {
2799         ++vcpu->stat.halt_exits;
2800         KVMTRACE_0D(HLT, vcpu, handler);
2801         if (irqchip_in_kernel(vcpu->kvm)) {
2802                 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2803                 return 1;
2804         } else {
2805                 vcpu->run->exit_reason = KVM_EXIT_HLT;
2806                 return 0;
2807         }
2808 }
2809 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2810
2811 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
2812                            unsigned long a1)
2813 {
2814         if (is_long_mode(vcpu))
2815                 return a0;
2816         else
2817                 return a0 | ((gpa_t)a1 << 32);
2818 }
2819
2820 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2821 {
2822         unsigned long nr, a0, a1, a2, a3, ret;
2823         int r = 1;
2824
2825         nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
2826         a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
2827         a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
2828         a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
2829         a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
2830
2831         KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2832
2833         if (!is_long_mode(vcpu)) {
2834                 nr &= 0xFFFFFFFF;
2835                 a0 &= 0xFFFFFFFF;
2836                 a1 &= 0xFFFFFFFF;
2837                 a2 &= 0xFFFFFFFF;
2838                 a3 &= 0xFFFFFFFF;
2839         }
2840
2841         switch (nr) {
2842         case KVM_HC_VAPIC_POLL_IRQ:
2843                 ret = 0;
2844                 break;
2845         case KVM_HC_MMU_OP:
2846                 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
2847                 break;
2848         default:
2849                 ret = -KVM_ENOSYS;
2850                 break;
2851         }
2852         kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
2853         ++vcpu->stat.hypercalls;
2854         return r;
2855 }
2856 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2857
2858 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2859 {
2860         char instruction[3];
2861         int ret = 0;
2862         unsigned long rip = kvm_rip_read(vcpu);
2863
2864
2865         /*
2866          * Blow out the MMU to ensure that no other VCPU has an active mapping
2867          * to ensure that the updated hypercall appears atomically across all
2868          * VCPUs.
2869          */
2870         kvm_mmu_zap_all(vcpu->kvm);
2871
2872         kvm_x86_ops->patch_hypercall(vcpu, instruction);
2873         if (emulator_write_emulated(rip, instruction, 3, vcpu)
2874             != X86EMUL_CONTINUE)
2875                 ret = -EFAULT;
2876
2877         return ret;
2878 }
2879
2880 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2881 {
2882         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2883 }
2884
2885 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2886 {
2887         struct descriptor_table dt = { limit, base };
2888
2889         kvm_x86_ops->set_gdt(vcpu, &dt);
2890 }
2891
2892 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2893 {
2894         struct descriptor_table dt = { limit, base };
2895
2896         kvm_x86_ops->set_idt(vcpu, &dt);
2897 }
2898
2899 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2900                    unsigned long *rflags)
2901 {
2902         kvm_lmsw(vcpu, msw);
2903         *rflags = kvm_x86_ops->get_rflags(vcpu);
2904 }
2905
2906 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2907 {
2908         unsigned long value;
2909
2910         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2911         switch (cr) {
2912         case 0:
2913                 value = vcpu->arch.cr0;
2914                 break;
2915         case 2:
2916                 value = vcpu->arch.cr2;
2917                 break;
2918         case 3:
2919                 value = vcpu->arch.cr3;
2920                 break;
2921         case 4:
2922                 value = vcpu->arch.cr4;
2923                 break;
2924         case 8:
2925                 value = kvm_get_cr8(vcpu);
2926                 break;
2927         default:
2928                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2929                 return 0;
2930         }
2931         KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2932                     (u32)((u64)value >> 32), handler);
2933
2934         return value;
2935 }
2936
2937 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2938                      unsigned long *rflags)
2939 {
2940         KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
2941                     (u32)((u64)val >> 32), handler);
2942
2943         switch (cr) {
2944         case 0:
2945                 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2946                 *rflags = kvm_x86_ops->get_rflags(vcpu);
2947                 break;
2948         case 2:
2949                 vcpu->arch.cr2 = val;
2950                 break;
2951         case 3:
2952                 kvm_set_cr3(vcpu, val);
2953                 break;
2954         case 4:
2955                 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2956                 break;
2957         case 8:
2958                 kvm_set_cr8(vcpu, val & 0xfUL);
2959                 break;
2960         default:
2961                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2962         }
2963 }
2964
2965 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2966 {
2967         struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2968         int j, nent = vcpu->arch.cpuid_nent;
2969
2970         e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2971         /* when no next entry is found, the current entry[i] is reselected */
2972         for (j = i + 1; ; j = (j + 1) % nent) {
2973                 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2974                 if (ej->function == e->function) {
2975                         ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2976                         return j;
2977                 }
2978         }
2979         return 0; /* silence gcc, even though control never reaches here */
2980 }
2981
2982 /* find an entry with matching function, matching index (if needed), and that
2983  * should be read next (if it's stateful) */
2984 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2985         u32 function, u32 index)
2986 {
2987         if (e->function != function)
2988                 return 0;
2989         if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2990                 return 0;
2991         if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2992             !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2993                 return 0;
2994         return 1;
2995 }
2996
2997 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
2998                                               u32 function, u32 index)
2999 {
3000         int i;
3001         struct kvm_cpuid_entry2 *best = NULL;
3002
3003         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
3004                 struct kvm_cpuid_entry2 *e;
3005
3006                 e = &vcpu->arch.cpuid_entries[i];
3007                 if (is_matching_cpuid_entry(e, function, index)) {
3008                         if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
3009                                 move_to_next_stateful_cpuid_entry(vcpu, i);
3010                         best = e;
3011                         break;
3012                 }
3013                 /*
3014                  * Both basic or both extended?
3015                  */
3016                 if (((e->function ^ function) & 0x80000000) == 0)
3017                         if (!best || e->function > best->function)
3018                                 best = e;
3019         }
3020         return best;
3021 }
3022
3023 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3024 {
3025         struct kvm_cpuid_entry2 *best;
3026
3027         best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
3028         if (best)
3029                 return best->eax & 0xff;
3030         return 36;
3031 }
3032
3033 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3034 {
3035         u32 function, index;
3036         struct kvm_cpuid_entry2 *best;
3037
3038         function = kvm_register_read(vcpu, VCPU_REGS_RAX);
3039         index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3040         kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
3041         kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
3042         kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
3043         kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
3044         best = kvm_find_cpuid_entry(vcpu, function, index);
3045         if (best) {
3046                 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
3047                 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
3048                 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
3049                 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
3050         }
3051         kvm_x86_ops->skip_emulated_instruction(vcpu);
3052         KVMTRACE_5D(CPUID, vcpu, function,
3053                     (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
3054                     (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
3055                     (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
3056                     (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
3057 }
3058 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3059
3060 /*
3061  * Check if userspace requested an interrupt window, and that the
3062  * interrupt window is open.
3063  *
3064  * No need to exit to userspace if we already have an interrupt queued.
3065  */
3066 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
3067                                           struct kvm_run *kvm_run)
3068 {
3069         return (!vcpu->arch.irq_summary &&
3070                 kvm_run->request_interrupt_window &&
3071                 vcpu->arch.interrupt_window_open &&
3072                 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
3073 }
3074
3075 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
3076                               struct kvm_run *kvm_run)
3077 {
3078         kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3079         kvm_run->cr8 = kvm_get_cr8(vcpu);
3080         kvm_run->apic_base = kvm_get_apic_base(vcpu);
3081         if (irqchip_in_kernel(vcpu->kvm))
3082                 kvm_run->ready_for_interrupt_injection = 1;
3083         else
3084                 kvm_run->ready_for_interrupt_injection =
3085                                         (vcpu->arch.interrupt_window_open &&
3086                                          vcpu->arch.irq_summary == 0);
3087 }
3088
3089 static void vapic_enter(struct kvm_vcpu *vcpu)
3090 {
3091         struct kvm_lapic *apic = vcpu->arch.apic;
3092         struct page *page;
3093
3094         if (!apic || !apic->vapic_addr)
3095                 return;
3096
3097         page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3098
3099         vcpu->arch.apic->vapic_page = page;
3100 }
3101
3102 static void vapic_exit(struct kvm_vcpu *vcpu)
3103 {
3104         struct kvm_lapic *apic = vcpu->arch.apic;
3105
3106         if (!apic || !apic->vapic_addr)
3107                 return;
3108
3109         down_read(&vcpu->kvm->slots_lock);
3110         kvm_release_page_dirty(apic->vapic_page);
3111         mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3112         up_read(&vcpu->kvm->slots_lock);
3113 }
3114
3115 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3116 {
3117         int r;
3118
3119         if (vcpu->requests)
3120                 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
3121                         kvm_mmu_unload(vcpu);
3122
3123         r = kvm_mmu_reload(vcpu);
3124         if (unlikely(r))
3125                 goto out;
3126
3127         if (vcpu->requests) {
3128                 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
3129                         __kvm_migrate_timers(vcpu);
3130                 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
3131                         kvm_write_guest_time(vcpu);
3132                 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
3133                         kvm_mmu_sync_roots(vcpu);
3134                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
3135                         kvm_x86_ops->tlb_flush(vcpu);
3136                 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3137                                        &vcpu->requests)) {
3138                         kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
3139                         r = 0;
3140                         goto out;
3141                 }
3142                 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3143                         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
3144                         r = 0;
3145                         goto out;
3146                 }
3147         }
3148
3149         preempt_disable();
3150
3151         kvm_x86_ops->prepare_guest_switch(vcpu);
3152         kvm_load_guest_fpu(vcpu);
3153
3154         local_irq_disable();
3155
3156         if (vcpu->requests || need_resched() || signal_pending(current)) {
3157                 local_irq_enable();
3158                 preempt_enable();
3159                 r = 1;
3160                 goto out;
3161         }
3162
3163         vcpu->guest_mode = 1;
3164         /*
3165          * Make sure that guest_mode assignment won't happen after
3166          * testing the pending IRQ vector bitmap.
3167          */
3168         smp_wmb();
3169
3170         if (vcpu->arch.exception.pending)
3171                 __queue_exception(vcpu);
3172         else if (irqchip_in_kernel(vcpu->kvm))
3173                 kvm_x86_ops->inject_pending_irq(vcpu);
3174         else
3175                 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
3176
3177         kvm_lapic_sync_to_vapic(vcpu);
3178
3179         up_read(&vcpu->kvm->slots_lock);
3180
3181         kvm_guest_enter();
3182
3183         get_debugreg(vcpu->arch.host_dr6, 6);
3184         get_debugreg(vcpu->arch.host_dr7, 7);
3185         if (unlikely(vcpu->arch.switch_db_regs)) {
3186                 get_debugreg(vcpu->arch.host_db[0], 0);
3187                 get_debugreg(vcpu->arch.host_db[1], 1);
3188                 get_debugreg(vcpu->arch.host_db[2], 2);
3189                 get_debugreg(vcpu->arch.host_db[3], 3);
3190
3191                 set_debugreg(0, 7);
3192                 set_debugreg(vcpu->arch.eff_db[0], 0);
3193                 set_debugreg(vcpu->arch.eff_db[1], 1);
3194                 set_debugreg(vcpu->arch.eff_db[2], 2);
3195                 set_debugreg(vcpu->arch.eff_db[3], 3);
3196         }
3197
3198         KVMTRACE_0D(VMENTRY, vcpu, entryexit);
3199         kvm_x86_ops->run(vcpu, kvm_run);
3200
3201         if (unlikely(vcpu->arch.switch_db_regs)) {
3202                 set_debugreg(0, 7);
3203                 set_debugreg(vcpu->arch.host_db[0], 0);
3204                 set_debugreg(vcpu->arch.host_db[1], 1);
3205                 set_debugreg(vcpu->arch.host_db[2], 2);
3206                 set_debugreg(vcpu->arch.host_db[3], 3);
3207         }
3208         set_debugreg(vcpu->arch.host_dr6, 6);
3209         set_debugreg(vcpu->arch.host_dr7, 7);
3210
3211         vcpu->guest_mode = 0;
3212         local_irq_enable();
3213
3214         ++vcpu->stat.exits;
3215
3216         /*
3217          * We must have an instruction between local_irq_enable() and
3218          * kvm_guest_exit(), so the timer interrupt isn't delayed by
3219          * the interrupt shadow.  The stat.exits increment will do nicely.
3220          * But we need to prevent reordering, hence this barrier():
3221          */
3222         barrier();
3223
3224         kvm_guest_exit();
3225
3226         preempt_enable();
3227
3228         down_read(&vcpu->kvm->slots_lock);
3229
3230         /*
3231          * Profile KVM exit RIPs:
3232          */
3233         if (unlikely(prof_on == KVM_PROFILING)) {
3234                 unsigned long rip = kvm_rip_read(vcpu);
3235                 profile_hit(KVM_PROFILING, (void *)rip);
3236         }
3237
3238         if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
3239                 vcpu->arch.exception.pending = false;
3240
3241         kvm_lapic_sync_from_vapic(vcpu);
3242
3243         r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
3244 out:
3245         return r;
3246 }
3247
3248
3249 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3250 {
3251         int r;
3252
3253         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3254                 pr_debug("vcpu %d received sipi with vector # %x\n",
3255                          vcpu->vcpu_id, vcpu->arch.sipi_vector);
3256                 kvm_lapic_reset(vcpu);
3257                 r = kvm_arch_vcpu_reset(vcpu);
3258                 if (r)
3259                         return r;
3260                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3261         }
3262
3263         down_read(&vcpu->kvm->slots_lock);
3264         vapic_enter(vcpu);
3265
3266         r = 1;
3267         while (r > 0) {
3268                 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3269                         r = vcpu_enter_guest(vcpu, kvm_run);
3270                 else {
3271                         up_read(&vcpu->kvm->slots_lock);
3272                         kvm_vcpu_block(vcpu);
3273                         down_read(&vcpu->kvm->slots_lock);
3274                         if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3275                         {
3276                                 switch(vcpu->arch.mp_state) {
3277                                 case KVM_MP_STATE_HALTED:
3278                                         vcpu->arch.mp_state =
3279                                                 KVM_MP_STATE_RUNNABLE;
3280                                 case KVM_MP_STATE_RUNNABLE:
3281                                         break;
3282                                 case KVM_MP_STATE_SIPI_RECEIVED:
3283                                 default:
3284                                         r = -EINTR;
3285                                         break;
3286                                 }
3287                         }
3288                 }
3289
3290                 if (r <= 0)
3291                         break;
3292
3293                 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3294                 if (kvm_cpu_has_pending_timer(vcpu))
3295                         kvm_inject_pending_timer_irqs(vcpu);
3296
3297                 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3298                         r = -EINTR;
3299                         kvm_run->exit_reason = KVM_EXIT_INTR;
3300                         ++vcpu->stat.request_irq_exits;
3301                 }
3302                 if (signal_pending(current)) {
3303                         r = -EINTR;
3304                         kvm_run->exit_reason = KVM_EXIT_INTR;
3305                         ++vcpu->stat.signal_exits;
3306                 }
3307                 if (need_resched()) {
3308                         up_read(&vcpu->kvm->slots_lock);
3309                         kvm_resched(vcpu);
3310                         down_read(&vcpu->kvm->slots_lock);
3311                 }
3312         }
3313
3314         up_read(&vcpu->kvm->slots_lock);
3315         post_kvm_run_save(vcpu, kvm_run);
3316
3317         vapic_exit(vcpu);
3318
3319         return r;
3320 }
3321
3322 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3323 {
3324         int r;
3325         sigset_t sigsaved;
3326
3327         vcpu_load(vcpu);
3328
3329         if (vcpu->sigset_active)
3330                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3331
3332         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
3333                 kvm_vcpu_block(vcpu);
3334                 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
3335                 r = -EAGAIN;
3336                 goto out;
3337         }
3338
3339         /* re-sync apic's tpr */
3340         if (!irqchip_in_kernel(vcpu->kvm))
3341                 kvm_set_cr8(vcpu, kvm_run->cr8);
3342
3343         if (vcpu->arch.pio.cur_count) {
3344                 r = complete_pio(vcpu);
3345                 if (r)
3346                         goto out;
3347         }
3348 #if CONFIG_HAS_IOMEM
3349         if (vcpu->mmio_needed) {
3350                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3351                 vcpu->mmio_read_completed = 1;
3352                 vcpu->mmio_needed = 0;
3353
3354                 down_read(&vcpu->kvm->slots_lock);
3355                 r = emulate_instruction(vcpu, kvm_run,
3356                                         vcpu->arch.mmio_fault_cr2, 0,
3357                                         EMULTYPE_NO_DECODE);
3358                 up_read(&vcpu->kvm->slots_lock);
3359                 if (r == EMULATE_DO_MMIO) {
3360                         /*
3361                          * Read-modify-write.  Back to userspace.
3362                          */
3363                         r = 0;
3364                         goto out;
3365                 }
3366         }
3367 #endif
3368         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3369                 kvm_register_write(vcpu, VCPU_REGS_RAX,
3370                                      kvm_run->hypercall.ret);
3371
3372         r = __vcpu_run(vcpu, kvm_run);
3373
3374 out:
3375         if (vcpu->sigset_active)
3376                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
3377
3378         vcpu_put(vcpu);
3379         return r;
3380 }
3381
3382 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3383 {
3384         vcpu_load(vcpu);
3385
3386         regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3387         regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3388         regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3389         regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3390         regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3391         regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3392         regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3393         regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3394 #ifdef CONFIG_X86_64
3395         regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3396         regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3397         regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3398         regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3399         regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3400         regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3401         regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3402         regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
3403 #endif
3404
3405         regs->rip = kvm_rip_read(vcpu);
3406         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
3407
3408         /*
3409          * Don't leak debug flags in case they were set for guest debugging
3410          */
3411         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3412                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3413
3414         vcpu_put(vcpu);
3415
3416         return 0;
3417 }
3418
3419 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3420 {
3421         vcpu_load(vcpu);
3422
3423         kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3424         kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3425         kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3426         kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3427         kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3428         kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3429         kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3430         kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
3431 #ifdef CONFIG_X86_64
3432         kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3433         kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3434         kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3435         kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3436         kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3437         kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3438         kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3439         kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3440
3441 #endif
3442
3443         kvm_rip_write(vcpu, regs->rip);
3444         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3445
3446
3447         vcpu->arch.exception.pending = false;
3448
3449         vcpu_put(vcpu);
3450
3451         return 0;
3452 }
3453
3454 void kvm_get_segment(struct kvm_vcpu *vcpu,
3455                      struct kvm_segment *var, int seg)
3456 {
3457         kvm_x86_ops->get_segment(vcpu, var, seg);
3458 }
3459
3460 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3461 {
3462         struct kvm_segment cs;
3463
3464         kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3465         *db = cs.db;
3466         *l = cs.l;
3467 }
3468 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
3469
3470 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3471                                   struct kvm_sregs *sregs)
3472 {
3473         struct descriptor_table dt;
3474         int pending_vec;
3475
3476         vcpu_load(vcpu);
3477
3478         kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3479         kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3480         kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3481         kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3482         kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3483         kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3484
3485         kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3486         kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3487
3488         kvm_x86_ops->get_idt(vcpu, &dt);
3489         sregs->idt.limit = dt.limit;
3490         sregs->idt.base = dt.base;
3491         kvm_x86_ops->get_gdt(vcpu, &dt);
3492         sregs->gdt.limit = dt.limit;
3493         sregs->gdt.base = dt.base;
3494
3495         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3496         sregs->cr0 = vcpu->arch.cr0;
3497         sregs->cr2 = vcpu->arch.cr2;
3498         sregs->cr3 = vcpu->arch.cr3;
3499         sregs->cr4 = vcpu->arch.cr4;
3500         sregs->cr8 = kvm_get_cr8(vcpu);
3501         sregs->efer = vcpu->arch.shadow_efer;
3502         sregs->apic_base = kvm_get_apic_base(vcpu);
3503
3504         if (irqchip_in_kernel(vcpu->kvm)) {
3505                 memset(sregs->interrupt_bitmap, 0,
3506                        sizeof sregs->interrupt_bitmap);
3507                 pending_vec = kvm_x86_ops->get_irq(vcpu);
3508                 if (pending_vec >= 0)
3509                         set_bit(pending_vec,
3510                                 (unsigned long *)sregs->interrupt_bitmap);
3511         } else
3512                 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
3513                        sizeof sregs->interrupt_bitmap);
3514
3515         vcpu_put(vcpu);
3516
3517         return 0;
3518 }
3519
3520 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
3521                                     struct kvm_mp_state *mp_state)
3522 {
3523         vcpu_load(vcpu);
3524         mp_state->mp_state = vcpu->arch.mp_state;
3525         vcpu_put(vcpu);
3526         return 0;
3527 }
3528
3529 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3530                                     struct kvm_mp_state *mp_state)
3531 {
3532         vcpu_load(vcpu);
3533         vcpu->arch.mp_state = mp_state->mp_state;
3534         vcpu_put(vcpu);
3535         return 0;
3536 }
3537
3538 static void kvm_set_segment(struct kvm_vcpu *vcpu,
3539                         struct kvm_segment *var, int seg)
3540 {
3541         kvm_x86_ops->set_segment(vcpu, var, seg);
3542 }
3543
3544 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3545                                    struct kvm_segment *kvm_desct)
3546 {
3547         kvm_desct->base = seg_desc->base0;
3548         kvm_desct->base |= seg_desc->base1 << 16;
3549         kvm_desct->base |= seg_desc->base2 << 24;
3550         kvm_desct->limit = seg_desc->limit0;
3551         kvm_desct->limit |= seg_desc->limit << 16;
3552         if (seg_desc->g) {
3553                 kvm_desct->limit <<= 12;
3554                 kvm_desct->limit |= 0xfff;
3555         }
3556         kvm_desct->selector = selector;
3557         kvm_desct->type = seg_desc->type;
3558         kvm_desct->present = seg_desc->p;
3559         kvm_desct->dpl = seg_desc->dpl;
3560         kvm_desct->db = seg_desc->d;
3561         kvm_desct->s = seg_desc->s;
3562         kvm_desct->l = seg_desc->l;
3563         kvm_desct->g = seg_desc->g;
3564         kvm_desct->avl = seg_desc->avl;
3565         if (!selector)
3566                 kvm_desct->unusable = 1;
3567         else
3568                 kvm_desct->unusable = 0;
3569         kvm_desct->padding = 0;
3570 }
3571
3572 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
3573                                           u16 selector,
3574                                           struct descriptor_table *dtable)
3575 {
3576         if (selector & 1 << 2) {
3577                 struct kvm_segment kvm_seg;
3578
3579                 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3580
3581                 if (kvm_seg.unusable)
3582                         dtable->limit = 0;
3583                 else
3584                         dtable->limit = kvm_seg.limit;
3585                 dtable->base = kvm_seg.base;
3586         }
3587         else
3588                 kvm_x86_ops->get_gdt(vcpu, dtable);
3589 }
3590
3591 /* allowed just for 8 bytes segments */
3592 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3593                                          struct desc_struct *seg_desc)
3594 {
3595         gpa_t gpa;
3596         struct descriptor_table dtable;
3597         u16 index = selector >> 3;
3598
3599         get_segment_descriptor_dtable(vcpu, selector, &dtable);
3600
3601         if (dtable.limit < index * 8 + 7) {
3602                 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3603                 return 1;
3604         }
3605         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3606         gpa += index * 8;
3607         return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3608 }
3609
3610 /* allowed just for 8 bytes segments */
3611 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3612                                          struct desc_struct *seg_desc)
3613 {
3614         gpa_t gpa;
3615         struct descriptor_table dtable;
3616         u16 index = selector >> 3;
3617
3618         get_segment_descriptor_dtable(vcpu, selector, &dtable);
3619
3620         if (dtable.limit < index * 8 + 7)
3621                 return 1;
3622         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3623         gpa += index * 8;
3624         return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3625 }
3626
3627 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3628                              struct desc_struct *seg_desc)
3629 {
3630         u32 base_addr;
3631
3632         base_addr = seg_desc->base0;
3633         base_addr |= (seg_desc->base1 << 16);
3634         base_addr |= (seg_desc->base2 << 24);
3635
3636         return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3637 }
3638
3639 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3640 {
3641         struct kvm_segment kvm_seg;
3642
3643         kvm_get_segment(vcpu, &kvm_seg, seg);
3644         return kvm_seg.selector;
3645 }
3646
3647 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3648                                                 u16 selector,
3649                                                 struct kvm_segment *kvm_seg)
3650 {
3651         struct desc_struct seg_desc;
3652
3653         if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
3654                 return 1;
3655         seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
3656         return 0;
3657 }
3658
3659 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
3660 {
3661         struct kvm_segment segvar = {
3662                 .base = selector << 4,
3663                 .limit = 0xffff,
3664                 .selector = selector,
3665                 .type = 3,
3666                 .present = 1,
3667                 .dpl = 3,
3668                 .db = 0,
3669                 .s = 1,
3670                 .l = 0,
3671                 .g = 0,
3672                 .avl = 0,
3673                 .unusable = 0,
3674         };
3675         kvm_x86_ops->set_segment(vcpu, &segvar, seg);
3676         return 0;
3677 }
3678
3679 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3680                                 int type_bits, int seg)
3681 {
3682         struct kvm_segment kvm_seg;
3683
3684         if (!(vcpu->arch.cr0 & X86_CR0_PE))
3685                 return kvm_load_realmode_segment(vcpu, selector, seg);
3686         if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3687                 return 1;
3688         kvm_seg.type |= type_bits;
3689
3690         if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
3691             seg != VCPU_SREG_LDTR)
3692                 if (!kvm_seg.s)
3693                         kvm_seg.unusable = 1;
3694
3695         kvm_set_segment(vcpu, &kvm_seg, seg);
3696         return 0;
3697 }
3698
3699 static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3700                                 struct tss_segment_32 *tss)
3701 {
3702         tss->cr3 = vcpu->arch.cr3;
3703         tss->eip = kvm_rip_read(vcpu);
3704         tss->eflags = kvm_x86_ops->get_rflags(vcpu);
3705         tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3706         tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3707         tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3708         tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3709         tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3710         tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3711         tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3712         tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3713         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3714         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3715         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3716         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3717         tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
3718         tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
3719         tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3720 }
3721
3722 static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3723                                   struct tss_segment_32 *tss)
3724 {
3725         kvm_set_cr3(vcpu, tss->cr3);
3726
3727         kvm_rip_write(vcpu, tss->eip);
3728         kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3729
3730         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
3731         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
3732         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
3733         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
3734         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
3735         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
3736         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
3737         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
3738
3739         if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3740                 return 1;
3741
3742         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3743                 return 1;
3744
3745         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3746                 return 1;
3747
3748         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3749                 return 1;
3750
3751         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3752                 return 1;
3753
3754         if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3755                 return 1;
3756
3757         if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3758                 return 1;
3759         return 0;
3760 }
3761
3762 static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3763                                 struct tss_segment_16 *tss)
3764 {
3765         tss->ip = kvm_rip_read(vcpu);
3766         tss->flag = kvm_x86_ops->get_rflags(vcpu);
3767         tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3768         tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3769         tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3770         tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3771         tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3772         tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3773         tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
3774         tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
3775
3776         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3777         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3778         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3779         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3780         tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3781         tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3782 }
3783
3784 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3785                                  struct tss_segment_16 *tss)
3786 {
3787         kvm_rip_write(vcpu, tss->ip);
3788         kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
3789         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
3790         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
3791         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
3792         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
3793         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
3794         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
3795         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
3796         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
3797
3798         if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3799                 return 1;
3800
3801         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3802                 return 1;
3803
3804         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3805                 return 1;
3806
3807         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3808                 return 1;
3809
3810         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3811                 return 1;
3812         return 0;
3813 }
3814
3815 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3816                               u16 old_tss_sel, u32 old_tss_base,
3817                               struct desc_struct *nseg_desc)
3818 {
3819         struct tss_segment_16 tss_segment_16;
3820         int ret = 0;
3821
3822         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3823                            sizeof tss_segment_16))
3824                 goto out;
3825
3826         save_state_to_tss16(vcpu, &tss_segment_16);
3827
3828         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3829                             sizeof tss_segment_16))
3830                 goto out;
3831
3832         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3833                            &tss_segment_16, sizeof tss_segment_16))
3834                 goto out;
3835
3836         if (old_tss_sel != 0xffff) {
3837                 tss_segment_16.prev_task_link = old_tss_sel;
3838
3839                 if (kvm_write_guest(vcpu->kvm,
3840                                     get_tss_base_addr(vcpu, nseg_desc),
3841                                     &tss_segment_16.prev_task_link,
3842                                     sizeof tss_segment_16.prev_task_link))
3843                         goto out;
3844         }
3845
3846         if (load_state_from_tss16(vcpu, &tss_segment_16))
3847                 goto out;
3848
3849         ret = 1;
3850 out:
3851         return ret;
3852 }
3853
3854 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3855                        u16 old_tss_sel, u32 old_tss_base,
3856                        struct desc_struct *nseg_desc)
3857 {
3858         struct tss_segment_32 tss_segment_32;
3859         int ret = 0;
3860
3861         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3862                            sizeof tss_segment_32))
3863                 goto out;
3864
3865         save_state_to_tss32(vcpu, &tss_segment_32);
3866
3867         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3868                             sizeof tss_segment_32))
3869                 goto out;
3870
3871         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3872                            &tss_segment_32, sizeof tss_segment_32))
3873                 goto out;
3874
3875         if (old_tss_sel != 0xffff) {
3876                 tss_segment_32.prev_task_link = old_tss_sel;
3877
3878                 if (kvm_write_guest(vcpu->kvm,
3879                                     get_tss_base_addr(vcpu, nseg_desc),
3880                                     &tss_segment_32.prev_task_link,
3881                                     sizeof tss_segment_32.prev_task_link))
3882                         goto out;
3883         }
3884
3885         if (load_state_from_tss32(vcpu, &tss_segment_32))
3886                 goto out;
3887
3888         ret = 1;
3889 out:
3890         return ret;
3891 }
3892
3893 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3894 {
3895         struct kvm_segment tr_seg;
3896         struct desc_struct cseg_desc;
3897         struct desc_struct nseg_desc;
3898         int ret = 0;
3899         u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
3900         u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
3901
3902         old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
3903
3904         /* FIXME: Handle errors. Failure to read either TSS or their
3905          * descriptors should generate a pagefault.
3906          */
3907         if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3908                 goto out;
3909
3910         if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
3911                 goto out;
3912
3913         if (reason != TASK_SWITCH_IRET) {
3914                 int cpl;
3915
3916                 cpl = kvm_x86_ops->get_cpl(vcpu);
3917                 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
3918                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
3919                         return 1;
3920                 }
3921         }
3922
3923         if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
3924                 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
3925                 return 1;
3926         }
3927
3928         if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
3929                 cseg_desc.type &= ~(1 << 1); //clear the B flag
3930                 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
3931         }
3932
3933         if (reason == TASK_SWITCH_IRET) {
3934                 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
3935                 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
3936         }
3937
3938         kvm_x86_ops->skip_emulated_instruction(vcpu);
3939
3940         /* set back link to prev task only if NT bit is set in eflags
3941            note that old_tss_sel is not used afetr this point */
3942         if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
3943                 old_tss_sel = 0xffff;
3944
3945         if (nseg_desc.type & 8)
3946                 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
3947                                          old_tss_base, &nseg_desc);
3948         else
3949                 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
3950                                          old_tss_base, &nseg_desc);
3951
3952         if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
3953                 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
3954                 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
3955         }
3956
3957         if (reason != TASK_SWITCH_IRET) {
3958                 nseg_desc.type |= (1 << 1);
3959                 save_guest_segment_descriptor(vcpu, tss_selector,
3960                                               &nseg_desc);
3961         }
3962
3963         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
3964         seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
3965         tr_seg.type = 11;
3966         kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3967 out:
3968         return ret;
3969 }
3970 EXPORT_SYMBOL_GPL(kvm_task_switch);
3971
3972 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3973                                   struct kvm_sregs *sregs)
3974 {
3975         int mmu_reset_needed = 0;
3976         int i, pending_vec, max_bits;
3977         struct descriptor_table dt;
3978
3979         vcpu_load(vcpu);
3980
3981         dt.limit = sregs->idt.limit;
3982         dt.base = sregs->idt.base;
3983         kvm_x86_ops->set_idt(vcpu, &dt);
3984         dt.limit = sregs->gdt.limit;
3985         dt.base = sregs->gdt.base;
3986         kvm_x86_ops->set_gdt(vcpu, &dt);
3987
3988         vcpu->arch.cr2 = sregs->cr2;
3989         mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
3990         vcpu->arch.cr3 = sregs->cr3;
3991
3992         kvm_set_cr8(vcpu, sregs->cr8);
3993
3994         mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
3995         kvm_x86_ops->set_efer(vcpu, sregs->efer);
3996         kvm_set_apic_base(vcpu, sregs->apic_base);
3997
3998         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3999
4000         mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4001         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4002         vcpu->arch.cr0 = sregs->cr0;
4003
4004         mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4005         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4006         if (!is_long_mode(vcpu) && is_pae(vcpu))
4007                 load_pdptrs(vcpu, vcpu->arch.cr3);
4008
4009         if (mmu_reset_needed)
4010                 kvm_mmu_reset_context(vcpu);
4011
4012         if (!irqchip_in_kernel(vcpu->kvm)) {
4013                 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
4014                        sizeof vcpu->arch.irq_pending);
4015                 vcpu->arch.irq_summary = 0;
4016                 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
4017                         if (vcpu->arch.irq_pending[i])
4018                                 __set_bit(i, &vcpu->arch.irq_summary);
4019         } else {
4020                 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
4021                 pending_vec = find_first_bit(
4022                         (const unsigned long *)sregs->interrupt_bitmap,
4023                         max_bits);
4024                 /* Only pending external irq is handled here */
4025                 if (pending_vec < max_bits) {
4026                         kvm_x86_ops->set_irq(vcpu, pending_vec);
4027                         pr_debug("Set back pending irq %d\n",
4028                                  pending_vec);
4029                 }
4030                 kvm_pic_clear_isr_ack(vcpu->kvm);
4031         }
4032
4033         kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4034         kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4035         kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
4036         kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
4037         kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
4038         kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
4039
4040         kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4041         kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4042
4043         /* Older userspace won't unhalt the vcpu on reset. */
4044         if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
4045             sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4046             !(vcpu->arch.cr0 & X86_CR0_PE))
4047                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4048
4049         vcpu_put(vcpu);
4050
4051         return 0;
4052 }
4053
4054 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4055                                         struct kvm_guest_debug *dbg)
4056 {
4057         int i, r;
4058
4059         vcpu_load(vcpu);
4060
4061         if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) ==
4062             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) {
4063                 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4064                         vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4065                 vcpu->arch.switch_db_regs =
4066                         (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
4067         } else {
4068                 for (i = 0; i < KVM_NR_DB_REGS; i++)
4069                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
4070                 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4071         }
4072
4073         r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
4074
4075         if (dbg->control & KVM_GUESTDBG_INJECT_DB)
4076                 kvm_queue_exception(vcpu, DB_VECTOR);
4077         else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4078                 kvm_queue_exception(vcpu, BP_VECTOR);
4079
4080         vcpu_put(vcpu);
4081
4082         return r;
4083 }
4084
4085 /*
4086  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
4087  * we have asm/x86/processor.h
4088  */
4089 struct fxsave {
4090         u16     cwd;
4091         u16     swd;
4092         u16     twd;
4093         u16     fop;
4094         u64     rip;
4095         u64     rdp;
4096         u32     mxcsr;
4097         u32     mxcsr_mask;
4098         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
4099 #ifdef CONFIG_X86_64
4100         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
4101 #else
4102         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
4103 #endif
4104 };
4105
4106 /*
4107  * Translate a guest virtual address to a guest physical address.
4108  */
4109 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4110                                     struct kvm_translation *tr)
4111 {
4112         unsigned long vaddr = tr->linear_address;
4113         gpa_t gpa;
4114
4115         vcpu_load(vcpu);
4116         down_read(&vcpu->kvm->slots_lock);
4117         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
4118         up_read(&vcpu->kvm->slots_lock);
4119         tr->physical_address = gpa;
4120         tr->valid = gpa != UNMAPPED_GVA;
4121         tr->writeable = 1;
4122         tr->usermode = 0;
4123         vcpu_put(vcpu);
4124
4125         return 0;
4126 }
4127
4128 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4129 {
4130         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4131
4132         vcpu_load(vcpu);
4133
4134         memcpy(fpu->fpr, fxsave->st_space, 128);
4135         fpu->fcw = fxsave->cwd;
4136         fpu->fsw = fxsave->swd;
4137         fpu->ftwx = fxsave->twd;
4138         fpu->last_opcode = fxsave->fop;
4139         fpu->last_ip = fxsave->rip;
4140         fpu->last_dp = fxsave->rdp;
4141         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
4142
4143         vcpu_put(vcpu);
4144
4145         return 0;
4146 }
4147
4148 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4149 {
4150         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4151
4152         vcpu_load(vcpu);
4153
4154         memcpy(fxsave->st_space, fpu->fpr, 128);
4155         fxsave->cwd = fpu->fcw;
4156         fxsave->swd = fpu->fsw;
4157         fxsave->twd = fpu->ftwx;
4158         fxsave->fop = fpu->last_opcode;
4159         fxsave->rip = fpu->last_ip;
4160         fxsave->rdp = fpu->last_dp;
4161         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
4162
4163         vcpu_put(vcpu);
4164
4165         return 0;
4166 }
4167
4168 void fx_init(struct kvm_vcpu *vcpu)
4169 {
4170         unsigned after_mxcsr_mask;
4171
4172         /*
4173          * Touch the fpu the first time in non atomic context as if
4174          * this is the first fpu instruction the exception handler
4175          * will fire before the instruction returns and it'll have to
4176          * allocate ram with GFP_KERNEL.
4177          */
4178         if (!used_math())
4179                 kvm_fx_save(&vcpu->arch.host_fx_image);
4180
4181         /* Initialize guest FPU by resetting ours and saving into guest's */
4182         preempt_disable();
4183         kvm_fx_save(&vcpu->arch.host_fx_image);
4184         kvm_fx_finit();
4185         kvm_fx_save(&vcpu->arch.guest_fx_image);
4186         kvm_fx_restore(&vcpu->arch.host_fx_image);
4187         preempt_enable();
4188
4189         vcpu->arch.cr0 |= X86_CR0_ET;
4190         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
4191         vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
4192         memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
4193                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
4194 }
4195 EXPORT_SYMBOL_GPL(fx_init);
4196
4197 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4198 {
4199         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
4200                 return;
4201
4202         vcpu->guest_fpu_loaded = 1;
4203         kvm_fx_save(&vcpu->arch.host_fx_image);
4204         kvm_fx_restore(&vcpu->arch.guest_fx_image);
4205 }
4206 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4207
4208 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4209 {
4210         if (!vcpu->guest_fpu_loaded)
4211                 return;
4212
4213         vcpu->guest_fpu_loaded = 0;
4214         kvm_fx_save(&vcpu->arch.guest_fx_image);
4215         kvm_fx_restore(&vcpu->arch.host_fx_image);
4216         ++vcpu->stat.fpu_reload;
4217 }
4218 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4219
4220 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4221 {
4222         if (vcpu->arch.time_page) {
4223                 kvm_release_page_dirty(vcpu->arch.time_page);
4224                 vcpu->arch.time_page = NULL;
4225         }
4226
4227         kvm_x86_ops->vcpu_free(vcpu);
4228 }
4229
4230 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
4231                                                 unsigned int id)
4232 {
4233         return kvm_x86_ops->vcpu_create(kvm, id);
4234 }
4235
4236 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
4237 {
4238         int r;
4239
4240         /* We do fxsave: this must be aligned. */
4241         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
4242
4243         vcpu->arch.mtrr_state.have_fixed = 1;
4244         vcpu_load(vcpu);
4245         r = kvm_arch_vcpu_reset(vcpu);
4246         if (r == 0)
4247                 r = kvm_mmu_setup(vcpu);
4248         vcpu_put(vcpu);
4249         if (r < 0)
4250                 goto free_vcpu;
4251
4252         return 0;
4253 free_vcpu:
4254         kvm_x86_ops->vcpu_free(vcpu);
4255         return r;
4256 }
4257
4258 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
4259 {
4260         vcpu_load(vcpu);
4261         kvm_mmu_unload(vcpu);
4262         vcpu_put(vcpu);
4263
4264         kvm_x86_ops->vcpu_free(vcpu);
4265 }
4266
4267 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4268 {
4269         vcpu->arch.nmi_pending = false;
4270         vcpu->arch.nmi_injected = false;
4271
4272         vcpu->arch.switch_db_regs = 0;
4273         memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
4274         vcpu->arch.dr6 = DR6_FIXED_1;
4275         vcpu->arch.dr7 = DR7_FIXED_1;
4276
4277         return kvm_x86_ops->vcpu_reset(vcpu);
4278 }
4279
4280 void kvm_arch_hardware_enable(void *garbage)
4281 {
4282         kvm_x86_ops->hardware_enable(garbage);
4283 }
4284
4285 void kvm_arch_hardware_disable(void *garbage)
4286 {
4287         kvm_x86_ops->hardware_disable(garbage);
4288 }
4289
4290 int kvm_arch_hardware_setup(void)
4291 {
4292         return kvm_x86_ops->hardware_setup();
4293 }
4294
4295 void kvm_arch_hardware_unsetup(void)
4296 {
4297         kvm_x86_ops->hardware_unsetup();
4298 }
4299
4300 void kvm_arch_check_processor_compat(void *rtn)
4301 {
4302         kvm_x86_ops->check_processor_compatibility(rtn);
4303 }
4304
4305 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4306 {
4307         struct page *page;
4308         struct kvm *kvm;
4309         int r;
4310
4311         BUG_ON(vcpu->kvm == NULL);
4312         kvm = vcpu->kvm;
4313
4314         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4315         if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
4316                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4317         else
4318                 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
4319
4320         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
4321         if (!page) {
4322                 r = -ENOMEM;
4323                 goto fail;
4324         }
4325         vcpu->arch.pio_data = page_address(page);
4326
4327         r = kvm_mmu_create(vcpu);
4328         if (r < 0)
4329                 goto fail_free_pio_data;
4330
4331         if (irqchip_in_kernel(kvm)) {
4332                 r = kvm_create_lapic(vcpu);
4333                 if (r < 0)
4334                         goto fail_mmu_destroy;
4335         }
4336
4337         return 0;
4338
4339 fail_mmu_destroy:
4340         kvm_mmu_destroy(vcpu);
4341 fail_free_pio_data:
4342         free_page((unsigned long)vcpu->arch.pio_data);
4343 fail:
4344         return r;
4345 }
4346
4347 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
4348 {
4349         kvm_free_lapic(vcpu);
4350         down_read(&vcpu->kvm->slots_lock);
4351         kvm_mmu_destroy(vcpu);
4352         up_read(&vcpu->kvm->slots_lock);
4353         free_page((unsigned long)vcpu->arch.pio_data);
4354 }
4355
4356 struct  kvm *kvm_arch_create_vm(void)
4357 {
4358         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
4359
4360         if (!kvm)
4361                 return ERR_PTR(-ENOMEM);
4362
4363         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4364         INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
4365         INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4366
4367         /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
4368         set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
4369
4370         rdtscll(kvm->arch.vm_init_tsc);
4371
4372         return kvm;
4373 }
4374
4375 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
4376 {
4377         vcpu_load(vcpu);
4378         kvm_mmu_unload(vcpu);
4379         vcpu_put(vcpu);
4380 }
4381
4382 static void kvm_free_vcpus(struct kvm *kvm)
4383 {
4384         unsigned int i;
4385
4386         /*
4387          * Unpin any mmu pages first.
4388          */
4389         for (i = 0; i < KVM_MAX_VCPUS; ++i)
4390                 if (kvm->vcpus[i])
4391                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
4392         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
4393                 if (kvm->vcpus[i]) {
4394                         kvm_arch_vcpu_free(kvm->vcpus[i]);
4395                         kvm->vcpus[i] = NULL;
4396                 }
4397         }
4398
4399 }
4400
4401 void kvm_arch_sync_events(struct kvm *kvm)
4402 {
4403         kvm_free_all_assigned_devices(kvm);
4404 }
4405
4406 void kvm_arch_destroy_vm(struct kvm *kvm)
4407 {
4408         kvm_iommu_unmap_guest(kvm);
4409         kvm_free_pit(kvm);
4410         kfree(kvm->arch.vpic);
4411         kfree(kvm->arch.vioapic);
4412         kvm_free_vcpus(kvm);
4413         kvm_free_physmem(kvm);
4414         if (kvm->arch.apic_access_page)
4415                 put_page(kvm->arch.apic_access_page);
4416         if (kvm->arch.ept_identity_pagetable)
4417                 put_page(kvm->arch.ept_identity_pagetable);
4418         kfree(kvm);
4419 }
4420
4421 int kvm_arch_set_memory_region(struct kvm *kvm,
4422                                 struct kvm_userspace_memory_region *mem,
4423                                 struct kvm_memory_slot old,
4424                                 int user_alloc)
4425 {
4426         int npages = mem->memory_size >> PAGE_SHIFT;
4427         struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
4428
4429         /*To keep backward compatibility with older userspace,
4430          *x86 needs to hanlde !user_alloc case.
4431          */
4432         if (!user_alloc) {
4433                 if (npages && !old.rmap) {
4434                         unsigned long userspace_addr;
4435
4436                         down_write(&current->mm->mmap_sem);
4437                         userspace_addr = do_mmap(NULL, 0,
4438                                                  npages * PAGE_SIZE,
4439                                                  PROT_READ | PROT_WRITE,
4440                                                  MAP_PRIVATE | MAP_ANONYMOUS,
4441                                                  0);
4442                         up_write(&current->mm->mmap_sem);
4443
4444                         if (IS_ERR((void *)userspace_addr))
4445                                 return PTR_ERR((void *)userspace_addr);
4446
4447                         /* set userspace_addr atomically for kvm_hva_to_rmapp */
4448                         spin_lock(&kvm->mmu_lock);
4449                         memslot->userspace_addr = userspace_addr;
4450                         spin_unlock(&kvm->mmu_lock);
4451                 } else {
4452                         if (!old.user_alloc && old.rmap) {
4453                                 int ret;
4454
4455                                 down_write(&current->mm->mmap_sem);
4456                                 ret = do_munmap(current->mm, old.userspace_addr,
4457                                                 old.npages * PAGE_SIZE);
4458                                 up_write(&current->mm->mmap_sem);
4459                                 if (ret < 0)
4460                                         printk(KERN_WARNING
4461                                        "kvm_vm_ioctl_set_memory_region: "
4462                                        "failed to munmap memory\n");
4463                         }
4464                 }
4465         }
4466
4467         if (!kvm->arch.n_requested_mmu_pages) {
4468                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4469                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4470         }
4471
4472         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4473         kvm_flush_remote_tlbs(kvm);
4474
4475         return 0;
4476 }
4477
4478 void kvm_arch_flush_shadow(struct kvm *kvm)
4479 {
4480         kvm_mmu_zap_all(kvm);
4481 }
4482
4483 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4484 {
4485         return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
4486                || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
4487                || vcpu->arch.nmi_pending;
4488 }
4489
4490 static void vcpu_kick_intr(void *info)
4491 {
4492 #ifdef DEBUG
4493         struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
4494         printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
4495 #endif
4496 }
4497
4498 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4499 {
4500         int ipi_pcpu = vcpu->cpu;
4501         int cpu = get_cpu();
4502
4503         if (waitqueue_active(&vcpu->wq)) {
4504                 wake_up_interruptible(&vcpu->wq);
4505                 ++vcpu->stat.halt_wakeup;
4506         }
4507         /*
4508          * We may be called synchronously with irqs disabled in guest mode,
4509          * So need not to call smp_call_function_single() in that case.
4510          */
4511         if (vcpu->guest_mode && vcpu->cpu != cpu)
4512                 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
4513         put_cpu();
4514 }
4515
4516 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4517 {
4518         return kvm_x86_ops->interrupt_allowed(vcpu);
4519 }