arch/x86/kvm/x86.c

   1 /*
   2  * Kernel-based Virtual Machine driver for Linux
   3  *
   4  * derived from drivers/kvm/kvm_main.c
   5  *
   6  * Copyright (C) 2006 Qumranet, Inc.
   7  *
   8  * Authors:
   9  *   Avi Kivity   <avi@qumranet.com>
  10  *   Yaniv Kamay  <yaniv@qumranet.com>
  11  *
  12  * This work is licensed under the terms of the GNU GPL, version 2.  See
  13  * the COPYING file in the top-level directory.
  14  *
  15  */
  16
  17 #include <linux/kvm_host.h>
  18 #include "irq.h"
  19 #include "mmu.h"
  20
  21 #include <linux/clocksource.h>
  22 #include <linux/kvm.h>
  23 #include <linux/fs.h>
  24 #include <linux/vmalloc.h>
  25 #include <linux/module.h>
  26 #include <linux/mman.h>
  27 #include <linux/highmem.h>
  28
  29 #include <asm/uaccess.h>
  30 #include <asm/msr.h>
  31 #include <asm/desc.h>
  32
  33 #define MAX_IO_MSRS 256
  34 #define CR0_RESERVED_BITS                                               \
  35         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
  36                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  37                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
  38 #define CR4_RESERVED_BITS                                               \
  39         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
  40                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
  41                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
  42                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
  43
  44 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
  45 /* EFER defaults:
  46  * - enable syscall per default because its emulated by KVM
  47  * - enable LME and LMA per default on 64 bit KVM
  48  */
  49 #ifdef CONFIG_X86_64
  50 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
  51 #else
  52 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
  53 #endif
  54
  55 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  56 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  57
  58 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
  59                                     struct kvm_cpuid_entry2 __user *entries);
  60
  61 struct kvm_x86_ops *kvm_x86_ops;
  62
  63 struct kvm_stats_debugfs_item debugfs_entries[] = {
  64         { "pf_fixed", VCPU_STAT(pf_fixed) },
  65         { "pf_guest", VCPU_STAT(pf_guest) },
  66         { "tlb_flush", VCPU_STAT(tlb_flush) },
  67         { "invlpg", VCPU_STAT(invlpg) },
  68         { "exits", VCPU_STAT(exits) },
  69         { "io_exits", VCPU_STAT(io_exits) },
  70         { "mmio_exits", VCPU_STAT(mmio_exits) },
  71         { "signal_exits", VCPU_STAT(signal_exits) },
  72         { "irq_window", VCPU_STAT(irq_window_exits) },
  73         { "halt_exits", VCPU_STAT(halt_exits) },
  74         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
  75         { "hypercalls", VCPU_STAT(hypercalls) },
  76         { "request_irq", VCPU_STAT(request_irq_exits) },
  77         { "irq_exits", VCPU_STAT(irq_exits) },
  78         { "host_state_reload", VCPU_STAT(host_state_reload) },
  79         { "efer_reload", VCPU_STAT(efer_reload) },
  80         { "fpu_reload", VCPU_STAT(fpu_reload) },
  81         { "insn_emulation", VCPU_STAT(insn_emulation) },
  82         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
  83         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
  84         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
  85         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
  86         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
  87         { "mmu_flooded", VM_STAT(mmu_flooded) },
  88         { "mmu_recycled", VM_STAT(mmu_recycled) },
  89         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
  90         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
  91         { "largepages", VM_STAT(lpages) },
  92         { NULL }
  93 };
  94
  95
  96 unsigned long segment_base(u16 selector)
  97 {
  98         struct descriptor_table gdt;
  99         struct desc_struct *d;
 100         unsigned long table_base;
 101         unsigned long v;
 102
 103         if (selector == 0)
 104                 return 0;
 105
 106         asm("sgdt %0" : "=m"(gdt));
 107         table_base = gdt.base;
 108
 109         if (selector & 4) {           /* from ldt */
 110                 u16 ldt_selector;
 111
 112                 asm("sldt %0" : "=g"(ldt_selector));
 113                 table_base = segment_base(ldt_selector);
 114         }
 115         d = (struct desc_struct *)(table_base + (selector & ~7));
 116         v = d->base0 | ((unsigned long)d->base1 << 16) |
 117                 ((unsigned long)d->base2 << 24);
 118 #ifdef CONFIG_X86_64
 119         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
 120                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
 121 #endif
 122         return v;
 123 }
 124 EXPORT_SYMBOL_GPL(segment_base);
 125
 126 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 127 {
 128         if (irqchip_in_kernel(vcpu->kvm))
 129                 return vcpu->arch.apic_base;
 130         else
 131                 return vcpu->arch.apic_base;
 132 }
 133 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 134
 135 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 136 {
 137         /* TODO: reserve bits check */
 138         if (irqchip_in_kernel(vcpu->kvm))
 139                 kvm_lapic_set_base(vcpu, data);
 140         else
 141                 vcpu->arch.apic_base = data;
 142 }
 143 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 144
 145 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 146 {
 147         WARN_ON(vcpu->arch.exception.pending);
 148         vcpu->arch.exception.pending = true;
 149         vcpu->arch.exception.has_error_code = false;
 150         vcpu->arch.exception.nr = nr;
 151 }
 152 EXPORT_SYMBOL_GPL(kvm_queue_exception);
 153
 154 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 155                            u32 error_code)
 156 {
 157         ++vcpu->stat.pf_guest;
 158         if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
 159                 printk(KERN_DEBUG "kvm: inject_page_fault:"
 160                        " double fault 0x%lx\n", addr);
 161                 vcpu->arch.exception.nr = DF_VECTOR;
 162                 vcpu->arch.exception.error_code = 0;
 163                 return;
 164         }
 165         vcpu->arch.cr2 = addr;
 166         kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 167 }
 168
 169 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 170 {
 171         WARN_ON(vcpu->arch.exception.pending);
 172         vcpu->arch.exception.pending = true;
 173         vcpu->arch.exception.has_error_code = true;
 174         vcpu->arch.exception.nr = nr;
 175         vcpu->arch.exception.error_code = error_code;
 176 }
 177 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 178
 179 static void __queue_exception(struct kvm_vcpu *vcpu)
 180 {
 181         kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
 182                                      vcpu->arch.exception.has_error_code,
 183                                      vcpu->arch.exception.error_code);
 184 }
 185
 186 /*
 187  * Load the pae pdptrs.  Return true is they are all valid.
 188  */
 189 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 190 {
 191         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 192         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 193         int i;
 194         int ret;
 195         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 196
 197         down_read(&vcpu->kvm->slots_lock);
 198         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
 199                                   offset * sizeof(u64), sizeof(pdpte));
 200         if (ret < 0) {
 201                 ret = 0;
 202                 goto out;
 203         }
 204         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 205                 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
 206                         ret = 0;
 207                         goto out;
 208                 }
 209         }
 210         ret = 1;
 211
 212         memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 213 out:
 214         up_read(&vcpu->kvm->slots_lock);
 215
 216         return ret;
 217 }
 218 EXPORT_SYMBOL_GPL(load_pdptrs);
 219
 220 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 221 {
 222         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 223         bool changed = true;
 224         int r;
 225
 226         if (is_long_mode(vcpu) || !is_pae(vcpu))
 227                 return false;
 228
 229         down_read(&vcpu->kvm->slots_lock);
 230         r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
 231         if (r < 0)
 232                 goto out;
 233         changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 234 out:
 235         up_read(&vcpu->kvm->slots_lock);
 236
 237         return changed;
 238 }
 239
 240 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 241 {
 242         if (cr0 & CR0_RESERVED_BITS) {
 243                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
 244                        cr0, vcpu->arch.cr0);
 245                 kvm_inject_gp(vcpu, 0);
 246                 return;
 247         }
 248
 249         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 250                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
 251                 kvm_inject_gp(vcpu, 0);
 252                 return;
 253         }
 254
 255         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 256                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
 257                        "and a clear PE flag\n");
 258                 kvm_inject_gp(vcpu, 0);
 259                 return;
 260         }
 261
 262         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 263 #ifdef CONFIG_X86_64
 264                 if ((vcpu->arch.shadow_efer & EFER_LME)) {
 265                         int cs_db, cs_l;
 266
 267                         if (!is_pae(vcpu)) {
 268                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 269                                        "in long mode while PAE is disabled\n");
 270                                 kvm_inject_gp(vcpu, 0);
 271                                 return;
 272                         }
 273                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 274                         if (cs_l) {
 275                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 276                                        "in long mode while CS.L == 1\n");
 277                                 kvm_inject_gp(vcpu, 0);
 278                                 return;
 279
 280                         }
 281                 } else
 282 #endif
 283                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 284                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
 285                                "reserved bits\n");
 286                         kvm_inject_gp(vcpu, 0);
 287                         return;
 288                 }
 289
 290         }
 291
 292         kvm_x86_ops->set_cr0(vcpu, cr0);
 293         vcpu->arch.cr0 = cr0;
 294
 295         kvm_mmu_reset_context(vcpu);
 296         return;
 297 }
 298 EXPORT_SYMBOL_GPL(set_cr0);
 299
 300 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 301 {
 302         set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
 303 }
 304 EXPORT_SYMBOL_GPL(lmsw);
 305
 306 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 307 {
 308         if (cr4 & CR4_RESERVED_BITS) {
 309                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
 310                 kvm_inject_gp(vcpu, 0);
 311                 return;
 312         }
 313
 314         if (is_long_mode(vcpu)) {
 315                 if (!(cr4 & X86_CR4_PAE)) {
 316                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
 317                                "in long mode\n");
 318                         kvm_inject_gp(vcpu, 0);
 319                         return;
 320                 }
 321         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
 322                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 323                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
 324                 kvm_inject_gp(vcpu, 0);
 325                 return;
 326         }
 327
 328         if (cr4 & X86_CR4_VMXE) {
 329                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 330                 kvm_inject_gp(vcpu, 0);
 331                 return;
 332         }
 333         kvm_x86_ops->set_cr4(vcpu, cr4);
 334         vcpu->arch.cr4 = cr4;
 335         kvm_mmu_reset_context(vcpu);
 336 }
 337 EXPORT_SYMBOL_GPL(set_cr4);
 338
 339 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 340 {
 341         if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 342                 kvm_mmu_flush_tlb(vcpu);
 343                 return;
 344         }
 345
 346         if (is_long_mode(vcpu)) {
 347                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 348                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
 349                         kvm_inject_gp(vcpu, 0);
 350                         return;
 351                 }
 352         } else {
 353                 if (is_pae(vcpu)) {
 354                         if (cr3 & CR3_PAE_RESERVED_BITS) {
 355                                 printk(KERN_DEBUG
 356                                        "set_cr3: #GP, reserved bits\n");
 357                                 kvm_inject_gp(vcpu, 0);
 358                                 return;
 359                         }
 360                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
 361                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
 362                                        "reserved bits\n");
 363                                 kvm_inject_gp(vcpu, 0);
 364                                 return;
 365                         }
 366                 }
 367                 /*
 368                  * We don't check reserved bits in nonpae mode, because
 369                  * this isn't enforced, and VMware depends on this.
 370                  */
 371         }
 372
 373         down_read(&vcpu->kvm->slots_lock);
 374         /*
 375          * Does the new cr3 value map to physical memory? (Note, we
 376          * catch an invalid cr3 even in real-mode, because it would
 377          * cause trouble later on when we turn on paging anyway.)
 378          *
 379          * A real CPU would silently accept an invalid cr3 and would
 380          * attempt to use it - with largely undefined (and often hard
 381          * to debug) behavior on the guest side.
 382          */
 383         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 384                 kvm_inject_gp(vcpu, 0);
 385         else {
 386                 vcpu->arch.cr3 = cr3;
 387                 vcpu->arch.mmu.new_cr3(vcpu);
 388         }
 389         up_read(&vcpu->kvm->slots_lock);
 390 }
 391 EXPORT_SYMBOL_GPL(set_cr3);
 392
 393 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 394 {
 395         if (cr8 & CR8_RESERVED_BITS) {
 396                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
 397                 kvm_inject_gp(vcpu, 0);
 398                 return;
 399         }
 400         if (irqchip_in_kernel(vcpu->kvm))
 401                 kvm_lapic_set_tpr(vcpu, cr8);
 402         else
 403                 vcpu->arch.cr8 = cr8;
 404 }
 405 EXPORT_SYMBOL_GPL(set_cr8);
 406
 407 unsigned long get_cr8(struct kvm_vcpu *vcpu)
 408 {
 409         if (irqchip_in_kernel(vcpu->kvm))
 410                 return kvm_lapic_get_cr8(vcpu);
 411         else
 412                 return vcpu->arch.cr8;
 413 }
 414 EXPORT_SYMBOL_GPL(get_cr8);
 415
 416 /*
 417  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 418  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 419  *
 420  * This list is modified at module load time to reflect the
 421  * capabilities of the host cpu.
 422  */
 423 static u32 msrs_to_save[] = {
 424         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 425         MSR_K6_STAR,
 426 #ifdef CONFIG_X86_64
 427         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 428 #endif
 429         MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 430         MSR_IA32_PERF_STATUS,
 431 };
 432
 433 static unsigned num_msrs_to_save;
 434
 435 static u32 emulated_msrs[] = {
 436         MSR_IA32_MISC_ENABLE,
 437 };
 438
 439 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 440 {
 441         if (efer & efer_reserved_bits) {
 442                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
 443                        efer);
 444                 kvm_inject_gp(vcpu, 0);
 445                 return;
 446         }
 447
 448         if (is_paging(vcpu)
 449             && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
 450                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
 451                 kvm_inject_gp(vcpu, 0);
 452                 return;
 453         }
 454
 455         kvm_x86_ops->set_efer(vcpu, efer);
 456
 457         efer &= ~EFER_LMA;
 458         efer |= vcpu->arch.shadow_efer & EFER_LMA;
 459
 460         vcpu->arch.shadow_efer = efer;
 461 }
 462
 463 void kvm_enable_efer_bits(u64 mask)
 464 {
 465        efer_reserved_bits &= ~mask;
 466 }
 467 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 468
 469
 470 /*
 471  * Writes msr value into into the appropriate "register".
 472  * Returns 0 on success, non-0 otherwise.
 473  * Assumes vcpu_load() was already called.
 474  */
 475 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 476 {
 477         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
 478 }
 479
 480 /*
 481  * Adapt set_msr() to msr_io()'s calling convention
 482  */
 483 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 484 {
 485         return kvm_set_msr(vcpu, index, *data);
 486 }
 487
 488 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 489 {
 490         static int version;
 491         struct kvm_wall_clock wc;
 492         struct timespec wc_ts;
 493
 494         if (!wall_clock)
 495                 return;
 496
 497         version++;
 498
 499         down_read(&kvm->slots_lock);
 500         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 501
 502         wc_ts = current_kernel_time();
 503         wc.wc_sec = wc_ts.tv_sec;
 504         wc.wc_nsec = wc_ts.tv_nsec;
 505         wc.wc_version = version;
 506
 507         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 508
 509         version++;
 510         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 511         up_read(&kvm->slots_lock);
 512 }
 513
 514 static void kvm_write_guest_time(struct kvm_vcpu *v)
 515 {
 516         struct timespec ts;
 517         unsigned long flags;
 518         struct kvm_vcpu_arch *vcpu = &v->arch;
 519         void *shared_kaddr;
 520
 521         if ((!vcpu->time_page))
 522                 return;
 523
 524         /* Keep irq disabled to prevent changes to the clock */
 525         local_irq_save(flags);
 526         kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
 527                           &vcpu->hv_clock.tsc_timestamp);
 528         ktime_get_ts(&ts);
 529         local_irq_restore(flags);
 530
 531         /* With all the info we got, fill in the values */
 532
 533         vcpu->hv_clock.system_time = ts.tv_nsec +
 534                                      (NSEC_PER_SEC * (u64)ts.tv_sec);
 535         /*
 536          * The interface expects us to write an even number signaling that the
 537          * update is finished. Since the guest won't see the intermediate
 538          * state, we just write "2" at the end
 539          */
 540         vcpu->hv_clock.version = 2;
 541
 542         shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
 543
 544         memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
 545                 sizeof(vcpu->hv_clock));
 546
 547         kunmap_atomic(shared_kaddr, KM_USER0);
 548
 549         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 550 }
 551
 552
 553 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 554 {
 555         switch (msr) {
 556         case MSR_EFER:
 557                 set_efer(vcpu, data);
 558                 break;
 559         case MSR_IA32_MC0_STATUS:
 560                 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
 561                        __FUNCTION__, data);
 562                 break;
 563         case MSR_IA32_MCG_STATUS:
 564                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
 565                         __FUNCTION__, data);
 566                 break;
 567         case MSR_IA32_MCG_CTL:
 568                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
 569                         __FUNCTION__, data);
 570                 break;
 571         case MSR_IA32_UCODE_REV:
 572         case MSR_IA32_UCODE_WRITE:
 573         case 0x200 ... 0x2ff: /* MTRRs */
 574                 break;
 575         case MSR_IA32_APICBASE:
 576                 kvm_set_apic_base(vcpu, data);
 577                 break;
 578         case MSR_IA32_MISC_ENABLE:
 579                 vcpu->arch.ia32_misc_enable_msr = data;
 580                 break;
 581         case MSR_KVM_WALL_CLOCK:
 582                 vcpu->kvm->arch.wall_clock = data;
 583                 kvm_write_wall_clock(vcpu->kvm, data);
 584                 break;
 585         case MSR_KVM_SYSTEM_TIME: {
 586                 if (vcpu->arch.time_page) {
 587                         kvm_release_page_dirty(vcpu->arch.time_page);
 588                         vcpu->arch.time_page = NULL;
 589                 }
 590
 591                 vcpu->arch.time = data;
 592
 593                 /* we verify if the enable bit is set... */
 594                 if (!(data & 1))
 595                         break;
 596
 597                 /* ...but clean it before doing the actual write */
 598                 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 599
 600                 vcpu->arch.hv_clock.tsc_to_system_mul =
 601                                         clocksource_khz2mult(tsc_khz, 22);
 602                 vcpu->arch.hv_clock.tsc_shift = 22;
 603
 604                 down_read(&current->mm->mmap_sem);
 605                 down_read(&vcpu->kvm->slots_lock);
 606                 vcpu->arch.time_page =
 607                                 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
 608                 up_read(&vcpu->kvm->slots_lock);
 609                 up_read(&current->mm->mmap_sem);
 610
 611                 if (is_error_page(vcpu->arch.time_page)) {
 612                         kvm_release_page_clean(vcpu->arch.time_page);
 613                         vcpu->arch.time_page = NULL;
 614                 }
 615
 616                 kvm_write_guest_time(vcpu);
 617                 break;
 618         }
 619         default:
 620                 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
 621                 return 1;
 622         }
 623         return 0;
 624 }
 625 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
 626
 627
 628 /*
 629  * Reads an msr value (of 'msr_index') into 'pdata'.
 630  * Returns 0 on success, non-0 otherwise.
 631  * Assumes vcpu_load() was already called.
 632  */
 633 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 634 {
 635         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
 636 }
 637
 638 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 639 {
 640         u64 data;
 641
 642         switch (msr) {
 643         case 0xc0010010: /* SYSCFG */
 644         case 0xc0010015: /* HWCR */
 645         case MSR_IA32_PLATFORM_ID:
 646         case MSR_IA32_P5_MC_ADDR:
 647         case MSR_IA32_P5_MC_TYPE:
 648         case MSR_IA32_MC0_CTL:
 649         case MSR_IA32_MCG_STATUS:
 650         case MSR_IA32_MCG_CAP:
 651         case MSR_IA32_MCG_CTL:
 652         case MSR_IA32_MC0_MISC:
 653         case MSR_IA32_MC0_MISC+4:
 654         case MSR_IA32_MC0_MISC+8:
 655         case MSR_IA32_MC0_MISC+12:
 656         case MSR_IA32_MC0_MISC+16:
 657         case MSR_IA32_UCODE_REV:
 658         case MSR_IA32_EBL_CR_POWERON:
 659                 /* MTRR registers */
 660         case 0xfe:
 661         case 0x200 ... 0x2ff:
 662                 data = 0;
 663                 break;
 664         case 0xcd: /* fsb frequency */
 665                 data = 3;
 666                 break;
 667         case MSR_IA32_APICBASE:
 668                 data = kvm_get_apic_base(vcpu);
 669                 break;
 670         case MSR_IA32_MISC_ENABLE:
 671                 data = vcpu->arch.ia32_misc_enable_msr;
 672                 break;
 673         case MSR_IA32_PERF_STATUS:
 674                 /* TSC increment by tick */
 675                 data = 1000ULL;
 676                 /* CPU multiplier */
 677                 data |= (((uint64_t)4ULL) << 40);
 678                 break;
 679         case MSR_EFER:
 680                 data = vcpu->arch.shadow_efer;
 681                 break;
 682         case MSR_KVM_WALL_CLOCK:
 683                 data = vcpu->kvm->arch.wall_clock;
 684                 break;
 685         case MSR_KVM_SYSTEM_TIME:
 686                 data = vcpu->arch.time;
 687                 break;
 688         default:
 689                 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 690                 return 1;
 691         }
 692         *pdata = data;
 693         return 0;
 694 }
 695 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
 696
 697 /*
 698  * Read or write a bunch of msrs. All parameters are kernel addresses.
 699  *
 700  * @return number of msrs set successfully.
 701  */
 702 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 703                     struct kvm_msr_entry *entries,
 704                     int (*do_msr)(struct kvm_vcpu *vcpu,
 705                                   unsigned index, u64 *data))
 706 {
 707         int i;
 708
 709         vcpu_load(vcpu);
 710
 711         for (i = 0; i < msrs->nmsrs; ++i)
 712                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
 713                         break;
 714
 715         vcpu_put(vcpu);
 716
 717         return i;
 718 }
 719
 720 /*
 721  * Read or write a bunch of msrs. Parameters are user addresses.
 722  *
 723  * @return number of msrs set successfully.
 724  */
 725 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
 726                   int (*do_msr)(struct kvm_vcpu *vcpu,
 727                                 unsigned index, u64 *data),
 728                   int writeback)
 729 {
 730         struct kvm_msrs msrs;
 731         struct kvm_msr_entry *entries;
 732         int r, n;
 733         unsigned size;
 734
 735         r = -EFAULT;
 736         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
 737                 goto out;
 738
 739         r = -E2BIG;
 740         if (msrs.nmsrs >= MAX_IO_MSRS)
 741                 goto out;
 742
 743         r = -ENOMEM;
 744         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
 745         entries = vmalloc(size);
 746         if (!entries)
 747                 goto out;
 748
 749         r = -EFAULT;
 750         if (copy_from_user(entries, user_msrs->entries, size))
 751                 goto out_free;
 752
 753         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
 754         if (r < 0)
 755                 goto out_free;
 756
 757         r = -EFAULT;
 758         if (writeback && copy_to_user(user_msrs->entries, entries, size))
 759                 goto out_free;
 760
 761         r = n;
 762
 763 out_free:
 764         vfree(entries);
 765 out:
 766         return r;
 767 }
 768
 769 /*
 770  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
 771  * cached on it.
 772  */
 773 void decache_vcpus_on_cpu(int cpu)
 774 {
 775         struct kvm *vm;
 776         struct kvm_vcpu *vcpu;
 777         int i;
 778
 779         spin_lock(&kvm_lock);
 780         list_for_each_entry(vm, &vm_list, vm_list)
 781                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 782                         vcpu = vm->vcpus[i];
 783                         if (!vcpu)
 784                                 continue;
 785                         /*
 786                          * If the vcpu is locked, then it is running on some
 787                          * other cpu and therefore it is not cached on the
 788                          * cpu in question.
 789                          *
 790                          * If it's not locked, check the last cpu it executed
 791                          * on.
 792                          */
 793                         if (mutex_trylock(&vcpu->mutex)) {
 794                                 if (vcpu->cpu == cpu) {
 795                                         kvm_x86_ops->vcpu_decache(vcpu);
 796                                         vcpu->cpu = -1;
 797                                 }
 798                                 mutex_unlock(&vcpu->mutex);
 799                         }
 800                 }
 801         spin_unlock(&kvm_lock);
 802 }
 803
 804 int kvm_dev_ioctl_check_extension(long ext)
 805 {
 806         int r;
 807
 808         switch (ext) {
 809         case KVM_CAP_IRQCHIP:
 810         case KVM_CAP_HLT:
 811         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
 812         case KVM_CAP_USER_MEMORY:
 813         case KVM_CAP_SET_TSS_ADDR:
 814         case KVM_CAP_EXT_CPUID:
 815         case KVM_CAP_CLOCKSOURCE:
 816                 r = 1;
 817                 break;
 818         case KVM_CAP_VAPIC:
 819                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
 820                 break;
 821         case KVM_CAP_NR_VCPUS:
 822                 r = KVM_MAX_VCPUS;
 823                 break;
 824         case KVM_CAP_NR_MEMSLOTS:
 825                 r = KVM_MEMORY_SLOTS;
 826                 break;
 827         default:
 828                 r = 0;
 829                 break;
 830         }
 831         return r;
 832
 833 }
 834
 835 long kvm_arch_dev_ioctl(struct file *filp,
 836                         unsigned int ioctl, unsigned long arg)
 837 {
 838         void __user *argp = (void __user *)arg;
 839         long r;
 840
 841         switch (ioctl) {
 842         case KVM_GET_MSR_INDEX_LIST: {
 843                 struct kvm_msr_list __user *user_msr_list = argp;
 844                 struct kvm_msr_list msr_list;
 845                 unsigned n;
 846
 847                 r = -EFAULT;
 848                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
 849                         goto out;
 850                 n = msr_list.nmsrs;
 851                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
 852                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
 853                         goto out;
 854                 r = -E2BIG;
 855                 if (n < num_msrs_to_save)
 856                         goto out;
 857                 r = -EFAULT;
 858                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
 859                                  num_msrs_to_save * sizeof(u32)))
 860                         goto out;
 861                 if (copy_to_user(user_msr_list->indices
 862                                  + num_msrs_to_save * sizeof(u32),
 863                                  &emulated_msrs,
 864                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
 865                         goto out;
 866                 r = 0;
 867                 break;
 868         }
 869         case KVM_GET_SUPPORTED_CPUID: {
 870                 struct kvm_cpuid2 __user *cpuid_arg = argp;
 871                 struct kvm_cpuid2 cpuid;
 872
 873                 r = -EFAULT;
 874                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 875                         goto out;
 876                 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
 877                         cpuid_arg->entries);
 878                 if (r)
 879                         goto out;
 880
 881                 r = -EFAULT;
 882                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
 883                         goto out;
 884                 r = 0;
 885                 break;
 886         }
 887         default:
 888                 r = -EINVAL;
 889         }
 890 out:
 891         return r;
 892 }
 893
 894 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 895 {
 896         kvm_x86_ops->vcpu_load(vcpu, cpu);
 897         kvm_write_guest_time(vcpu);
 898 }
 899
 900 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 901 {
 902         kvm_x86_ops->vcpu_put(vcpu);
 903         kvm_put_guest_fpu(vcpu);
 904 }
 905
 906 static int is_efer_nx(void)
 907 {
 908         u64 efer;
 909
 910         rdmsrl(MSR_EFER, efer);
 911         return efer & EFER_NX;
 912 }
 913
 914 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
 915 {
 916         int i;
 917         struct kvm_cpuid_entry2 *e, *entry;
 918
 919         entry = NULL;
 920         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
 921                 e = &vcpu->arch.cpuid_entries[i];
 922                 if (e->function == 0x80000001) {
 923                         entry = e;
 924                         break;
 925                 }
 926         }
 927         if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
 928                 entry->edx &= ~(1 << 20);
 929                 printk(KERN_INFO "kvm: guest NX capability removed\n");
 930         }
 931 }
 932
 933 /* when an old userspace process fills a new kernel module */
 934 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 935                                     struct kvm_cpuid *cpuid,
 936                                     struct kvm_cpuid_entry __user *entries)
 937 {
 938         int r, i;
 939         struct kvm_cpuid_entry *cpuid_entries;
 940
 941         r = -E2BIG;
 942         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
 943                 goto out;
 944         r = -ENOMEM;
 945         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
 946         if (!cpuid_entries)
 947                 goto out;
 948         r = -EFAULT;
 949         if (copy_from_user(cpuid_entries, entries,
 950                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
 951                 goto out_free;
 952         for (i = 0; i < cpuid->nent; i++) {
 953                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
 954                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
 955                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
 956                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
 957                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
 958                 vcpu->arch.cpuid_entries[i].index = 0;
 959                 vcpu->arch.cpuid_entries[i].flags = 0;
 960                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
 961                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
 962                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
 963         }
 964         vcpu->arch.cpuid_nent = cpuid->nent;
 965         cpuid_fix_nx_cap(vcpu);
 966         r = 0;
 967
 968 out_free:
 969         vfree(cpuid_entries);
 970 out:
 971         return r;
 972 }
 973
 974 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 975                                     struct kvm_cpuid2 *cpuid,
 976                                     struct kvm_cpuid_entry2 __user *entries)
 977 {
 978         int r;
 979
 980         r = -E2BIG;
 981         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
 982                 goto out;
 983         r = -EFAULT;
 984         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
 985                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
 986                 goto out;
 987         vcpu->arch.cpuid_nent = cpuid->nent;
 988         return 0;
 989
 990 out:
 991         return r;
 992 }
 993
 994 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 995                                     struct kvm_cpuid2 *cpuid,
 996                                     struct kvm_cpuid_entry2 __user *entries)
 997 {
 998         int r;
 999
1000         r = -E2BIG;
1001         if (cpuid->nent < vcpu->arch.cpuid_nent)
1002                 goto out;
1003         r = -EFAULT;
1004         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1005                            vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1006                 goto out;
1007         return 0;
1008
1009 out:
1010         cpuid->nent = vcpu->arch.cpuid_nent;
1011         return r;
1012 }
1013
1014 static inline u32 bit(int bitno)
1015 {
1016         return 1 << (bitno & 31);
1017 }
1018
1019 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1020                           u32 index)
1021 {
1022         entry->function = function;
1023         entry->index = index;
1024         cpuid_count(entry->function, entry->index,
1025                 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1026         entry->flags = 0;
1027 }
1028
1029 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1030                          u32 index, int *nent, int maxnent)
1031 {
1032         const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
1033                 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1034                 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1035                 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1036                 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1037                 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
1038                 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1039                 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
1040                 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
1041                 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
1042         const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
1043                 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1044                 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1045                 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1046                 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1047                 bit(X86_FEATURE_PGE) |
1048                 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1049                 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
1050                 bit(X86_FEATURE_SYSCALL) |
1051                 (bit(X86_FEATURE_NX) && is_efer_nx()) |
1052 #ifdef CONFIG_X86_64
1053                 bit(X86_FEATURE_LM) |
1054 #endif
1055                 bit(X86_FEATURE_MMXEXT) |
1056                 bit(X86_FEATURE_3DNOWEXT) |
1057                 bit(X86_FEATURE_3DNOW);
1058         const u32 kvm_supported_word3_x86_features =
1059                 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
1060         const u32 kvm_supported_word6_x86_features =
1061                 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
1062
1063         /* all func 2 cpuid_count() should be called on the same cpu */
1064         get_cpu();
1065         do_cpuid_1_ent(entry, function, index);
1066         ++*nent;
1067
1068         switch (function) {
1069         case 0:
1070                 entry->eax = min(entry->eax, (u32)0xb);
1071                 break;
1072         case 1:
1073                 entry->edx &= kvm_supported_word0_x86_features;
1074                 entry->ecx &= kvm_supported_word3_x86_features;
1075                 break;
1076         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1077          * may return different values. This forces us to get_cpu() before
1078          * issuing the first command, and also to emulate this annoying behavior
1079          * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1080         case 2: {
1081                 int t, times = entry->eax & 0xff;
1082
1083                 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1084                 for (t = 1; t < times && *nent < maxnent; ++t) {
1085                         do_cpuid_1_ent(&entry[t], function, 0);
1086                         entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1087                         ++*nent;
1088                 }
1089                 break;
1090         }
1091         /* function 4 and 0xb have additional index. */
1092         case 4: {
1093                 int i, cache_type;
1094
1095                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1096                 /* read more entries until cache_type is zero */
1097                 for (i = 1; *nent < maxnent; ++i) {
1098                         cache_type = entry[i - 1].eax & 0x1f;
1099                         if (!cache_type)
1100                                 break;
1101                         do_cpuid_1_ent(&entry[i], function, i);
1102                         entry[i].flags |=
1103                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1104                         ++*nent;
1105                 }
1106                 break;
1107         }
1108         case 0xb: {
1109                 int i, level_type;
1110
1111                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1112                 /* read more entries until level_type is zero */
1113                 for (i = 1; *nent < maxnent; ++i) {
1114                         level_type = entry[i - 1].ecx & 0xff;
1115                         if (!level_type)
1116                                 break;
1117                         do_cpuid_1_ent(&entry[i], function, i);
1118                         entry[i].flags |=
1119                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1120                         ++*nent;
1121                 }
1122                 break;
1123         }
1124         case 0x80000000:
1125                 entry->eax = min(entry->eax, 0x8000001a);
1126                 break;
1127         case 0x80000001:
1128                 entry->edx &= kvm_supported_word1_x86_features;
1129                 entry->ecx &= kvm_supported_word6_x86_features;
1130                 break;
1131         }
1132         put_cpu();
1133 }
1134
1135 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1136                                     struct kvm_cpuid_entry2 __user *entries)
1137 {
1138         struct kvm_cpuid_entry2 *cpuid_entries;
1139         int limit, nent = 0, r = -E2BIG;
1140         u32 func;
1141
1142         if (cpuid->nent < 1)
1143                 goto out;
1144         r = -ENOMEM;
1145         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1146         if (!cpuid_entries)
1147                 goto out;
1148
1149         do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1150         limit = cpuid_entries[0].eax;
1151         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1152                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1153                                 &nent, cpuid->nent);
1154         r = -E2BIG;
1155         if (nent >= cpuid->nent)
1156                 goto out_free;
1157
1158         do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1159         limit = cpuid_entries[nent - 1].eax;
1160         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1161                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1162                                &nent, cpuid->nent);
1163         r = -EFAULT;
1164         if (copy_to_user(entries, cpuid_entries,
1165                         nent * sizeof(struct kvm_cpuid_entry2)))
1166                 goto out_free;
1167         cpuid->nent = nent;
1168         r = 0;
1169
1170 out_free:
1171         vfree(cpuid_entries);
1172 out:
1173         return r;
1174 }
1175
1176 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1177                                     struct kvm_lapic_state *s)
1178 {
1179         vcpu_load(vcpu);
1180         memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1181         vcpu_put(vcpu);
1182
1183         return 0;
1184 }
1185
1186 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1187                                     struct kvm_lapic_state *s)
1188 {
1189         vcpu_load(vcpu);
1190         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1191         kvm_apic_post_state_restore(vcpu);
1192         vcpu_put(vcpu);
1193
1194         return 0;
1195 }
1196
1197 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1198                                     struct kvm_interrupt *irq)
1199 {
1200         if (irq->irq < 0 || irq->irq >= 256)
1201                 return -EINVAL;
1202         if (irqchip_in_kernel(vcpu->kvm))
1203                 return -ENXIO;
1204         vcpu_load(vcpu);
1205
1206         set_bit(irq->irq, vcpu->arch.irq_pending);
1207         set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1208
1209         vcpu_put(vcpu);
1210
1211         return 0;
1212 }
1213
1214 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1215                                            struct kvm_tpr_access_ctl *tac)
1216 {
1217         if (tac->flags)
1218                 return -EINVAL;
1219         vcpu->arch.tpr_access_reporting = !!tac->enabled;
1220         return 0;
1221 }
1222
1223 long kvm_arch_vcpu_ioctl(struct file *filp,
1224                          unsigned int ioctl, unsigned long arg)
1225 {
1226         struct kvm_vcpu *vcpu = filp->private_data;
1227         void __user *argp = (void __user *)arg;
1228         int r;
1229
1230         switch (ioctl) {
1231         case KVM_GET_LAPIC: {
1232                 struct kvm_lapic_state lapic;
1233
1234                 memset(&lapic, 0, sizeof lapic);
1235                 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
1236                 if (r)
1237                         goto out;
1238                 r = -EFAULT;
1239                 if (copy_to_user(argp, &lapic, sizeof lapic))
1240                         goto out;
1241                 r = 0;
1242                 break;
1243         }
1244         case KVM_SET_LAPIC: {
1245                 struct kvm_lapic_state lapic;
1246
1247                 r = -EFAULT;
1248                 if (copy_from_user(&lapic, argp, sizeof lapic))
1249                         goto out;
1250                 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
1251                 if (r)
1252                         goto out;
1253                 r = 0;
1254                 break;
1255         }
1256         case KVM_INTERRUPT: {
1257                 struct kvm_interrupt irq;
1258
1259                 r = -EFAULT;
1260                 if (copy_from_user(&irq, argp, sizeof irq))
1261                         goto out;
1262                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1263                 if (r)
1264                         goto out;
1265                 r = 0;
1266                 break;
1267         }
1268         case KVM_SET_CPUID: {
1269                 struct kvm_cpuid __user *cpuid_arg = argp;
1270                 struct kvm_cpuid cpuid;
1271
1272                 r = -EFAULT;
1273                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1274                         goto out;
1275                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1276                 if (r)
1277                         goto out;
1278                 break;
1279         }
1280         case KVM_SET_CPUID2: {
1281                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1282                 struct kvm_cpuid2 cpuid;
1283
1284                 r = -EFAULT;
1285                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1286                         goto out;
1287                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1288                                 cpuid_arg->entries);
1289                 if (r)
1290                         goto out;
1291                 break;
1292         }
1293         case KVM_GET_CPUID2: {
1294                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1295                 struct kvm_cpuid2 cpuid;
1296
1297                 r = -EFAULT;
1298                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1299                         goto out;
1300                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1301                                 cpuid_arg->entries);
1302                 if (r)
1303                         goto out;
1304                 r = -EFAULT;
1305                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1306                         goto out;
1307                 r = 0;
1308                 break;
1309         }
1310         case KVM_GET_MSRS:
1311                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1312                 break;
1313         case KVM_SET_MSRS:
1314                 r = msr_io(vcpu, argp, do_set_msr, 0);
1315                 break;
1316         case KVM_TPR_ACCESS_REPORTING: {
1317                 struct kvm_tpr_access_ctl tac;
1318
1319                 r = -EFAULT;
1320                 if (copy_from_user(&tac, argp, sizeof tac))
1321                         goto out;
1322                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1323                 if (r)
1324                         goto out;
1325                 r = -EFAULT;
1326                 if (copy_to_user(argp, &tac, sizeof tac))
1327                         goto out;
1328                 r = 0;
1329                 break;
1330         };
1331         case KVM_SET_VAPIC_ADDR: {
1332                 struct kvm_vapic_addr va;
1333
1334                 r = -EINVAL;
1335                 if (!irqchip_in_kernel(vcpu->kvm))
1336                         goto out;
1337                 r = -EFAULT;
1338                 if (copy_from_user(&va, argp, sizeof va))
1339                         goto out;
1340                 r = 0;
1341                 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1342                 break;
1343         }
1344         default:
1345                 r = -EINVAL;
1346         }
1347 out:
1348         return r;
1349 }
1350
1351 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1352 {
1353         int ret;
1354
1355         if (addr > (unsigned int)(-3 * PAGE_SIZE))
1356                 return -1;
1357         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1358         return ret;
1359 }
1360
1361 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1362                                           u32 kvm_nr_mmu_pages)
1363 {
1364         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1365                 return -EINVAL;
1366
1367         down_write(&kvm->slots_lock);
1368
1369         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1370         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1371
1372         up_write(&kvm->slots_lock);
1373         return 0;
1374 }
1375
1376 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1377 {
1378         return kvm->arch.n_alloc_mmu_pages;
1379 }
1380
1381 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1382 {
1383         int i;
1384         struct kvm_mem_alias *alias;
1385
1386         for (i = 0; i < kvm->arch.naliases; ++i) {
1387                 alias = &kvm->arch.aliases[i];
1388                 if (gfn >= alias->base_gfn
1389                     && gfn < alias->base_gfn + alias->npages)
1390                         return alias->target_gfn + gfn - alias->base_gfn;
1391         }
1392         return gfn;
1393 }
1394
1395 /*
1396  * Set a new alias region.  Aliases map a portion of physical memory into
1397  * another portion.  This is useful for memory windows, for example the PC
1398  * VGA region.
1399  */
1400 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1401                                          struct kvm_memory_alias *alias)
1402 {
1403         int r, n;
1404         struct kvm_mem_alias *p;
1405
1406         r = -EINVAL;
1407         /* General sanity checks */
1408         if (alias->memory_size & (PAGE_SIZE - 1))
1409                 goto out;
1410         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1411                 goto out;
1412         if (alias->slot >= KVM_ALIAS_SLOTS)
1413                 goto out;
1414         if (alias->guest_phys_addr + alias->memory_size
1415             < alias->guest_phys_addr)
1416                 goto out;
1417         if (alias->target_phys_addr + alias->memory_size
1418             < alias->target_phys_addr)
1419                 goto out;
1420
1421         down_write(&kvm->slots_lock);
1422
1423         p = &kvm->arch.aliases[alias->slot];
1424         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1425         p->npages = alias->memory_size >> PAGE_SHIFT;
1426         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1427
1428         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1429                 if (kvm->arch.aliases[n - 1].npages)
1430                         break;
1431         kvm->arch.naliases = n;
1432
1433         kvm_mmu_zap_all(kvm);
1434
1435         up_write(&kvm->slots_lock);
1436
1437         return 0;
1438
1439 out:
1440         return r;
1441 }
1442
1443 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1444 {
1445         int r;
1446
1447         r = 0;
1448         switch (chip->chip_id) {
1449         case KVM_IRQCHIP_PIC_MASTER:
1450                 memcpy(&chip->chip.pic,
1451                         &pic_irqchip(kvm)->pics[0],
1452                         sizeof(struct kvm_pic_state));
1453                 break;
1454         case KVM_IRQCHIP_PIC_SLAVE:
1455                 memcpy(&chip->chip.pic,
1456                         &pic_irqchip(kvm)->pics[1],
1457                         sizeof(struct kvm_pic_state));
1458                 break;
1459         case KVM_IRQCHIP_IOAPIC:
1460                 memcpy(&chip->chip.ioapic,
1461                         ioapic_irqchip(kvm),
1462                         sizeof(struct kvm_ioapic_state));
1463                 break;
1464         default:
1465                 r = -EINVAL;
1466                 break;
1467         }
1468         return r;
1469 }
1470
1471 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1472 {
1473         int r;
1474
1475         r = 0;
1476         switch (chip->chip_id) {
1477         case KVM_IRQCHIP_PIC_MASTER:
1478                 memcpy(&pic_irqchip(kvm)->pics[0],
1479                         &chip->chip.pic,
1480                         sizeof(struct kvm_pic_state));
1481                 break;
1482         case KVM_IRQCHIP_PIC_SLAVE:
1483                 memcpy(&pic_irqchip(kvm)->pics[1],
1484                         &chip->chip.pic,
1485                         sizeof(struct kvm_pic_state));
1486                 break;
1487         case KVM_IRQCHIP_IOAPIC:
1488                 memcpy(ioapic_irqchip(kvm),
1489                         &chip->chip.ioapic,
1490                         sizeof(struct kvm_ioapic_state));
1491                 break;
1492         default:
1493                 r = -EINVAL;
1494                 break;
1495         }
1496         kvm_pic_update_irq(pic_irqchip(kvm));
1497         return r;
1498 }
1499
1500 /*
1501  * Get (and clear) the dirty memory log for a memory slot.
1502  */
1503 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1504                                       struct kvm_dirty_log *log)
1505 {
1506         int r;
1507         int n;
1508         struct kvm_memory_slot *memslot;
1509         int is_dirty = 0;
1510
1511         down_write(&kvm->slots_lock);
1512
1513         r = kvm_get_dirty_log(kvm, log, &is_dirty);
1514         if (r)
1515                 goto out;
1516
1517         /* If nothing is dirty, don't bother messing with page tables. */
1518         if (is_dirty) {
1519                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1520                 kvm_flush_remote_tlbs(kvm);
1521                 memslot = &kvm->memslots[log->slot];
1522                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1523                 memset(memslot->dirty_bitmap, 0, n);
1524         }
1525         r = 0;
1526 out:
1527         up_write(&kvm->slots_lock);
1528         return r;
1529 }
1530
1531 long kvm_arch_vm_ioctl(struct file *filp,
1532                        unsigned int ioctl, unsigned long arg)
1533 {
1534         struct kvm *kvm = filp->private_data;
1535         void __user *argp = (void __user *)arg;
1536         int r = -EINVAL;
1537
1538         switch (ioctl) {
1539         case KVM_SET_TSS_ADDR:
1540                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1541                 if (r < 0)
1542                         goto out;
1543                 break;
1544         case KVM_SET_MEMORY_REGION: {
1545                 struct kvm_memory_region kvm_mem;
1546                 struct kvm_userspace_memory_region kvm_userspace_mem;
1547
1548                 r = -EFAULT;
1549                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1550                         goto out;
1551                 kvm_userspace_mem.slot = kvm_mem.slot;
1552                 kvm_userspace_mem.flags = kvm_mem.flags;
1553                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1554                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1555                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1556                 if (r)
1557                         goto out;
1558                 break;
1559         }
1560         case KVM_SET_NR_MMU_PAGES:
1561                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1562                 if (r)
1563                         goto out;
1564                 break;
1565         case KVM_GET_NR_MMU_PAGES:
1566                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1567                 break;
1568         case KVM_SET_MEMORY_ALIAS: {
1569                 struct kvm_memory_alias alias;
1570
1571                 r = -EFAULT;
1572                 if (copy_from_user(&alias, argp, sizeof alias))
1573                         goto out;
1574                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
1575                 if (r)
1576                         goto out;
1577                 break;
1578         }
1579         case KVM_CREATE_IRQCHIP:
1580                 r = -ENOMEM;
1581                 kvm->arch.vpic = kvm_create_pic(kvm);
1582                 if (kvm->arch.vpic) {
1583                         r = kvm_ioapic_init(kvm);
1584                         if (r) {
1585                                 kfree(kvm->arch.vpic);
1586                                 kvm->arch.vpic = NULL;
1587                                 goto out;
1588                         }
1589                 } else
1590                         goto out;
1591                 break;
1592         case KVM_IRQ_LINE: {
1593                 struct kvm_irq_level irq_event;
1594
1595                 r = -EFAULT;
1596                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1597                         goto out;
1598                 if (irqchip_in_kernel(kvm)) {
1599                         mutex_lock(&kvm->lock);
1600                         if (irq_event.irq < 16)
1601                                 kvm_pic_set_irq(pic_irqchip(kvm),
1602                                         irq_event.irq,
1603                                         irq_event.level);
1604                         kvm_ioapic_set_irq(kvm->arch.vioapic,
1605                                         irq_event.irq,
1606                                         irq_event.level);
1607                         mutex_unlock(&kvm->lock);
1608                         r = 0;
1609                 }
1610                 break;
1611         }
1612         case KVM_GET_IRQCHIP: {
1613                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1614                 struct kvm_irqchip chip;
1615
1616                 r = -EFAULT;
1617                 if (copy_from_user(&chip, argp, sizeof chip))
1618                         goto out;
1619                 r = -ENXIO;
1620                 if (!irqchip_in_kernel(kvm))
1621                         goto out;
1622                 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
1623                 if (r)
1624                         goto out;
1625                 r = -EFAULT;
1626                 if (copy_to_user(argp, &chip, sizeof chip))
1627                         goto out;
1628                 r = 0;
1629                 break;
1630         }
1631         case KVM_SET_IRQCHIP: {
1632                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1633                 struct kvm_irqchip chip;
1634
1635                 r = -EFAULT;
1636                 if (copy_from_user(&chip, argp, sizeof chip))
1637                         goto out;
1638                 r = -ENXIO;
1639                 if (!irqchip_in_kernel(kvm))
1640                         goto out;
1641                 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
1642                 if (r)
1643                         goto out;
1644                 r = 0;
1645                 break;
1646         }
1647         default:
1648                 ;
1649         }
1650 out:
1651         return r;
1652 }
1653
1654 static void kvm_init_msr_list(void)
1655 {
1656         u32 dummy[2];
1657         unsigned i, j;
1658
1659         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1660                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1661                         continue;
1662                 if (j < i)
1663                         msrs_to_save[j] = msrs_to_save[i];
1664                 j++;
1665         }
1666         num_msrs_to_save = j;
1667 }
1668
1669 /*
1670  * Only apic need an MMIO device hook, so shortcut now..
1671  */
1672 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1673                                                 gpa_t addr)
1674 {
1675         struct kvm_io_device *dev;
1676
1677         if (vcpu->arch.apic) {
1678                 dev = &vcpu->arch.apic->dev;
1679                 if (dev->in_range(dev, addr))
1680                         return dev;
1681         }
1682         return NULL;
1683 }
1684
1685
1686 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1687                                                 gpa_t addr)
1688 {
1689         struct kvm_io_device *dev;
1690
1691         dev = vcpu_find_pervcpu_dev(vcpu, addr);
1692         if (dev == NULL)
1693                 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1694         return dev;
1695 }
1696
1697 int emulator_read_std(unsigned long addr,
1698                              void *val,
1699                              unsigned int bytes,
1700                              struct kvm_vcpu *vcpu)
1701 {
1702         void *data = val;
1703         int r = X86EMUL_CONTINUE;
1704
1705         down_read(&vcpu->kvm->slots_lock);
1706         while (bytes) {
1707                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1708                 unsigned offset = addr & (PAGE_SIZE-1);
1709                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1710                 int ret;
1711
1712                 if (gpa == UNMAPPED_GVA) {
1713                         r = X86EMUL_PROPAGATE_FAULT;
1714                         goto out;
1715                 }
1716                 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1717                 if (ret < 0) {
1718                         r = X86EMUL_UNHANDLEABLE;
1719                         goto out;
1720                 }
1721
1722                 bytes -= tocopy;
1723                 data += tocopy;
1724                 addr += tocopy;
1725         }
1726 out:
1727         up_read(&vcpu->kvm->slots_lock);
1728         return r;
1729 }
1730 EXPORT_SYMBOL_GPL(emulator_read_std);
1731
1732 static int emulator_read_emulated(unsigned long addr,
1733                                   void *val,
1734                                   unsigned int bytes,
1735                                   struct kvm_vcpu *vcpu)
1736 {
1737         struct kvm_io_device *mmio_dev;
1738         gpa_t                 gpa;
1739
1740         if (vcpu->mmio_read_completed) {
1741                 memcpy(val, vcpu->mmio_data, bytes);
1742                 vcpu->mmio_read_completed = 0;
1743                 return X86EMUL_CONTINUE;
1744         }
1745
1746         down_read(&vcpu->kvm->slots_lock);
1747         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1748         up_read(&vcpu->kvm->slots_lock);
1749
1750         /* For APIC access vmexit */
1751         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1752                 goto mmio;
1753
1754         if (emulator_read_std(addr, val, bytes, vcpu)
1755                         == X86EMUL_CONTINUE)
1756                 return X86EMUL_CONTINUE;
1757         if (gpa == UNMAPPED_GVA)
1758                 return X86EMUL_PROPAGATE_FAULT;
1759
1760 mmio:
1761         /*
1762          * Is this MMIO handled locally?
1763          */
1764         mutex_lock(&vcpu->kvm->lock);
1765         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1766         if (mmio_dev) {
1767                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1768                 mutex_unlock(&vcpu->kvm->lock);
1769                 return X86EMUL_CONTINUE;
1770         }
1771         mutex_unlock(&vcpu->kvm->lock);
1772
1773         vcpu->mmio_needed = 1;
1774         vcpu->mmio_phys_addr = gpa;
1775         vcpu->mmio_size = bytes;
1776         vcpu->mmio_is_write = 0;
1777
1778         return X86EMUL_UNHANDLEABLE;
1779 }
1780
1781 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1782                                const void *val, int bytes)
1783 {
1784         int ret;
1785
1786         down_read(&vcpu->kvm->slots_lock);
1787         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1788         if (ret < 0) {
1789                 up_read(&vcpu->kvm->slots_lock);
1790                 return 0;
1791         }
1792         kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1793         up_read(&vcpu->kvm->slots_lock);
1794         return 1;
1795 }
1796
1797 static int emulator_write_emulated_onepage(unsigned long addr,
1798                                            const void *val,
1799                                            unsigned int bytes,
1800                                            struct kvm_vcpu *vcpu)
1801 {
1802         struct kvm_io_device *mmio_dev;
1803         gpa_t                 gpa;
1804
1805         down_read(&vcpu->kvm->slots_lock);
1806         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1807         up_read(&vcpu->kvm->slots_lock);
1808
1809         if (gpa == UNMAPPED_GVA) {
1810                 kvm_inject_page_fault(vcpu, addr, 2);
1811                 return X86EMUL_PROPAGATE_FAULT;
1812         }
1813
1814         /* For APIC access vmexit */
1815         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1816                 goto mmio;
1817
1818         if (emulator_write_phys(vcpu, gpa, val, bytes))
1819                 return X86EMUL_CONTINUE;
1820
1821 mmio:
1822         /*
1823          * Is this MMIO handled locally?
1824          */
1825         mutex_lock(&vcpu->kvm->lock);
1826         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1827         if (mmio_dev) {
1828                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1829                 mutex_unlock(&vcpu->kvm->lock);
1830                 return X86EMUL_CONTINUE;
1831         }
1832         mutex_unlock(&vcpu->kvm->lock);
1833
1834         vcpu->mmio_needed = 1;
1835         vcpu->mmio_phys_addr = gpa;
1836         vcpu->mmio_size = bytes;
1837         vcpu->mmio_is_write = 1;
1838         memcpy(vcpu->mmio_data, val, bytes);
1839
1840         return X86EMUL_CONTINUE;
1841 }
1842
1843 int emulator_write_emulated(unsigned long addr,
1844                                    const void *val,
1845                                    unsigned int bytes,
1846                                    struct kvm_vcpu *vcpu)
1847 {
1848         /* Crossing a page boundary? */
1849         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1850                 int rc, now;
1851
1852                 now = -addr & ~PAGE_MASK;
1853                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1854                 if (rc != X86EMUL_CONTINUE)
1855                         return rc;
1856                 addr += now;
1857                 val += now;
1858                 bytes -= now;
1859         }
1860         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1861 }
1862 EXPORT_SYMBOL_GPL(emulator_write_emulated);
1863
1864 static int emulator_cmpxchg_emulated(unsigned long addr,
1865                                      const void *old,
1866                                      const void *new,
1867                                      unsigned int bytes,
1868                                      struct kvm_vcpu *vcpu)
1869 {
1870         static int reported;
1871
1872         if (!reported) {
1873                 reported = 1;
1874                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1875         }
1876 #ifndef CONFIG_X86_64
1877         /* guests cmpxchg8b have to be emulated atomically */
1878         if (bytes == 8) {
1879                 gpa_t gpa;
1880                 struct page *page;
1881                 char *kaddr;
1882                 u64 val;
1883
1884                 down_read(&vcpu->kvm->slots_lock);
1885                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1886
1887                 if (gpa == UNMAPPED_GVA ||
1888                    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1889                         goto emul_write;
1890
1891                 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
1892                         goto emul_write;
1893
1894                 val = *(u64 *)new;
1895
1896                 down_read(&current->mm->mmap_sem);
1897                 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1898                 up_read(&current->mm->mmap_sem);
1899
1900                 kaddr = kmap_atomic(page, KM_USER0);
1901                 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
1902                 kunmap_atomic(kaddr, KM_USER0);
1903                 kvm_release_page_dirty(page);
1904         emul_write:
1905                 up_read(&vcpu->kvm->slots_lock);
1906         }
1907 #endif
1908
1909         return emulator_write_emulated(addr, new, bytes, vcpu);
1910 }
1911
1912 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1913 {
1914         return kvm_x86_ops->get_segment_base(vcpu, seg);
1915 }
1916
1917 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1918 {
1919         return X86EMUL_CONTINUE;
1920 }
1921
1922 int emulate_clts(struct kvm_vcpu *vcpu)
1923 {
1924         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
1925         return X86EMUL_CONTINUE;
1926 }
1927
1928 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1929 {
1930         struct kvm_vcpu *vcpu = ctxt->vcpu;
1931
1932         switch (dr) {
1933         case 0 ... 3:
1934                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1935                 return X86EMUL_CONTINUE;
1936         default:
1937                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1938                 return X86EMUL_UNHANDLEABLE;
1939         }
1940 }
1941
1942 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1943 {
1944         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1945         int exception;
1946
1947         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1948         if (exception) {
1949                 /* FIXME: better handling */
1950                 return X86EMUL_UNHANDLEABLE;
1951         }
1952         return X86EMUL_CONTINUE;
1953 }
1954
1955 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1956 {
1957         static int reported;
1958         u8 opcodes[4];
1959         unsigned long rip = vcpu->arch.rip;
1960         unsigned long rip_linear;
1961
1962         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1963
1964         if (reported)
1965                 return;
1966
1967         emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1968
1969         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1970                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1971         reported = 1;
1972 }
1973 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1974
1975 static struct x86_emulate_ops emulate_ops = {
1976         .read_std            = emulator_read_std,
1977         .read_emulated       = emulator_read_emulated,
1978         .write_emulated      = emulator_write_emulated,
1979         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1980 };
1981
1982 int emulate_instruction(struct kvm_vcpu *vcpu,
1983                         struct kvm_run *run,
1984                         unsigned long cr2,
1985                         u16 error_code,
1986                         int emulation_type)
1987 {
1988         int r;
1989         struct decode_cache *c;
1990
1991         vcpu->arch.mmio_fault_cr2 = cr2;
1992         kvm_x86_ops->cache_regs(vcpu);
1993
1994         vcpu->mmio_is_write = 0;
1995         vcpu->arch.pio.string = 0;
1996
1997         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
1998                 int cs_db, cs_l;
1999                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2000
2001                 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2002                 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
2003                 vcpu->arch.emulate_ctxt.mode =
2004                         (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2005                         ? X86EMUL_MODE_REAL : cs_l
2006                         ? X86EMUL_MODE_PROT64 : cs_db
2007                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2008
2009                 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2010                         vcpu->arch.emulate_ctxt.cs_base = 0;
2011                         vcpu->arch.emulate_ctxt.ds_base = 0;
2012                         vcpu->arch.emulate_ctxt.es_base = 0;
2013                         vcpu->arch.emulate_ctxt.ss_base = 0;
2014                 } else {
2015                         vcpu->arch.emulate_ctxt.cs_base =
2016                                         get_segment_base(vcpu, VCPU_SREG_CS);
2017                         vcpu->arch.emulate_ctxt.ds_base =
2018                                         get_segment_base(vcpu, VCPU_SREG_DS);
2019                         vcpu->arch.emulate_ctxt.es_base =
2020                                         get_segment_base(vcpu, VCPU_SREG_ES);
2021                         vcpu->arch.emulate_ctxt.ss_base =
2022                                         get_segment_base(vcpu, VCPU_SREG_SS);
2023                 }
2024
2025                 vcpu->arch.emulate_ctxt.gs_base =
2026                                         get_segment_base(vcpu, VCPU_SREG_GS);
2027                 vcpu->arch.emulate_ctxt.fs_base =
2028                                         get_segment_base(vcpu, VCPU_SREG_FS);
2029
2030                 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2031
2032                 /* Reject the instructions other than VMCALL/VMMCALL when
2033                  * try to emulate invalid opcode */
2034                 c = &vcpu->arch.emulate_ctxt.decode;
2035                 if ((emulation_type & EMULTYPE_TRAP_UD) &&
2036                     (!(c->twobyte && c->b == 0x01 &&
2037                       (c->modrm_reg == 0 || c->modrm_reg == 3) &&
2038                        c->modrm_mod == 3 && c->modrm_rm == 1)))
2039                         return EMULATE_FAIL;
2040
2041                 ++vcpu->stat.insn_emulation;
2042                 if (r)  {
2043                         ++vcpu->stat.insn_emulation_fail;
2044                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2045                                 return EMULATE_DONE;
2046                         return EMULATE_FAIL;
2047                 }
2048         }
2049
2050         r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2051
2052         if (vcpu->arch.pio.string)
2053                 return EMULATE_DO_MMIO;
2054
2055         if ((r || vcpu->mmio_is_write) && run) {
2056                 run->exit_reason = KVM_EXIT_MMIO;
2057                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2058                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2059                 run->mmio.len = vcpu->mmio_size;
2060                 run->mmio.is_write = vcpu->mmio_is_write;
2061         }
2062
2063         if (r) {
2064                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2065                         return EMULATE_DONE;
2066                 if (!vcpu->mmio_needed) {
2067                         kvm_report_emulation_failure(vcpu, "mmio");
2068                         return EMULATE_FAIL;
2069                 }
2070                 return EMULATE_DO_MMIO;
2071         }
2072
2073         kvm_x86_ops->decache_regs(vcpu);
2074         kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2075
2076         if (vcpu->mmio_is_write) {
2077                 vcpu->mmio_needed = 0;
2078                 return EMULATE_DO_MMIO;
2079         }
2080
2081         return EMULATE_DONE;
2082 }
2083 EXPORT_SYMBOL_GPL(emulate_instruction);
2084
2085 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
2086 {
2087         int i;
2088
2089         for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
2090                 if (vcpu->arch.pio.guest_pages[i]) {
2091                         kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
2092                         vcpu->arch.pio.guest_pages[i] = NULL;
2093                 }
2094 }
2095
2096 static int pio_copy_data(struct kvm_vcpu *vcpu)
2097 {
2098         void *p = vcpu->arch.pio_data;
2099         void *q;
2100         unsigned bytes;
2101         int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
2102
2103         q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
2104                  PAGE_KERNEL);
2105         if (!q) {
2106                 free_pio_guest_pages(vcpu);
2107                 return -ENOMEM;
2108         }
2109         q += vcpu->arch.pio.guest_page_offset;
2110         bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2111         if (vcpu->arch.pio.in)
2112                 memcpy(q, p, bytes);
2113         else
2114                 memcpy(p, q, bytes);
2115         q -= vcpu->arch.pio.guest_page_offset;
2116         vunmap(q);
2117         free_pio_guest_pages(vcpu);
2118         return 0;
2119 }
2120
2121 int complete_pio(struct kvm_vcpu *vcpu)
2122 {
2123         struct kvm_pio_request *io = &vcpu->arch.pio;
2124         long delta;
2125         int r;
2126
2127         kvm_x86_ops->cache_regs(vcpu);
2128
2129         if (!io->string) {
2130                 if (io->in)
2131                         memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
2132                                io->size);
2133         } else {
2134                 if (io->in) {
2135                         r = pio_copy_data(vcpu);
2136                         if (r) {
2137                                 kvm_x86_ops->cache_regs(vcpu);
2138                                 return r;
2139                         }
2140                 }
2141
2142                 delta = 1;
2143                 if (io->rep) {
2144                         delta *= io->cur_count;
2145                         /*
2146                          * The size of the register should really depend on
2147                          * current address size.
2148                          */
2149                         vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
2150                 }
2151                 if (io->down)
2152                         delta = -delta;
2153                 delta *= io->size;
2154                 if (io->in)
2155                         vcpu->arch.regs[VCPU_REGS_RDI] += delta;
2156                 else
2157                         vcpu->arch.regs[VCPU_REGS_RSI] += delta;
2158         }
2159
2160         kvm_x86_ops->decache_regs(vcpu);
2161
2162         io->count -= io->cur_count;
2163         io->cur_count = 0;
2164
2165         return 0;
2166 }
2167
2168 static void kernel_pio(struct kvm_io_device *pio_dev,
2169                        struct kvm_vcpu *vcpu,
2170                        void *pd)
2171 {
2172         /* TODO: String I/O for in kernel device */
2173
2174         mutex_lock(&vcpu->kvm->lock);
2175         if (vcpu->arch.pio.in)
2176                 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
2177                                   vcpu->arch.pio.size,
2178                                   pd);
2179         else
2180                 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
2181                                    vcpu->arch.pio.size,
2182                                    pd);
2183         mutex_unlock(&vcpu->kvm->lock);
2184 }
2185
2186 static void pio_string_write(struct kvm_io_device *pio_dev,
2187                              struct kvm_vcpu *vcpu)
2188 {
2189         struct kvm_pio_request *io = &vcpu->arch.pio;
2190         void *pd = vcpu->arch.pio_data;
2191         int i;
2192
2193         mutex_lock(&vcpu->kvm->lock);
2194         for (i = 0; i < io->cur_count; i++) {
2195                 kvm_iodevice_write(pio_dev, io->port,
2196                                    io->size,
2197                                    pd);
2198                 pd += io->size;
2199         }
2200         mutex_unlock(&vcpu->kvm->lock);
2201 }
2202
2203 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2204                                                gpa_t addr)
2205 {
2206         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
2207 }
2208
2209 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2210                   int size, unsigned port)
2211 {
2212         struct kvm_io_device *pio_dev;
2213
2214         vcpu->run->exit_reason = KVM_EXIT_IO;
2215         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2216         vcpu->run->io.size = vcpu->arch.pio.size = size;
2217         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2218         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2219         vcpu->run->io.port = vcpu->arch.pio.port = port;
2220         vcpu->arch.pio.in = in;
2221         vcpu->arch.pio.string = 0;
2222         vcpu->arch.pio.down = 0;
2223         vcpu->arch.pio.guest_page_offset = 0;
2224         vcpu->arch.pio.rep = 0;
2225
2226         kvm_x86_ops->cache_regs(vcpu);
2227         memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2228         kvm_x86_ops->decache_regs(vcpu);
2229
2230         kvm_x86_ops->skip_emulated_instruction(vcpu);
2231
2232         pio_dev = vcpu_find_pio_dev(vcpu, port);
2233         if (pio_dev) {
2234                 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2235                 complete_pio(vcpu);
2236                 return 1;
2237         }
2238         return 0;
2239 }
2240 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2241
2242 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2243                   int size, unsigned long count, int down,
2244                   gva_t address, int rep, unsigned port)
2245 {
2246         unsigned now, in_page;
2247         int i, ret = 0;
2248         int nr_pages = 1;
2249         struct page *page;
2250         struct kvm_io_device *pio_dev;
2251
2252         vcpu->run->exit_reason = KVM_EXIT_IO;
2253         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2254         vcpu->run->io.size = vcpu->arch.pio.size = size;
2255         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2256         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2257         vcpu->run->io.port = vcpu->arch.pio.port = port;
2258         vcpu->arch.pio.in = in;
2259         vcpu->arch.pio.string = 1;
2260         vcpu->arch.pio.down = down;
2261         vcpu->arch.pio.guest_page_offset = offset_in_page(address);
2262         vcpu->arch.pio.rep = rep;
2263
2264         if (!count) {
2265                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2266                 return 1;
2267         }
2268
2269         if (!down)
2270                 in_page = PAGE_SIZE - offset_in_page(address);
2271         else
2272                 in_page = offset_in_page(address) + size;
2273         now = min(count, (unsigned long)in_page / size);
2274         if (!now) {
2275                 /*
2276                  * String I/O straddles page boundary.  Pin two guest pages
2277                  * so that we satisfy atomicity constraints.  Do just one
2278                  * transaction to avoid complexity.
2279                  */
2280                 nr_pages = 2;
2281                 now = 1;
2282         }
2283         if (down) {
2284                 /*
2285                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
2286                  */
2287                 pr_unimpl(vcpu, "guest string pio down\n");
2288                 kvm_inject_gp(vcpu, 0);
2289                 return 1;
2290         }
2291         vcpu->run->io.count = now;
2292         vcpu->arch.pio.cur_count = now;
2293
2294         if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2295                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2296
2297         for (i = 0; i < nr_pages; ++i) {
2298                 down_read(&vcpu->kvm->slots_lock);
2299                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2300                 vcpu->arch.pio.guest_pages[i] = page;
2301                 up_read(&vcpu->kvm->slots_lock);
2302                 if (!page) {
2303                         kvm_inject_gp(vcpu, 0);
2304                         free_pio_guest_pages(vcpu);
2305                         return 1;
2306                 }
2307         }
2308
2309         pio_dev = vcpu_find_pio_dev(vcpu, port);
2310         if (!vcpu->arch.pio.in) {
2311                 /* string PIO write */
2312                 ret = pio_copy_data(vcpu);
2313                 if (ret >= 0 && pio_dev) {
2314                         pio_string_write(pio_dev, vcpu);
2315                         complete_pio(vcpu);
2316                         if (vcpu->arch.pio.count == 0)
2317                                 ret = 1;
2318                 }
2319         } else if (pio_dev)
2320                 pr_unimpl(vcpu, "no string pio read support yet, "
2321                        "port %x size %d count %ld\n",
2322                         port, size, count);
2323
2324         return ret;
2325 }
2326 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2327
2328 int kvm_arch_init(void *opaque)
2329 {
2330         int r;
2331         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2332
2333         if (kvm_x86_ops) {
2334                 printk(KERN_ERR "kvm: already loaded the other module\n");
2335                 r = -EEXIST;
2336                 goto out;
2337         }
2338
2339         if (!ops->cpu_has_kvm_support()) {
2340                 printk(KERN_ERR "kvm: no hardware support\n");
2341                 r = -EOPNOTSUPP;
2342                 goto out;
2343         }
2344         if (ops->disabled_by_bios()) {
2345                 printk(KERN_ERR "kvm: disabled by bios\n");
2346                 r = -EOPNOTSUPP;
2347                 goto out;
2348         }
2349
2350         r = kvm_mmu_module_init();
2351         if (r)
2352                 goto out;
2353
2354         kvm_init_msr_list();
2355
2356         kvm_x86_ops = ops;
2357         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2358         return 0;
2359
2360 out:
2361         return r;
2362 }
2363
2364 void kvm_arch_exit(void)
2365 {
2366         kvm_x86_ops = NULL;
2367         kvm_mmu_module_exit();
2368 }
2369
2370 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2371 {
2372         ++vcpu->stat.halt_exits;
2373         if (irqchip_in_kernel(vcpu->kvm)) {
2374                 vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
2375                 kvm_vcpu_block(vcpu);
2376                 if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
2377                         return -EINTR;
2378                 return 1;
2379         } else {
2380                 vcpu->run->exit_reason = KVM_EXIT_HLT;
2381                 return 0;
2382         }
2383 }
2384 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2385
2386 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2387 {
2388         unsigned long nr, a0, a1, a2, a3, ret;
2389
2390         kvm_x86_ops->cache_regs(vcpu);
2391
2392         nr = vcpu->arch.regs[VCPU_REGS_RAX];
2393         a0 = vcpu->arch.regs[VCPU_REGS_RBX];
2394         a1 = vcpu->arch.regs[VCPU_REGS_RCX];
2395         a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2396         a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2397
2398         if (!is_long_mode(vcpu)) {
2399                 nr &= 0xFFFFFFFF;
2400                 a0 &= 0xFFFFFFFF;
2401                 a1 &= 0xFFFFFFFF;
2402                 a2 &= 0xFFFFFFFF;
2403                 a3 &= 0xFFFFFFFF;
2404         }
2405
2406         switch (nr) {
2407         case KVM_HC_VAPIC_POLL_IRQ:
2408                 ret = 0;
2409                 break;
2410         default:
2411                 ret = -KVM_ENOSYS;
2412                 break;
2413         }
2414         vcpu->arch.regs[VCPU_REGS_RAX] = ret;
2415         kvm_x86_ops->decache_regs(vcpu);
2416         ++vcpu->stat.hypercalls;
2417         return 0;
2418 }
2419 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2420
2421 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2422 {
2423         char instruction[3];
2424         int ret = 0;
2425
2426
2427         /*
2428          * Blow out the MMU to ensure that no other VCPU has an active mapping
2429          * to ensure that the updated hypercall appears atomically across all
2430          * VCPUs.
2431          */
2432         kvm_mmu_zap_all(vcpu->kvm);
2433
2434         kvm_x86_ops->cache_regs(vcpu);
2435         kvm_x86_ops->patch_hypercall(vcpu, instruction);
2436         if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
2437             != X86EMUL_CONTINUE)
2438                 ret = -EFAULT;
2439
2440         return ret;
2441 }
2442
2443 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2444 {
2445         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2446 }
2447
2448 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2449 {
2450         struct descriptor_table dt = { limit, base };
2451
2452         kvm_x86_ops->set_gdt(vcpu, &dt);
2453 }
2454
2455 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2456 {
2457         struct descriptor_table dt = { limit, base };
2458
2459         kvm_x86_ops->set_idt(vcpu, &dt);
2460 }
2461
2462 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2463                    unsigned long *rflags)
2464 {
2465         lmsw(vcpu, msw);
2466         *rflags = kvm_x86_ops->get_rflags(vcpu);
2467 }
2468
2469 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2470 {
2471         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2472         switch (cr) {
2473         case 0:
2474                 return vcpu->arch.cr0;
2475         case 2:
2476                 return vcpu->arch.cr2;
2477         case 3:
2478                 return vcpu->arch.cr3;
2479         case 4:
2480                 return vcpu->arch.cr4;
2481         case 8:
2482                 return get_cr8(vcpu);
2483         default:
2484                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2485                 return 0;
2486         }
2487 }
2488
2489 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2490                      unsigned long *rflags)
2491 {
2492         switch (cr) {
2493         case 0:
2494                 set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2495                 *rflags = kvm_x86_ops->get_rflags(vcpu);
2496                 break;
2497         case 2:
2498                 vcpu->arch.cr2 = val;
2499                 break;
2500         case 3:
2501                 set_cr3(vcpu, val);
2502                 break;
2503         case 4:
2504                 set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2505                 break;
2506         case 8:
2507                 set_cr8(vcpu, val & 0xfUL);
2508                 break;
2509         default:
2510                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2511         }
2512 }
2513
2514 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2515 {
2516         struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2517         int j, nent = vcpu->arch.cpuid_nent;
2518
2519         e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2520         /* when no next entry is found, the current entry[i] is reselected */
2521         for (j = i + 1; j == i; j = (j + 1) % nent) {
2522                 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2523                 if (ej->function == e->function) {
2524                         ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2525                         return j;
2526                 }
2527         }
2528         return 0; /* silence gcc, even though control never reaches here */
2529 }
2530
2531 /* find an entry with matching function, matching index (if needed), and that
2532  * should be read next (if it's stateful) */
2533 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2534         u32 function, u32 index)
2535 {
2536         if (e->function != function)
2537                 return 0;
2538         if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2539                 return 0;
2540         if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2541                 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2542                 return 0;
2543         return 1;
2544 }
2545
2546 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2547 {
2548         int i;
2549         u32 function, index;
2550         struct kvm_cpuid_entry2 *e, *best;
2551
2552         kvm_x86_ops->cache_regs(vcpu);
2553         function = vcpu->arch.regs[VCPU_REGS_RAX];
2554         index = vcpu->arch.regs[VCPU_REGS_RCX];
2555         vcpu->arch.regs[VCPU_REGS_RAX] = 0;
2556         vcpu->arch.regs[VCPU_REGS_RBX] = 0;
2557         vcpu->arch.regs[VCPU_REGS_RCX] = 0;
2558         vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2559         best = NULL;
2560         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2561                 e = &vcpu->arch.cpuid_entries[i];
2562                 if (is_matching_cpuid_entry(e, function, index)) {
2563                         if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2564                                 move_to_next_stateful_cpuid_entry(vcpu, i);
2565                         best = e;
2566                         break;
2567                 }
2568                 /*
2569                  * Both basic or both extended?
2570                  */
2571                 if (((e->function ^ function) & 0x80000000) == 0)
2572                         if (!best || e->function > best->function)
2573                                 best = e;
2574         }
2575         if (best) {
2576                 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
2577                 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
2578                 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
2579                 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
2580         }
2581         kvm_x86_ops->decache_regs(vcpu);
2582         kvm_x86_ops->skip_emulated_instruction(vcpu);
2583 }
2584 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2585
2586 /*
2587  * Check if userspace requested an interrupt window, and that the
2588  * interrupt window is open.
2589  *
2590  * No need to exit to userspace if we already have an interrupt queued.
2591  */
2592 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
2593                                           struct kvm_run *kvm_run)
2594 {
2595         return (!vcpu->arch.irq_summary &&
2596                 kvm_run->request_interrupt_window &&
2597                 vcpu->arch.interrupt_window_open &&
2598                 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
2599 }
2600
2601 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2602                               struct kvm_run *kvm_run)
2603 {
2604         kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2605         kvm_run->cr8 = get_cr8(vcpu);
2606         kvm_run->apic_base = kvm_get_apic_base(vcpu);
2607         if (irqchip_in_kernel(vcpu->kvm))
2608                 kvm_run->ready_for_interrupt_injection = 1;
2609         else
2610                 kvm_run->ready_for_interrupt_injection =
2611                                         (vcpu->arch.interrupt_window_open &&
2612                                          vcpu->arch.irq_summary == 0);
2613 }
2614
2615 static void vapic_enter(struct kvm_vcpu *vcpu)
2616 {
2617         struct kvm_lapic *apic = vcpu->arch.apic;
2618         struct page *page;
2619
2620         if (!apic || !apic->vapic_addr)
2621                 return;
2622
2623         down_read(&current->mm->mmap_sem);
2624         page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2625         up_read(&current->mm->mmap_sem);
2626
2627         vcpu->arch.apic->vapic_page = page;
2628 }
2629
2630 static void vapic_exit(struct kvm_vcpu *vcpu)
2631 {
2632         struct kvm_lapic *apic = vcpu->arch.apic;
2633
2634         if (!apic || !apic->vapic_addr)
2635                 return;
2636
2637         kvm_release_page_dirty(apic->vapic_page);
2638         mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2639 }
2640
2641 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2642 {
2643         int r;
2644
2645         if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
2646                 pr_debug("vcpu %d received sipi with vector # %x\n",
2647                        vcpu->vcpu_id, vcpu->arch.sipi_vector);
2648                 kvm_lapic_reset(vcpu);
2649                 r = kvm_x86_ops->vcpu_reset(vcpu);
2650                 if (r)
2651                         return r;
2652                 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
2653         }
2654
2655         vapic_enter(vcpu);
2656
2657 preempted:
2658         if (vcpu->guest_debug.enabled)
2659                 kvm_x86_ops->guest_debug_pre(vcpu);
2660
2661 again:
2662         if (vcpu->requests)
2663                 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
2664                         kvm_mmu_unload(vcpu);
2665
2666         r = kvm_mmu_reload(vcpu);
2667         if (unlikely(r))
2668                 goto out;
2669
2670         if (vcpu->requests) {
2671                 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2672                         __kvm_migrate_apic_timer(vcpu);
2673                 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2674                                        &vcpu->requests)) {
2675                         kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
2676                         r = 0;
2677                         goto out;
2678                 }
2679         }
2680
2681         kvm_inject_pending_timer_irqs(vcpu);
2682
2683         preempt_disable();
2684
2685         kvm_x86_ops->prepare_guest_switch(vcpu);
2686         kvm_load_guest_fpu(vcpu);
2687
2688         local_irq_disable();
2689
2690         if (need_resched()) {
2691                 local_irq_enable();
2692                 preempt_enable();
2693                 r = 1;
2694                 goto out;
2695         }
2696
2697         if (vcpu->requests)
2698                 if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
2699                         local_irq_enable();
2700                         preempt_enable();
2701                         r = 1;
2702                         goto out;
2703                 }
2704
2705         if (signal_pending(current)) {
2706                 local_irq_enable();
2707                 preempt_enable();
2708                 r = -EINTR;
2709                 kvm_run->exit_reason = KVM_EXIT_INTR;
2710                 ++vcpu->stat.signal_exits;
2711                 goto out;
2712         }
2713
2714         if (vcpu->arch.exception.pending)
2715                 __queue_exception(vcpu);
2716         else if (irqchip_in_kernel(vcpu->kvm))
2717                 kvm_x86_ops->inject_pending_irq(vcpu);
2718         else
2719                 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2720
2721         kvm_lapic_sync_to_vapic(vcpu);
2722
2723         vcpu->guest_mode = 1;
2724         kvm_guest_enter();
2725
2726         if (vcpu->requests)
2727                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2728                         kvm_x86_ops->tlb_flush(vcpu);
2729
2730         kvm_x86_ops->run(vcpu, kvm_run);
2731
2732         vcpu->guest_mode = 0;
2733         local_irq_enable();
2734
2735         ++vcpu->stat.exits;
2736
2737         /*
2738          * We must have an instruction between local_irq_enable() and
2739          * kvm_guest_exit(), so the timer interrupt isn't delayed by
2740          * the interrupt shadow.  The stat.exits increment will do nicely.
2741          * But we need to prevent reordering, hence this barrier():
2742          */
2743         barrier();
2744
2745         kvm_guest_exit();
2746
2747         preempt_enable();
2748
2749         /*
2750          * Profile KVM exit RIPs:
2751          */
2752         if (unlikely(prof_on == KVM_PROFILING)) {
2753                 kvm_x86_ops->cache_regs(vcpu);
2754                 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
2755         }
2756
2757         if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
2758                 vcpu->arch.exception.pending = false;
2759
2760         kvm_lapic_sync_from_vapic(vcpu);
2761
2762         r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2763
2764         if (r > 0) {
2765                 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2766                         r = -EINTR;
2767                         kvm_run->exit_reason = KVM_EXIT_INTR;
2768                         ++vcpu->stat.request_irq_exits;
2769                         goto out;
2770                 }
2771                 if (!need_resched())
2772                         goto again;
2773         }
2774
2775 out:
2776         if (r > 0) {
2777                 kvm_resched(vcpu);
2778                 goto preempted;
2779         }
2780
2781         post_kvm_run_save(vcpu, kvm_run);
2782
2783         vapic_exit(vcpu);
2784
2785         return r;
2786 }
2787
2788 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2789 {
2790         int r;
2791         sigset_t sigsaved;
2792
2793         vcpu_load(vcpu);
2794
2795         if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2796                 kvm_vcpu_block(vcpu);
2797                 vcpu_put(vcpu);
2798                 return -EAGAIN;
2799         }
2800
2801         if (vcpu->sigset_active)
2802                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2803
2804         /* re-sync apic's tpr */
2805         if (!irqchip_in_kernel(vcpu->kvm))
2806                 set_cr8(vcpu, kvm_run->cr8);
2807
2808         if (vcpu->arch.pio.cur_count) {
2809                 r = complete_pio(vcpu);
2810                 if (r)
2811                         goto out;
2812         }
2813 #if CONFIG_HAS_IOMEM
2814         if (vcpu->mmio_needed) {
2815                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2816                 vcpu->mmio_read_completed = 1;
2817                 vcpu->mmio_needed = 0;
2818                 r = emulate_instruction(vcpu, kvm_run,
2819                                         vcpu->arch.mmio_fault_cr2, 0,
2820                                         EMULTYPE_NO_DECODE);
2821                 if (r == EMULATE_DO_MMIO) {
2822                         /*
2823                          * Read-modify-write.  Back to userspace.
2824                          */
2825                         r = 0;
2826                         goto out;
2827                 }
2828         }
2829 #endif
2830         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2831                 kvm_x86_ops->cache_regs(vcpu);
2832                 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2833                 kvm_x86_ops->decache_regs(vcpu);
2834         }
2835
2836         r = __vcpu_run(vcpu, kvm_run);
2837
2838 out:
2839         if (vcpu->sigset_active)
2840                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2841
2842         vcpu_put(vcpu);
2843         return r;
2844 }
2845
2846 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2847 {
2848         vcpu_load(vcpu);
2849
2850         kvm_x86_ops->cache_regs(vcpu);
2851
2852         regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
2853         regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
2854         regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
2855         regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
2856         regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
2857         regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
2858         regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2859         regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
2860 #ifdef CONFIG_X86_64
2861         regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
2862         regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
2863         regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
2864         regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
2865         regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
2866         regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
2867         regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
2868         regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
2869 #endif
2870
2871         regs->rip = vcpu->arch.rip;
2872         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2873
2874         /*
2875          * Don't leak debug flags in case they were set for guest debugging
2876          */
2877         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2878                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2879
2880         vcpu_put(vcpu);
2881
2882         return 0;
2883 }
2884
2885 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2886 {
2887         vcpu_load(vcpu);
2888
2889         vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
2890         vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
2891         vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
2892         vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
2893         vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
2894         vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
2895         vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
2896         vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
2897 #ifdef CONFIG_X86_64
2898         vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
2899         vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
2900         vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
2901         vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
2902         vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
2903         vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
2904         vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
2905         vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
2906 #endif
2907
2908         vcpu->arch.rip = regs->rip;
2909         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2910
2911         kvm_x86_ops->decache_regs(vcpu);
2912
2913         vcpu_put(vcpu);
2914
2915         return 0;
2916 }
2917
2918 static void get_segment(struct kvm_vcpu *vcpu,
2919                         struct kvm_segment *var, int seg)
2920 {
2921         kvm_x86_ops->get_segment(vcpu, var, seg);
2922 }
2923
2924 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2925 {
2926         struct kvm_segment cs;
2927
2928         get_segment(vcpu, &cs, VCPU_SREG_CS);
2929         *db = cs.db;
2930         *l = cs.l;
2931 }
2932 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2933
2934 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2935                                   struct kvm_sregs *sregs)
2936 {
2937         struct descriptor_table dt;
2938         int pending_vec;
2939
2940         vcpu_load(vcpu);
2941
2942         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2943         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2944         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2945         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2946         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2947         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2948
2949         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2950         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2951
2952         kvm_x86_ops->get_idt(vcpu, &dt);
2953         sregs->idt.limit = dt.limit;
2954         sregs->idt.base = dt.base;
2955         kvm_x86_ops->get_gdt(vcpu, &dt);
2956         sregs->gdt.limit = dt.limit;
2957         sregs->gdt.base = dt.base;
2958
2959         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2960         sregs->cr0 = vcpu->arch.cr0;
2961         sregs->cr2 = vcpu->arch.cr2;
2962         sregs->cr3 = vcpu->arch.cr3;
2963         sregs->cr4 = vcpu->arch.cr4;
2964         sregs->cr8 = get_cr8(vcpu);
2965         sregs->efer = vcpu->arch.shadow_efer;
2966         sregs->apic_base = kvm_get_apic_base(vcpu);
2967
2968         if (irqchip_in_kernel(vcpu->kvm)) {
2969                 memset(sregs->interrupt_bitmap, 0,
2970                        sizeof sregs->interrupt_bitmap);
2971                 pending_vec = kvm_x86_ops->get_irq(vcpu);
2972                 if (pending_vec >= 0)
2973                         set_bit(pending_vec,
2974                                 (unsigned long *)sregs->interrupt_bitmap);
2975         } else
2976                 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
2977                        sizeof sregs->interrupt_bitmap);
2978
2979         vcpu_put(vcpu);
2980
2981         return 0;
2982 }
2983
2984 static void set_segment(struct kvm_vcpu *vcpu,
2985                         struct kvm_segment *var, int seg)
2986 {
2987         kvm_x86_ops->set_segment(vcpu, var, seg);
2988 }
2989
2990 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2991                                   struct kvm_sregs *sregs)
2992 {
2993         int mmu_reset_needed = 0;
2994         int i, pending_vec, max_bits;
2995         struct descriptor_table dt;
2996
2997         vcpu_load(vcpu);
2998
2999         dt.limit = sregs->idt.limit;
3000         dt.base = sregs->idt.base;
3001         kvm_x86_ops->set_idt(vcpu, &dt);
3002         dt.limit = sregs->gdt.limit;
3003         dt.base = sregs->gdt.base;
3004         kvm_x86_ops->set_gdt(vcpu, &dt);
3005
3006         vcpu->arch.cr2 = sregs->cr2;
3007         mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
3008         vcpu->arch.cr3 = sregs->cr3;
3009
3010         set_cr8(vcpu, sregs->cr8);
3011
3012         mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
3013         kvm_x86_ops->set_efer(vcpu, sregs->efer);
3014         kvm_set_apic_base(vcpu, sregs->apic_base);
3015
3016         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3017
3018         mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
3019         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
3020         vcpu->arch.cr0 = sregs->cr0;
3021
3022         mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
3023         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
3024         if (!is_long_mode(vcpu) && is_pae(vcpu))
3025                 load_pdptrs(vcpu, vcpu->arch.cr3);
3026
3027         if (mmu_reset_needed)
3028                 kvm_mmu_reset_context(vcpu);
3029
3030         if (!irqchip_in_kernel(vcpu->kvm)) {
3031                 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
3032                        sizeof vcpu->arch.irq_pending);
3033                 vcpu->arch.irq_summary = 0;
3034                 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
3035                         if (vcpu->arch.irq_pending[i])
3036                                 __set_bit(i, &vcpu->arch.irq_summary);
3037         } else {
3038                 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
3039                 pending_vec = find_first_bit(
3040                         (const unsigned long *)sregs->interrupt_bitmap,
3041                         max_bits);
3042                 /* Only pending external irq is handled here */
3043                 if (pending_vec < max_bits) {
3044                         kvm_x86_ops->set_irq(vcpu, pending_vec);
3045                         pr_debug("Set back pending irq %d\n",
3046                                  pending_vec);
3047                 }
3048         }
3049
3050         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3051         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3052         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3053         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3054         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3055         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3056
3057         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3058         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3059
3060         vcpu_put(vcpu);
3061
3062         return 0;
3063 }
3064
3065 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
3066                                     struct kvm_debug_guest *dbg)
3067 {
3068         int r;
3069
3070         vcpu_load(vcpu);
3071
3072         r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
3073
3074         vcpu_put(vcpu);
3075
3076         return r;
3077 }
3078
3079 /*
3080  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
3081  * we have asm/x86/processor.h
3082  */
3083 struct fxsave {
3084         u16     cwd;
3085         u16     swd;
3086         u16     twd;
3087         u16     fop;
3088         u64     rip;
3089         u64     rdp;
3090         u32     mxcsr;
3091         u32     mxcsr_mask;
3092         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
3093 #ifdef CONFIG_X86_64
3094         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
3095 #else
3096         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
3097 #endif
3098 };
3099
3100 /*
3101  * Translate a guest virtual address to a guest physical address.
3102  */
3103 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
3104                                     struct kvm_translation *tr)
3105 {
3106         unsigned long vaddr = tr->linear_address;
3107         gpa_t gpa;
3108
3109         vcpu_load(vcpu);
3110         down_read(&vcpu->kvm->slots_lock);
3111         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
3112         up_read(&vcpu->kvm->slots_lock);
3113         tr->physical_address = gpa;
3114         tr->valid = gpa != UNMAPPED_GVA;
3115         tr->writeable = 1;
3116         tr->usermode = 0;
3117         vcpu_put(vcpu);
3118
3119         return 0;
3120 }
3121
3122 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
3123 {
3124         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
3125
3126         vcpu_load(vcpu);
3127
3128         memcpy(fpu->fpr, fxsave->st_space, 128);
3129         fpu->fcw = fxsave->cwd;
3130         fpu->fsw = fxsave->swd;
3131         fpu->ftwx = fxsave->twd;
3132         fpu->last_opcode = fxsave->fop;
3133         fpu->last_ip = fxsave->rip;
3134         fpu->last_dp = fxsave->rdp;
3135         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
3136
3137         vcpu_put(vcpu);
3138
3139         return 0;
3140 }
3141
3142 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
3143 {
3144         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
3145
3146         vcpu_load(vcpu);
3147
3148         memcpy(fxsave->st_space, fpu->fpr, 128);
3149         fxsave->cwd = fpu->fcw;
3150         fxsave->swd = fpu->fsw;
3151         fxsave->twd = fpu->ftwx;
3152         fxsave->fop = fpu->last_opcode;
3153         fxsave->rip = fpu->last_ip;
3154         fxsave->rdp = fpu->last_dp;
3155         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
3156
3157         vcpu_put(vcpu);
3158
3159         return 0;
3160 }
3161
3162 void fx_init(struct kvm_vcpu *vcpu)
3163 {
3164         unsigned after_mxcsr_mask;
3165
3166         /* Initialize guest FPU by resetting ours and saving into guest's */
3167         preempt_disable();
3168         fx_save(&vcpu->arch.host_fx_image);
3169         fpu_init();
3170         fx_save(&vcpu->arch.guest_fx_image);
3171         fx_restore(&vcpu->arch.host_fx_image);
3172         preempt_enable();
3173
3174         vcpu->arch.cr0 |= X86_CR0_ET;
3175         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
3176         vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
3177         memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
3178                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
3179 }
3180 EXPORT_SYMBOL_GPL(fx_init);
3181
3182 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
3183 {
3184         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
3185                 return;
3186
3187         vcpu->guest_fpu_loaded = 1;
3188         fx_save(&vcpu->arch.host_fx_image);
3189         fx_restore(&vcpu->arch.guest_fx_image);
3190 }
3191 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
3192
3193 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
3194 {
3195         if (!vcpu->guest_fpu_loaded)
3196                 return;
3197
3198         vcpu->guest_fpu_loaded = 0;
3199         fx_save(&vcpu->arch.guest_fx_image);
3200         fx_restore(&vcpu->arch.host_fx_image);
3201         ++vcpu->stat.fpu_reload;
3202 }
3203 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
3204
3205 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
3206 {
3207         kvm_x86_ops->vcpu_free(vcpu);
3208 }
3209
3210 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
3211                                                 unsigned int id)
3212 {
3213         return kvm_x86_ops->vcpu_create(kvm, id);
3214 }
3215
3216 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
3217 {
3218         int r;
3219
3220         /* We do fxsave: this must be aligned. */
3221         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
3222
3223         vcpu_load(vcpu);
3224         r = kvm_arch_vcpu_reset(vcpu);
3225         if (r == 0)
3226                 r = kvm_mmu_setup(vcpu);
3227         vcpu_put(vcpu);
3228         if (r < 0)
3229                 goto free_vcpu;
3230
3231         return 0;
3232 free_vcpu:
3233         kvm_x86_ops->vcpu_free(vcpu);
3234         return r;
3235 }
3236
3237 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
3238 {
3239         vcpu_load(vcpu);
3240         kvm_mmu_unload(vcpu);
3241         vcpu_put(vcpu);
3242
3243         kvm_x86_ops->vcpu_free(vcpu);
3244 }
3245
3246 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
3247 {
3248         return kvm_x86_ops->vcpu_reset(vcpu);
3249 }
3250
3251 void kvm_arch_hardware_enable(void *garbage)
3252 {
3253         kvm_x86_ops->hardware_enable(garbage);
3254 }
3255
3256 void kvm_arch_hardware_disable(void *garbage)
3257 {
3258         kvm_x86_ops->hardware_disable(garbage);
3259 }
3260
3261 int kvm_arch_hardware_setup(void)
3262 {
3263         return kvm_x86_ops->hardware_setup();
3264 }
3265
3266 void kvm_arch_hardware_unsetup(void)
3267 {
3268         kvm_x86_ops->hardware_unsetup();
3269 }
3270
3271 void kvm_arch_check_processor_compat(void *rtn)
3272 {
3273         kvm_x86_ops->check_processor_compatibility(rtn);
3274 }
3275
3276 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
3277 {
3278         struct page *page;
3279         struct kvm *kvm;
3280         int r;
3281
3282         BUG_ON(vcpu->kvm == NULL);
3283         kvm = vcpu->kvm;
3284
3285         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3286         if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
3287                 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
3288         else
3289                 vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
3290
3291         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3292         if (!page) {
3293                 r = -ENOMEM;
3294                 goto fail;
3295         }
3296         vcpu->arch.pio_data = page_address(page);
3297
3298         r = kvm_mmu_create(vcpu);
3299         if (r < 0)
3300                 goto fail_free_pio_data;
3301
3302         if (irqchip_in_kernel(kvm)) {
3303                 r = kvm_create_lapic(vcpu);
3304                 if (r < 0)
3305                         goto fail_mmu_destroy;
3306         }
3307
3308         return 0;
3309
3310 fail_mmu_destroy:
3311         kvm_mmu_destroy(vcpu);
3312 fail_free_pio_data:
3313         free_page((unsigned long)vcpu->arch.pio_data);
3314 fail:
3315         return r;
3316 }
3317
3318 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
3319 {
3320         kvm_free_lapic(vcpu);
3321         kvm_mmu_destroy(vcpu);
3322         free_page((unsigned long)vcpu->arch.pio_data);
3323 }
3324
3325 struct  kvm *kvm_arch_create_vm(void)
3326 {
3327         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
3328
3329         if (!kvm)
3330                 return ERR_PTR(-ENOMEM);
3331
3332         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
3333
3334         return kvm;
3335 }
3336
3337 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
3338 {
3339         vcpu_load(vcpu);
3340         kvm_mmu_unload(vcpu);
3341         vcpu_put(vcpu);
3342 }
3343
3344 static void kvm_free_vcpus(struct kvm *kvm)
3345 {
3346         unsigned int i;
3347
3348         /*
3349          * Unpin any mmu pages first.
3350          */
3351         for (i = 0; i < KVM_MAX_VCPUS; ++i)
3352                 if (kvm->vcpus[i])
3353                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
3354         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3355                 if (kvm->vcpus[i]) {
3356                         kvm_arch_vcpu_free(kvm->vcpus[i]);
3357                         kvm->vcpus[i] = NULL;
3358                 }
3359         }
3360
3361 }
3362
3363 void kvm_arch_destroy_vm(struct kvm *kvm)
3364 {
3365         kfree(kvm->arch.vpic);
3366         kfree(kvm->arch.vioapic);
3367         kvm_free_vcpus(kvm);
3368         kvm_free_physmem(kvm);
3369         kfree(kvm);
3370 }
3371
3372 int kvm_arch_set_memory_region(struct kvm *kvm,
3373                                 struct kvm_userspace_memory_region *mem,
3374                                 struct kvm_memory_slot old,
3375                                 int user_alloc)
3376 {
3377         int npages = mem->memory_size >> PAGE_SHIFT;
3378         struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
3379
3380         /*To keep backward compatibility with older userspace,
3381          *x86 needs to hanlde !user_alloc case.
3382          */
3383         if (!user_alloc) {
3384                 if (npages && !old.rmap) {
3385                         down_write(&current->mm->mmap_sem);
3386                         memslot->userspace_addr = do_mmap(NULL, 0,
3387                                                      npages * PAGE_SIZE,
3388                                                      PROT_READ | PROT_WRITE,
3389                                                      MAP_SHARED | MAP_ANONYMOUS,
3390                                                      0);
3391                         up_write(&current->mm->mmap_sem);
3392
3393                         if (IS_ERR((void *)memslot->userspace_addr))
3394                                 return PTR_ERR((void *)memslot->userspace_addr);
3395                 } else {
3396                         if (!old.user_alloc && old.rmap) {
3397                                 int ret;
3398
3399                                 down_write(&current->mm->mmap_sem);
3400                                 ret = do_munmap(current->mm, old.userspace_addr,
3401                                                 old.npages * PAGE_SIZE);
3402                                 up_write(&current->mm->mmap_sem);
3403                                 if (ret < 0)
3404                                         printk(KERN_WARNING
3405                                        "kvm_vm_ioctl_set_memory_region: "
3406                                        "failed to munmap memory\n");
3407                         }
3408                 }
3409         }
3410
3411         if (!kvm->arch.n_requested_mmu_pages) {
3412                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
3413                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
3414         }
3415
3416         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
3417         kvm_flush_remote_tlbs(kvm);
3418
3419         return 0;
3420 }
3421
3422 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
3423 {
3424         return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
3425                || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
3426 }
3427
3428 static void vcpu_kick_intr(void *info)
3429 {
3430 #ifdef DEBUG
3431         struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
3432         printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
3433 #endif
3434 }
3435
3436 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3437 {
3438         int ipi_pcpu = vcpu->cpu;
3439
3440         if (waitqueue_active(&vcpu->wq)) {
3441                 wake_up_interruptible(&vcpu->wq);
3442                 ++vcpu->stat.halt_wakeup;
3443         }
3444         if (vcpu->guest_mode)
3445                 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
3446 }