Merge branch 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 5 Sep 2013 01:15:06 +0000 (18:15 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 5 Sep 2013 01:15:06 +0000 (18:15 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 5 Sep 2013 01:15:06 +0000 (18:15 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 5 Sep 2013 01:15:06 +0000 (18:15 -0700)
diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt

index 83afe65d4966d0664186e2163083510da5aba012..22ff659bc0fb644704933e50cb931d0e2681232b 100644 (file)
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -43,6 +43,10 @@ KVM_FEATURE_CLOCKSOURCE2           ||     3 || kvmclock available at msrs
  KVM_FEATURE_ASYNC_PF               ||     4 || async pf can be enabled by
                                     ||       || writing to msr 0x4b564d02
  ------------------------------------------------------------------------------
+KVM_FEATURE_PV_UNHALT              ||     7 || guest checks this feature bit
+                                   ||       || before enabling paravirtualized
+                                   ||       || spinlock support.
+------------------------------------------------------------------------------
  KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
                                     ||       || per-cpu warps are expected in
                                     ||       || kvmclock.
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt

index ea113b5d87a4a8530fcc32f57a4da78c6e9e7281..022198e389d7e6066b8101292199fa25bcde84e4 100644 (file)
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -64,3 +64,17 @@ Purpose: To enable communication between the hypervisor and guest there is a
  shared page that contains parts of supervisor visible register state.
  The guest can map this shared page to access its supervisor register through
  memory using this hypercall.
+
+5. KVM_HC_KICK_CPU
+------------------------
+Architecture: x86
+Status: active
+Purpose: Hypercall used to wakeup a vcpu from HLT state
+Usage example : A vcpu of a paravirtualized guest that is busywaiting in guest
+kernel mode for an event to occur (ex: a spinlock to become available) can
+execute HLT instruction once it has busy-waited for more than a threshold
+time-interval. Execution of HLT instruction would cause the hypervisor to put
+the vcpu to sleep until occurence of an appropriate event. Another vcpu of the
+same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall,
+specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0)
+is used in the hypercall for future use.
diff --git a/arch/arm/configs/keystone_defconfig b/arch/arm/configs/keystone_defconfig

index 62e968cac9dc132f66c7c2d397c689a19aeda295..1f36b823905f1d6e1c16eeb9e0554efc1678bcf0 100644 (file)
--- a/arch/arm/configs/keystone_defconfig
+++ b/arch/arm/configs/keystone_defconfig
@@ -104,6 +104,7 @@ CONFIG_IP_SCTP=y
  CONFIG_VLAN_8021Q=y
  CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
  CONFIG_CMA=y
+CONFIG_DMA_CMA=y
  CONFIG_MTD=y
  CONFIG_MTD_CMDLINE_PARTS=y
  CONFIG_MTD_BLOCK=y
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig

index 5339e6a4d639dccaca9144eb0df3101b692d0df2..5465f564fdf3c8e5a9af125182f6fdfd983e5ab1 100644 (file)
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -78,6 +78,7 @@ CONFIG_MAC80211_RC_PID=y
  CONFIG_MAC80211_RC_DEFAULT_PID=y
  CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
  CONFIG_CMA=y
+CONFIG_DMA_CMA=y
  CONFIG_CONNECTOR=y
  CONFIG_DEVTMPFS=y
  CONFIG_DEVTMPFS_MOUNT=y
diff --git a/arch/arm/configs/tegra_defconfig b/arch/arm/configs/tegra_defconfig

index 1effb43dab80833f2b0df2a520fc61f7f2db6028..92d0a149aeb5e58094ad3d3765fec5e878065d4a 100644 (file)
--- a/arch/arm/configs/tegra_defconfig
+++ b/arch/arm/configs/tegra_defconfig
@@ -79,6 +79,7 @@ CONFIG_DEVTMPFS=y
  CONFIG_DEVTMPFS_MOUNT=y
  # CONFIG_FIRMWARE_IN_KERNEL is not set
  CONFIG_CMA=y
+CONFIG_DMA_CMA=y
  CONFIG_MTD=y
  CONFIG_MTD_M25P80=y
  CONFIG_PROC_DEVICETREE=y
diff --git a/arch/arm/include/asm/dma-contiguous.h b/arch/arm/include/asm/dma-contiguous.h

index 3ed37b4d93dade5c688410eae876ccb53364ef72..e072bb2ba1b12761d579b44cf951ba97d28b0386 100644 (file)
--- a/arch/arm/include/asm/dma-contiguous.h
+++ b/arch/arm/include/asm/dma-contiguous.h
@@ -2,7 +2,7 @@
  #define ASMARM_DMA_CONTIGUOUS_H
  
  #ifdef __KERNEL__
-#ifdef CONFIG_CMA
+#ifdef CONFIG_DMA_CMA
  
  #include <linux/types.h>
  #include <asm-generic/dma-contiguous.h>
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h

index 472ac7091003ac0cbae073ec1794413d80a04262..9b28c41f4ba916a569bf1f105064f092ab434dda 100644 (file)
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -64,7 +64,7 @@ void kvm_clear_hyp_idmap(void);
  
  static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)
  {
-       pte_val(*pte) = new_pte;
+       *pte = new_pte;
         /*
          * flush_pmd_entry just takes a void pointer and cleans the necessary
          * cache entries, so we can reuse the function for ptes.
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c

index 741f66a2edbd77bf3ddce5a53c6cb760e491c7fe..9c697db2787e2a524cf59db4519f5272fa2918b0 100644 (file)
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -219,6 +219,10 @@ long kvm_arch_dev_ioctl(struct file *filp,
         return -EINVAL;
  }
  
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
+
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                    struct kvm_memory_slot *memslot,
                                    struct kvm_userspace_memory_region *mem,
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S

index 16cd4ba5d7fd6d0ff0781da759712167cbc17508..85dd84b10687136b15367e433ce4e716b3e2848f 100644 (file)
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -492,10 +492,10 @@ __kvm_hyp_code_end:
         .section ".rodata"
  
  und_die_str:
-       .ascii  "unexpected undefined exception in Hyp mode at: %#08x"
+       .ascii  "unexpected undefined exception in Hyp mode at: %#08x\n"
  pabt_die_str:
-       .ascii  "unexpected prefetch abort in Hyp mode at: %#08x"
+       .ascii  "unexpected prefetch abort in Hyp mode at: %#08x\n"
  dabt_die_str:
-       .ascii  "unexpected data abort in Hyp mode at: %#08x"
+       .ascii  "unexpected data abort in Hyp mode at: %#08x\n"
  svc_die_str:
-       .ascii  "unexpected HVC/SVC trap in Hyp mode at: %#08x"
+       .ascii  "unexpected HVC/SVC trap in Hyp mode at: %#08x\n"
diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c

index b7840e7aa4529ac73e34c29e26c0002ef033306d..71e08baee209387f899e14a32fa32e02682b0ae8 100644 (file)
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@@ -40,7 +40,7 @@ static struct kvm_regs a15_regs_reset = {
  };
  
  static const struct kvm_irq_level a15_vtimer_irq = {
-       .irq = 27,
+       { .irq = 27 },
         .level = 1,
  };
  
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h

index a8e73ed5ad5b710fc73de0006255bd1dfd6713ef..b1d640f78623971337ad08072efdcc981c124c0f 100644 (file)
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@@ -59,10 +59,9 @@ TRACE_EVENT(kvm_guest_fault,
                 __entry->ipa                    = ipa;
         ),
  
-       TP_printk("guest fault at PC %#08lx (hxfar %#08lx, "
-                 "ipa %#16llx, hsr %#08lx",
-                 __entry->vcpu_pc, __entry->hxfar,
-                 __entry->ipa, __entry->hsr)
+       TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
+                 __entry->ipa, __entry->hsr,
+                 __entry->hxfar, __entry->vcpu_pc)
  );
  
  TRACE_EVENT(kvm_irq_line,
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c

index 7f9b1798c6cf12f08e55496b27082722e16a30f3..dbddc07a3bbd9abc6d2ba9169085457d4fc86dc5 100644 (file)
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -358,7 +358,7 @@ static int __init atomic_pool_init(void)
         if (!pages)
                 goto no_pages;
  
-       if (IS_ENABLED(CONFIG_CMA))
+       if (IS_ENABLED(CONFIG_DMA_CMA))
                 ptr = __alloc_from_contiguous(NULL, pool->size, prot, &page,
                                               atomic_pool_init);
         else
@@ -670,7 +670,7 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
                 addr = __alloc_simple_buffer(dev, size, gfp, &page);
         else if (!(gfp & __GFP_WAIT))
                 addr = __alloc_from_pool(size, &page);
-       else if (!IS_ENABLED(CONFIG_CMA))
+       else if (!IS_ENABLED(CONFIG_DMA_CMA))
                 addr = __alloc_remap_buffer(dev, size, gfp, prot, &page, caller);
         else
                 addr = __alloc_from_contiguous(dev, size, prot, &page, caller);
@@ -759,7 +759,7 @@ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
                 __dma_free_buffer(page, size);
         } else if (__free_from_pool(cpu_addr, size)) {
                 return;
-       } else if (!IS_ENABLED(CONFIG_CMA)) {
+       } else if (!IS_ENABLED(CONFIG_DMA_CMA)) {
                 __dma_free_remap(cpu_addr, size);
                 __dma_free_buffer(page, size);
         } else {
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c

index 5b2dc0d10c8f4211d28e044a2071306d1ef955ed..bdfd8789b37661da691bdf320841a21043cd4cd4 100644 (file)
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1560,6 +1560,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
         return 0;
  }
  
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
+
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                 struct kvm_memory_slot *memslot,
                 struct kvm_userspace_memory_region *mem,
diff --git a/arch/mips/kvm/kvm_locore.S b/arch/mips/kvm/kvm_locore.S

index dca2aa66599371f376845e8ab598c752ac7b08a6..bbace092ad0addef5277391a14944d5590b97be4 100644 (file)
--- a/arch/mips/kvm/kvm_locore.S
+++ b/arch/mips/kvm/kvm_locore.S
@@ -1,13 +1,13 @@
  /*
-* This file is subject to the terms and conditions of the GNU General Public
-* License.  See the file "COPYING" in the main directory of this archive
-* for more details.
-*
-* Main entry point for the guest, exception handling.
-*
-* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
-* Authors: Sanjay Lal <sanjayl@kymasys.com>
-*/
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Main entry point for the guest, exception handling.
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
+ */
  
  #include <asm/asm.h>
  #include <asm/asmmacro.h>
@@ -55,195 +55,193 @@
   * a0: run
   * a1: vcpu
   */
+       .set    noreorder
+       .set    noat
  
  FEXPORT(__kvm_mips_vcpu_run)
-    .set    push
-    .set    noreorder
-    .set    noat
-
-    /* k0/k1 not being used in host kernel context */
-       addiu           k1,sp, -PT_SIZE
-    LONG_S         $0, PT_R0(k1)
-    LONG_S             $1, PT_R1(k1)
-    LONG_S             $2, PT_R2(k1)
-    LONG_S             $3, PT_R3(k1)
-
-    LONG_S             $4, PT_R4(k1)
-    LONG_S             $5, PT_R5(k1)
-    LONG_S             $6, PT_R6(k1)
-    LONG_S             $7, PT_R7(k1)
-
-    LONG_S             $8,  PT_R8(k1)
-    LONG_S             $9,  PT_R9(k1)
-    LONG_S             $10, PT_R10(k1)
-    LONG_S             $11, PT_R11(k1)
-    LONG_S             $12, PT_R12(k1)
-    LONG_S             $13, PT_R13(k1)
-    LONG_S             $14, PT_R14(k1)
-    LONG_S             $15, PT_R15(k1)
-    LONG_S             $16, PT_R16(k1)
-    LONG_S             $17, PT_R17(k1)
-
-    LONG_S             $18, PT_R18(k1)
-    LONG_S             $19, PT_R19(k1)
-    LONG_S             $20, PT_R20(k1)
-    LONG_S             $21, PT_R21(k1)
-    LONG_S             $22, PT_R22(k1)
-    LONG_S             $23, PT_R23(k1)
-    LONG_S             $24, PT_R24(k1)
-    LONG_S             $25, PT_R25(k1)
+       /* k0/k1 not being used in host kernel context */
+       INT_ADDIU k1, sp, -PT_SIZE
+       LONG_S  $0, PT_R0(k1)
+       LONG_S  $1, PT_R1(k1)
+       LONG_S  $2, PT_R2(k1)
+       LONG_S  $3, PT_R3(k1)
+
+       LONG_S  $4, PT_R4(k1)
+       LONG_S  $5, PT_R5(k1)
+       LONG_S  $6, PT_R6(k1)
+       LONG_S  $7, PT_R7(k1)
+
+       LONG_S  $8,  PT_R8(k1)
+       LONG_S  $9,  PT_R9(k1)
+       LONG_S  $10, PT_R10(k1)
+       LONG_S  $11, PT_R11(k1)
+       LONG_S  $12, PT_R12(k1)
+       LONG_S  $13, PT_R13(k1)
+       LONG_S  $14, PT_R14(k1)
+       LONG_S  $15, PT_R15(k1)
+       LONG_S  $16, PT_R16(k1)
+       LONG_S  $17, PT_R17(k1)
+
+       LONG_S  $18, PT_R18(k1)
+       LONG_S  $19, PT_R19(k1)
+       LONG_S  $20, PT_R20(k1)
+       LONG_S  $21, PT_R21(k1)
+       LONG_S  $22, PT_R22(k1)
+       LONG_S  $23, PT_R23(k1)
+       LONG_S  $24, PT_R24(k1)
+       LONG_S  $25, PT_R25(k1)
  
         /* XXXKYMA k0/k1 not saved, not being used if we got here through an ioctl() */
  
-    LONG_S             $28, PT_R28(k1)
-    LONG_S             $29, PT_R29(k1)
-    LONG_S             $30, PT_R30(k1)
-    LONG_S             $31, PT_R31(k1)
+       LONG_S  $28, PT_R28(k1)
+       LONG_S  $29, PT_R29(k1)
+       LONG_S  $30, PT_R30(k1)
+       LONG_S  $31, PT_R31(k1)
  
-    /* Save hi/lo */
-       mflo            v0
-       LONG_S          v0, PT_LO(k1)
-       mfhi            v1
-       LONG_S          v1, PT_HI(k1)
+       /* Save hi/lo */
+       mflo    v0
+       LONG_S  v0, PT_LO(k1)
+       mfhi    v1
+       LONG_S  v1, PT_HI(k1)
  
         /* Save host status */
-       mfc0            v0, CP0_STATUS
-       LONG_S          v0, PT_STATUS(k1)
+       mfc0    v0, CP0_STATUS
+       LONG_S  v0, PT_STATUS(k1)
  
         /* Save host ASID, shove it into the BVADDR location */
-       mfc0            v1,CP0_ENTRYHI
-       andi            v1, 0xff
-       LONG_S          v1, PT_HOST_ASID(k1)
+       mfc0    v1, CP0_ENTRYHI
+       andi    v1, 0xff
+       LONG_S  v1, PT_HOST_ASID(k1)
  
-    /* Save DDATA_LO, will be used to store pointer to vcpu */
-    mfc0        v1, CP0_DDATA_LO
-    LONG_S      v1, PT_HOST_USERLOCAL(k1)
+       /* Save DDATA_LO, will be used to store pointer to vcpu */
+       mfc0    v1, CP0_DDATA_LO
+       LONG_S  v1, PT_HOST_USERLOCAL(k1)
  
-    /* DDATA_LO has pointer to vcpu */
-    mtc0        a1,CP0_DDATA_LO
+       /* DDATA_LO has pointer to vcpu */
+       mtc0    a1, CP0_DDATA_LO
  
-    /* Offset into vcpu->arch */
-       addiu           k1, a1, VCPU_HOST_ARCH
+       /* Offset into vcpu->arch */
+       INT_ADDIU k1, a1, VCPU_HOST_ARCH
  
-    /* Save the host stack to VCPU, used for exception processing when we exit from the Guest */
-    LONG_S      sp, VCPU_HOST_STACK(k1)
+       /*
+        * Save the host stack to VCPU, used for exception processing
+        * when we exit from the Guest
+        */
+       LONG_S  sp, VCPU_HOST_STACK(k1)
  
-    /* Save the kernel gp as well */
-    LONG_S      gp, VCPU_HOST_GP(k1)
+       /* Save the kernel gp as well */
+       LONG_S  gp, VCPU_HOST_GP(k1)
  
         /* Setup status register for running the guest in UM, interrupts are disabled */
-       li                      k0,(ST0_EXL | KSU_USER| ST0_BEV)
-       mtc0            k0,CP0_STATUS
-    ehb
-
-    /* load up the new EBASE */
-    LONG_L      k0, VCPU_GUEST_EBASE(k1)
-    mtc0        k0,CP0_EBASE
-
-    /* Now that the new EBASE has been loaded, unset BEV, set interrupt mask as it was
-     * but make sure that timer interrupts are enabled
-     */
-    li          k0,(ST0_EXL | KSU_USER | ST0_IE)
-    andi        v0, v0, ST0_IM
-    or          k0, k0, v0
-    mtc0        k0,CP0_STATUS
-    ehb
+       li      k0, (ST0_EXL | KSU_USER | ST0_BEV)
+       mtc0    k0, CP0_STATUS
+       ehb
+
+       /* load up the new EBASE */
+       LONG_L  k0, VCPU_GUEST_EBASE(k1)
+       mtc0    k0, CP0_EBASE
+
+       /*
+        * Now that the new EBASE has been loaded, unset BEV, set
+        * interrupt mask as it was but make sure that timer interrupts
+        * are enabled
+        */
+       li      k0, (ST0_EXL | KSU_USER | ST0_IE)
+       andi    v0, v0, ST0_IM
+       or      k0, k0, v0
+       mtc0    k0, CP0_STATUS
+       ehb
  
  
         /* Set Guest EPC */
-       LONG_L          t0, VCPU_PC(k1)
-       mtc0            t0, CP0_EPC
+       LONG_L  t0, VCPU_PC(k1)
+       mtc0    t0, CP0_EPC
  
  FEXPORT(__kvm_mips_load_asid)
-    /* Set the ASID for the Guest Kernel */
-    sll         t0, t0, 1                       /* with kseg0 @ 0x40000000, kernel */
-                                                /* addresses shift to 0x80000000 */
-    bltz        t0, 1f                          /* If kernel */
-       addiu       t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-    addiu       t1, k1, VCPU_GUEST_USER_ASID    /* else user */
+       /* Set the ASID for the Guest Kernel */
+       INT_SLL t0, t0, 1       /* with kseg0 @ 0x40000000, kernel */
+                               /* addresses shift to 0x80000000 */
+       bltz    t0, 1f          /* If kernel */
+        INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
+       INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
  1:
-    /* t1: contains the base of the ASID array, need to get the cpu id  */
-    LONG_L      t2, TI_CPU($28)             /* smp_processor_id */
-    sll         t2, t2, 2                   /* x4 */
-    addu        t3, t1, t2
-    LONG_L      k0, (t3)
-    andi        k0, k0, 0xff
-       mtc0            k0,CP0_ENTRYHI
-    ehb
-
-    /* Disable RDHWR access */
-    mtc0    zero,  CP0_HWRENA
-
-    /* Now load up the Guest Context from VCPU */
-    LONG_L             $1, VCPU_R1(k1)
-    LONG_L             $2, VCPU_R2(k1)
-    LONG_L             $3, VCPU_R3(k1)
-
-    LONG_L             $4, VCPU_R4(k1)
-    LONG_L             $5, VCPU_R5(k1)
-    LONG_L             $6, VCPU_R6(k1)
-    LONG_L             $7, VCPU_R7(k1)
-
-    LONG_L             $8,  VCPU_R8(k1)
-    LONG_L             $9,  VCPU_R9(k1)
-    LONG_L             $10, VCPU_R10(k1)
-    LONG_L             $11, VCPU_R11(k1)
-    LONG_L             $12, VCPU_R12(k1)
-    LONG_L             $13, VCPU_R13(k1)
-    LONG_L             $14, VCPU_R14(k1)
-    LONG_L             $15, VCPU_R15(k1)
-    LONG_L             $16, VCPU_R16(k1)
-    LONG_L             $17, VCPU_R17(k1)
-    LONG_L             $18, VCPU_R18(k1)
-    LONG_L             $19, VCPU_R19(k1)
-    LONG_L             $20, VCPU_R20(k1)
-    LONG_L             $21, VCPU_R21(k1)
-    LONG_L             $22, VCPU_R22(k1)
-    LONG_L             $23, VCPU_R23(k1)
-    LONG_L             $24, VCPU_R24(k1)
-    LONG_L             $25, VCPU_R25(k1)
-
-    /* k0/k1 loaded up later */
-
-    LONG_L             $28, VCPU_R28(k1)
-    LONG_L             $29, VCPU_R29(k1)
-    LONG_L             $30, VCPU_R30(k1)
-    LONG_L             $31, VCPU_R31(k1)
-
-    /* Restore hi/lo */
-       LONG_L          k0, VCPU_LO(k1)
-       mtlo            k0
-
-       LONG_L          k0, VCPU_HI(k1)
-       mthi            k0
+            /* t1: contains the base of the ASID array, need to get the cpu id  */
+       LONG_L  t2, TI_CPU($28)             /* smp_processor_id */
+       INT_SLL t2, t2, 2                   /* x4 */
+       REG_ADDU t3, t1, t2
+       LONG_L  k0, (t3)
+       andi    k0, k0, 0xff
+       mtc0    k0, CP0_ENTRYHI
+       ehb
+
+       /* Disable RDHWR access */
+       mtc0    zero, CP0_HWRENA
+
+       /* Now load up the Guest Context from VCPU */
+       LONG_L  $1, VCPU_R1(k1)
+       LONG_L  $2, VCPU_R2(k1)
+       LONG_L  $3, VCPU_R3(k1)
+
+       LONG_L  $4, VCPU_R4(k1)
+       LONG_L  $5, VCPU_R5(k1)
+       LONG_L  $6, VCPU_R6(k1)
+       LONG_L  $7, VCPU_R7(k1)
+
+       LONG_L  $8, VCPU_R8(k1)
+       LONG_L  $9, VCPU_R9(k1)
+       LONG_L  $10, VCPU_R10(k1)
+       LONG_L  $11, VCPU_R11(k1)
+       LONG_L  $12, VCPU_R12(k1)
+       LONG_L  $13, VCPU_R13(k1)
+       LONG_L  $14, VCPU_R14(k1)
+       LONG_L  $15, VCPU_R15(k1)
+       LONG_L  $16, VCPU_R16(k1)
+       LONG_L  $17, VCPU_R17(k1)
+       LONG_L  $18, VCPU_R18(k1)
+       LONG_L  $19, VCPU_R19(k1)
+       LONG_L  $20, VCPU_R20(k1)
+       LONG_L  $21, VCPU_R21(k1)
+       LONG_L  $22, VCPU_R22(k1)
+       LONG_L  $23, VCPU_R23(k1)
+       LONG_L  $24, VCPU_R24(k1)
+       LONG_L  $25, VCPU_R25(k1)
+
+       /* k0/k1 loaded up later */
+
+       LONG_L  $28, VCPU_R28(k1)
+       LONG_L  $29, VCPU_R29(k1)
+       LONG_L  $30, VCPU_R30(k1)
+       LONG_L  $31, VCPU_R31(k1)
+
+       /* Restore hi/lo */
+       LONG_L  k0, VCPU_LO(k1)
+       mtlo    k0
+
+       LONG_L  k0, VCPU_HI(k1)
+       mthi    k0
  
  FEXPORT(__kvm_mips_load_k0k1)
         /* Restore the guest's k0/k1 registers */
-    LONG_L             k0, VCPU_R26(k1)
-    LONG_L             k1, VCPU_R27(k1)
+       LONG_L  k0, VCPU_R26(k1)
+       LONG_L  k1, VCPU_R27(k1)
  
-    /* Jump to guest */
+       /* Jump to guest */
         eret
-       .set    pop
  
  VECTOR(MIPSX(exception), unknown)
  /*
   * Find out what mode we came from and jump to the proper handler.
   */
-    .set    push
-       .set    noat
-    .set    noreorder
-    mtc0    k0, CP0_ERROREPC    #01: Save guest k0
-    ehb                         #02:
-
-    mfc0    k0, CP0_EBASE       #02: Get EBASE
-    srl     k0, k0, 10          #03: Get rid of CPUNum
-    sll     k0, k0, 10          #04
-    LONG_S  k1, 0x3000(k0)      #05: Save k1 @ offset 0x3000
-    addiu   k0, k0, 0x2000      #06: Exception handler is installed @ offset 0x2000
-       j       k0                                      #07: jump to the function
-       nop                                             #08: branch delay slot
-       .set    push
+       mtc0    k0, CP0_ERROREPC        #01: Save guest k0
+       ehb                             #02:
+
+       mfc0    k0, CP0_EBASE           #02: Get EBASE
+       INT_SRL k0, k0, 10              #03: Get rid of CPUNum
+       INT_SLL k0, k0, 10              #04
+       LONG_S  k1, 0x3000(k0)          #05: Save k1 @ offset 0x3000
+       INT_ADDIU k0, k0, 0x2000                #06: Exception handler is installed @ offset 0x2000
+       j       k0                      #07: jump to the function
+        nop                            #08: branch delay slot
  VECTOR_END(MIPSX(exceptionEnd))
  .end MIPSX(exception)
  
@@ -253,329 +251,327 @@ VECTOR_END(MIPSX(exceptionEnd))
   *
   */
  NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
-    .set    push
-    .set    noat
-    .set    noreorder
-
-    /* Get the VCPU pointer from DDTATA_LO */
-    mfc0        k1, CP0_DDATA_LO
-       addiu           k1, k1, VCPU_HOST_ARCH
-
-    /* Start saving Guest context to VCPU */
-    LONG_S  $0, VCPU_R0(k1)
-    LONG_S  $1, VCPU_R1(k1)
-    LONG_S  $2, VCPU_R2(k1)
-    LONG_S  $3, VCPU_R3(k1)
-    LONG_S  $4, VCPU_R4(k1)
-    LONG_S  $5, VCPU_R5(k1)
-    LONG_S  $6, VCPU_R6(k1)
-    LONG_S  $7, VCPU_R7(k1)
-    LONG_S  $8, VCPU_R8(k1)
-    LONG_S  $9, VCPU_R9(k1)
-    LONG_S  $10, VCPU_R10(k1)
-    LONG_S  $11, VCPU_R11(k1)
-    LONG_S  $12, VCPU_R12(k1)
-    LONG_S  $13, VCPU_R13(k1)
-    LONG_S  $14, VCPU_R14(k1)
-    LONG_S  $15, VCPU_R15(k1)
-    LONG_S  $16, VCPU_R16(k1)
-    LONG_S  $17,VCPU_R17(k1)
-    LONG_S  $18, VCPU_R18(k1)
-    LONG_S  $19, VCPU_R19(k1)
-    LONG_S  $20, VCPU_R20(k1)
-    LONG_S  $21, VCPU_R21(k1)
-    LONG_S  $22, VCPU_R22(k1)
-    LONG_S  $23, VCPU_R23(k1)
-    LONG_S  $24, VCPU_R24(k1)
-    LONG_S  $25, VCPU_R25(k1)
-
-    /* Guest k0/k1 saved later */
-
-    LONG_S  $28, VCPU_R28(k1)
-    LONG_S  $29, VCPU_R29(k1)
-    LONG_S  $30, VCPU_R30(k1)
-    LONG_S  $31, VCPU_R31(k1)
-
-    /* We need to save hi/lo and restore them on
-     * the way out
-     */
-    mfhi    t0
-    LONG_S  t0, VCPU_HI(k1)
-
-    mflo    t0
-    LONG_S  t0, VCPU_LO(k1)
-
-    /* Finally save guest k0/k1 to VCPU */
-    mfc0    t0, CP0_ERROREPC
-    LONG_S  t0, VCPU_R26(k1)
-
-    /* Get GUEST k1 and save it in VCPU */
-    la      t1, ~0x2ff
-    mfc0    t0, CP0_EBASE
-    and     t0, t0, t1
-    LONG_L  t0, 0x3000(t0)
-    LONG_S  t0, VCPU_R27(k1)
-
-    /* Now that context has been saved, we can use other registers */
-
-    /* Restore vcpu */
-    mfc0        a1, CP0_DDATA_LO
-    move        s1, a1
-
-   /* Restore run (vcpu->run) */
-    LONG_L      a0, VCPU_RUN(a1)
-    /* Save pointer to run in s0, will be saved by the compiler */
-    move        s0, a0
-
-
-    /* Save Host level EPC, BadVaddr and Cause to VCPU, useful to process the exception */
-    mfc0    k0,CP0_EPC
-    LONG_S  k0, VCPU_PC(k1)
-
-    mfc0    k0, CP0_BADVADDR
-    LONG_S  k0, VCPU_HOST_CP0_BADVADDR(k1)
-
-    mfc0    k0, CP0_CAUSE
-    LONG_S  k0, VCPU_HOST_CP0_CAUSE(k1)
-
-    mfc0    k0, CP0_ENTRYHI
-    LONG_S  k0, VCPU_HOST_ENTRYHI(k1)
-
-    /* Now restore the host state just enough to run the handlers */
-
-    /* Swtich EBASE to the one used by Linux */
-    /* load up the host EBASE */
-    mfc0        v0, CP0_STATUS
-
-    .set at
-       or          k0, v0, ST0_BEV
-    .set noat
-
-    mtc0        k0, CP0_STATUS
-    ehb
-
-    LONG_L      k0, VCPU_HOST_EBASE(k1)
-    mtc0        k0,CP0_EBASE
-
-
-    /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
-    .set at
-       and         v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
-    or          v0, v0, ST0_CU0
-    .set noat
-    mtc0        v0, CP0_STATUS
-    ehb
-
-    /* Load up host GP */
-    LONG_L  gp, VCPU_HOST_GP(k1)
-
-    /* Need a stack before we can jump to "C" */
-    LONG_L  sp, VCPU_HOST_STACK(k1)
-
-    /* Saved host state */
-    addiu   sp,sp, -PT_SIZE
+       /* Get the VCPU pointer from DDTATA_LO */
+       mfc0    k1, CP0_DDATA_LO
+       INT_ADDIU k1, k1, VCPU_HOST_ARCH
+
+       /* Start saving Guest context to VCPU */
+       LONG_S  $0, VCPU_R0(k1)
+       LONG_S  $1, VCPU_R1(k1)
+       LONG_S  $2, VCPU_R2(k1)
+       LONG_S  $3, VCPU_R3(k1)
+       LONG_S  $4, VCPU_R4(k1)
+       LONG_S  $5, VCPU_R5(k1)
+       LONG_S  $6, VCPU_R6(k1)
+       LONG_S  $7, VCPU_R7(k1)
+       LONG_S  $8, VCPU_R8(k1)
+       LONG_S  $9, VCPU_R9(k1)
+       LONG_S  $10, VCPU_R10(k1)
+       LONG_S  $11, VCPU_R11(k1)
+       LONG_S  $12, VCPU_R12(k1)
+       LONG_S  $13, VCPU_R13(k1)
+       LONG_S  $14, VCPU_R14(k1)
+       LONG_S  $15, VCPU_R15(k1)
+       LONG_S  $16, VCPU_R16(k1)
+       LONG_S  $17, VCPU_R17(k1)
+       LONG_S  $18, VCPU_R18(k1)
+       LONG_S  $19, VCPU_R19(k1)
+       LONG_S  $20, VCPU_R20(k1)
+       LONG_S  $21, VCPU_R21(k1)
+       LONG_S  $22, VCPU_R22(k1)
+       LONG_S  $23, VCPU_R23(k1)
+       LONG_S  $24, VCPU_R24(k1)
+       LONG_S  $25, VCPU_R25(k1)
+
+       /* Guest k0/k1 saved later */
+
+       LONG_S  $28, VCPU_R28(k1)
+       LONG_S  $29, VCPU_R29(k1)
+       LONG_S  $30, VCPU_R30(k1)
+       LONG_S  $31, VCPU_R31(k1)
+
+       /* We need to save hi/lo and restore them on
+        * the way out
+        */
+       mfhi    t0
+       LONG_S  t0, VCPU_HI(k1)
+
+       mflo    t0
+       LONG_S  t0, VCPU_LO(k1)
+
+       /* Finally save guest k0/k1 to VCPU */
+       mfc0    t0, CP0_ERROREPC
+       LONG_S  t0, VCPU_R26(k1)
+
+       /* Get GUEST k1 and save it in VCPU */
+       PTR_LI  t1, ~0x2ff
+       mfc0    t0, CP0_EBASE
+       and     t0, t0, t1
+       LONG_L  t0, 0x3000(t0)
+       LONG_S  t0, VCPU_R27(k1)
+
+       /* Now that context has been saved, we can use other registers */
+
+       /* Restore vcpu */
+       mfc0    a1, CP0_DDATA_LO
+       move    s1, a1
+
+       /* Restore run (vcpu->run) */
+       LONG_L  a0, VCPU_RUN(a1)
+       /* Save pointer to run in s0, will be saved by the compiler */
+       move    s0, a0
+
+       /* Save Host level EPC, BadVaddr and Cause to VCPU, useful to
+        * process the exception */
+       mfc0    k0,CP0_EPC
+       LONG_S  k0, VCPU_PC(k1)
+
+       mfc0    k0, CP0_BADVADDR
+       LONG_S  k0, VCPU_HOST_CP0_BADVADDR(k1)
+
+       mfc0    k0, CP0_CAUSE
+       LONG_S  k0, VCPU_HOST_CP0_CAUSE(k1)
+
+       mfc0    k0, CP0_ENTRYHI
+       LONG_S  k0, VCPU_HOST_ENTRYHI(k1)
+
+       /* Now restore the host state just enough to run the handlers */
+
+       /* Swtich EBASE to the one used by Linux */
+       /* load up the host EBASE */
+       mfc0    v0, CP0_STATUS
+
+       .set    at
+       or      k0, v0, ST0_BEV
+       .set    noat
+
+       mtc0    k0, CP0_STATUS
+       ehb
+
+       LONG_L  k0, VCPU_HOST_EBASE(k1)
+       mtc0    k0,CP0_EBASE
+
  
-    /* XXXKYMA do we need to load the host ASID, maybe not because the
-     * kernel entries are marked GLOBAL, need to verify
-     */
+       /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
+       .set    at
+       and     v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
+       or      v0, v0, ST0_CU0
+       .set    noat
+       mtc0    v0, CP0_STATUS
+       ehb
+
+       /* Load up host GP */
+       LONG_L  gp, VCPU_HOST_GP(k1)
+
+       /* Need a stack before we can jump to "C" */
+       LONG_L  sp, VCPU_HOST_STACK(k1)
+
+       /* Saved host state */
+       INT_ADDIU sp, sp, -PT_SIZE
  
-    /* Restore host DDATA_LO */
-    LONG_L      k0, PT_HOST_USERLOCAL(sp)
-    mtc0        k0, CP0_DDATA_LO
+       /* XXXKYMA do we need to load the host ASID, maybe not because the
+        * kernel entries are marked GLOBAL, need to verify
+        */
  
-    /* Restore RDHWR access */
-    la      k0, 0x2000000F
-    mtc0    k0,  CP0_HWRENA
+       /* Restore host DDATA_LO */
+       LONG_L  k0, PT_HOST_USERLOCAL(sp)
+       mtc0    k0, CP0_DDATA_LO
  
-    /* Jump to handler */
+       /* Restore RDHWR access */
+       PTR_LI  k0, 0x2000000F
+       mtc0    k0, CP0_HWRENA
+
+       /* Jump to handler */
  FEXPORT(__kvm_mips_jump_to_handler)
-    /* XXXKYMA: not sure if this is safe, how large is the stack?? */
-    /* Now jump to the kvm_mips_handle_exit() to see if we can deal with this in the kernel */
-    la          t9,kvm_mips_handle_exit
-    jalr.hb     t9
-    addiu       sp,sp, -CALLFRAME_SIZ           /* BD Slot */
-
-    /* Return from handler Make sure interrupts are disabled */
-    di
-    ehb
-
-    /* XXXKYMA: k0/k1 could have been blown away if we processed an exception
-     * while we were handling the exception from the guest, reload k1
-     */
-    move        k1, s1
-       addiu           k1, k1, VCPU_HOST_ARCH
-
-    /* Check return value, should tell us if we are returning to the host (handle I/O etc)
-     * or resuming the guest
-     */
-    andi        t0, v0, RESUME_HOST
-    bnez        t0, __kvm_mips_return_to_host
-    nop
+       /* XXXKYMA: not sure if this is safe, how large is the stack??
+        * Now jump to the kvm_mips_handle_exit() to see if we can deal
+        * with this in the kernel */
+       PTR_LA  t9, kvm_mips_handle_exit
+       jalr.hb t9
+        INT_ADDIU sp, sp, -CALLFRAME_SIZ           /* BD Slot */
+
+       /* Return from handler Make sure interrupts are disabled */
+       di
+       ehb
+
+       /* XXXKYMA: k0/k1 could have been blown away if we processed
+        * an exception while we were handling the exception from the
+        * guest, reload k1
+        */
+
+       move    k1, s1
+       INT_ADDIU k1, k1, VCPU_HOST_ARCH
+
+       /* Check return value, should tell us if we are returning to the
+        * host (handle I/O etc)or resuming the guest
+        */
+       andi    t0, v0, RESUME_HOST
+       bnez    t0, __kvm_mips_return_to_host
+        nop
  
  __kvm_mips_return_to_guest:
-    /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
-    mtc0        s1, CP0_DDATA_LO
-
-    /* Load up the Guest EBASE to minimize the window where BEV is set */
-    LONG_L      t0, VCPU_GUEST_EBASE(k1)
-
-    /* Switch EBASE back to the one used by KVM */
-    mfc0        v1, CP0_STATUS
-    .set at
-       or          k0, v1, ST0_BEV
-    .set noat
-    mtc0        k0, CP0_STATUS
-    ehb
-    mtc0        t0,CP0_EBASE
-
-    /* Setup status register for running guest in UM */
-    .set at
-    or     v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
-    and     v1, v1, ~ST0_CU0
-    .set noat
-    mtc0    v1, CP0_STATUS
-    ehb
+       /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
+       mtc0    s1, CP0_DDATA_LO
  
+       /* Load up the Guest EBASE to minimize the window where BEV is set */
+       LONG_L  t0, VCPU_GUEST_EBASE(k1)
+
+       /* Switch EBASE back to the one used by KVM */
+       mfc0    v1, CP0_STATUS
+       .set    at
+       or      k0, v1, ST0_BEV
+       .set    noat
+       mtc0    k0, CP0_STATUS
+       ehb
+       mtc0    t0, CP0_EBASE
+
+       /* Setup status register for running guest in UM */
+       .set    at
+       or      v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
+       and     v1, v1, ~ST0_CU0
+       .set    noat
+       mtc0    v1, CP0_STATUS
+       ehb
  
         /* Set Guest EPC */
-       LONG_L          t0, VCPU_PC(k1)
-       mtc0            t0, CP0_EPC
-
-    /* Set the ASID for the Guest Kernel */
-    sll         t0, t0, 1                       /* with kseg0 @ 0x40000000, kernel */
-                                                /* addresses shift to 0x80000000 */
-    bltz        t0, 1f                          /* If kernel */
-       addiu       t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-    addiu       t1, k1, VCPU_GUEST_USER_ASID    /* else user */
+       LONG_L  t0, VCPU_PC(k1)
+       mtc0    t0, CP0_EPC
+
+       /* Set the ASID for the Guest Kernel */
+       INT_SLL t0, t0, 1       /* with kseg0 @ 0x40000000, kernel */
+                               /* addresses shift to 0x80000000 */
+       bltz    t0, 1f          /* If kernel */
+        INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
+       INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
  1:
-    /* t1: contains the base of the ASID array, need to get the cpu id  */
-    LONG_L      t2, TI_CPU($28)             /* smp_processor_id */
-    sll         t2, t2, 2                   /* x4 */
-    addu        t3, t1, t2
-    LONG_L      k0, (t3)
-    andi        k0, k0, 0xff
-       mtc0            k0,CP0_ENTRYHI
-    ehb
-
-    /* Disable RDHWR access */
-    mtc0    zero,  CP0_HWRENA
-
-    /* load the guest context from VCPU and return */
-    LONG_L  $0, VCPU_R0(k1)
-    LONG_L  $1, VCPU_R1(k1)
-    LONG_L  $2, VCPU_R2(k1)
-    LONG_L  $3, VCPU_R3(k1)
-    LONG_L  $4, VCPU_R4(k1)
-    LONG_L  $5, VCPU_R5(k1)
-    LONG_L  $6, VCPU_R6(k1)
-    LONG_L  $7, VCPU_R7(k1)
-    LONG_L  $8, VCPU_R8(k1)
-    LONG_L  $9, VCPU_R9(k1)
-    LONG_L  $10, VCPU_R10(k1)
-    LONG_L  $11, VCPU_R11(k1)
-    LONG_L  $12, VCPU_R12(k1)
-    LONG_L  $13, VCPU_R13(k1)
-    LONG_L  $14, VCPU_R14(k1)
-    LONG_L  $15, VCPU_R15(k1)
-    LONG_L  $16, VCPU_R16(k1)
-    LONG_L  $17, VCPU_R17(k1)
-    LONG_L  $18, VCPU_R18(k1)
-    LONG_L  $19, VCPU_R19(k1)
-    LONG_L  $20, VCPU_R20(k1)
-    LONG_L  $21, VCPU_R21(k1)
-    LONG_L  $22, VCPU_R22(k1)
-    LONG_L  $23, VCPU_R23(k1)
-    LONG_L  $24, VCPU_R24(k1)
-    LONG_L  $25, VCPU_R25(k1)
-
-    /* $/k1 loaded later */
-    LONG_L  $28, VCPU_R28(k1)
-    LONG_L  $29, VCPU_R29(k1)
-    LONG_L  $30, VCPU_R30(k1)
-    LONG_L  $31, VCPU_R31(k1)
+       /* t1: contains the base of the ASID array, need to get the cpu id  */
+       LONG_L  t2, TI_CPU($28)         /* smp_processor_id */
+       INT_SLL t2, t2, 2               /* x4 */
+       REG_ADDU t3, t1, t2
+       LONG_L  k0, (t3)
+       andi    k0, k0, 0xff
+       mtc0    k0,CP0_ENTRYHI
+       ehb
+
+       /* Disable RDHWR access */
+       mtc0    zero,  CP0_HWRENA
+
+       /* load the guest context from VCPU and return */
+       LONG_L  $0, VCPU_R0(k1)
+       LONG_L  $1, VCPU_R1(k1)
+       LONG_L  $2, VCPU_R2(k1)
+       LONG_L  $3, VCPU_R3(k1)
+       LONG_L  $4, VCPU_R4(k1)
+       LONG_L  $5, VCPU_R5(k1)
+       LONG_L  $6, VCPU_R6(k1)
+       LONG_L  $7, VCPU_R7(k1)
+       LONG_L  $8, VCPU_R8(k1)
+       LONG_L  $9, VCPU_R9(k1)
+       LONG_L  $10, VCPU_R10(k1)
+       LONG_L  $11, VCPU_R11(k1)
+       LONG_L  $12, VCPU_R12(k1)
+       LONG_L  $13, VCPU_R13(k1)
+       LONG_L  $14, VCPU_R14(k1)
+       LONG_L  $15, VCPU_R15(k1)
+       LONG_L  $16, VCPU_R16(k1)
+       LONG_L  $17, VCPU_R17(k1)
+       LONG_L  $18, VCPU_R18(k1)
+       LONG_L  $19, VCPU_R19(k1)
+       LONG_L  $20, VCPU_R20(k1)
+       LONG_L  $21, VCPU_R21(k1)
+       LONG_L  $22, VCPU_R22(k1)
+       LONG_L  $23, VCPU_R23(k1)
+       LONG_L  $24, VCPU_R24(k1)
+       LONG_L  $25, VCPU_R25(k1)
+
+       /* $/k1 loaded later */
+       LONG_L  $28, VCPU_R28(k1)
+       LONG_L  $29, VCPU_R29(k1)
+       LONG_L  $30, VCPU_R30(k1)
+       LONG_L  $31, VCPU_R31(k1)
  
  FEXPORT(__kvm_mips_skip_guest_restore)
-    LONG_L  k0, VCPU_HI(k1)
-    mthi    k0
+       LONG_L  k0, VCPU_HI(k1)
+       mthi    k0
  
-    LONG_L  k0, VCPU_LO(k1)
-    mtlo    k0
+       LONG_L  k0, VCPU_LO(k1)
+       mtlo    k0
  
-    LONG_L  k0, VCPU_R26(k1)
-    LONG_L  k1, VCPU_R27(k1)
+       LONG_L  k0, VCPU_R26(k1)
+       LONG_L  k1, VCPU_R27(k1)
  
-    eret
+       eret
  
  __kvm_mips_return_to_host:
-    /* EBASE is already pointing to Linux */
-    LONG_L  k1, VCPU_HOST_STACK(k1)
-       addiu   k1,k1, -PT_SIZE
-
-    /* Restore host DDATA_LO */
-    LONG_L      k0, PT_HOST_USERLOCAL(k1)
-    mtc0        k0, CP0_DDATA_LO
-
-    /* Restore host ASID */
-    LONG_L      k0, PT_HOST_ASID(sp)
-    andi        k0, 0xff
-    mtc0        k0,CP0_ENTRYHI
-    ehb
-
-    /* Load context saved on the host stack */
-    LONG_L  $0, PT_R0(k1)
-    LONG_L  $1, PT_R1(k1)
-
-    /* r2/v0 is the return code, shift it down by 2 (arithmetic) to recover the err code  */
-    sra     k0, v0, 2
-    move    $2, k0
-
-    LONG_L  $3, PT_R3(k1)
-    LONG_L  $4, PT_R4(k1)
-    LONG_L  $5, PT_R5(k1)
-    LONG_L  $6, PT_R6(k1)
-    LONG_L  $7, PT_R7(k1)
-    LONG_L  $8, PT_R8(k1)
-    LONG_L  $9, PT_R9(k1)
-    LONG_L  $10, PT_R10(k1)
-    LONG_L  $11, PT_R11(k1)
-    LONG_L  $12, PT_R12(k1)
-    LONG_L  $13, PT_R13(k1)
-    LONG_L  $14, PT_R14(k1)
-    LONG_L  $15, PT_R15(k1)
-    LONG_L  $16, PT_R16(k1)
-    LONG_L  $17, PT_R17(k1)
-    LONG_L  $18, PT_R18(k1)
-    LONG_L  $19, PT_R19(k1)
-    LONG_L  $20, PT_R20(k1)
-    LONG_L  $21, PT_R21(k1)
-    LONG_L  $22, PT_R22(k1)
-    LONG_L  $23, PT_R23(k1)
-    LONG_L  $24, PT_R24(k1)
-    LONG_L  $25, PT_R25(k1)
-
-    /* Host k0/k1 were not saved */
-
-    LONG_L  $28, PT_R28(k1)
-    LONG_L  $29, PT_R29(k1)
-    LONG_L  $30, PT_R30(k1)
-
-    LONG_L  k0, PT_HI(k1)
-    mthi    k0
-
-    LONG_L  k0, PT_LO(k1)
-    mtlo    k0
-
-    /* Restore RDHWR access */
-    la      k0, 0x2000000F
-    mtc0    k0,  CP0_HWRENA
-
-
-    /* Restore RA, which is the address we will return to */
-    LONG_L  ra, PT_R31(k1)
-    j       ra
-    nop
-
-    .set    pop
+       /* EBASE is already pointing to Linux */
+       LONG_L  k1, VCPU_HOST_STACK(k1)
+       INT_ADDIU k1,k1, -PT_SIZE
+
+       /* Restore host DDATA_LO */
+       LONG_L  k0, PT_HOST_USERLOCAL(k1)
+       mtc0    k0, CP0_DDATA_LO
+
+       /* Restore host ASID */
+       LONG_L  k0, PT_HOST_ASID(sp)
+       andi    k0, 0xff
+       mtc0    k0,CP0_ENTRYHI
+       ehb
+
+       /* Load context saved on the host stack */
+       LONG_L  $0, PT_R0(k1)
+       LONG_L  $1, PT_R1(k1)
+
+       /* r2/v0 is the return code, shift it down by 2 (arithmetic)
+        * to recover the err code  */
+       INT_SRA k0, v0, 2
+       move    $2, k0
+
+       LONG_L  $3, PT_R3(k1)
+       LONG_L  $4, PT_R4(k1)
+       LONG_L  $5, PT_R5(k1)
+       LONG_L  $6, PT_R6(k1)
+       LONG_L  $7, PT_R7(k1)
+       LONG_L  $8, PT_R8(k1)
+       LONG_L  $9, PT_R9(k1)
+       LONG_L  $10, PT_R10(k1)
+       LONG_L  $11, PT_R11(k1)
+       LONG_L  $12, PT_R12(k1)
+       LONG_L  $13, PT_R13(k1)
+       LONG_L  $14, PT_R14(k1)
+       LONG_L  $15, PT_R15(k1)
+       LONG_L  $16, PT_R16(k1)
+       LONG_L  $17, PT_R17(k1)
+       LONG_L  $18, PT_R18(k1)
+       LONG_L  $19, PT_R19(k1)
+       LONG_L  $20, PT_R20(k1)
+       LONG_L  $21, PT_R21(k1)
+       LONG_L  $22, PT_R22(k1)
+       LONG_L  $23, PT_R23(k1)
+       LONG_L  $24, PT_R24(k1)
+       LONG_L  $25, PT_R25(k1)
+
+       /* Host k0/k1 were not saved */
+
+       LONG_L  $28, PT_R28(k1)
+       LONG_L  $29, PT_R29(k1)
+       LONG_L  $30, PT_R30(k1)
+
+       LONG_L  k0, PT_HI(k1)
+       mthi    k0
+
+       LONG_L  k0, PT_LO(k1)
+       mtlo    k0
+
+       /* Restore RDHWR access */
+       PTR_LI  k0, 0x2000000F
+       mtc0    k0,  CP0_HWRENA
+
+
+       /* Restore RA, which is the address we will return to */
+       LONG_L  ra, PT_R31(k1)
+       j       ra
+        nop
+
  VECTOR_END(MIPSX(GuestExceptionEnd))
  .end MIPSX(GuestException)
  
@@ -627,24 +623,23 @@ MIPSX(exceptions):
  
  #define HW_SYNCI_Step       $1
  LEAF(MIPSX(SyncICache))
-    .set    push
+       .set    push
         .set    mips32r2
-    beq     a1, zero, 20f
-    nop
-    addu    a1, a0, a1
-    rdhwr   v0, HW_SYNCI_Step
-    beq     v0, zero, 20f
-    nop
-
+       beq     a1, zero, 20f
+        nop
+       REG_ADDU a1, a0, a1
+       rdhwr   v0, HW_SYNCI_Step
+       beq     v0, zero, 20f
+        nop
  10:
-    synci   0(a0)
-    addu    a0, a0, v0
-    sltu    v1, a0, a1
-    bne     v1, zero, 10b
-    nop
-    sync
+       synci   0(a0)
+       REG_ADDU a0, a0, v0
+       sltu    v1, a0, a1
+       bne     v1, zero, 10b
+        nop
+       sync
  20:
-    jr.hb   ra
-    nop
-    .set pop
+       jr.hb   ra
+        nop
+       .set    pop
  END(MIPSX(SyncICache))
diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c

index dd203e59e6fd650767a3ae5286e0599f4dbc15b7..a7b044536de48e0f804d2270fc69a3cdbe6a3335 100644 (file)
--- a/arch/mips/kvm/kvm_mips.c
+++ b/arch/mips/kvm/kvm_mips.c
@@ -208,6 +208,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
         return 0;
  }
  
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
+
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                  struct kvm_memory_slot *memslot,
                                  struct kvm_userspace_memory_region *mem,
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h

index 08891d07aeb656e891633d96cd8b2ab09b7dc6d4..fa19e2f1a874b3878d0cd556bc5c39417ff1a08d 100644 (file)
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -334,6 +334,27 @@ static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
         return r;
  }
  
+/*
+ * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
+ * Because the sc instruction sets SRR0 to point to the following
+ * instruction, we have to fetch from pc - 4.
+ */
+static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
+{
+       ulong pc = kvmppc_get_pc(vcpu) - 4;
+       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+       u32 r;
+
+       /* Load the instruction manually if it failed to do so in the
+        * exit path */
+       if (svcpu->last_inst == KVM_INST_FETCH_FAILED)
+               kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false);
+
+       r = svcpu->last_inst;
+       svcpu_put(svcpu);
+       return r;
+}
+
  static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
  {
         struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
@@ -446,6 +467,23 @@ static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
         return vcpu->arch.last_inst;
  }
  
+/*
+ * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
+ * Because the sc instruction sets SRR0 to point to the following
+ * instruction, we have to fetch from pc - 4.
+ */
+static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
+{
+       ulong pc = kvmppc_get_pc(vcpu) - 4;
+
+       /* Load the instruction manually if it failed to do so in the
+        * exit path */
+       if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
+               kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
+
+       return vcpu->arch.last_inst;
+}
+
  static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
  {
         return vcpu->arch.fault_dar;
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h

index a1ecb14e4442d564fac6208e4bc445825beb0ff6..86d638a3b359e1d0f8e03c09c08adafcf9a7aa28 100644 (file)
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -37,7 +37,7 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
  
  #ifdef CONFIG_KVM_BOOK3S_64_HV
  #define KVM_DEFAULT_HPT_ORDER  24      /* 16MB HPT by default */
-extern int kvm_hpt_order;              /* order of preallocated HPTs */
+extern unsigned long kvm_rma_pages;
  #endif
  
  #define VRMA_VSID      0x1ffffffUL     /* 1TB VSID reserved for VRMA */
@@ -100,7 +100,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
                         /* (masks depend on page size) */
                         rb |= 0x1000;           /* page encoding in LP field */
                         rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
-                       rb |= (va_low & 0xfe);  /* AVAL field (P7 doesn't seem to care) */
+                       rb |= ((va_low << 4) & 0xf0);   /* AVAL field (P7 doesn't seem to care) */
                 }
         } else {
                 /* 4kB page */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h

index af326cde7cb62bf2f07c70e6d0992e154504d5e0..33283532e9d8fd8c14bbd376b3112136a134e233 100644 (file)
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -183,13 +183,9 @@ struct kvmppc_spapr_tce_table {
         struct page *pages[0];
  };
  
-struct kvmppc_linear_info {
-       void            *base_virt;
-       unsigned long    base_pfn;
-       unsigned long    npages;
-       struct list_head list;
-       atomic_t         use_count;
-       int              type;
+struct kvm_rma_info {
+       atomic_t use_count;
+       unsigned long base_pfn;
  };
  
  /* XICS components, defined in book3s_xics.c */
@@ -246,7 +242,7 @@ struct kvm_arch {
         int tlbie_lock;
         unsigned long lpcr;
         unsigned long rmor;
-       struct kvmppc_linear_info *rma;
+       struct kvm_rma_info *rma;
         unsigned long vrma_slb_v;
         int rma_setup_done;
         int using_mmu_notifiers;
@@ -259,7 +255,7 @@ struct kvm_arch {
         spinlock_t slot_phys_lock;
         cpumask_t need_tlb_flush;
         struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
-       struct kvmppc_linear_info *hpt_li;
+       int hpt_cma_alloc;
  #endif /* CONFIG_KVM_BOOK3S_64_HV */
  #ifdef CONFIG_PPC_BOOK3S_64
         struct list_head spapr_tce_tables;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h

index a5287fe03d773e7d541e26757b8b5fad4a9a3af3..b15554a26c20b8fd6bd25896164e91f91e27c5a4 100644 (file)
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -137,10 +137,10 @@ extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                              unsigned long ioba, unsigned long tce);
  extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
                                 struct kvm_allocate_rma *rma);
-extern struct kvmppc_linear_info *kvm_alloc_rma(void);
-extern void kvm_release_rma(struct kvmppc_linear_info *ri);
-extern struct kvmppc_linear_info *kvm_alloc_hpt(void);
-extern void kvm_release_hpt(struct kvmppc_linear_info *li);
+extern struct kvm_rma_info *kvm_alloc_rma(void);
+extern void kvm_release_rma(struct kvm_rma_info *ri);
+extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
+extern void kvm_release_hpt(struct page *page, unsigned long nr_pages);
  extern int kvmppc_core_init_vm(struct kvm *kvm);
  extern void kvmppc_core_destroy_vm(struct kvm *kvm);
  extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
@@ -261,6 +261,7 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
  struct openpic;
  
  #ifdef CONFIG_KVM_BOOK3S_64_HV
+extern void kvm_cma_reserve(void) __init;
  static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
  {
         paca[cpu].kvm_hstate.xics_phys = addr;
@@ -281,13 +282,12 @@ static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
  }
  
  extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu);
-extern void kvm_linear_init(void);
  
  #else
-static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+static inline void __init kvm_cma_reserve(void)
  {}
  
-static inline void kvm_linear_init(void)
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
  {}
  
  static inline u32 kvmppc_get_xics_latch(void)
@@ -394,10 +394,15 @@ static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
         }
  }
  
-/* Please call after prepare_to_enter. This function puts the lazy ee state
-   back to normal mode, without actually enabling interrupts. */
-static inline void kvmppc_lazy_ee_enable(void)
+/*
+ * Please call after prepare_to_enter. This function puts the lazy ee and irq
+ * disabled tracking state back to normal mode, without actually enabling
+ * interrupts.
+ */
+static inline void kvmppc_fix_ee_before_entry(void)
  {
+       trace_hardirqs_on();
+
  #ifdef CONFIG_PPC64
         /* Only need to enable IRQs by hard enabling them after this */
         local_paca->irq_happened = 0;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c

index 8207459efe5619e4b9e2d824221104d111507e67..d8958be5f31a18b7c0bae16509749067aae75265 100644 (file)
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -454,6 +454,7 @@ int main(void)
         DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
         DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
  #endif
+       DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
         DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
         DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
         DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6));
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c

index 389fb8077cc9cea25746b12497673dd573d35c56..fe6a58c9f0b7f36598822d9131bbdd90b6e8bf4a 100644 (file)
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -229,6 +229,8 @@ void __init early_setup(unsigned long dt_ptr)
         /* Initialize the hash table or TLB handling */
         early_init_mmu();
  
+       kvm_cma_reserve();
+
         /*
          * Reserve any gigantic pages requested on the command line.
          * memblock needs to have been initialized by the time this is
@@ -609,8 +611,6 @@ void __init setup_arch(char **cmdline_p)
         /* Initialize the MMU context management stuff */
         mmu_context_init();
  
-       kvm_linear_init();
-
         /* Interrupt code needs to be 64K-aligned */
         if ((unsigned long)_stext & 0xffff)
                 panic("Kernelbase not 64K-aligned (0x%lx)!\n",
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig

index eb643f8625796711f93fbad6d92022f957517df9..ffaef2cb101a4ef50c77f1dc58c3cdac1f2da105 100644 (file)
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -72,6 +72,7 @@ config KVM_BOOK3S_64_HV
         bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
         depends on KVM_BOOK3S_64
         select MMU_NOTIFIER
+       select CMA
         ---help---
           Support running unmodified book3s_64 guest kernels in
           virtual machines on POWER7 and PPC970 processors that have
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile

index 008cd856c5b52942c7be2d1e136650ea9aca8922..6646c952c5e37a80c19d4d86296e7bbdc0e2f3ad 100644 (file)
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -81,6 +81,7 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
         book3s_64_vio_hv.o \
         book3s_hv_ras.o \
         book3s_hv_builtin.o \
+       book3s_hv_cma.o \
         $(kvm-book3s_64-builtin-xics-objs-y)
  
  kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c

index 739bfbadb85e3218bf39a293af377d7fe99eb218..7e345e00661a73065a85fc1c6d0a2e4ef7e44108 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -182,10 +182,13 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
         hva_t ptegp;
         u64 pteg[16];
         u64 avpn = 0;
+       u64 v, r;
+       u64 v_val, v_mask;
+       u64 eaddr_mask;
         int i;
-       u8 key = 0;
+       u8 pp, key = 0;
         bool found = false;
-       int second = 0;
+       bool second = false;
         ulong mp_ea = vcpu->arch.magic_page_ea;
  
         /* Magic page override */
@@ -208,8 +211,16 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
                 goto no_seg_found;
  
         avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr);
+       v_val = avpn & HPTE_V_AVPN;
+
         if (slbe->tb)
-               avpn |= SLB_VSID_B_1T;
+               v_val |= SLB_VSID_B_1T;
+       if (slbe->large)
+               v_val |= HPTE_V_LARGE;
+       v_val |= HPTE_V_VALID;
+
+       v_mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_LARGE | HPTE_V_VALID |
+               HPTE_V_SECONDARY;
  
  do_second:
         ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second);
@@ -227,91 +238,74 @@ do_second:
                 key = 4;
  
         for (i=0; i<16; i+=2) {
-               u64 v = pteg[i];
-               u64 r = pteg[i+1];
-
-               /* Valid check */
-               if (!(v & HPTE_V_VALID))
-                       continue;
-               /* Hash check */
-               if ((v & HPTE_V_SECONDARY) != second)
-                       continue;
-
-               /* AVPN compare */
-               if (HPTE_V_COMPARE(avpn, v)) {
-                       u8 pp = (r & HPTE_R_PP) | key;
-                       int eaddr_mask = 0xFFF;
-
-                       gpte->eaddr = eaddr;
-                       gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu,
-                                                                   eaddr,
-                                                                   data);
-                       if (slbe->large)
-                               eaddr_mask = 0xFFFFFF;
-                       gpte->raddr = (r & HPTE_R_RPN) | (eaddr & eaddr_mask);
-                       gpte->may_execute = ((r & HPTE_R_N) ? false : true);
-                       gpte->may_read = false;
-                       gpte->may_write = false;
-
-                       switch (pp) {
-                       case 0:
-                       case 1:
-                       case 2:
-                       case 6:
-                               gpte->may_write = true;
-                               /* fall through */
-                       case 3:
-                       case 5:
-                       case 7:
-                               gpte->may_read = true;
-                               break;
-                       }
-
-                       dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
-                               "-> 0x%lx\n",
-                               eaddr, avpn, gpte->vpage, gpte->raddr);
+               /* Check all relevant fields of 1st dword */
+               if ((pteg[i] & v_mask) == v_val) {
                         found = true;
                         break;
                 }
         }
  
-       /* Update PTE R and C bits, so the guest's swapper knows we used the
-        * page */
-       if (found) {
-               u32 oldr = pteg[i+1];
+       if (!found) {
+               if (second)
+                       goto no_page_found;
+               v_val |= HPTE_V_SECONDARY;
+               second = true;
+               goto do_second;
+       }
  
-               if (gpte->may_read) {
-                       /* Set the accessed flag */
-                       pteg[i+1] |= HPTE_R_R;
-               }
-               if (gpte->may_write) {
-                       /* Set the dirty flag */
-                       pteg[i+1] |= HPTE_R_C;
-               } else {
-                       dprintk("KVM: Mapping read-only page!\n");
-               }
+       v = pteg[i];
+       r = pteg[i+1];
+       pp = (r & HPTE_R_PP) | key;
+       eaddr_mask = 0xFFF;
+
+       gpte->eaddr = eaddr;
+       gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
+       if (slbe->large)
+               eaddr_mask = 0xFFFFFF;
+       gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask);
+       gpte->may_execute = ((r & HPTE_R_N) ? false : true);
+       gpte->may_read = false;
+       gpte->may_write = false;
+
+       switch (pp) {
+       case 0:
+       case 1:
+       case 2:
+       case 6:
+               gpte->may_write = true;
+               /* fall through */
+       case 3:
+       case 5:
+       case 7:
+               gpte->may_read = true;
+               break;
+       }
  
-               /* Write back into the PTEG */
-               if (pteg[i+1] != oldr)
-                       copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
+       dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
+               "-> 0x%lx\n",
+               eaddr, avpn, gpte->vpage, gpte->raddr);
  
-               if (!gpte->may_read)
-                       return -EPERM;
-               return 0;
-       } else {
-               dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx "
-                       "ptegp=0x%lx)\n",
-                       eaddr, to_book3s(vcpu)->sdr1, ptegp);
-               for (i = 0; i < 16; i += 2)
-                       dprintk("   %02d: 0x%llx - 0x%llx (0x%llx)\n",
-                               i, pteg[i], pteg[i+1], avpn);
-
-               if (!second) {
-                       second = HPTE_V_SECONDARY;
-                       goto do_second;
-               }
+       /* Update PTE R and C bits, so the guest's swapper knows we used the
+        * page */
+       if (gpte->may_read) {
+               /* Set the accessed flag */
+               r |= HPTE_R_R;
+       }
+       if (data && gpte->may_write) {
+               /* Set the dirty flag -- XXX even if not writing */
+               r |= HPTE_R_C;
+       }
+
+       /* Write back into the PTEG */
+       if (pteg[i+1] != r) {
+               pteg[i+1] = r;
+               copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
         }
  
+       if (!gpte->may_read)
+               return -EPERM;
+       return 0;
+
  no_page_found:
         return -ENOENT;
  
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c

index 710d31317d812efe73086e33a89bcdced65ba349..043eec8461e779bfc6e1c25b678116d5f963a59d 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -37,6 +37,8 @@
  #include <asm/ppc-opcode.h>
  #include <asm/cputable.h>
  
+#include "book3s_hv_cma.h"
+
  /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
  #define MAX_LPID_970   63
  
@@ -52,8 +54,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
  {
         unsigned long hpt;
         struct revmap_entry *rev;
-       struct kvmppc_linear_info *li;
-       long order = kvm_hpt_order;
+       struct page *page = NULL;
+       long order = KVM_DEFAULT_HPT_ORDER;
  
         if (htab_orderp) {
                 order = *htab_orderp;
@@ -61,26 +63,23 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
                         order = PPC_MIN_HPT_ORDER;
         }
  
+       kvm->arch.hpt_cma_alloc = 0;
         /*
-        * If the user wants a different size from default,
          * try first to allocate it from the kernel page allocator.
+        * We keep the CMA reserved for failed allocation.
          */
-       hpt = 0;
-       if (order != kvm_hpt_order) {
-               hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
-                                      __GFP_NOWARN, order - PAGE_SHIFT);
-               if (!hpt)
-                       --order;
-       }
+       hpt = __get_free_pages(GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT |
+                              __GFP_NOWARN, order - PAGE_SHIFT);
  
         /* Next try to allocate from the preallocated pool */
         if (!hpt) {
-               li = kvm_alloc_hpt();
-               if (li) {
-                       hpt = (ulong)li->base_virt;
-                       kvm->arch.hpt_li = li;
-                       order = kvm_hpt_order;
-               }
+               VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER);
+               page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
+               if (page) {
+                       hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+                       kvm->arch.hpt_cma_alloc = 1;
+               } else
+                       --order;
         }
  
         /* Lastly try successively smaller sizes from the page allocator */
@@ -118,8 +117,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
         return 0;
  
   out_freehpt:
-       if (kvm->arch.hpt_li)
-               kvm_release_hpt(kvm->arch.hpt_li);
+       if (kvm->arch.hpt_cma_alloc)
+               kvm_release_hpt(page, 1 << (order - PAGE_SHIFT));
         else
                 free_pages(hpt, order - PAGE_SHIFT);
         return -ENOMEM;
@@ -165,8 +164,9 @@ void kvmppc_free_hpt(struct kvm *kvm)
  {
         kvmppc_free_lpid(kvm->arch.lpid);
         vfree(kvm->arch.revmap);
-       if (kvm->arch.hpt_li)
-               kvm_release_hpt(kvm->arch.hpt_li);
+       if (kvm->arch.hpt_cma_alloc)
+               kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
+                               1 << (kvm->arch.hpt_order - PAGE_SHIFT));
         else
                 free_pages(kvm->arch.hpt_virt,
                            kvm->arch.hpt_order - PAGE_SHIFT);
@@ -1579,7 +1579,7 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
         ctx->first_pass = 1;
  
         rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
-       ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag);
+       ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
         if (ret < 0) {
                 kvm_put_kvm(kvm);
                 return ret;
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c

index b2d3f3b2de72a7a796e88bee5e28184adc3ae0a6..54cf9bc94dadfe3a1debf64f03cea511fc0aa97b 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -136,7 +136,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
         mutex_unlock(&kvm->lock);
  
         return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
-                               stt, O_RDWR);
+                               stt, O_RDWR | O_CLOEXEC);
  
  fail:
         if (stt) {
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c

index 1f6344c4408d6deba440fefab77829486f424d6c..360ce68c98099f1cc3495107f112f4b19e82f56f 100644 (file)
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -458,6 +458,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
         case SPRN_PMC4_GEKKO:
         case SPRN_WPAR_GEKKO:
         case SPRN_MSSSR0:
+       case SPRN_DABR:
                 break;
  unprivileged:
         default:
@@ -555,6 +556,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
         case SPRN_PMC4_GEKKO:
         case SPRN_WPAR_GEKKO:
         case SPRN_MSSSR0:
+       case SPRN_DABR:
                 *spr_val = 0;
                 break;
         default:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c

index 7629cd3eb91ad69e9460204a7f7c2e8d081a1084..b0ee3bc9ca763c475e12edf3bcad79275dc92203 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -680,13 +680,12 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
  }
  
  int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
+                                 struct kvm_sregs *sregs)
  {
         int i;
  
-       sregs->pvr = vcpu->arch.pvr;
-
         memset(sregs, 0, sizeof(struct kvm_sregs));
+       sregs->pvr = vcpu->arch.pvr;
         for (i = 0; i < vcpu->arch.slb_max; i++) {
                 sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
                 sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
@@ -696,7 +695,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
  }
  
  int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
+                                 struct kvm_sregs *sregs)
  {
         int i, j;
  
@@ -1511,10 +1510,10 @@ static inline int lpcr_rmls(unsigned long rma_size)
  
  static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
-       struct kvmppc_linear_info *ri = vma->vm_file->private_data;
         struct page *page;
+       struct kvm_rma_info *ri = vma->vm_file->private_data;
  
-       if (vmf->pgoff >= ri->npages)
+       if (vmf->pgoff >= kvm_rma_pages)
                 return VM_FAULT_SIGBUS;
  
         page = pfn_to_page(ri->base_pfn + vmf->pgoff);
@@ -1536,7 +1535,7 @@ static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
  
  static int kvm_rma_release(struct inode *inode, struct file *filp)
  {
-       struct kvmppc_linear_info *ri = filp->private_data;
+       struct kvm_rma_info *ri = filp->private_data;
  
         kvm_release_rma(ri);
         return 0;
@@ -1549,18 +1548,27 @@ static const struct file_operations kvm_rma_fops = {
  
  long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
  {
-       struct kvmppc_linear_info *ri;
         long fd;
+       struct kvm_rma_info *ri;
+       /*
+        * Only do this on PPC970 in HV mode
+        */
+       if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+           !cpu_has_feature(CPU_FTR_ARCH_201))
+               return -EINVAL;
+
+       if (!kvm_rma_pages)
+               return -EINVAL;
  
         ri = kvm_alloc_rma();
         if (!ri)
                 return -ENOMEM;
  
-       fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR);
+       fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR | O_CLOEXEC);
         if (fd < 0)
                 kvm_release_rma(ri);
  
-       ret->rma_size = ri->npages << PAGE_SHIFT;
+       ret->rma_size = kvm_rma_pages << PAGE_SHIFT;
         return fd;
  }
  
@@ -1725,7 +1733,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
  {
         int err = 0;
         struct kvm *kvm = vcpu->kvm;
-       struct kvmppc_linear_info *ri = NULL;
+       struct kvm_rma_info *ri = NULL;
         unsigned long hva;
         struct kvm_memory_slot *memslot;
         struct vm_area_struct *vma;
@@ -1803,7 +1811,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
  
         } else {
                 /* Set up to use an RMO region */
-               rma_size = ri->npages;
+               rma_size = kvm_rma_pages;
                 if (rma_size > memslot->npages)
                         rma_size = memslot->npages;
                 rma_size <<= PAGE_SHIFT;
@@ -1831,14 +1839,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
                         /* POWER7 */
                         lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
                         lpcr |= rmls << LPCR_RMLS_SH;
-                       kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
+                       kvm->arch.rmor = ri->base_pfn << PAGE_SHIFT;
                 }
                 kvm->arch.lpcr = lpcr;
                 pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n",
                         ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
  
                 /* Initialize phys addrs of pages in RMO */
-               npages = ri->npages;
+               npages = kvm_rma_pages;
                 porder = __ilog2(npages);
                 physp = memslot->arch.slot_phys;
                 if (physp) {
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c

index ec0a9e5de1005331e7372a5c76d3a153d2a4dfd5..8cd0daebb82deea5b6785dbba4e55d12d3d6ac88 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -13,33 +13,34 @@
  #include <linux/spinlock.h>
  #include <linux/bootmem.h>
  #include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/sizes.h>
  
  #include <asm/cputable.h>
  #include <asm/kvm_ppc.h>
  #include <asm/kvm_book3s.h>
  
-#define KVM_LINEAR_RMA         0
-#define KVM_LINEAR_HPT         1
-
-static void __init kvm_linear_init_one(ulong size, int count, int type);
-static struct kvmppc_linear_info *kvm_alloc_linear(int type);
-static void kvm_release_linear(struct kvmppc_linear_info *ri);
-
-int kvm_hpt_order = KVM_DEFAULT_HPT_ORDER;
-EXPORT_SYMBOL_GPL(kvm_hpt_order);
-
-/*************** RMA *************/
-
+#include "book3s_hv_cma.h"
+/*
+ * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
+ * should be power of 2.
+ */
+#define HPT_ALIGN_PAGES                ((1 << 18) >> PAGE_SHIFT) /* 256k */
+/*
+ * By default we reserve 5% of memory for hash pagetable allocation.
+ */
+static unsigned long kvm_cma_resv_ratio = 5;
  /*
- * This maintains a list of RMAs (real mode areas) for KVM guests to use.
+ * We allocate RMAs (real mode areas) for KVM guests from the KVM CMA area.
   * Each RMA has to be physically contiguous and of a size that the
   * hardware supports.  PPC970 and POWER7 support 64MB, 128MB and 256MB,
   * and other larger sizes.  Since we are unlikely to be allocate that
   * much physically contiguous memory after the system is up and running,
- * we preallocate a set of RMAs in early boot for KVM to use.
+ * we preallocate a set of RMAs in early boot using CMA.
+ * should be power of 2.
   */
-static unsigned long kvm_rma_size = 64 << 20;  /* 64MB */
-static unsigned long kvm_rma_count;
+unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT; /* 128MB */
+EXPORT_SYMBOL_GPL(kvm_rma_pages);
  
  /* Work out RMLS (real mode limit selector) field value for a given RMA size.
     Assumes POWER7 or PPC970. */
@@ -69,165 +70,114 @@ static inline int lpcr_rmls(unsigned long rma_size)
  
  static int __init early_parse_rma_size(char *p)
  {
-       if (!p)
-               return 1;
+       unsigned long kvm_rma_size;
  
+       pr_debug("%s(%s)\n", __func__, p);
+       if (!p)
+               return -EINVAL;
         kvm_rma_size = memparse(p, &p);
-
+       /*
+        * Check that the requested size is one supported in hardware
+        */
+       if (lpcr_rmls(kvm_rma_size) < 0) {
+               pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
+               return -EINVAL;
+       }
+       kvm_rma_pages = kvm_rma_size >> PAGE_SHIFT;
         return 0;
  }
  early_param("kvm_rma_size", early_parse_rma_size);
  
-static int __init early_parse_rma_count(char *p)
+struct kvm_rma_info *kvm_alloc_rma()
  {
-       if (!p)
-               return 1;
-
-       kvm_rma_count = simple_strtoul(p, NULL, 0);
-
-       return 0;
-}
-early_param("kvm_rma_count", early_parse_rma_count);
-
-struct kvmppc_linear_info *kvm_alloc_rma(void)
-{
-       return kvm_alloc_linear(KVM_LINEAR_RMA);
+       struct page *page;
+       struct kvm_rma_info *ri;
+
+       ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL);
+       if (!ri)
+               return NULL;
+       page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages);
+       if (!page)
+               goto err_out;
+       atomic_set(&ri->use_count, 1);
+       ri->base_pfn = page_to_pfn(page);
+       return ri;
+err_out:
+       kfree(ri);
+       return NULL;
  }
  EXPORT_SYMBOL_GPL(kvm_alloc_rma);
  
-void kvm_release_rma(struct kvmppc_linear_info *ri)
+void kvm_release_rma(struct kvm_rma_info *ri)
  {
-       kvm_release_linear(ri);
+       if (atomic_dec_and_test(&ri->use_count)) {
+               kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages);
+               kfree(ri);
+       }
  }
  EXPORT_SYMBOL_GPL(kvm_release_rma);
  
-/*************** HPT *************/
-
-/*
- * This maintains a list of big linear HPT tables that contain the GVA->HPA
- * memory mappings. If we don't reserve those early on, we might not be able
- * to get a big (usually 16MB) linear memory region from the kernel anymore.
- */
-
-static unsigned long kvm_hpt_count;
-
-static int __init early_parse_hpt_count(char *p)
+static int __init early_parse_kvm_cma_resv(char *p)
  {
+       pr_debug("%s(%s)\n", __func__, p);
         if (!p)
-               return 1;
-
-       kvm_hpt_count = simple_strtoul(p, NULL, 0);
-
-       return 0;
+               return -EINVAL;
+       return kstrtoul(p, 0, &kvm_cma_resv_ratio);
  }
-early_param("kvm_hpt_count", early_parse_hpt_count);
+early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv);
  
-struct kvmppc_linear_info *kvm_alloc_hpt(void)
+struct page *kvm_alloc_hpt(unsigned long nr_pages)
  {
-       return kvm_alloc_linear(KVM_LINEAR_HPT);
+       unsigned long align_pages = HPT_ALIGN_PAGES;
+
+       /* Old CPUs require HPT aligned on a multiple of its size */
+       if (!cpu_has_feature(CPU_FTR_ARCH_206))
+               align_pages = nr_pages;
+       return kvm_alloc_cma(nr_pages, align_pages);
  }
  EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
  
-void kvm_release_hpt(struct kvmppc_linear_info *li)
+void kvm_release_hpt(struct page *page, unsigned long nr_pages)
  {
-       kvm_release_linear(li);
+       kvm_release_cma(page, nr_pages);
  }
  EXPORT_SYMBOL_GPL(kvm_release_hpt);
  
-/*************** generic *************/
-
-static LIST_HEAD(free_linears);
-static DEFINE_SPINLOCK(linear_lock);
-
-static void __init kvm_linear_init_one(ulong size, int count, int type)
-{
-       unsigned long i;
-       unsigned long j, npages;
-       void *linear;
-       struct page *pg;
-       const char *typestr;
-       struct kvmppc_linear_info *linear_info;
-
-       if (!count)
-               return;
-
-       typestr = (type == KVM_LINEAR_RMA) ? "RMA" : "HPT";
-
-       npages = size >> PAGE_SHIFT;
-       linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info));
-       for (i = 0; i < count; ++i) {
-               linear = alloc_bootmem_align(size, size);
-               pr_debug("Allocated KVM %s at %p (%ld MB)\n", typestr, linear,
-                        size >> 20);
-               linear_info[i].base_virt = linear;
-               linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT;
-               linear_info[i].npages = npages;
-               linear_info[i].type = type;
-               list_add_tail(&linear_info[i].list, &free_linears);
-               atomic_set(&linear_info[i].use_count, 0);
-
-               pg = pfn_to_page(linear_info[i].base_pfn);
-               for (j = 0; j < npages; ++j) {
-                       atomic_inc(&pg->_count);
-                       ++pg;
-               }
-       }
-}
-
-static struct kvmppc_linear_info *kvm_alloc_linear(int type)
-{
-       struct kvmppc_linear_info *ri, *ret;
-
-       ret = NULL;
-       spin_lock(&linear_lock);
-       list_for_each_entry(ri, &free_linears, list) {
-               if (ri->type != type)
-                       continue;
-
-               list_del(&ri->list);
-               atomic_inc(&ri->use_count);
-               memset(ri->base_virt, 0, ri->npages << PAGE_SHIFT);
-               ret = ri;
-               break;
-       }
-       spin_unlock(&linear_lock);
-       return ret;
-}
-
-static void kvm_release_linear(struct kvmppc_linear_info *ri)
-{
-       if (atomic_dec_and_test(&ri->use_count)) {
-               spin_lock(&linear_lock);
-               list_add_tail(&ri->list, &free_linears);
-               spin_unlock(&linear_lock);
-
-       }
-}
-
-/*
- * Called at boot time while the bootmem allocator is active,
- * to allocate contiguous physical memory for the hash page
- * tables for guests.
+/**
+ * kvm_cma_reserve() - reserve area for kvm hash pagetable
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory.
   */
-void __init kvm_linear_init(void)
+void __init kvm_cma_reserve(void)
  {
-       /* HPT */
-       kvm_linear_init_one(1 << kvm_hpt_order, kvm_hpt_count, KVM_LINEAR_HPT);
-
-       /* RMA */
-       /* Only do this on PPC970 in HV mode */
-       if (!cpu_has_feature(CPU_FTR_HVMODE) ||
-           !cpu_has_feature(CPU_FTR_ARCH_201))
-               return;
-
-       if (!kvm_rma_size || !kvm_rma_count)
-               return;
-
-       /* Check that the requested size is one supported in hardware */
-       if (lpcr_rmls(kvm_rma_size) < 0) {
-               pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
-               return;
+       unsigned long align_size;
+       struct memblock_region *reg;
+       phys_addr_t selected_size = 0;
+       /*
+        * We cannot use memblock_phys_mem_size() here, because
+        * memblock_analyze() has not been called yet.
+        */
+       for_each_memblock(memory, reg)
+               selected_size += memblock_region_memory_end_pfn(reg) -
+                                memblock_region_memory_base_pfn(reg);
+
+       selected_size = (selected_size * kvm_cma_resv_ratio / 100) << PAGE_SHIFT;
+       if (selected_size) {
+               pr_debug("%s: reserving %ld MiB for global area\n", __func__,
+                        (unsigned long)selected_size / SZ_1M);
+               /*
+                * Old CPUs require HPT aligned on a multiple of its size. So for them
+                * make the alignment as max size we could request.
+                */
+               if (!cpu_has_feature(CPU_FTR_ARCH_206))
+                       align_size = __rounddown_pow_of_two(selected_size);
+               else
+                       align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
+
+               align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size);
+               kvm_cma_declare_contiguous(selected_size, align_size);
         }
-
-       kvm_linear_init_one(kvm_rma_size, kvm_rma_count, KVM_LINEAR_RMA);
  }
diff --git a/arch/powerpc/kvm/book3s_hv_cma.c b/arch/powerpc/kvm/book3s_hv_cma.c

new file mode 100644 (file)

index 0000000..d9d3d85
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_cma.c
@@ -0,0 +1,240 @@
+/*
+ * Contiguous Memory Allocator for ppc KVM hash pagetable  based on CMA
+ * for DMA mapping framework
+ *
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ *
+ */
+#define pr_fmt(fmt) "kvm_cma: " fmt
+
+#ifdef CONFIG_CMA_DEBUG
+#ifndef DEBUG
+#  define DEBUG
+#endif
+#endif
+
+#include <linux/memblock.h>
+#include <linux/mutex.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+
+#include "book3s_hv_cma.h"
+
+struct kvm_cma {
+       unsigned long   base_pfn;
+       unsigned long   count;
+       unsigned long   *bitmap;
+};
+
+static DEFINE_MUTEX(kvm_cma_mutex);
+static struct kvm_cma kvm_cma_area;
+
+/**
+ * kvm_cma_declare_contiguous() - reserve area for contiguous memory handling
+ *                               for kvm hash pagetable
+ * @size:  Size of the reserved memory.
+ * @alignment:  Alignment for the contiguous memory area
+ *
+ * This function reserves memory for kvm cma area. It should be
+ * called by arch code when early allocator (memblock or bootmem)
+ * is still activate.
+ */
+long __init kvm_cma_declare_contiguous(phys_addr_t size, phys_addr_t alignment)
+{
+       long base_pfn;
+       phys_addr_t addr;
+       struct kvm_cma *cma = &kvm_cma_area;
+
+       pr_debug("%s(size %lx)\n", __func__, (unsigned long)size);
+
+       if (!size)
+               return -EINVAL;
+       /*
+        * Sanitise input arguments.
+        * We should be pageblock aligned for CMA.
+        */
+       alignment = max(alignment, (phys_addr_t)(PAGE_SIZE << pageblock_order));
+       size = ALIGN(size, alignment);
+       /*
+        * Reserve memory
+        * Use __memblock_alloc_base() since
+        * memblock_alloc_base() panic()s.
+        */
+       addr = __memblock_alloc_base(size, alignment, 0);
+       if (!addr) {
+               base_pfn = -ENOMEM;
+               goto err;
+       } else
+               base_pfn = PFN_DOWN(addr);
+
+       /*
+        * Each reserved area must be initialised later, when more kernel
+        * subsystems (like slab allocator) are available.
+        */
+       cma->base_pfn = base_pfn;
+       cma->count    = size >> PAGE_SHIFT;
+       pr_info("CMA: reserved %ld MiB\n", (unsigned long)size / SZ_1M);
+       return 0;
+err:
+       pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
+       return base_pfn;
+}
+
+/**
+ * kvm_alloc_cma() - allocate pages from contiguous area
+ * @nr_pages: Requested number of pages.
+ * @align_pages: Requested alignment in number of pages
+ *
+ * This function allocates memory buffer for hash pagetable.
+ */
+struct page *kvm_alloc_cma(unsigned long nr_pages, unsigned long align_pages)
+{
+       int ret;
+       struct page *page = NULL;
+       struct kvm_cma *cma = &kvm_cma_area;
+       unsigned long chunk_count, nr_chunk;
+       unsigned long mask, pfn, pageno, start = 0;
+
+
+       if (!cma || !cma->count)
+               return NULL;
+
+       pr_debug("%s(cma %p, count %lu, align pages %lu)\n", __func__,
+                (void *)cma, nr_pages, align_pages);
+
+       if (!nr_pages)
+               return NULL;
+       /*
+        * align mask with chunk size. The bit tracks pages in chunk size
+        */
+       VM_BUG_ON(!is_power_of_2(align_pages));
+       mask = (align_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)) - 1;
+       BUILD_BUG_ON(PAGE_SHIFT > KVM_CMA_CHUNK_ORDER);
+
+       chunk_count = cma->count >>  (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+       nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+
+       mutex_lock(&kvm_cma_mutex);
+       for (;;) {
+               pageno = bitmap_find_next_zero_area(cma->bitmap, chunk_count,
+                                                   start, nr_chunk, mask);
+               if (pageno >= chunk_count)
+                       break;
+
+               pfn = cma->base_pfn + (pageno << (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT));
+               ret = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_CMA);
+               if (ret == 0) {
+                       bitmap_set(cma->bitmap, pageno, nr_chunk);
+                       page = pfn_to_page(pfn);
+                       memset(pfn_to_kaddr(pfn), 0, nr_pages << PAGE_SHIFT);
+                       break;
+               } else if (ret != -EBUSY) {
+                       break;
+               }
+               pr_debug("%s(): memory range at %p is busy, retrying\n",
+                        __func__, pfn_to_page(pfn));
+               /* try again with a bit different memory target */
+               start = pageno + mask + 1;
+       }
+       mutex_unlock(&kvm_cma_mutex);
+       pr_debug("%s(): returned %p\n", __func__, page);
+       return page;
+}
+
+/**
+ * kvm_release_cma() - release allocated pages for hash pagetable
+ * @pages: Allocated pages.
+ * @nr_pages: Number of allocated pages.
+ *
+ * This function releases memory allocated by kvm_alloc_cma().
+ * It returns false when provided pages do not belong to contiguous area and
+ * true otherwise.
+ */
+bool kvm_release_cma(struct page *pages, unsigned long nr_pages)
+{
+       unsigned long pfn;
+       unsigned long nr_chunk;
+       struct kvm_cma *cma = &kvm_cma_area;
+
+       if (!cma || !pages)
+               return false;
+
+       pr_debug("%s(page %p count %lu)\n", __func__, (void *)pages, nr_pages);
+
+       pfn = page_to_pfn(pages);
+
+       if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
+               return false;
+
+       VM_BUG_ON(pfn + nr_pages > cma->base_pfn + cma->count);
+       nr_chunk = nr_pages >>  (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+
+       mutex_lock(&kvm_cma_mutex);
+       bitmap_clear(cma->bitmap,
+                    (pfn - cma->base_pfn) >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT),
+                    nr_chunk);
+       free_contig_range(pfn, nr_pages);
+       mutex_unlock(&kvm_cma_mutex);
+
+       return true;
+}
+
+static int __init kvm_cma_activate_area(unsigned long base_pfn,
+                                       unsigned long count)
+{
+       unsigned long pfn = base_pfn;
+       unsigned i = count >> pageblock_order;
+       struct zone *zone;
+
+       WARN_ON_ONCE(!pfn_valid(pfn));
+       zone = page_zone(pfn_to_page(pfn));
+       do {
+               unsigned j;
+               base_pfn = pfn;
+               for (j = pageblock_nr_pages; j; --j, pfn++) {
+                       WARN_ON_ONCE(!pfn_valid(pfn));
+                       /*
+                        * alloc_contig_range requires the pfn range
+                        * specified to be in the same zone. Make this
+                        * simple by forcing the entire CMA resv range
+                        * to be in the same zone.
+                        */
+                       if (page_zone(pfn_to_page(pfn)) != zone)
+                               return -EINVAL;
+               }
+               init_cma_reserved_pageblock(pfn_to_page(base_pfn));
+       } while (--i);
+       return 0;
+}
+
+static int __init kvm_cma_init_reserved_areas(void)
+{
+       int bitmap_size, ret;
+       unsigned long chunk_count;
+       struct kvm_cma *cma = &kvm_cma_area;
+
+       pr_debug("%s()\n", __func__);
+       if (!cma->count)
+               return 0;
+       chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+       bitmap_size = BITS_TO_LONGS(chunk_count) * sizeof(long);
+       cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+       if (!cma->bitmap)
+               return -ENOMEM;
+
+       ret = kvm_cma_activate_area(cma->base_pfn, cma->count);
+       if (ret)
+               goto error;
+       return 0;
+
+error:
+       kfree(cma->bitmap);
+       return ret;
+}
+core_initcall(kvm_cma_init_reserved_areas);
diff --git a/arch/powerpc/kvm/book3s_hv_cma.h b/arch/powerpc/kvm/book3s_hv_cma.h

new file mode 100644 (file)

index 0000000..655144f
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_cma.h
@@ -0,0 +1,27 @@
+/*
+ * Contiguous Memory Allocator for ppc KVM hash pagetable  based on CMA
+ * for DMA mapping framework
+ *
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ *
+ */
+
+#ifndef __POWERPC_KVM_CMA_ALLOC_H__
+#define __POWERPC_KVM_CMA_ALLOC_H__
+/*
+ * Both RMA and Hash page allocation will be multiple of 256K.
+ */
+#define KVM_CMA_CHUNK_ORDER    18
+
+extern struct page *kvm_alloc_cma(unsigned long nr_pages,
+                                 unsigned long align_pages);
+extern bool kvm_release_cma(struct page *pages, unsigned long nr_pages);
+extern long kvm_cma_declare_contiguous(phys_addr_t size,
+                                      phys_addr_t alignment) __init;
+#endif
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c

index fc25689a9f35076e61d83ca024a08e2bdf7564c1..45e30d6e462b02e03894917a3abe8ae93d83f25a 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -383,6 +383,80 @@ static inline int try_lock_tlbie(unsigned int *lock)
         return old == 0;
  }
  
+/*
+ * tlbie/tlbiel is a bit different on the PPC970 compared to later
+ * processors such as POWER7; the large page bit is in the instruction
+ * not RB, and the top 16 bits and the bottom 12 bits of the VA
+ * in RB must be 0.
+ */
+static void do_tlbies_970(struct kvm *kvm, unsigned long *rbvalues,
+                         long npages, int global, bool need_sync)
+{
+       long i;
+
+       if (global) {
+               while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+                       cpu_relax();
+               if (need_sync)
+                       asm volatile("ptesync" : : : "memory");
+               for (i = 0; i < npages; ++i) {
+                       unsigned long rb = rbvalues[i];
+
+                       if (rb & 1)             /* large page */
+                               asm volatile("tlbie %0,1" : :
+                                            "r" (rb & 0x0000fffffffff000ul));
+                       else
+                               asm volatile("tlbie %0,0" : :
+                                            "r" (rb & 0x0000fffffffff000ul));
+               }
+               asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+               kvm->arch.tlbie_lock = 0;
+       } else {
+               if (need_sync)
+                       asm volatile("ptesync" : : : "memory");
+               for (i = 0; i < npages; ++i) {
+                       unsigned long rb = rbvalues[i];
+
+                       if (rb & 1)             /* large page */
+                               asm volatile("tlbiel %0,1" : :
+                                            "r" (rb & 0x0000fffffffff000ul));
+                       else
+                               asm volatile("tlbiel %0,0" : :
+                                            "r" (rb & 0x0000fffffffff000ul));
+               }
+               asm volatile("ptesync" : : : "memory");
+       }
+}
+
+static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
+                     long npages, int global, bool need_sync)
+{
+       long i;
+
+       if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+               /* PPC970 tlbie instruction is a bit different */
+               do_tlbies_970(kvm, rbvalues, npages, global, need_sync);
+               return;
+       }
+       if (global) {
+               while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+                       cpu_relax();
+               if (need_sync)
+                       asm volatile("ptesync" : : : "memory");
+               for (i = 0; i < npages; ++i)
+                       asm volatile(PPC_TLBIE(%1,%0) : :
+                                    "r" (rbvalues[i]), "r" (kvm->arch.lpid));
+               asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+               kvm->arch.tlbie_lock = 0;
+       } else {
+               if (need_sync)
+                       asm volatile("ptesync" : : : "memory");
+               for (i = 0; i < npages; ++i)
+                       asm volatile("tlbiel %0" : : "r" (rbvalues[i]));
+               asm volatile("ptesync" : : : "memory");
+       }
+}
+
  long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
                         unsigned long pte_index, unsigned long avpn,
                         unsigned long *hpret)
@@ -408,19 +482,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
         if (v & HPTE_V_VALID) {
                 hpte[0] &= ~HPTE_V_VALID;
                 rb = compute_tlbie_rb(v, hpte[1], pte_index);
-               if (global_invalidates(kvm, flags)) {
-                       while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
-                               cpu_relax();
-                       asm volatile("ptesync" : : : "memory");
-                       asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-                                    : : "r" (rb), "r" (kvm->arch.lpid));
-                       asm volatile("ptesync" : : : "memory");
-                       kvm->arch.tlbie_lock = 0;
-               } else {
-                       asm volatile("ptesync" : : : "memory");
-                       asm volatile("tlbiel %0" : : "r" (rb));
-                       asm volatile("ptesync" : : : "memory");
-               }
+               do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
                 /* Read PTE low word after tlbie to get final R/C values */
                 remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
         }
@@ -448,12 +510,11 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
         unsigned long *hp, *hptes[4], tlbrb[4];
         long int i, j, k, n, found, indexes[4];
         unsigned long flags, req, pte_index, rcbits;
-       long int local = 0;
+       int global;
         long int ret = H_SUCCESS;
         struct revmap_entry *rev, *revs[4];
  
-       if (atomic_read(&kvm->online_vcpus) == 1)
-               local = 1;
+       global = global_invalidates(kvm, 0);
         for (i = 0; i < 4 && ret == H_SUCCESS; ) {
                 n = 0;
                 for (; i < 4; ++i) {
@@ -529,22 +590,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
                         break;
  
                 /* Now that we've collected a batch, do the tlbies */
-               if (!local) {
-                       while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
-                               cpu_relax();
-                       asm volatile("ptesync" : : : "memory");
-                       for (k = 0; k < n; ++k)
-                               asm volatile(PPC_TLBIE(%1,%0) : :
-                                            "r" (tlbrb[k]),
-                                            "r" (kvm->arch.lpid));
-                       asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-                       kvm->arch.tlbie_lock = 0;
-               } else {
-                       asm volatile("ptesync" : : : "memory");
-                       for (k = 0; k < n; ++k)
-                               asm volatile("tlbiel %0" : : "r" (tlbrb[k]));
-                       asm volatile("ptesync" : : : "memory");
-               }
+               do_tlbies(kvm, tlbrb, n, global, true);
  
                 /* Read PTE low words after tlbie to get final R/C values */
                 for (k = 0; k < n; ++k) {
@@ -603,19 +649,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
         if (v & HPTE_V_VALID) {
                 rb = compute_tlbie_rb(v, r, pte_index);
                 hpte[0] = v & ~HPTE_V_VALID;
-               if (global_invalidates(kvm, flags)) {
-                       while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
-                               cpu_relax();
-                       asm volatile("ptesync" : : : "memory");
-                       asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-                                    : : "r" (rb), "r" (kvm->arch.lpid));
-                       asm volatile("ptesync" : : : "memory");
-                       kvm->arch.tlbie_lock = 0;
-               } else {
-                       asm volatile("ptesync" : : : "memory");
-                       asm volatile("tlbiel %0" : : "r" (rb));
-                       asm volatile("ptesync" : : : "memory");
-               }
+               do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
                 /*
                  * If the host has this page as readonly but the guest
                  * wants to make it read/write, reduce the permissions.
@@ -686,13 +720,7 @@ void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
  
         hptep[0] &= ~HPTE_V_VALID;
         rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
-       while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
-               cpu_relax();
-       asm volatile("ptesync" : : : "memory");
-       asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-                    : : "r" (rb), "r" (kvm->arch.lpid));
-       asm volatile("ptesync" : : : "memory");
-       kvm->arch.tlbie_lock = 0;
+       do_tlbies(kvm, &rb, 1, 1, true);
  }
  EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
  
@@ -706,12 +734,7 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
         rbyte = (hptep[1] & ~HPTE_R_R) >> 8;
         /* modify only the second-last byte, which contains the ref bit */
         *((char *)hptep + 14) = rbyte;
-       while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
-               cpu_relax();
-       asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-                    : : "r" (rb), "r" (kvm->arch.lpid));
-       asm volatile("ptesync" : : : "memory");
-       kvm->arch.tlbie_lock = 0;
+       do_tlbies(kvm, &rb, 1, 1, false);
  }
  EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);
  
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S

index b02f91e4c70dc5341bc450fa68f22b102ccd462e..60dce5bfab3fe25e7df25603719f712b0a1c88a6 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1381,7 +1381,7 @@ hcall_try_real_mode:
         cmpldi  r3,hcall_real_table_end - hcall_real_table
         bge     guest_exit_cont
         LOAD_REG_ADDR(r4, hcall_real_table)
-       lwzx    r3,r3,r4
+       lwax    r3,r3,r4
         cmpwi   r3,0
         beq     guest_exit_cont
         add     r3,r3,r4
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S

index 48cbbf862958f8e3eea8b82932ec03cf231595b0..17cfae5497a3384d756fb2ed63b775758e6efb19 100644 (file)
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -92,6 +92,11 @@ kvm_start_lightweight:
         PPC_LL  r3, VCPU_HFLAGS(r4)
         rldicl  r3, r3, 0, 63           /* r3 &= 1 */
         stb     r3, HSTATE_RESTORE_HID5(r13)
+
+       /* Load up guest SPRG3 value, since it's user readable */
+       ld      r3, VCPU_SHARED(r4)
+       ld      r3, VCPU_SHARED_SPRG3(r3)
+       mtspr   SPRN_SPRG3, r3
  #endif /* CONFIG_PPC_BOOK3S_64 */
  
         PPC_LL  r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */
@@ -123,6 +128,15 @@ kvmppc_handler_highmem:
         /* R7 = vcpu */
         PPC_LL  r7, GPR4(r1)
  
+#ifdef CONFIG_PPC_BOOK3S_64
+       /*
+        * Reload kernel SPRG3 value.
+        * No need to save guest value as usermode can't modify SPRG3.
+        */
+       ld      r3, PACA_SPRG3(r13)
+       mtspr   SPRN_SPRG3, r3
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
         PPC_STL r14, VCPU_GPR(R14)(r7)
         PPC_STL r15, VCPU_GPR(R15)(r7)
         PPC_STL r16, VCPU_GPR(R16)(r7)
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c

index c6e13d9a9e15b0be5f2b75a1151c7eea4b93b8e5..27db1e66595987a99e2f387819345af30baad739 100644 (file)
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -468,7 +468,8 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
                  * both the traditional FP registers and the added VSX
                  * registers into thread.fpr[].
                  */
-               giveup_fpu(current);
+               if (current->thread.regs->msr & MSR_FP)
+                       giveup_fpu(current);
                 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
                         vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
  
@@ -483,7 +484,8 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
  
  #ifdef CONFIG_ALTIVEC
         if (msr & MSR_VEC) {
-               giveup_altivec(current);
+               if (current->thread.regs->msr & MSR_VEC)
+                       giveup_altivec(current);
                 memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
                 vcpu->arch.vscr = t->vscr;
         }
@@ -575,8 +577,6 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
         printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
  #endif
  
-       current->thread.regs->msr |= msr;
-
         if (msr & MSR_FP) {
                 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
                         thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
@@ -598,12 +598,32 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
  #endif
         }
  
+       current->thread.regs->msr |= msr;
         vcpu->arch.guest_owned_ext |= msr;
         kvmppc_recalc_shadow_msr(vcpu);
  
         return RESUME_GUEST;
  }
  
+/*
+ * Kernel code using FP or VMX could have flushed guest state to
+ * the thread_struct; if so, get it back now.
+ */
+static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
+{
+       unsigned long lost_ext;
+
+       lost_ext = vcpu->arch.guest_owned_ext & ~current->thread.regs->msr;
+       if (!lost_ext)
+               return;
+
+       if (lost_ext & MSR_FP)
+               kvmppc_load_up_fpu();
+       if (lost_ext & MSR_VEC)
+               kvmppc_load_up_altivec();
+       current->thread.regs->msr |= lost_ext;
+}
+
  int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                         unsigned int exit_nr)
  {
@@ -772,7 +792,7 @@ program_interrupt:
         }
         case BOOK3S_INTERRUPT_SYSCALL:
                 if (vcpu->arch.papr_enabled &&
-                   (kvmppc_get_last_inst(vcpu) == 0x44000022) &&
+                   (kvmppc_get_last_sc(vcpu) == 0x44000022) &&
                     !(vcpu->arch.shared->msr & MSR_PR)) {
                         /* SC 1 papr hypercalls */
                         ulong cmd = kvmppc_get_gpr(vcpu, 3);
@@ -890,8 +910,9 @@ program_interrupt:
                         local_irq_enable();
                         r = s;
                 } else {
-                       kvmppc_lazy_ee_enable();
+                       kvmppc_fix_ee_before_entry();
                 }
+               kvmppc_handle_lost_ext(vcpu);
         }
  
         trace_kvm_book3s_reenter(r, vcpu);
@@ -1162,7 +1183,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
         if (vcpu->arch.shared->msr & MSR_FP)
                 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
  
-       kvmppc_lazy_ee_enable();
+       kvmppc_fix_ee_before_entry();
  
         ret = __kvmppc_vcpu_run(kvm_run, vcpu);
  
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c

index 94c1dd46b83d54e98a96c6af3b2e682072f027e6..a3a5cb8ee7eac2baa68cfb809e6a4eebce5457a9 100644 (file)
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -19,6 +19,7 @@
  #include <asm/hvcall.h>
  #include <asm/xics.h>
  #include <asm/debug.h>
+#include <asm/time.h>
  
  #include <linux/debugfs.h>
  #include <linux/seq_file.h>
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c

index dcc94f016007f485e769f189cce610c77734f56f..17722d82f1d1f500bd5579f544cf29b18e6cbd80 100644 (file)
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -674,8 +674,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                 goto out;
         }
  
-       kvm_guest_enter();
-
  #ifdef CONFIG_PPC_FPU
         /* Save userspace FPU state in stack */
         enable_kernel_fp();
@@ -698,7 +696,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
         kvmppc_load_guest_fp(vcpu);
  #endif
  
-       kvmppc_lazy_ee_enable();
+       kvmppc_fix_ee_before_entry();
  
         ret = __kvmppc_vcpu_run(kvm_run, vcpu);
  
@@ -1168,7 +1166,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                         local_irq_enable();
                         r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
                 } else {
-                       kvmppc_lazy_ee_enable();
+                       kvmppc_fix_ee_before_entry();
                 }
         }
  
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c

index 6316ee336e888e22636f557d1623c54b30d7a207..f55e14cd1762192e2950689d5ee41e2e35698f2b 100644 (file)
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -117,8 +117,6 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
                         kvm_guest_exit();
                         continue;
                 }
-
-               trace_hardirqs_on();
  #endif
  
                 kvm_guest_enter();
@@ -420,6 +418,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
         return kvmppc_core_create_memslot(slot, npages);
  }
  
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
+
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                    struct kvm_memory_slot *memslot,
                                    struct kvm_userspace_memory_region *mem,
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h

index 3238d4004e8436aa1a7d1e5eb3d811fe065d0dcb..e87ecaa2c569860f0c9353fc3ed398a0eba034ad 100644 (file)
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -274,6 +274,14 @@ struct kvm_arch{
         int css_support;
  };
  
+#define KVM_HVA_ERR_BAD                (-1UL)
+#define KVM_HVA_ERR_RO_BAD     (-2UL)
+
+static inline bool kvm_is_error_hva(unsigned long addr)
+{
+       return IS_ERR_VALUE(addr);
+}
+
  extern int sie64a(struct kvm_s390_sie_block *, u64 *);
  extern char sie_exit;
  #endif
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h

index 6340178748bf470bcb724eaa1e7feb066e261884..ff132ac64ddd0609f16c22021732c00355318bb3 100644 (file)
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -12,8 +12,6 @@ typedef struct {
         unsigned long asce_bits;
         unsigned long asce_limit;
         unsigned long vdso_base;
-       /* Cloned contexts will be created with extended page tables. */
-       unsigned int alloc_pgste:1;
         /* The mmu context has extended page tables. */
         unsigned int has_pgste:1;
  } mm_context_t;
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h

index 7b7fce4e846941832282adb57e760e49c506ac0a..9f973d8de90ea91fbde12f11cff70c8470bf8a81 100644 (file)
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -21,24 +21,7 @@ static inline int init_new_context(struct task_struct *tsk,
  #ifdef CONFIG_64BIT
         mm->context.asce_bits |= _ASCE_TYPE_REGION3;
  #endif
-       if (current->mm && current->mm->context.alloc_pgste) {
-               /*
-                * alloc_pgste indicates, that any NEW context will be created
-                * with extended page tables. The old context is unchanged. The
-                * page table allocation and the page table operations will
-                * look at has_pgste to distinguish normal and extended page
-                * tables. The only way to create extended page tables is to
-                * set alloc_pgste and then create a new context (e.g. dup_mm).
-                * The page table allocation is called after init_new_context
-                * and if has_pgste is set, it will create extended page
-                * tables.
-                */
-               mm->context.has_pgste = 1;
-               mm->context.alloc_pgste = 1;
-       } else {
-               mm->context.has_pgste = 0;
-               mm->context.alloc_pgste = 0;
-       }
+       mm->context.has_pgste = 0;
         mm->context.asce_limit = STACK_TOP_MAX;
         crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
         return 0;
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h

index 9f215b40109e1c4d9df5bb0aa6da36e5be3213ba..9b60a36c348d5422dc325463bcb26efeee64161d 100644 (file)
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1442,6 +1442,17 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
  }
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */
  
+static inline void pmdp_flush_lazy(struct mm_struct *mm,
+                                  unsigned long address, pmd_t *pmdp)
+{
+       int active = (mm == current->active_mm) ? 1 : 0;
+
+       if ((atomic_read(&mm->context.attach_count) & 0xffff) > active)
+               __pmd_idte(address, pmdp);
+       else
+               mm->context.flush_mm = 1;
+}
+
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  
  #define __HAVE_ARCH_PGTABLE_DEPOSIT
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h

index b0e6435b2f02195e60303a9c46f0c199220c5225..0eb37505cab11c71f083ed508f02c98a72127e95 100644 (file)
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -43,6 +43,7 @@ extern void execve_tail(void);
  #ifndef CONFIG_64BIT
  
  #define TASK_SIZE              (1UL << 31)
+#define TASK_MAX_SIZE          (1UL << 31)
  #define TASK_UNMAPPED_BASE     (1UL << 30)
  
  #else /* CONFIG_64BIT */
@@ -51,6 +52,7 @@ extern void execve_tail(void);
  #define TASK_UNMAPPED_BASE     (test_thread_flag(TIF_31BIT) ? \
                                         (1UL << 30) : (1UL << 41))
  #define TASK_SIZE              TASK_SIZE_OF(current)
+#define TASK_MAX_SIZE          (1UL << 53)
  
  #endif /* CONFIG_64BIT */
  
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c

index 3074475c8ae062dbfa7c285b7217fbf2f2c87241..3a74d8af0d69dd2e3bac77a427a6bfd64723bae9 100644 (file)
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -119,12 +119,21 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
          * The layout is as follows:
          * - gpr 2 contains the subchannel id (passed as addr)
          * - gpr 3 contains the virtqueue index (passed as datamatch)
+        * - gpr 4 contains the index on the bus (optionally)
          */
-       ret = kvm_io_bus_write(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS,
-                               vcpu->run->s.regs.gprs[2],
-                               8, &vcpu->run->s.regs.gprs[3]);
+       ret = kvm_io_bus_write_cookie(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS,
+                                     vcpu->run->s.regs.gprs[2],
+                                     8, &vcpu->run->s.regs.gprs[3],
+                                     vcpu->run->s.regs.gprs[4]);
         srcu_read_unlock(&vcpu->kvm->srcu, idx);
-       /* kvm_io_bus_write returns -EOPNOTSUPP if it found no match. */
+
+       /*
+        * Return cookie in gpr 2, but don't overwrite the register if the
+        * diagnose will be handled by userspace.
+        */
+       if (ret != -EOPNOTSUPP)
+               vcpu->run->s.regs.gprs[2] = ret;
+       /* kvm_io_bus_write_cookie returns -EOPNOTSUPP if it found no match. */
         return ret < 0 ? ret : 0;
  }
  
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c

index 34c1c9a90be288d9080d93373660d020bedf2a90..776dafe918db30b8c3f4823b8bf11c461bed6f5f 100644 (file)
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -28,6 +28,7 @@
  #include <asm/pgtable.h>
  #include <asm/nmi.h>
  #include <asm/switch_to.h>
+#include <asm/facility.h>
  #include <asm/sclp.h>
  #include "kvm-s390.h"
  #include "gaccess.h"
@@ -84,9 +85,15 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
         { NULL }
  };
  
-static unsigned long long *facilities;
+unsigned long *vfacilities;
  static struct gmap_notifier gmap_notifier;
  
+/* test availability of vfacility */
+static inline int test_vfacility(unsigned long nr)
+{
+       return __test_facility(nr, (void *) vfacilities);
+}
+
  /* Section: not file related */
  int kvm_arch_hardware_enable(void *garbage)
  {
@@ -387,7 +394,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
         vcpu->arch.sie_block->ecb   = 6;
         vcpu->arch.sie_block->ecb2  = 8;
         vcpu->arch.sie_block->eca   = 0xC1002001U;
-       vcpu->arch.sie_block->fac   = (int) (long) facilities;
+       vcpu->arch.sie_block->fac   = (int) (long) vfacilities;
         hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
         tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
                      (unsigned long) vcpu);
@@ -1063,6 +1070,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
         return 0;
  }
  
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
+
  /* Section: memory related */
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                    struct kvm_memory_slot *memslot,
@@ -1129,20 +1140,20 @@ static int __init kvm_s390_init(void)
          * to hold the maximum amount of facilities. On the other hand, we
          * only set facilities that are known to work in KVM.
          */
-       facilities = (unsigned long long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
-       if (!facilities) {
+       vfacilities = (unsigned long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
+       if (!vfacilities) {
                 kvm_exit();
                 return -ENOMEM;
         }
-       memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
-       facilities[0] &= 0xff82fff3f47c0000ULL;
-       facilities[1] &= 0x001c000000000000ULL;
+       memcpy(vfacilities, S390_lowcore.stfle_fac_list, 16);
+       vfacilities[0] &= 0xff82fff3f47c0000UL;
+       vfacilities[1] &= 0x001c000000000000UL;
         return 0;
  }
  
  static void __exit kvm_s390_exit(void)
  {
-       free_page((unsigned long) facilities);
+       free_page((unsigned long) vfacilities);
         kvm_exit();
  }
  
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h

index 028ca9fd2158f4ab015e40236d70b3acbf53c6e5..dc99f1ca42678768e5e0241cbeaa20de4e0aae87 100644 (file)
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -24,6 +24,9 @@
  
  typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
  
+/* declare vfacilities extern */
+extern unsigned long *vfacilities;
+
  /* negativ values are error codes, positive values for internal conditions */
  #define SIE_INTERCEPT_RERUNVCPU                (1<<0)
  #define SIE_INTERCEPT_UCONTROL         (1<<1)
@@ -112,6 +115,13 @@ static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu)
         return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
  }
  
+/* Set the condition code in the guest program status word */
+static inline void kvm_s390_set_psw_cc(struct kvm_vcpu *vcpu, unsigned long cc)
+{
+       vcpu->arch.sie_block->gpsw.mask &= ~(3UL << 44);
+       vcpu->arch.sie_block->gpsw.mask |= cc << 44;
+}
+
  int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
  enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
  void kvm_s390_tasklet(unsigned long parm);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c

index 4cdc54e63ebcb366786347de18437f327ee8e2b2..59200ee275e568ae99d593484b5575be320b79b8 100644 (file)
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -164,8 +164,7 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
         kfree(inti);
  no_interrupt:
         /* Set condition code and we're done. */
-       vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
-       vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44;
+       kvm_s390_set_psw_cc(vcpu, cc);
         return 0;
  }
  
@@ -220,15 +219,13 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
                  * Set condition code 3 to stop the guest from issueing channel
                  * I/O instructions.
                  */
-               vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
-               vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44;
+               kvm_s390_set_psw_cc(vcpu, 3);
                 return 0;
         }
  }
  
  static int handle_stfl(struct kvm_vcpu *vcpu)
  {
-       unsigned int facility_list;
         int rc;
  
         vcpu->stat.instruction_stfl++;
@@ -236,15 +233,13 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
         if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
  
-       /* only pass the facility bits, which we can handle */
-       facility_list = S390_lowcore.stfl_fac_list & 0xff82fff3;
-
         rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
-                          &facility_list, sizeof(facility_list));
+                          vfacilities, 4);
         if (rc)
                 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-       VCPU_EVENT(vcpu, 5, "store facility list value %x", facility_list);
-       trace_kvm_s390_handle_stfl(vcpu, facility_list);
+       VCPU_EVENT(vcpu, 5, "store facility list value %x",
+                  *(unsigned int *) vfacilities);
+       trace_kvm_s390_handle_stfl(vcpu, *(unsigned int *) vfacilities);
         return 0;
  }
  
@@ -387,7 +382,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
                 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
  
         if (fc > 3) {
-               vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;     /* cc 3 */
+               kvm_s390_set_psw_cc(vcpu, 3);
                 return 0;
         }
  
@@ -397,7 +392,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
  
         if (fc == 0) {
                 vcpu->run->s.regs.gprs[0] = 3 << 28;
-               vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);  /* cc 0 */
+               kvm_s390_set_psw_cc(vcpu, 0);
                 return 0;
         }
  
@@ -431,12 +426,11 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
         }
         trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
         free_page(mem);
-       vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+       kvm_s390_set_psw_cc(vcpu, 0);
         vcpu->run->s.regs.gprs[0] = 0;
         return 0;
  out_no_data:
-       /* condition code 3 */
-       vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;
+       kvm_s390_set_psw_cc(vcpu, 3);
  out_exception:
         free_page(mem);
         return rc;
@@ -494,12 +488,12 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
         kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
  
         /* This basically extracts the mask half of the psw. */
-       vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000;
+       vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000UL;
         vcpu->run->s.regs.gprs[reg1] |= vcpu->arch.sie_block->gpsw.mask >> 32;
         if (reg2) {
-               vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000;
+               vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000UL;
                 vcpu->run->s.regs.gprs[reg2] |=
-                       vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffff;
+                       vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffffUL;
         }
         return 0;
  }
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c

index 6d16132d08501bb40fd4598d4dae951601bd37b5..bf7c0dc64a76111d307b7b48585cc40ecb6741aa 100644 (file)
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -335,7 +335,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
  
         if ((from | to | len) & (PMD_SIZE - 1))
                 return -EINVAL;
-       if (len == 0 || from + len > PGDIR_SIZE ||
+       if (len == 0 || from + len > TASK_MAX_SIZE ||
             from + len < from || to + len < to)
                 return -EINVAL;
  
@@ -732,6 +732,11 @@ void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long addr, pte_t *pte)
         spin_unlock(&gmap_notifier_lock);
  }
  
+static inline int page_table_with_pgste(struct page *page)
+{
+       return atomic_read(&page->_mapcount) == 0;
+}
+
  static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
                                                     unsigned long vmaddr)
  {
@@ -751,7 +756,7 @@ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
         mp->vmaddr = vmaddr & PMD_MASK;
         INIT_LIST_HEAD(&mp->mapper);
         page->index = (unsigned long) mp;
-       atomic_set(&page->_mapcount, 3);
+       atomic_set(&page->_mapcount, 0);
         table = (unsigned long *) page_to_phys(page);
         clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
         clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT,
@@ -818,6 +823,11 @@ EXPORT_SYMBOL(set_guest_storage_key);
  
  #else /* CONFIG_PGSTE */
  
+static inline int page_table_with_pgste(struct page *page)
+{
+       return 0;
+}
+
  static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
                                                     unsigned long vmaddr)
  {
@@ -894,12 +904,12 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
         struct page *page;
         unsigned int bit, mask;
  
-       if (mm_has_pgste(mm)) {
+       page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+       if (page_table_with_pgste(page)) {
                 gmap_disconnect_pgtable(mm, table);
                 return page_table_free_pgste(table);
         }
         /* Free 1K/2K page table fragment of a 4K page */
-       page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
         bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
         spin_lock_bh(&mm->context.list_lock);
         if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
@@ -937,14 +947,14 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
         unsigned int bit, mask;
  
         mm = tlb->mm;
-       if (mm_has_pgste(mm)) {
+       page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+       if (page_table_with_pgste(page)) {
                 gmap_disconnect_pgtable(mm, table);
                 table = (unsigned long *) (__pa(table) | FRAG_MASK);
                 tlb_remove_table(tlb, table);
                 return;
         }
         bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
-       page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
         spin_lock_bh(&mm->context.list_lock);
         if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
                 list_del(&page->lru);
@@ -1030,36 +1040,120 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
  }
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void thp_split_vma(struct vm_area_struct *vma)
+static inline void thp_split_vma(struct vm_area_struct *vma)
  {
         unsigned long addr;
-       struct page *page;
  
-       for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
-               page = follow_page(vma, addr, FOLL_SPLIT);
-       }
+       for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
+               follow_page(vma, addr, FOLL_SPLIT);
  }
  
-void thp_split_mm(struct mm_struct *mm)
+static inline void thp_split_mm(struct mm_struct *mm)
  {
-       struct vm_area_struct *vma = mm->mmap;
+       struct vm_area_struct *vma;
  
-       while (vma != NULL) {
+       for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
                 thp_split_vma(vma);
                 vma->vm_flags &= ~VM_HUGEPAGE;
                 vma->vm_flags |= VM_NOHUGEPAGE;
-               vma = vma->vm_next;
         }
+       mm->def_flags |= VM_NOHUGEPAGE;
+}
+#else
+static inline void thp_split_mm(struct mm_struct *mm)
+{
  }
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
+static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
+                               struct mm_struct *mm, pud_t *pud,
+                               unsigned long addr, unsigned long end)
+{
+       unsigned long next, *table, *new;
+       struct page *page;
+       pmd_t *pmd;
+
+       pmd = pmd_offset(pud, addr);
+       do {
+               next = pmd_addr_end(addr, end);
+again:
+               if (pmd_none_or_clear_bad(pmd))
+                       continue;
+               table = (unsigned long *) pmd_deref(*pmd);
+               page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+               if (page_table_with_pgste(page))
+                       continue;
+               /* Allocate new page table with pgstes */
+               new = page_table_alloc_pgste(mm, addr);
+               if (!new) {
+                       mm->context.has_pgste = 0;
+                       continue;
+               }
+               spin_lock(&mm->page_table_lock);
+               if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
+                       /* Nuke pmd entry pointing to the "short" page table */
+                       pmdp_flush_lazy(mm, addr, pmd);
+                       pmd_clear(pmd);
+                       /* Copy ptes from old table to new table */
+                       memcpy(new, table, PAGE_SIZE/2);
+                       clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+                       /* Establish new table */
+                       pmd_populate(mm, pmd, (pte_t *) new);
+                       /* Free old table with rcu, there might be a walker! */
+                       page_table_free_rcu(tlb, table);
+                       new = NULL;
+               }
+               spin_unlock(&mm->page_table_lock);
+               if (new) {
+                       page_table_free_pgste(new);
+                       goto again;
+               }
+       } while (pmd++, addr = next, addr != end);
+
+       return addr;
+}
+
+static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
+                                  struct mm_struct *mm, pgd_t *pgd,
+                                  unsigned long addr, unsigned long end)
+{
+       unsigned long next;
+       pud_t *pud;
+
+       pud = pud_offset(pgd, addr);
+       do {
+               next = pud_addr_end(addr, end);
+               if (pud_none_or_clear_bad(pud))
+                       continue;
+               next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
+       } while (pud++, addr = next, addr != end);
+
+       return addr;
+}
+
+static void page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
+                              unsigned long addr, unsigned long end)
+{
+       unsigned long next;
+       pgd_t *pgd;
+
+       pgd = pgd_offset(mm, addr);
+       do {
+               next = pgd_addr_end(addr, end);
+               if (pgd_none_or_clear_bad(pgd))
+                       continue;
+               next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
+       } while (pgd++, addr = next, addr != end);
+}
+
  /*
   * switch on pgstes for its userspace process (for kvm)
   */
  int s390_enable_sie(void)
  {
         struct task_struct *tsk = current;
-       struct mm_struct *mm, *old_mm;
+       struct mm_struct *mm = tsk->mm;
+       struct mmu_gather tlb;
  
         /* Do we have switched amode? If no, we cannot do sie */
         if (s390_user_mode == HOME_SPACE_MODE)
@@ -1069,57 +1163,16 @@ int s390_enable_sie(void)
         if (mm_has_pgste(tsk->mm))
                 return 0;
  
-       /* lets check if we are allowed to replace the mm */
-       task_lock(tsk);
-       if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
-#ifdef CONFIG_AIO
-           !hlist_empty(&tsk->mm->ioctx_list) ||
-#endif
-           tsk->mm != tsk->active_mm) {
-               task_unlock(tsk);
-               return -EINVAL;
-       }
-       task_unlock(tsk);
-
-       /* we copy the mm and let dup_mm create the page tables with_pgstes */
-       tsk->mm->context.alloc_pgste = 1;
-       /* make sure that both mms have a correct rss state */
-       sync_mm_rss(tsk->mm);
-       mm = dup_mm(tsk);
-       tsk->mm->context.alloc_pgste = 0;
-       if (!mm)
-               return -ENOMEM;
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       down_write(&mm->mmap_sem);
         /* split thp mappings and disable thp for future mappings */
         thp_split_mm(mm);
-       mm->def_flags |= VM_NOHUGEPAGE;
-#endif
-
-       /* Now lets check again if something happened */
-       task_lock(tsk);
-       if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
-#ifdef CONFIG_AIO
-           !hlist_empty(&tsk->mm->ioctx_list) ||
-#endif
-           tsk->mm != tsk->active_mm) {
-               mmput(mm);
-               task_unlock(tsk);
-               return -EINVAL;
-       }
-
-       /* ok, we are alone. No ptrace, no threads, etc. */
-       old_mm = tsk->mm;
-       tsk->mm = tsk->active_mm = mm;
-       preempt_disable();
-       update_mm(mm, tsk);
-       atomic_inc(&mm->context.attach_count);
-       atomic_dec(&old_mm->context.attach_count);
-       cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
-       preempt_enable();
-       task_unlock(tsk);
-       mmput(old_mm);
-       return 0;
+       /* Reallocate the page tables with pgstes */
+       mm->context.has_pgste = 1;
+       tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
+       page_table_realloc(&tlb, mm, 0, TASK_SIZE);
+       tlb_finish_mmu(&tlb, 0, TASK_SIZE);
+       up_write(&mm->mmap_sem);
+       return mm->context.has_pgste ? 0 : -ENOMEM;
  }
  EXPORT_SYMBOL_GPL(s390_enable_sie);
  
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index f87f7fcefa0acdc4856aa576078a34c2d4002e1e..c76ff74a98f2ed5ffcd72b4dcefbcb0c9203c0cd 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,6 +286,7 @@ struct kvm_mmu {
         u64 *pae_root;
         u64 *lm_root;
         u64 rsvd_bits_mask[2][4];
+       u64 bad_mt_xwr;
  
         /*
          * Bitmap: bit set = last pte in walk
@@ -323,6 +324,7 @@ struct kvm_pmu {
         u64 global_ovf_ctrl;
         u64 counter_bitmask[2];
         u64 global_ctrl_mask;
+       u64 reserved_bits;
         u8 version;
         struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
         struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
@@ -511,6 +513,14 @@ struct kvm_vcpu_arch {
          * instruction.
          */
         bool write_fault_to_shadow_pgtable;
+
+       /* set at EPT violation at this point */
+       unsigned long exit_qualification;
+
+       /* pv related host specific info */
+       struct {
+               bool pv_unhalted;
+       } pv;
  };
  
  struct kvm_lpage_info {
@@ -802,8 +812,8 @@ extern u32  kvm_min_guest_tsc_khz;
  extern u32  kvm_max_guest_tsc_khz;
  
  enum emulation_result {
-       EMULATE_DONE,       /* no further processing */
-       EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
+       EMULATE_DONE,         /* no further processing */
+       EMULATE_USER_EXIT,    /* kvm_run ready for userspace exit */
         EMULATE_FAIL,         /* can't emulate this instruction */
  };
  
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h

index 109a9dd5d454f197414e2449617cf9cb73ab1220..be8269b00e2a1f720fe69755645948cf771e769e 100644 (file)
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
  
  struct pvclock_vsyscall_time_info {
         struct pvclock_vcpu_time_info pvti;
-       u32 migrate_count;
  } __attribute__((__aligned__(SMP_CACHE_BYTES)));
  
  #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h

index f3e01a2cbaa1b965f0adf89aa1c00b7a3619df51..966502d4682eeaf0c61b34b1b186e48cfdedf4fe 100644 (file)
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -387,6 +387,7 @@ enum vmcs_field {
  #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR         0
  #define VMX_EPT_EXTENT_CONTEXT                 1
  #define VMX_EPT_EXTENT_GLOBAL                  2
+#define VMX_EPT_EXTENT_SHIFT                   24
  
  #define VMX_EPT_EXECUTE_ONLY_BIT               (1ull)
  #define VMX_EPT_PAGE_WALK_4_BIT                        (1ull << 6)
@@ -394,6 +395,7 @@ enum vmcs_field {
  #define VMX_EPTP_WB_BIT                                (1ull << 14)
  #define VMX_EPT_2MB_PAGE_BIT                   (1ull << 16)
  #define VMX_EPT_1GB_PAGE_BIT                   (1ull << 17)
+#define VMX_EPT_INVEPT_BIT                     (1ull << 20)
  #define VMX_EPT_AD_BIT                             (1ull << 21)
  #define VMX_EPT_EXTENT_CONTEXT_BIT             (1ull << 25)
  #define VMX_EPT_EXTENT_GLOBAL_BIT              (1ull << 26)
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h

index d651082c7cf720a805f653b162343f6b9a984329..0e79420376eb93d51224e8ff6d41ecdf55280224 100644 (file)
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
  #define EXIT_REASON_EOI_INDUCED         45
  #define EXIT_REASON_EPT_VIOLATION       48
  #define EXIT_REASON_EPT_MISCONFIG       49
+#define EXIT_REASON_INVEPT              50
  #define EXIT_REASON_PREEMPTION_TIMER    52
  #define EXIT_REASON_WBINVD              54
  #define EXIT_REASON_XSETBV              55
@@ -106,12 +107,13 @@
         { EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
         { EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
         { EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
+       { EXIT_REASON_INVEPT,                "INVEPT" }, \
+       { EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }, \
         { EXIT_REASON_WBINVD,                "WBINVD" }, \
         { EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
         { EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
         { EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
         { EXIT_REASON_INVD,                  "INVD" }, \
-       { EXIT_REASON_INVPCID,               "INVPCID" }, \
-       { EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }
+       { EXIT_REASON_INVPCID,               "INVPCID" }
  
  #endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c

index 2cb9470ea85bac67b42c1fb6decdd26c35230041..a16bae3f83b37ab189dbe2a443f1461a075a6ae7 100644 (file)
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
         set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
  }
  
-static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
-
-static struct pvclock_vsyscall_time_info *
-pvclock_get_vsyscall_user_time_info(int cpu)
-{
-       if (!pvclock_vdso_info) {
-               BUG();
-               return NULL;
-       }
-
-       return &pvclock_vdso_info[cpu];
-}
-
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
-{
-       return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
-}
-
  #ifdef CONFIG_X86_64
-static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
-                               void *v)
-{
-       struct task_migration_notifier *mn = v;
-       struct pvclock_vsyscall_time_info *pvti;
-
-       pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
-
-       /* this is NULL when pvclock vsyscall is not initialized */
-       if (unlikely(pvti == NULL))
-               return NOTIFY_DONE;
-
-       pvti->migrate_count++;
-
-       return NOTIFY_DONE;
-}
-
-static struct notifier_block pvclock_migrate = {
-       .notifier_call = pvclock_task_migrate,
-};
-
  /*
   * Initialize the generic pvclock vsyscall state.  This will allocate
   * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
  
         WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
  
-       pvclock_vdso_info = i;
-
         for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
                 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
                              __pa(i) + (idx*PAGE_SIZE),
                              PAGE_KERNEL_VVAR);
         }
  
-
-       register_task_migration_notifier(&pvclock_migrate);
-
         return 0;
  }
  #endif
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c

index a20ecb5b6cbf3543490ab6a74a969aa45a1862c5..b110fe6c03d43908146d05ad689937d3bd991bb9 100644 (file)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                              (1 << KVM_FEATURE_CLOCKSOURCE2) |
                              (1 << KVM_FEATURE_ASYNC_PF) |
                              (1 << KVM_FEATURE_PV_EOI) |
-                            (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+                            (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+                            (1 << KVM_FEATURE_PV_UNHALT);
  
                 if (sched_info_on())
                         entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c

index afc11245827cf2f39b56d14d50dacd6ea9aeb374..5439117d5c4cccfa00d28dd64fb5aa8fd488261e 100644 (file)
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
         *((u32 *) (apic->regs + reg_off)) = val;
  }
  
-static inline int apic_test_and_set_vector(int vec, void *bitmap)
-{
-       return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int apic_test_and_clear_vector(int vec, void *bitmap)
-{
-       return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
  static inline int apic_test_vector(int vec, void *bitmap)
  {
         return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
  }
  EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
  
-static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
+static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
  {
         apic->irr_pending = true;
-       return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
+       apic_set_vector(vec, apic->regs + APIC_IRR);
  }
  
  static inline int apic_search_irr(struct kvm_lapic *apic)
@@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                 if (unlikely(!apic_enabled(apic)))
                         break;
  
+               result = 1;
+
                 if (dest_map)
                         __set_bit(vcpu->vcpu_id, dest_map);
  
-               if (kvm_x86_ops->deliver_posted_interrupt) {
-                       result = 1;
+               if (kvm_x86_ops->deliver_posted_interrupt)
                         kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
-               } else {
-                       result = !apic_test_and_set_irr(vector, apic);
-
-                       if (!result) {
-                               if (trig_mode)
-                                       apic_debug("level trig mode repeatedly "
-                                               "for vector %d", vector);
-                               goto out;
-                       }
+               else {
+                       apic_set_irr(vector, apic);
  
                         kvm_make_request(KVM_REQ_EVENT, vcpu);
                         kvm_vcpu_kick(vcpu);
                 }
-out:
                 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
-                               trig_mode, vector, !result);
+                                         trig_mode, vector, false);
                 break;
  
         case APIC_DM_REMRD:
-               apic_debug("Ignoring delivery mode 3\n");
+               result = 1;
+               vcpu->arch.pv.pv_unhalted = 1;
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+               kvm_vcpu_kick(vcpu);
                 break;
  
         case APIC_DM_SMI:
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index 9e9285ae9b946ac1fbcb3b1dfd02159107c0f221..6e2d2c8f230bea3202896156be9a8196785241aa 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -132,8 +132,8 @@ module_param(dbg, bool, 0644);
         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
                                             * PT32_LEVEL_BITS))) - 1))
  
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
-                       | PT64_NX_MASK)
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
+                       | shadow_x_mask | shadow_nx_mask)
  
  #define ACC_EXEC_MASK    1
  #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@@ -331,11 +331,6 @@ static int is_large_pte(u64 pte)
         return pte & PT_PAGE_SIZE_MASK;
  }
  
-static int is_dirty_gpte(unsigned long pte)
-{
-       return pte & PT_DIRTY_MASK;
-}
-
  static int is_rmap_spte(u64 pte)
  {
         return is_shadow_present_pte(pte);
@@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
         return __shadow_walk_next(iterator, *iterator->sptep);
  }
  
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
  {
         u64 spte;
  
+       BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
+                       VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+
         spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
-              shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+              shadow_user_mask | shadow_x_mask;
+
+       if (accessed)
+               spte |= shadow_accessed_mask;
  
         mmu_spte_set(sptep, spte);
  }
@@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
         mmu_free_roots(vcpu);
  }
  
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-       int bit7;
-
-       bit7 = (gpte >> 7) & 1;
-       return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
  static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
                                      bool no_dirty_log)
  {
@@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
         return gfn_to_pfn_memslot_atomic(slot, gfn);
  }
  
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu_page *sp, u64 *spte,
-                                 u64 gpte)
-{
-       if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-               goto no_present;
-
-       if (!is_present_gpte(gpte))
-               goto no_present;
-
-       if (!(gpte & PT_ACCESSED_MASK))
-               goto no_present;
-
-       return false;
-
-no_present:
-       drop_spte(vcpu->kvm, spte);
-       return true;
-}
-
  static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
                                     struct kvm_mmu_page *sp,
                                     u64 *start, u64 *end)
@@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                                               iterator.level - 1,
                                               1, ACC_ALL, iterator.sptep);
  
-                       link_shadow_page(iterator.sptep, sp);
+                       link_shadow_page(iterator.sptep, sp, true);
                 }
         }
         return emulate;
@@ -2808,7 +2781,7 @@ exit:
         return ret;
  }
  
-static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
+static bool page_fault_can_be_fast(u32 error_code)
  {
         /*
          * Do not fix the mmio spte with invalid generation number which
@@ -2861,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
         bool ret = false;
         u64 spte = 0ull;
  
-       if (!page_fault_can_be_fast(vcpu, error_code))
+       if (!page_fault_can_be_fast(error_code))
                 return false;
  
         walk_shadow_page_lockless_begin(vcpu);
@@ -3209,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
         mmu_sync_roots(vcpu);
         spin_unlock(&vcpu->kvm->mmu_lock);
  }
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
  
  static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
                                   u32 access, struct x86_exception *exception)
@@ -3478,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
         ++vcpu->stat.tlb_flush;
         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  }
+EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
  
  static void paging_new_cr3(struct kvm_vcpu *vcpu)
  {
@@ -3501,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
         nonpaging_free(vcpu);
  }
  
-static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
-{
-       unsigned mask;
-
-       BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
-
-       mask = (unsigned)~ACC_WRITE_MASK;
-       /* Allow write access to dirty gptes */
-       mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
-       *access &= mask;
-}
-
  static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
                            unsigned access, int *nr_present)
  {
@@ -3530,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
         return false;
  }
  
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
-       unsigned access;
-
-       access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-       access &= ~(gpte >> PT64_NX_SHIFT);
-
-       return access;
-}
-
  static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
  {
         unsigned index;
@@ -3549,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
         return mmu->last_pte_bitmap & (1 << index);
  }
  
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
  #define PTTYPE 64
  #include "paging_tmpl.h"
  #undef PTTYPE
@@ -3563,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
         int maxphyaddr = cpuid_maxphyaddr(vcpu);
         u64 exb_bit_rsvd = 0;
  
+       context->bad_mt_xwr = 0;
+
         if (!context->nx)
                 exb_bit_rsvd = rsvd_bits(63, 63);
         switch (context->root_level) {
@@ -3618,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
         }
  }
  
-static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+               struct kvm_mmu *context, bool execonly)
+{
+       int maxphyaddr = cpuid_maxphyaddr(vcpu);
+       int pte;
+
+       context->rsvd_bits_mask[0][3] =
+               rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+       context->rsvd_bits_mask[0][2] =
+               rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+       context->rsvd_bits_mask[0][1] =
+               rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+       context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+
+       /* large page */
+       context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+       context->rsvd_bits_mask[1][2] =
+               rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
+       context->rsvd_bits_mask[1][1] =
+               rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+       context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+
+       for (pte = 0; pte < 64; pte++) {
+               int rwx_bits = pte & 7;
+               int mt = pte >> 3;
+               if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
+                               rwx_bits == 0x2 || rwx_bits == 0x6 ||
+                               (rwx_bits == 0x4 && !execonly))
+                       context->bad_mt_xwr |= (1ull << pte);
+       }
+}
+
+static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+               struct kvm_mmu *mmu, bool ept)
  {
         unsigned bit, byte, pfec;
         u8 map;
@@ -3636,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
                         w = bit & ACC_WRITE_MASK;
                         u = bit & ACC_USER_MASK;
  
-                       /* Not really needed: !nx will cause pte.nx to fault */
-                       x |= !mmu->nx;
-                       /* Allow supervisor writes if !cr0.wp */
-                       w |= !is_write_protection(vcpu) && !uf;
-                       /* Disallow supervisor fetches of user code if cr4.smep */
-                       x &= !(smep && u && !uf);
+                       if (!ept) {
+                               /* Not really needed: !nx will cause pte.nx to fault */
+                               x |= !mmu->nx;
+                               /* Allow supervisor writes if !cr0.wp */
+                               w |= !is_write_protection(vcpu) && !uf;
+                               /* Disallow supervisor fetches of user code if cr4.smep */
+                               x &= !(smep && u && !uf);
+                       } else
+                               /* Not really needed: no U/S accesses on ept  */
+                               u = 1;
  
                         fault = (ff && !x) || (uf && !u) || (wf && !w);
                         map |= fault << bit;
@@ -3676,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
         context->root_level = level;
  
         reset_rsvds_bits_mask(vcpu, context);
-       update_permission_bitmask(vcpu, context);
+       update_permission_bitmask(vcpu, context, false);
         update_last_pte_bitmap(vcpu, context);
  
         ASSERT(is_pae(vcpu));
@@ -3706,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
         context->root_level = PT32_ROOT_LEVEL;
  
         reset_rsvds_bits_mask(vcpu, context);
-       update_permission_bitmask(vcpu, context);
+       update_permission_bitmask(vcpu, context, false);
         update_last_pte_bitmap(vcpu, context);
  
         context->new_cr3 = paging_new_cr3;
@@ -3768,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
                 context->gva_to_gpa = paging32_gva_to_gpa;
         }
  
-       update_permission_bitmask(vcpu, context);
+       update_permission_bitmask(vcpu, context, false);
         update_last_pte_bitmap(vcpu, context);
  
         return 0;
@@ -3800,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
  }
  EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
  
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+               bool execonly)
+{
+       ASSERT(vcpu);
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+       context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+
+       context->nx = true;
+       context->new_cr3 = paging_new_cr3;
+       context->page_fault = ept_page_fault;
+       context->gva_to_gpa = ept_gva_to_gpa;
+       context->sync_page = ept_sync_page;
+       context->invlpg = ept_invlpg;
+       context->update_pte = ept_update_pte;
+       context->free = paging_free;
+       context->root_level = context->shadow_root_level;
+       context->root_hpa = INVALID_PAGE;
+       context->direct_map = false;
+
+       update_permission_bitmask(vcpu, context, true);
+       reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
+
  static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
  {
         int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
@@ -3847,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
                 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
         }
  
-       update_permission_bitmask(vcpu, g_context);
+       update_permission_bitmask(vcpu, g_context, false);
         update_last_pte_bitmap(vcpu, g_context);
  
         return 0;
@@ -3923,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new)
                 return true;
         if ((old ^ new) & PT64_BASE_ADDR_MASK)
                 return true;
-       old ^= PT64_NX_MASK;
-       new ^= PT64_NX_MASK;
+       old ^= shadow_nx_mask;
+       new ^= shadow_nx_mask;
         return (old & ~new & PT64_PERM_MASK) != 0;
  }
  
@@ -4182,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
         switch (er) {
         case EMULATE_DONE:
                 return 1;
-       case EMULATE_DO_MMIO:
+       case EMULATE_USER_EXIT:
                 ++vcpu->stat.mmio_exits;
                 /* fall through */
         case EMULATE_FAIL:
@@ -4390,11 +4414,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
         /*
          * The very rare case: if the generation-number is round,
          * zap all shadow pages.
-        *
-        * The max value is MMIO_MAX_GEN - 1 since it is not called
-        * when mark memslot invalid.
          */
-       if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
+       if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
                 printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
                 kvm_mmu_invalidate_zap_all_pages(kvm);
         }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h

index 5b59c573aba7a749ad52ed696b3402e5fd275106..77e044a0f5f70f36222510cf8012ecb9607da65e 100644 (file)
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -71,6 +71,8 @@ enum {
  
  int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
  int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+               bool execonly);
  
  static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
  {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h

index 7769699d48a80caac0e1d8402280ab1e15b99bed..04333015917984a6b65f3fd0ab4ecbad2f5b1f2a 100644 (file)
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,6 +23,13 @@
   * so the code in this file is compiled twice, once per pte size.
   */
  
+/*
+ * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
+ * uses for EPT without A/D paging type.
+ */
+extern u64 __pure __using_nonexistent_pte_bit(void)
+              __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
+
  #if PTTYPE == 64
         #define pt_element_t u64
         #define guest_walker guest_walker64
@@ -32,6 +39,10 @@
         #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
         #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
         #define PT_LEVEL_BITS PT64_LEVEL_BITS
+       #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+       #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+       #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+       #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
         #ifdef CONFIG_X86_64
         #define PT_MAX_FULL_LEVELS 4
         #define CMPXCHG cmpxchg
@@ -49,7 +60,26 @@
         #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
         #define PT_LEVEL_BITS PT32_LEVEL_BITS
         #define PT_MAX_FULL_LEVELS 2
+       #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+       #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+       #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+       #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
         #define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+       #define pt_element_t u64
+       #define guest_walker guest_walkerEPT
+       #define FNAME(name) ept_##name
+       #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+       #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+       #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+       #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+       #define PT_LEVEL_BITS PT64_LEVEL_BITS
+       #define PT_GUEST_ACCESSED_MASK 0
+       #define PT_GUEST_DIRTY_MASK 0
+       #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
+       #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
+       #define CMPXCHG cmpxchg64
+       #define PT_MAX_FULL_LEVELS 4
  #else
         #error Invalid PTTYPE value
  #endif
@@ -80,6 +110,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
         return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
  }
  
+static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
+{
+       unsigned mask;
+
+       /* dirty bit is not supported, so no need to track it */
+       if (!PT_GUEST_DIRTY_MASK)
+               return;
+
+       BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+       mask = (unsigned)~ACC_WRITE_MASK;
+       /* Allow write access to dirty gptes */
+       mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
+               PT_WRITABLE_MASK;
+       *access &= mask;
+}
+
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+       int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
+
+       return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
+               ((mmu->bad_mt_xwr & (1ull << low6)) != 0);
+}
+
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE != PTTYPE_EPT
+       return is_present_gpte(pte);
+#else
+       return pte & 7;
+#endif
+}
+
  static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                                pt_element_t __user *ptep_user, unsigned index,
                                pt_element_t orig_pte, pt_element_t new_pte)
@@ -103,6 +167,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
         return (ret != orig_pte);
  }
  
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+                                 struct kvm_mmu_page *sp, u64 *spte,
+                                 u64 gpte)
+{
+       if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+               goto no_present;
+
+       if (!FNAME(is_present_gpte)(gpte))
+               goto no_present;
+
+       /* if accessed bit is not supported prefetch non accessed gpte */
+       if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
+               goto no_present;
+
+       return false;
+
+no_present:
+       drop_spte(vcpu->kvm, spte);
+       return true;
+}
+
+static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+       unsigned access;
+#if PTTYPE == PTTYPE_EPT
+       access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+               ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+               ACC_USER_MASK;
+#else
+       access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+       access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+
+       return access;
+}
+
  static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
                                              struct kvm_mmu *mmu,
                                              struct guest_walker *walker,
@@ -114,18 +214,23 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
         gfn_t table_gfn;
         int ret;
  
+       /* dirty/accessed bits are not supported, so no need to update them */
+       if (!PT_GUEST_DIRTY_MASK)
+               return 0;
+
         for (level = walker->max_level; level >= walker->level; --level) {
                 pte = orig_pte = walker->ptes[level - 1];
                 table_gfn = walker->table_gfn[level - 1];
                 ptep_user = walker->ptep_user[level - 1];
                 index = offset_in_page(ptep_user) / sizeof(pt_element_t);
-               if (!(pte & PT_ACCESSED_MASK)) {
+               if (!(pte & PT_GUEST_ACCESSED_MASK)) {
                         trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
-                       pte |= PT_ACCESSED_MASK;
+                       pte |= PT_GUEST_ACCESSED_MASK;
                 }
-               if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
+               if (level == walker->level && write_fault &&
+                               !(pte & PT_GUEST_DIRTY_MASK)) {
                         trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
-                       pte |= PT_DIRTY_MASK;
+                       pte |= PT_GUEST_DIRTY_MASK;
                 }
                 if (pte == orig_pte)
                         continue;
@@ -170,7 +275,7 @@ retry_walk:
         if (walker->level == PT32E_ROOT_LEVEL) {
                 pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
                 trace_kvm_mmu_paging_element(pte, walker->level);
-               if (!is_present_gpte(pte))
+               if (!FNAME(is_present_gpte)(pte))
                         goto error;
                 --walker->level;
         }
@@ -179,7 +284,7 @@ retry_walk:
         ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
                (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
  
-       accessed_dirty = PT_ACCESSED_MASK;
+       accessed_dirty = PT_GUEST_ACCESSED_MASK;
         pt_access = pte_access = ACC_ALL;
         ++walker->level;
  
@@ -215,17 +320,17 @@ retry_walk:
  
                 trace_kvm_mmu_paging_element(pte, walker->level);
  
-               if (unlikely(!is_present_gpte(pte)))
+               if (unlikely(!FNAME(is_present_gpte)(pte)))
                         goto error;
  
-               if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
-                                             walker->level))) {
+               if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
+                                                    walker->level))) {
                         errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
                         goto error;
                 }
  
                 accessed_dirty &= pte;
-               pte_access = pt_access & gpte_access(vcpu, pte);
+               pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
  
                 walker->ptes[walker->level - 1] = pte;
         } while (!is_last_gpte(mmu, walker->level, pte));
@@ -248,13 +353,15 @@ retry_walk:
         walker->gfn = real_gpa >> PAGE_SHIFT;
  
         if (!write_fault)
-               protect_clean_gpte(&pte_access, pte);
+               FNAME(protect_clean_gpte)(&pte_access, pte);
         else
                 /*
-                * On a write fault, fold the dirty bit into accessed_dirty by
-                * shifting it one place right.
+                * On a write fault, fold the dirty bit into accessed_dirty.
+                * For modes without A/D bits support accessed_dirty will be
+                * always clear.
                  */
-               accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
+               accessed_dirty &= pte >>
+                       (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
  
         if (unlikely(!accessed_dirty)) {
                 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -279,6 +386,25 @@ error:
         walker->fault.vector = PF_VECTOR;
         walker->fault.error_code_valid = true;
         walker->fault.error_code = errcode;
+
+#if PTTYPE == PTTYPE_EPT
+       /*
+        * Use PFERR_RSVD_MASK in error_code to to tell if EPT
+        * misconfiguration requires to be injected. The detection is
+        * done by is_rsvd_bits_set() above.
+        *
+        * We set up the value of exit_qualification to inject:
+        * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
+        * [5:3] - Calculated by the page walk of the guest EPT page tables
+        * [7:8] - Derived from [7:8] of real exit_qualification
+        *
+        * The other bits are set to 0.
+        */
+       if (!(errcode & PFERR_RSVD_MASK)) {
+               vcpu->arch.exit_qualification &= 0x187;
+               vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
+       }
+#endif
         walker->fault.address = addr;
         walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
  
@@ -293,6 +419,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
                                         access);
  }
  
+#if PTTYPE != PTTYPE_EPT
  static int FNAME(walk_addr_nested)(struct guest_walker *walker,
                                    struct kvm_vcpu *vcpu, gva_t addr,
                                    u32 access)
@@ -300,6 +427,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
         return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
                                         addr, access);
  }
+#endif
  
  static bool
  FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -309,14 +437,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
         gfn_t gfn;
         pfn_t pfn;
  
-       if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+       if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
                 return false;
  
         pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
  
         gfn = gpte_to_gfn(gpte);
-       pte_access = sp->role.access & gpte_access(vcpu, gpte);
-       protect_clean_gpte(&pte_access, gpte);
+       pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+       FNAME(protect_clean_gpte)(&pte_access, gpte);
         pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
                         no_dirty_log && (pte_access & ACC_WRITE_MASK));
         if (is_error_pfn(pfn))
@@ -446,7 +574,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                         goto out_gpte_changed;
  
                 if (sp)
-                       link_shadow_page(it.sptep, sp);
+                       link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
         }
  
         for (;
@@ -466,7 +594,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
  
                 sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
                                       true, direct_access, it.sptep);
-               link_shadow_page(it.sptep, sp);
+               link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
         }
  
         clear_sp_write_flooding_count(it.sptep);
@@ -727,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
         return gpa;
  }
  
+#if PTTYPE != PTTYPE_EPT
  static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
                                       u32 access,
                                       struct x86_exception *exception)
@@ -745,6 +874,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
  
         return gpa;
  }
+#endif
  
  /*
   * Using the cached information from sp->gfns is safe because:
@@ -785,15 +915,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                                           sizeof(pt_element_t)))
                         return -EINVAL;
  
-               if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
+               if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
                         vcpu->kvm->tlbs_dirty++;
                         continue;
                 }
  
                 gfn = gpte_to_gfn(gpte);
                 pte_access = sp->role.access;
-               pte_access &= gpte_access(vcpu, gpte);
-               protect_clean_gpte(&pte_access, gpte);
+               pte_access &= FNAME(gpte_access)(vcpu, gpte);
+               FNAME(protect_clean_gpte)(&pte_access, gpte);
  
                 if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
                       &nr_present))
@@ -830,3 +960,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
  #undef gpte_to_gfn
  #undef gpte_to_gfn_lvl
  #undef CMPXCHG
+#undef PT_GUEST_ACCESSED_MASK
+#undef PT_GUEST_DIRTY_MASK
+#undef PT_GUEST_DIRTY_SHIFT
+#undef PT_GUEST_ACCESSED_SHIFT
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c

index c53e797e7369ad4a1086899ee65eb1e71d048d84..5c4f63151b4d90a405808bda13daa911c5b2a131 100644 (file)
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc)
  
  static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
                 unsigned config, bool exclude_user, bool exclude_kernel,
-               bool intr)
+               bool intr, bool in_tx, bool in_tx_cp)
  {
         struct perf_event *event;
         struct perf_event_attr attr = {
@@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
                 .exclude_kernel = exclude_kernel,
                 .config = config,
         };
+       if (in_tx)
+               attr.config |= HSW_IN_TX;
+       if (in_tx_cp)
+               attr.config |= HSW_IN_TX_CHECKPOINTED;
  
         attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
  
@@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
  
         if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
                                 ARCH_PERFMON_EVENTSEL_INV |
-                               ARCH_PERFMON_EVENTSEL_CMASK))) {
+                               ARCH_PERFMON_EVENTSEL_CMASK |
+                               HSW_IN_TX |
+                               HSW_IN_TX_CHECKPOINTED))) {
                 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
                                 unit_mask);
                 if (config != PERF_COUNT_HW_MAX)
@@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
         reprogram_counter(pmc, type, config,
                         !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
                         !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
-                       eventsel & ARCH_PERFMON_EVENTSEL_INT);
+                       eventsel & ARCH_PERFMON_EVENTSEL_INT,
+                       (eventsel & HSW_IN_TX),
+                       (eventsel & HSW_IN_TX_CHECKPOINTED));
  }
  
  static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
@@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
                         arch_events[fixed_pmc_events[idx]].event_type,
                         !(en & 0x2), /* exclude user */
                         !(en & 0x1), /* exclude kernel */
-                       pmi);
+                       pmi, false, false);
  }
  
  static inline u8 fixed_en_pmi(u64 ctrl, int idx)
@@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
                         if (data == pmc->eventsel)
                                 return 0;
-                       if (!(data & 0xffffffff00200000ull)) {
+                       if (!(data & pmu->reserved_bits)) {
                                 reprogram_gp_counter(pmc, data);
                                 return 0;
                         }
@@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
         pmu->counter_bitmask[KVM_PMC_GP] = 0;
         pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
         pmu->version = 0;
+       pmu->reserved_bits = 0xffffffff00200000ull;
  
         entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
         if (!entry)
@@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
         pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
                 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
         pmu->global_ctrl_mask = ~pmu->global_ctrl;
+
+       entry = kvm_find_cpuid_entry(vcpu, 7, 0);
+       if (entry &&
+           (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+           (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
+               pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
  }
  
  void kvm_pmu_init(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 064d0be67ecc23734aa465541138d9b5be295277..1f1da43ff2a2ca66a137c434cf738dbf7a03e704 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -373,6 +373,7 @@ struct nested_vmx {
          * we must keep them pinned while L2 runs.
          */
         struct page *apic_access_page;
+       u64 msr_ia32_feature_control;
  };
  
  #define POSTED_INTR_ON  0
@@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page)
         kvm_release_page_clean(page);
  }
  
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
  static u64 construct_eptp(unsigned long root_hpa);
  static void kvm_cpu_vmxon(u64 addr);
  static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
  static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
  static void vmx_set_segment(struct kvm_vcpu *vcpu,
                             struct kvm_segment *var, int seg);
@@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
                 (vmcs12->secondary_vm_exec_control & bit);
  }
  
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
-       struct kvm_vcpu *vcpu)
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
  {
         return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
  }
  
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+       return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
  static inline bool is_exception(u32 intr_info)
  {
         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
  static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
  static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
  static u32 nested_vmx_misc_low, nested_vmx_misc_high;
+static u32 nested_vmx_ept_caps;
  static __init void nested_vmx_setup_ctls_msrs(void)
  {
         /*
@@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
          * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
          * 17 must be 1.
          */
+       rdmsr(MSR_IA32_VMX_EXIT_CTLS,
+               nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
         nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
         /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
+       nested_vmx_exit_ctls_high &=
  #ifdef CONFIG_X86_64
-       nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#else
-       nested_vmx_exit_ctls_high = 0;
+               VM_EXIT_HOST_ADDR_SPACE_SIZE |
  #endif
-       nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+                                     VM_EXIT_LOAD_IA32_EFER);
  
         /* entry controls */
         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
         /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
         nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
         nested_vmx_entry_ctls_high &=
-               VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
-       nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+#ifdef CONFIG_X86_64
+               VM_ENTRY_IA32E_MODE |
+#endif
+               VM_ENTRY_LOAD_IA32_PAT;
+       nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
+                                      VM_ENTRY_LOAD_IA32_EFER);
  
         /* cpu-based controls */
         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                 SECONDARY_EXEC_WBINVD_EXITING;
  
+       if (enable_ept) {
+               /* nested EPT: emulate EPT also to L1 */
+               nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+               nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+                        VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
+               nested_vmx_ept_caps &= vmx_capability.ept;
+               /*
+                * Since invept is completely emulated we support both global
+                * and context invalidation independent of what host cpu
+                * supports
+                */
+               nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+                       VMX_EPT_EXTENT_CONTEXT_BIT;
+       } else
+               nested_vmx_ept_caps = 0;
+
         /* miscellaneous data */
         rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
         nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
@@ -2282,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
  
         switch (msr_index) {
         case MSR_IA32_FEATURE_CONTROL:
-               *pdata = 0;
-               break;
+               if (nested_vmx_allowed(vcpu)) {
+                       *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+                       break;
+               }
+               return 0;
         case MSR_IA32_VMX_BASIC:
                 /*
                  * This MSR reports some information about VMX support. We
@@ -2346,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                                         nested_vmx_secondary_ctls_high);
                 break;
         case MSR_IA32_VMX_EPT_VPID_CAP:
-               /* Currently, no nested ept or nested vpid */
-               *pdata = 0;
+               /* Currently, no nested vpid support */
+               *pdata = nested_vmx_ept_caps;
                 break;
         default:
                 return 0;
@@ -2356,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
         return 1;
  }
  
-static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  {
+       u32 msr_index = msr_info->index;
+       u64 data = msr_info->data;
+       bool host_initialized = msr_info->host_initiated;
+
         if (!nested_vmx_allowed(vcpu))
                 return 0;
  
-       if (msr_index == MSR_IA32_FEATURE_CONTROL)
-               /* TODO: the right thing. */
+       if (msr_index == MSR_IA32_FEATURE_CONTROL) {
+               if (!host_initialized &&
+                               to_vmx(vcpu)->nested.msr_ia32_feature_control
+                               & FEATURE_CONTROL_LOCKED)
+                       return 0;
+               to_vmx(vcpu)->nested.msr_ia32_feature_control = data;
                 return 1;
+       }
+
         /*
          * No need to treat VMX capability MSRs specially: If we don't handle
          * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
@@ -2494,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                         return 1;
                 /* Otherwise falls through */
         default:
-               if (vmx_set_vmx_msr(vcpu, msr_index, data))
+               if (vmx_set_vmx_msr(vcpu, msr_info))
                         break;
                 msr = find_msr_entry(vmx, msr_index);
                 if (msr) {
@@ -5302,9 +5344,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
  
         /* It is a write fault? */
         error_code = exit_qualification & (1U << 1);
+       /* It is a fetch fault? */
+       error_code |= (exit_qualification & (1U << 2)) << 2;
         /* ept page table is present? */
         error_code |= (exit_qualification >> 3) & 0x1;
  
+       vcpu->arch.exit_qualification = exit_qualification;
+
         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
  }
  
@@ -5438,7 +5484,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
  
                 err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
  
-               if (err == EMULATE_DO_MMIO) {
+               if (err == EMULATE_USER_EXIT) {
+                       ++vcpu->stat.mmio_exits;
                         ret = 0;
                         goto out;
                 }
@@ -5567,8 +5614,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
                 free_loaded_vmcs(&vmx->vmcs01);
  }
  
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+ */
+static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+       vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+                       & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+                           X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+}
+
+static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+       vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+                       & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+                           X86_EFLAGS_SF | X86_EFLAGS_OF))
+                       | X86_EFLAGS_CF);
+}
+
  static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-                                u32 vm_instruction_error);
+                                       u32 vm_instruction_error)
+{
+       if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
+               /*
+                * failValid writes the error number to the current VMCS, which
+                * can't be done there isn't a current VMCS.
+                */
+               nested_vmx_failInvalid(vcpu);
+               return;
+       }
+       vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+                       & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+                           X86_EFLAGS_SF | X86_EFLAGS_OF))
+                       | X86_EFLAGS_ZF);
+       get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+       /*
+        * We don't need to force a shadow sync because
+        * VM_INSTRUCTION_ERROR is not shadowed
+        */
+}
  
  /*
   * Emulate the VMXON instruction.
@@ -5583,6 +5669,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
         struct kvm_segment cs;
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         struct vmcs *shadow_vmcs;
+       const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
+               | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
  
         /* The Intel VMX Instruction Reference lists a bunch of bits that
          * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5611,6 +5699,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                 skip_emulated_instruction(vcpu);
                 return 1;
         }
+
+       if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+                       != VMXON_NEEDED_FEATURES) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
         if (enable_shadow_vmcs) {
                 shadow_vmcs = alloc_vmcs();
                 if (!shadow_vmcs)
@@ -5628,6 +5723,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
         vmx->nested.vmxon = true;
  
         skip_emulated_instruction(vcpu);
+       nested_vmx_succeed(vcpu);
         return 1;
  }
  
@@ -5712,6 +5808,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
                 return 1;
         free_nested(to_vmx(vcpu));
         skip_emulated_instruction(vcpu);
+       nested_vmx_succeed(vcpu);
         return 1;
  }
  
@@ -5768,48 +5865,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
         return 0;
  }
  
-/*
- * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
- * set the success or error code of an emulated VMX instruction, as specified
- * by Vol 2B, VMX Instruction Reference, "Conventions".
- */
-static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
-{
-       vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
-                       & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
-                           X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
-}
-
-static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
-{
-       vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
-                       & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
-                           X86_EFLAGS_SF | X86_EFLAGS_OF))
-                       | X86_EFLAGS_CF);
-}
-
-static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-                                       u32 vm_instruction_error)
-{
-       if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
-               /*
-                * failValid writes the error number to the current VMCS, which
-                * can't be done there isn't a current VMCS.
-                */
-               nested_vmx_failInvalid(vcpu);
-               return;
-       }
-       vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
-                       & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
-                           X86_EFLAGS_SF | X86_EFLAGS_OF))
-                       | X86_EFLAGS_ZF);
-       get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
-       /*
-        * We don't need to force a shadow sync because
-        * VM_INSTRUCTION_ERROR is not shadowed
-        */
-}
-
  /* Emulate the VMCLEAR instruction */
  static int handle_vmclear(struct kvm_vcpu *vcpu)
  {
@@ -5972,8 +6027,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
         unsigned long field;
         u64 field_value;
         struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
-       unsigned long *fields = (unsigned long *)shadow_read_write_fields;
-       int num_fields = max_shadow_read_write_fields;
+       const unsigned long *fields = shadow_read_write_fields;
+       const int num_fields = max_shadow_read_write_fields;
  
         vmcs_load(shadow_vmcs);
  
@@ -6002,12 +6057,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
  
  static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
  {
-       unsigned long *fields[] = {
-               (unsigned long *)shadow_read_write_fields,
-               (unsigned long *)shadow_read_only_fields
+       const unsigned long *fields[] = {
+               shadow_read_write_fields,
+               shadow_read_only_fields
         };
-       int num_lists =  ARRAY_SIZE(fields);
-       int max_fields[] = {
+       const int max_fields[] = {
                 max_shadow_read_write_fields,
                 max_shadow_read_only_fields
         };
@@ -6018,7 +6072,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
  
         vmcs_load(shadow_vmcs);
  
-       for (q = 0; q < num_lists; q++) {
+       for (q = 0; q < ARRAY_SIZE(fields); q++) {
                 for (i = 0; i < max_fields[q]; i++) {
                         field = fields[q][i];
                         vmcs12_read_any(&vmx->vcpu, field, &field_value);
@@ -6248,6 +6302,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
         return 1;
  }
  
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+       u32 vmx_instruction_info, types;
+       unsigned long type;
+       gva_t gva;
+       struct x86_exception e;
+       struct {
+               u64 eptp, gpa;
+       } operand;
+       u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
+
+       if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+           !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       if (!nested_vmx_check_permission(vcpu))
+               return 1;
+
+       if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+       type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+       types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+
+       if (!(types & (1UL << type))) {
+               nested_vmx_failValid(vcpu,
+                               VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+               return 1;
+       }
+
+       /* According to the Intel VMX instruction reference, the memory
+        * operand is read even if it isn't needed (e.g., for type==global)
+        */
+       if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+                       vmx_instruction_info, &gva))
+               return 1;
+       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+                               sizeof(operand), &e)) {
+               kvm_inject_page_fault(vcpu, &e);
+               return 1;
+       }
+
+       switch (type) {
+       case VMX_EPT_EXTENT_CONTEXT:
+               if ((operand.eptp & eptp_mask) !=
+                               (nested_ept_get_cr3(vcpu) & eptp_mask))
+                       break;
+       case VMX_EPT_EXTENT_GLOBAL:
+               kvm_mmu_sync_roots(vcpu);
+               kvm_mmu_flush_tlb(vcpu);
+               nested_vmx_succeed(vcpu);
+               break;
+       default:
+               BUG_ON(1);
+               break;
+       }
+
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
  /*
   * The exit handlers return 1 if the exit was handled fully and guest execution
   * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -6292,6 +6414,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
         [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
         [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+       [EXIT_REASON_INVEPT]                  = handle_invept,
  };
  
  static const int kvm_vmx_max_exit_handlers =
@@ -6518,6 +6641,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
         case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
         case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
         case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+       case EXIT_REASON_INVEPT:
                 /*
                  * VMX instructions trap unconditionally. This allows L1 to
                  * emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -6550,7 +6674,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                 return nested_cpu_has2(vmcs12,
                         SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
         case EXIT_REASON_EPT_VIOLATION:
+               /*
+                * L0 always deals with the EPT violation. If nested EPT is
+                * used, and the nested mmu code discovers that the address is
+                * missing in the guest EPT table (EPT12), the EPT violation
+                * will be injected with nested_ept_inject_page_fault()
+                */
+               return 0;
         case EXIT_REASON_EPT_MISCONFIG:
+               /*
+                * L2 never uses directly L1's EPT, but rather L0's own EPT
+                * table (shadow on EPT) or a merged EPT table that L0 built
+                * (EPT on EPT). So any problems with the structure of the
+                * table is L0's fault.
+                */
                 return 0;
         case EXIT_REASON_PREEMPTION_TIMER:
                 return vmcs12->pin_based_vm_exec_control &
@@ -6638,7 +6775,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
  
         if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
             !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-                                       get_vmcs12(vcpu), vcpu)))) {
+                                       get_vmcs12(vcpu))))) {
                 if (vmx_interrupt_allowed(vcpu)) {
                         vmx->soft_vnmi_blocked = 0;
                 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -7326,6 +7463,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
                 entry->ecx |= bit(X86_FEATURE_VMX);
  }
  
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+               struct x86_exception *fault)
+{
+       struct vmcs12 *vmcs12;
+       nested_vmx_vmexit(vcpu);
+       vmcs12 = get_vmcs12(vcpu);
+
+       if (fault->error_code & PFERR_RSVD_MASK)
+               vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+       else
+               vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+       vmcs12->exit_qualification = vcpu->arch.exit_qualification;
+       vmcs12->guest_physical_address = fault->address;
+}
+
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+       /* return the page table to be shadowed - in our case, EPT12 */
+       return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+       int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+                       nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
+
+       vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
+       vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
+       vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+
+       vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+
+       return r;
+}
+
+static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
  /*
   * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
   * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7388,7 +7567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                 vmcs12->guest_interruptibility_info);
         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
         kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
-       vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
+       vmx_set_rflags(vcpu, vmcs12->guest_rflags);
         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
                 vmcs12->guest_pending_dbg_exceptions);
         vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
@@ -7508,15 +7687,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
  
-       /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
-       vmcs_write32(VM_EXIT_CONTROLS,
-               vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
-       vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+       /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
+        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
+        * bits are further modified by vmx_set_efer() below.
+        */
+       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+        * emulated by vmx_set_efer(), below.
+        */
+       vmcs_write32(VM_ENTRY_CONTROLS,
+               (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+                       ~VM_ENTRY_IA32E_MODE) |
                 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
  
-       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
+       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
                 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
-       else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+               vcpu->arch.pat = vmcs12->guest_ia32_pat;
+       } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
  
  
@@ -7538,6 +7726,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                 vmx_flush_tlb(vcpu);
         }
  
+       if (nested_cpu_has_ept(vmcs12)) {
+               kvm_mmu_unload(vcpu);
+               nested_ept_init_mmu_context(vcpu);
+       }
+
         if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
                 vcpu->arch.efer = vmcs12->guest_ia32_efer;
         else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@ -7565,6 +7758,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         kvm_set_cr3(vcpu, vmcs12->guest_cr3);
         kvm_mmu_reset_context(vcpu);
  
+       /*
+        * L1 may access the L2's PDPTR, so save them to construct vmcs12
+        */
+       if (enable_ept) {
+               vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+               vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+               vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+               vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+       }
+
         kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
         kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
  }
@@ -7887,6 +8090,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         vmcs12->guest_pending_dbg_exceptions =
                 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
  
+       /*
+        * In some cases (usually, nested EPT), L2 is allowed to change its
+        * own CR3 without exiting. If it has changed it, we must keep it.
+        * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+        * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+        *
+        * Additionally, restore L2's PDPTR to vmcs12.
+        */
+       if (enable_ept) {
+               vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+               vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+               vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+               vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+               vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+       }
+
         vmcs12->vm_entry_controls =
                 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
                 (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
@@ -7948,6 +8167,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                                    struct vmcs12 *vmcs12)
  {
+       struct kvm_segment seg;
+
         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
                 vcpu->arch.efer = vmcs12->host_ia32_efer;
         else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
@@ -7982,7 +8203,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
         kvm_set_cr4(vcpu, vmcs12->host_cr4);
  
-       /* shadow page tables on either EPT or shadow page tables */
+       if (nested_cpu_has_ept(vmcs12))
+               nested_ept_uninit_mmu_context(vcpu);
+
         kvm_set_cr3(vcpu, vmcs12->host_cr3);
         kvm_mmu_reset_context(vcpu);
  
@@ -8001,23 +8224,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
         vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
         vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
         vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
-       vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
-       vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
-       vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
-       vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
-       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
-       vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
-       vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
-       vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
-       vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
-       vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
-
-       if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
+
+       if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
                 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+               vcpu->arch.pat = vmcs12->host_ia32_pat;
+       }
         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
                 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
                         vmcs12->host_ia32_perf_global_ctrl);
  
+       /* Set L1 segment info according to Intel SDM
+           27.5.2 Loading Host Segment and Descriptor-Table Registers */
+       seg = (struct kvm_segment) {
+               .base = 0,
+               .limit = 0xFFFFFFFF,
+               .selector = vmcs12->host_cs_selector,
+               .type = 11,
+               .present = 1,
+               .s = 1,
+               .g = 1
+       };
+       if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+               seg.l = 1;
+       else
+               seg.db = 1;
+       vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+       seg = (struct kvm_segment) {
+               .base = 0,
+               .limit = 0xFFFFFFFF,
+               .type = 3,
+               .present = 1,
+               .s = 1,
+               .db = 1,
+               .g = 1
+       };
+       seg.selector = vmcs12->host_ds_selector;
+       vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+       seg.selector = vmcs12->host_es_selector;
+       vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+       seg.selector = vmcs12->host_ss_selector;
+       vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+       seg.selector = vmcs12->host_fs_selector;
+       seg.base = vmcs12->host_fs_base;
+       vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+       seg.selector = vmcs12->host_gs_selector;
+       seg.base = vmcs12->host_gs_base;
+       vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+       seg = (struct kvm_segment) {
+               .base = vmcs12->host_tr_base,
+               .limit = 0x67,
+               .selector = vmcs12->host_tr_selector,
+               .type = 11,
+               .present = 1
+       };
+       vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+
         kvm_set_dr(vcpu, 7, 0x400);
         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
  }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index d21bce5053155535f51d1507f7f38306023eb994..e5ca72a5cdb6da13617033ad8c0c65c4391d9e2f 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -682,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                  */
         }
  
-       /*
-        * Does the new cr3 value map to physical memory? (Note, we
-        * catch an invalid cr3 even in real-mode, because it would
-        * cause trouble later on when we turn on paging anyway.)
-        *
-        * A real CPU would silently accept an invalid cr3 and would
-        * attempt to use it - with largely undefined (and often hard
-        * to debug) behavior on the guest side.
-        */
-       if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-               return 1;
         vcpu->arch.cr3 = cr3;
         __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
         vcpu->arch.mmu.new_cr3(vcpu);
@@ -850,7 +839,8 @@ static u32 msrs_to_save[] = {
  #ifdef CONFIG_X86_64
         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
  #endif
-       MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+       MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+       MSR_IA32_FEATURE_CONTROL
  };
  
  static unsigned num_msrs_to_save;
@@ -1457,6 +1447,29 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
  #endif
  }
  
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+       int i;
+       struct kvm_vcpu *vcpu;
+       struct kvm_arch *ka = &kvm->arch;
+
+       spin_lock(&ka->pvclock_gtod_sync_lock);
+       kvm_make_mclock_inprogress_request(kvm);
+       /* no guest entries from this point */
+       pvclock_update_vm_gtod_copy(kvm);
+
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+
+       /* guest entries allowed */
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+
+       spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
+
  static int kvm_guest_time_update(struct kvm_vcpu *v)
  {
         unsigned long flags, this_tsc_khz;
@@ -3806,6 +3819,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                 delta = user_ns.clock - now_ns;
                 local_irq_enable();
                 kvm->arch.kvmclock_offset = delta;
+               kvm_gen_update_masterclock(kvm);
                 break;
         }
         case KVM_GET_CLOCK: {
@@ -4955,6 +4969,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
  static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
  static int complete_emulated_pio(struct kvm_vcpu *vcpu);
  
+static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
+                               unsigned long *db)
+{
+       u32 dr6 = 0;
+       int i;
+       u32 enable, rwlen;
+
+       enable = dr7;
+       rwlen = dr7 >> 16;
+       for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
+               if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
+                       dr6 |= (1 << i);
+       return dr6;
+}
+
+static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
+{
+       struct kvm_run *kvm_run = vcpu->run;
+
+       /*
+        * Use the "raw" value to see if TF was passed to the processor.
+        * Note that the new value of the flags has not been saved yet.
+        *
+        * This is correct even for TF set by the guest, because "the
+        * processor will not generate this exception after the instruction
+        * that sets the TF flag".
+        */
+       unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+
+       if (unlikely(rflags & X86_EFLAGS_TF)) {
+               if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+                       kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
+                       kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+                       kvm_run->debug.arch.exception = DB_VECTOR;
+                       kvm_run->exit_reason = KVM_EXIT_DEBUG;
+                       *r = EMULATE_USER_EXIT;
+               } else {
+                       vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
+                       /*
+                        * "Certain debug exceptions may clear bit 0-3.  The
+                        * remaining contents of the DR6 register are never
+                        * cleared by the processor".
+                        */
+                       vcpu->arch.dr6 &= ~15;
+                       vcpu->arch.dr6 |= DR6_BS;
+                       kvm_queue_exception(vcpu, DB_VECTOR);
+               }
+       }
+}
+
+static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+{
+       struct kvm_run *kvm_run = vcpu->run;
+       unsigned long eip = vcpu->arch.emulate_ctxt.eip;
+       u32 dr6 = 0;
+
+       if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
+           (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
+               dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+                                          vcpu->arch.guest_debug_dr7,
+                                          vcpu->arch.eff_db);
+
+               if (dr6 != 0) {
+                       kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+                       kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
+                               get_segment_base(vcpu, VCPU_SREG_CS);
+
+                       kvm_run->debug.arch.exception = DB_VECTOR;
+                       kvm_run->exit_reason = KVM_EXIT_DEBUG;
+                       *r = EMULATE_USER_EXIT;
+                       return true;
+               }
+       }
+
+       if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
+               dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+                                          vcpu->arch.dr7,
+                                          vcpu->arch.db);
+
+               if (dr6 != 0) {
+                       vcpu->arch.dr6 &= ~15;
+                       vcpu->arch.dr6 |= dr6;
+                       kvm_queue_exception(vcpu, DB_VECTOR);
+                       *r = EMULATE_DONE;
+                       return true;
+               }
+       }
+
+       return false;
+}
+
  int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                             unsigned long cr2,
                             int emulation_type,
@@ -4975,6 +5080,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
  
         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
                 init_emulate_ctxt(vcpu);
+
+               /*
+                * We will reenter on the same instruction since
+                * we do not set complete_userspace_io.  This does not
+                * handle watchpoints yet, those would be handled in
+                * the emulate_ops.
+                */
+               if (kvm_vcpu_check_breakpoint(vcpu, &r))
+                       return r;
+
                 ctxt->interruptibility = 0;
                 ctxt->have_exception = false;
                 ctxt->perm_ok = false;
@@ -5031,17 +5146,18 @@ restart:
                 inject_emulated_exception(vcpu);
                 r = EMULATE_DONE;
         } else if (vcpu->arch.pio.count) {
-               if (!vcpu->arch.pio.in)
+               if (!vcpu->arch.pio.in) {
+                       /* FIXME: return into emulator if single-stepping.  */
                         vcpu->arch.pio.count = 0;
-               else {
+               } else {
                         writeback = false;
                         vcpu->arch.complete_userspace_io = complete_emulated_pio;
                 }
-               r = EMULATE_DO_MMIO;
+               r = EMULATE_USER_EXIT;
         } else if (vcpu->mmio_needed) {
                 if (!vcpu->mmio_is_write)
                         writeback = false;
-               r = EMULATE_DO_MMIO;
+               r = EMULATE_USER_EXIT;
                 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
         } else if (r == EMULATION_RESTART)
                 goto restart;
@@ -5050,10 +5166,12 @@ restart:
  
         if (writeback) {
                 toggle_interruptibility(vcpu, ctxt->interruptibility);
-               kvm_set_rflags(vcpu, ctxt->eflags);
                 kvm_make_request(KVM_REQ_EVENT, vcpu);
                 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
                 kvm_rip_write(vcpu, ctxt->eip);
+               if (r == EMULATE_DONE)
+                       kvm_vcpu_check_singlestep(vcpu, &r);
+               kvm_set_rflags(vcpu, ctxt->eflags);
         } else
                 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
  
@@ -5347,7 +5465,7 @@ static struct notifier_block pvclock_gtod_notifier = {
  int kvm_arch_init(void *opaque)
  {
         int r;
-       struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+       struct kvm_x86_ops *ops = opaque;
  
         if (kvm_x86_ops) {
                 printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -5495,6 +5613,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
         return 1;
  }
  
+/*
+ * kvm_pv_kick_cpu_op:  Kick a vcpu.
+ *
+ * @apicid - apicid of vcpu to be kicked.
+ */
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
+{
+       struct kvm_lapic_irq lapic_irq;
+
+       lapic_irq.shorthand = 0;
+       lapic_irq.dest_mode = 0;
+       lapic_irq.dest_id = apicid;
+
+       lapic_irq.delivery_mode = APIC_DM_REMRD;
+       kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
+}
+
  int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
  {
         unsigned long nr, a0, a1, a2, a3, ret;
@@ -5528,6 +5663,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
         case KVM_HC_VAPIC_POLL_IRQ:
                 ret = 0;
                 break;
+       case KVM_HC_KICK_CPU:
+               kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
+               ret = 0;
+               break;
         default:
                 ret = -KVM_ENOSYS;
                 break;
@@ -5689,29 +5828,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
         kvm_make_request(KVM_REQ_EVENT, vcpu);
  }
  
-static void kvm_gen_update_masterclock(struct kvm *kvm)
-{
-#ifdef CONFIG_X86_64
-       int i;
-       struct kvm_vcpu *vcpu;
-       struct kvm_arch *ka = &kvm->arch;
-
-       spin_lock(&ka->pvclock_gtod_sync_lock);
-       kvm_make_mclock_inprogress_request(kvm);
-       /* no guest entries from this point */
-       pvclock_update_vm_gtod_copy(kvm);
-
-       kvm_for_each_vcpu(i, vcpu, kvm)
-               set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
-
-       /* guest entries allowed */
-       kvm_for_each_vcpu(i, vcpu, kvm)
-               clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
-
-       spin_unlock(&ka->pvclock_gtod_sync_lock);
-#endif
-}
-
  static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
  {
         u64 eoi_exit_bitmap[4];
@@ -5950,6 +6066,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                                 kvm_apic_accept_events(vcpu);
                                 switch(vcpu->arch.mp_state) {
                                 case KVM_MP_STATE_HALTED:
+                                       vcpu->arch.pv.pv_unhalted = false;
                                         vcpu->arch.mp_state =
                                                 KVM_MP_STATE_RUNNABLE;
                                 case KVM_MP_STATE_RUNNABLE:
@@ -6061,6 +6178,8 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
  
         if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
                 vcpu->mmio_needed = 0;
+
+               /* FIXME: return into emulator if single-stepping.  */
                 if (vcpu->mmio_is_write)
                         return 1;
                 vcpu->mmio_read_completed = 1;
@@ -6249,7 +6368,12 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                     struct kvm_mp_state *mp_state)
  {
         kvm_apic_accept_events(vcpu);
-       mp_state->mp_state = vcpu->arch.mp_state;
+       if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
+                                       vcpu->arch.pv.pv_unhalted)
+               mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+       else
+               mp_state->mp_state = vcpu->arch.mp_state;
+
         return 0;
  }
  
@@ -6770,6 +6894,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
         BUG_ON(vcpu->kvm == NULL);
         kvm = vcpu->kvm;
  
+       vcpu->arch.pv.pv_unhalted = false;
         vcpu->arch.emulate_ctxt.ops = &emulate_ops;
         if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -7019,6 +7144,15 @@ out_free:
         return -ENOMEM;
  }
  
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+       /*
+        * memslots->generation has been incremented.
+        * mmio generation may have reached its maximum value.
+        */
+       kvm_mmu_invalidate_mmio_sptes(kvm);
+}
+
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                 struct kvm_memory_slot *memslot,
                                 struct kvm_userspace_memory_region *mem,
@@ -7079,11 +7213,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
          */
         if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
                 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-       /*
-        * If memory slot is created, or moved, we need to clear all
-        * mmio sptes.
-        */
-       kvm_mmu_invalidate_mmio_sptes(kvm);
  }
  
  void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7103,6 +7232,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
                 !vcpu->arch.apf.halted)
                 || !list_empty_careful(&vcpu->async_pf.done)
                 || kvm_apic_has_events(vcpu)
+               || vcpu->arch.pv.pv_unhalted
                 || atomic_read(&vcpu->arch.nmi_queued) ||
                 (kvm_arch_interrupt_allowed(vcpu) &&
                  kvm_cpu_has_interrupt(vcpu));
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c

index c74436e687bf8984b722efa1a657e1a6a8b47f3b..72074d5284009a35265580a13eaa99e6f11588b4 100644 (file)
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode)
         cycle_t ret;
         u64 last;
         u32 version;
-       u32 migrate_count;
         u8 flags;
         unsigned cpu, cpu1;
  
  
         /*
-        * When looping to get a consistent (time-info, tsc) pair, we
-        * also need to deal with the possibility we can switch vcpus,
-        * so make sure we always re-fetch time-info for the current vcpu.
+        * Note: hypervisor must guarantee that:
+        * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
+        * 2. that per-CPU pvclock time info is updated if the
+        *    underlying CPU changes.
+        * 3. that version is increased whenever underlying CPU
+        *    changes.
+        *
          */
         do {
                 cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode)
  
                 pvti = get_pvti(cpu);
  
-               migrate_count = pvti->migrate_count;
-
                 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
  
                 /*
@@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode)
                 cpu1 = __getcpu() & VGETCPU_CPU_MASK;
         } while (unlikely(cpu != cpu1 ||
                           (pvti->pvti.version & 1) ||
-                         pvti->pvti.version != version ||
-                         pvti->migrate_count != migrate_count));
+                         pvti->pvti.version != version));
  
         if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
                 *mode = VCLOCK_NONE;
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig

index 5daa2599ed48d3fad44223dc904d0cfdc2cbad09..e373671652b0914c10f3237ada35998be9de7fcf 100644 (file)
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -200,11 +200,9 @@ config DMA_SHARED_BUFFER
           APIs extension; the file's descriptor can then be passed on to other
           driver.
  
-config CMA
-       bool "Contiguous Memory Allocator"
-       depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK
-       select MIGRATION
-       select MEMORY_ISOLATION
+config DMA_CMA
+       bool "DMA Contiguous Memory Allocator"
+       depends on HAVE_DMA_CONTIGUOUS && CMA
         help
           This enables the Contiguous Memory Allocator which allows drivers
           to allocate big physically-contiguous blocks of memory for use with
@@ -213,17 +211,7 @@ config CMA
           For more information see <include/linux/dma-contiguous.h>.
           If unsure, say "n".
  
-if CMA
-
-config CMA_DEBUG
-       bool "CMA debug messages (DEVELOPMENT)"
-       depends on DEBUG_KERNEL
-       help
-         Turns on debug messages in CMA.  This produces KERN_DEBUG
-         messages for every CMA call as well as various messages while
-         processing calls such as dma_alloc_from_contiguous().
-         This option does not affect warning and error messages.
-
+if  DMA_CMA
  comment "Default contiguous memory area size:"
  
  config CMA_SIZE_MBYTES
diff --git a/drivers/base/Makefile b/drivers/base/Makefile

index 48029aa477d94b1c6b01fe90b367a9e77e2e5dfe..94e8a80e87f87e545638f257ead04c56f73f088b 100644 (file)
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -6,7 +6,7 @@ obj-y                   := core.o bus.o dd.o syscore.o \
                            attribute_container.o transport_class.o \
                            topology.o
  obj-$(CONFIG_DEVTMPFS) += devtmpfs.o
-obj-$(CONFIG_CMA) += dma-contiguous.o
+obj-$(CONFIG_DMA_CMA) += dma-contiguous.o
  obj-y                  += power/
  obj-$(CONFIG_HAS_DMA)  += dma-mapping.o
  obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h

index 343744e4809c17f6c93269c1070edfd43c7575ee..7e2d15837b02ca36b9ae6b9beb4bb368f2fa02bc 100644 (file)
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -26,7 +26,7 @@
  #include <linux/types.h>
  #include <linux/irqchip/arm-gic.h>
  
-#define VGIC_NR_IRQS           128
+#define VGIC_NR_IRQS           256
  #define VGIC_NR_SGIS           16
  #define VGIC_NR_PPIS           16
  #define VGIC_NR_PRIVATE_IRQS   (VGIC_NR_SGIS + VGIC_NR_PPIS)
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h

index 01b5c84be8284fef2f721e26c1b3fb3d95a72bbf..00141d3325fe2dbe0f0c649f3580d9ec473b7f0c 100644 (file)
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -57,7 +57,7 @@ struct cma;
  struct page;
  struct device;
  
-#ifdef CONFIG_CMA
+#ifdef CONFIG_DMA_CMA
  
  /*
   * There is always at least global CMA area and a few optional device
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index a63d83ebd151918b84fa6bb6ddc594e4dcd0c907..ca645a01d37a79767baccfd12a8871b79af7b624 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -85,6 +85,12 @@ static inline bool is_noslot_pfn(pfn_t pfn)
         return pfn == KVM_PFN_NOSLOT;
  }
  
+/*
+ * architectures with KVM_HVA_ERR_BAD other than PAGE_OFFSET (e.g. s390)
+ * provide own defines and kvm_is_error_hva
+ */
+#ifndef KVM_HVA_ERR_BAD
+
  #define KVM_HVA_ERR_BAD                (PAGE_OFFSET)
  #define KVM_HVA_ERR_RO_BAD     (PAGE_OFFSET + PAGE_SIZE)
  
@@ -93,6 +99,8 @@ static inline bool kvm_is_error_hva(unsigned long addr)
         return addr >= PAGE_OFFSET;
  }
  
+#endif
+
  #define KVM_ERR_PTR_BAD_PAGE   (ERR_PTR(-ENOENT))
  
  static inline bool is_error_page(struct page *page)
@@ -160,8 +168,12 @@ enum kvm_bus {
  
  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                      int len, const void *val);
+int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+                           int len, const void *val, long cookie);
  int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
                     void *val);
+int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+                          int len, void *val, long cookie);
  int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                             int len, struct kvm_io_device *dev);
  int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
@@ -499,6 +511,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
  void kvm_arch_free_memslot(struct kvm_memory_slot *free,
                            struct kvm_memory_slot *dont);
  int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
+void kvm_arch_memslots_updated(struct kvm *kvm);
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                 struct kvm_memory_slot *memslot,
                                 struct kvm_userspace_memory_region *mem,
diff --git a/include/linux/sched.h b/include/linux/sched.h

index f79ced7194355ef01f282d38759689e0b7ee3059..ce1e1c0aaa337fab9dda6de7f8f6d7529ca8a270 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -107,14 +107,6 @@ extern unsigned long this_cpu_load(void);
  extern void calc_global_load(unsigned long ticks);
  extern void update_cpu_load_nohz(void);
  
-/* Notifier for when a task gets migrated to a new CPU */
-struct task_migration_notifier {
-       struct task_struct *task;
-       int from_cpu;
-       int to_cpu;
-};
-extern void register_task_migration_notifier(struct notifier_block *n);
-
  extern unsigned long get_parent_ip(unsigned long addr);
  
  extern void dump_cpu_task(int cpu);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h

index acccd08be6c7563f6c2c316f6c2530563a7d3cb5..99c25338ede88c755698fe892983665decb005bc 100644 (file)
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -667,6 +667,7 @@ struct kvm_ppc_smmu_info {
  #define KVM_CAP_PPC_RTAS 91
  #define KVM_CAP_IRQ_XICS 92
  #define KVM_CAP_ARM_EL1_32BIT 93
+#define KVM_CAP_SPAPR_MULTITCE 94
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 725aa067ad63c4d1a08f7e16975e625ee19c8c09..5ac63c9a995a3570e0ad73a20b28c23cb972a963 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -978,13 +978,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
                 rq->skip_clock_update = 1;
  }
  
-static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
-
-void register_task_migration_notifier(struct notifier_block *n)
-{
-       atomic_notifier_chain_register(&task_migration_notifier, n);
-}
-
  #ifdef CONFIG_SMP
  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  {
@@ -1015,18 +1008,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         trace_sched_migrate_task(p, new_cpu);
  
         if (task_cpu(p) != new_cpu) {
-               struct task_migration_notifier tmn;
-
                 if (p->sched_class->migrate_task_rq)
                         p->sched_class->migrate_task_rq(p, new_cpu);
                 p->se.nr_migrations++;
                 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
-
-               tmn.task = p;
-               tmn.from_cpu = task_cpu(p);
-               tmn.to_cpu = new_cpu;
-
-               atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
         }
  
         __set_task_cpu(p, new_cpu);
diff --git a/mm/Kconfig b/mm/Kconfig

index 8028dcc6615c6bbd533adf060ea3d9db36b9a48c..6cdd27043303e7473925bbc04778a43481dc2205 100644 (file)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -478,6 +478,30 @@ config FRONTSWAP
  
           If unsure, say Y to enable frontswap.
  
+config CMA
+       bool "Contiguous Memory Allocator"
+       depends on HAVE_MEMBLOCK
+       select MIGRATION
+       select MEMORY_ISOLATION
+       help
+         This enables the Contiguous Memory Allocator which allows other
+         subsystems to allocate big physically-contiguous blocks of memory.
+         CMA reserves a region of memory and allows only movable pages to
+         be allocated from it. This way, the kernel can use the memory for
+         pagecache and when a subsystem requests for contiguous area, the
+         allocated pages are migrated away to serve the contiguous request.
+
+         If unsure, say "n".
+
+config CMA_DEBUG
+       bool "CMA debug messages (DEVELOPMENT)"
+       depends on DEBUG_KERNEL && CMA
+       help
+         Turns on debug messages in CMA.  This produces KERN_DEBUG
+         messages for every CMA call as well as various messages while
+         processing calls such as dma_alloc_from_contiguous().
+         This option does not affect warning and error messages.
+
  config ZBUD
         tristate
         default n
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c

index 17c5ac7d10ed173753dd96ae4a7b25aa3106998b..685fc72fc751fae3f898c9de4d194365521ea5a0 100644 (file)
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -149,7 +149,7 @@ static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
  {
         offset >>= 2;
         BUG_ON(offset > (VGIC_NR_IRQS / 4));
-       if (offset < 4)
+       if (offset < 8)
                 return x->percpu[cpuid] + offset;
         else
                 return x->shared + offset - 8;
@@ -432,19 +432,13 @@ static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
  static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
  {
         struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int i, c;
-       unsigned long *bmap;
+       int i;
         u32 val = 0;
  
         irq -= VGIC_NR_PRIVATE_IRQS;
  
-       kvm_for_each_vcpu(c, vcpu, kvm) {
-               bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
-               for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
-                       if (test_bit(irq + i, bmap))
-                               val |= 1 << (c + i * 8);
-       }
+       for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
+               val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8);
  
         return val;
  }
@@ -547,8 +541,12 @@ static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
                                 struct kvm_exit_mmio *mmio, phys_addr_t offset)
  {
         u32 val;
-       u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
-                                      vcpu->vcpu_id, offset >> 1);
+       u32 *reg;
+
+       offset >>= 1;
+       reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+                                 vcpu->vcpu_id, offset);
+
         if (offset & 2)
                 val = *reg >> 16;
         else
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 1580dd4ace4eac20b37043c2f5c882349204ed4a..bf040c4e02b332b7dd2126ae6ed2013261ecfdb4 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -102,28 +102,8 @@ static bool largepages_enabled = true;
  
  bool kvm_is_mmio_pfn(pfn_t pfn)
  {
-       if (pfn_valid(pfn)) {
-               int reserved;
-               struct page *tail = pfn_to_page(pfn);
-               struct page *head = compound_trans_head(tail);
-               reserved = PageReserved(head);
-               if (head != tail) {
-                       /*
-                        * "head" is not a dangling pointer
-                        * (compound_trans_head takes care of that)
-                        * but the hugepage may have been splitted
-                        * from under us (and we may not hold a
-                        * reference count on the head page so it can
-                        * be reused before we run PageReferenced), so
-                        * we've to check PageTail before returning
-                        * what we just read.
-                        */
-                       smp_rmb();
-                       if (PageTail(tail))
-                               return reserved;
-               }
-               return PageReserved(tail);
-       }
+       if (pfn_valid(pfn))
+               return PageReserved(pfn_to_page(pfn));
  
         return true;
  }
@@ -731,7 +711,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
         update_memslots(slots, new, kvm->memslots->generation);
         rcu_assign_pointer(kvm->memslots, slots);
         synchronize_srcu_expedited(&kvm->srcu);
-       return old_memslots; 
+
+       kvm_arch_memslots_updated(kvm);
+
+       return old_memslots;
  }
  
  /*
@@ -1893,7 +1876,7 @@ static struct file_operations kvm_vcpu_fops = {
   */
  static int create_vcpu_fd(struct kvm_vcpu *vcpu)
  {
-       return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
+       return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
  }
  
  /*
@@ -2302,7 +2285,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
                 return ret;
         }
  
-       ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR);
+       ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
         if (ret < 0) {
                 ops->destroy(dev);
                 return ret;
@@ -2586,7 +2569,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
                 return r;
         }
  #endif
-       r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+       r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC);
         if (r < 0)
                 kvm_put_kvm(kvm);
  
@@ -2812,11 +2795,9 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
         kfree(bus);
  }
  
-static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
+                                 const struct kvm_io_range *r2)
  {
-       const struct kvm_io_range *r1 = p1;
-       const struct kvm_io_range *r2 = p2;
-
         if (r1->addr < r2->addr)
                 return -1;
         if (r1->addr + r1->len > r2->addr + r2->len)
@@ -2824,6 +2805,11 @@ static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
         return 0;
  }
  
+static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+{
+       return kvm_io_bus_cmp(p1, p2);
+}
+
  static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
                           gpa_t addr, int len)
  {
@@ -2857,17 +2843,54 @@ static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
  
         off = range - bus->range;
  
-       while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
+       while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
                 off--;
  
         return off;
  }
  
+static int __kvm_io_bus_write(struct kvm_io_bus *bus,
+                             struct kvm_io_range *range, const void *val)
+{
+       int idx;
+
+       idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
+       if (idx < 0)
+               return -EOPNOTSUPP;
+
+       while (idx < bus->dev_count &&
+               kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
+               if (!kvm_iodevice_write(bus->range[idx].dev, range->addr,
+                                       range->len, val))
+                       return idx;
+               idx++;
+       }
+
+       return -EOPNOTSUPP;
+}
+
  /* kvm_io_bus_write - called under kvm->slots_lock */
  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                      int len, const void *val)
  {
-       int idx;
+       struct kvm_io_bus *bus;
+       struct kvm_io_range range;
+       int r;
+
+       range = (struct kvm_io_range) {
+               .addr = addr,
+               .len = len,
+       };
+
+       bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+       r = __kvm_io_bus_write(bus, &range, val);
+       return r < 0 ? r : 0;
+}
+
+/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
+int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+                           int len, const void *val, long cookie)
+{
         struct kvm_io_bus *bus;
         struct kvm_io_range range;
  
@@ -2877,14 +2900,35 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
         };
  
         bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-       idx = kvm_io_bus_get_first_dev(bus, addr, len);
+
+       /* First try the device referenced by cookie. */
+       if ((cookie >= 0) && (cookie < bus->dev_count) &&
+           (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
+               if (!kvm_iodevice_write(bus->range[cookie].dev, addr, len,
+                                       val))
+                       return cookie;
+
+       /*
+        * cookie contained garbage; fall back to search and return the
+        * correct cookie value.
+        */
+       return __kvm_io_bus_write(bus, &range, val);
+}
+
+static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
+                            void *val)
+{
+       int idx;
+
+       idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
         if (idx < 0)
                 return -EOPNOTSUPP;
  
         while (idx < bus->dev_count &&
-               kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
-               if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
-                       return 0;
+               kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
+               if (!kvm_iodevice_read(bus->range[idx].dev, range->addr,
+                                      range->len, val))
+                       return idx;
                 idx++;
         }
  
@@ -2895,9 +2939,9 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
  int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                     int len, void *val)
  {
-       int idx;
         struct kvm_io_bus *bus;
         struct kvm_io_range range;
+       int r;
  
         range = (struct kvm_io_range) {
                 .addr = addr,
@@ -2905,18 +2949,36 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
         };
  
         bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-       idx = kvm_io_bus_get_first_dev(bus, addr, len);
-       if (idx < 0)
-               return -EOPNOTSUPP;
+       r = __kvm_io_bus_read(bus, &range, val);
+       return r < 0 ? r : 0;
+}
  
-       while (idx < bus->dev_count &&
-               kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
-               if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
-                       return 0;
-               idx++;
-       }
+/* kvm_io_bus_read_cookie - called under kvm->slots_lock */
+int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+                          int len, void *val, long cookie)
+{
+       struct kvm_io_bus *bus;
+       struct kvm_io_range range;
  
-       return -EOPNOTSUPP;
+       range = (struct kvm_io_range) {
+               .addr = addr,
+               .len = len,
+       };
+
+       bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+
+       /* First try the device referenced by cookie. */
+       if ((cookie >= 0) && (cookie < bus->dev_count) &&
+           (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
+               if (!kvm_iodevice_read(bus->range[cookie].dev, addr, len,
+                                      val))
+                       return cookie;
+
+       /*
+        * cookie contained garbage; fall back to search and return the
+        * correct cookie value.
+        */
+       return __kvm_io_bus_read(bus, &range, val);
  }
  
  /* Caller must hold slots_lock. */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 5 Sep 2013 01:15:06 +0000 (18:15 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 5 Sep 2013 01:15:06 +0000 (18:15 -0700)
Documentation/virtual/kvm/cpuid.txt		patch \| blob \| blame \| history
Documentation/virtual/kvm/hypercalls.txt		patch \| blob \| blame \| history
arch/arm/configs/keystone_defconfig		patch \| blob \| blame \| history
arch/arm/configs/omap2plus_defconfig		patch \| blob \| blame \| history
arch/arm/configs/tegra_defconfig		patch \| blob \| blame \| history
arch/arm/include/asm/dma-contiguous.h		patch \| blob \| blame \| history
arch/arm/include/asm/kvm_mmu.h		patch \| blob \| blame \| history
arch/arm/kvm/arm.c		patch \| blob \| blame \| history
arch/arm/kvm/interrupts.S		patch \| blob \| blame \| history
arch/arm/kvm/reset.c		patch \| blob \| blame \| history
arch/arm/kvm/trace.h		patch \| blob \| blame \| history
arch/arm/mm/dma-mapping.c		patch \| blob \| blame \| history
arch/ia64/kvm/kvm-ia64.c		patch \| blob \| blame \| history
arch/mips/kvm/kvm_locore.S		patch \| blob \| blame \| history
arch/mips/kvm/kvm_mips.c		patch \| blob \| blame \| history
arch/powerpc/include/asm/kvm_book3s.h		patch \| blob \| blame \| history
arch/powerpc/include/asm/kvm_book3s_64.h		patch \| blob \| blame \| history
arch/powerpc/include/asm/kvm_host.h		patch \| blob \| blame \| history
arch/powerpc/include/asm/kvm_ppc.h		patch \| blob \| blame \| history
arch/powerpc/kernel/asm-offsets.c		patch \| blob \| blame \| history
arch/powerpc/kernel/setup_64.c		patch \| blob \| blame \| history
arch/powerpc/kvm/Kconfig		patch \| blob \| blame \| history
arch/powerpc/kvm/Makefile		patch \| blob \| blame \| history
arch/powerpc/kvm/book3s_64_mmu.c		patch \| blob \| blame \| history
arch/powerpc/kvm/book3s_64_mmu_hv.c		patch \| blob \| blame \| history
arch/powerpc/kvm/book3s_64_vio.c		patch \| blob \| blame \| history
arch/powerpc/kvm/book3s_emulate.c		patch \| blob \| blame \| history
arch/powerpc/kvm/book3s_hv.c		patch \| blob \| blame \| history
arch/powerpc/kvm/book3s_hv_builtin.c		patch \| blob \| blame \| history
arch/powerpc/kvm/book3s_hv_cma.c	[new file with mode: 0644]	patch \| blob
arch/powerpc/kvm/book3s_hv_cma.h	[new file with mode: 0644]	patch \| blob
arch/powerpc/kvm/book3s_hv_rm_mmu.c		patch \| blob \| blame \| history
arch/powerpc/kvm/book3s_hv_rmhandlers.S		patch \| blob \| blame \| history
arch/powerpc/kvm/book3s_interrupts.S		patch \| blob \| blame \| history
arch/powerpc/kvm/book3s_pr.c		patch \| blob \| blame \| history
arch/powerpc/kvm/book3s_xics.c		patch \| blob \| blame \| history
arch/powerpc/kvm/booke.c		patch \| blob \| blame \| history
arch/powerpc/kvm/powerpc.c		patch \| blob \| blame \| history
arch/s390/include/asm/kvm_host.h		patch \| blob \| blame \| history
arch/s390/include/asm/mmu.h		patch \| blob \| blame \| history
arch/s390/include/asm/mmu_context.h		patch \| blob \| blame \| history
arch/s390/include/asm/pgtable.h		patch \| blob \| blame \| history
arch/s390/include/asm/processor.h		patch \| blob \| blame \| history
arch/s390/kvm/diag.c		patch \| blob \| blame \| history
arch/s390/kvm/kvm-s390.c		patch \| blob \| blame \| history
arch/s390/kvm/kvm-s390.h		patch \| blob \| blame \| history
arch/s390/kvm/priv.c		patch \| blob \| blame \| history
arch/s390/mm/pgtable.c		patch \| blob \| blame \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| blame \| history
arch/x86/include/asm/pvclock.h		patch \| blob \| blame \| history
arch/x86/include/asm/vmx.h		patch \| blob \| blame \| history
arch/x86/include/uapi/asm/vmx.h		patch \| blob \| blame \| history
arch/x86/kernel/pvclock.c		patch \| blob \| blame \| history
arch/x86/kvm/cpuid.c		patch \| blob \| blame \| history
arch/x86/kvm/lapic.c		patch \| blob \| blame \| history
arch/x86/kvm/mmu.c		patch \| blob \| blame \| history
arch/x86/kvm/mmu.h		patch \| blob \| blame \| history
arch/x86/kvm/paging_tmpl.h		patch \| blob \| blame \| history
arch/x86/kvm/pmu.c		patch \| blob \| blame \| history
arch/x86/kvm/vmx.c		patch \| blob \| blame \| history
arch/x86/kvm/x86.c		patch \| blob \| blame \| history
arch/x86/vdso/vclock_gettime.c		patch \| blob \| blame \| history
drivers/base/Kconfig		patch \| blob \| blame \| history
drivers/base/Makefile		patch \| blob \| blame \| history
include/kvm/arm_vgic.h		patch \| blob \| blame \| history
include/linux/dma-contiguous.h		patch \| blob \| blame \| history
include/linux/kvm_host.h		patch \| blob \| blame \| history
include/linux/sched.h		patch \| blob \| blame \| history
include/uapi/linux/kvm.h		patch \| blob \| blame \| history
kernel/sched/core.c		patch \| blob \| blame \| history
mm/Kconfig		patch \| blob \| blame \| history
virt/kvm/arm/vgic.c		patch \| blob \| blame \| history
virt/kvm/kvm_main.c		patch \| blob \| blame \| history