KVM: Disable SMAP for guests in EPT realmode and EPT unpaging mode

[deliverable/linux.git] / arch / x86 / kvm / vmx.c
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 3927528347510bb137e8420f4e26b07d09055483..1f68c5831924d15dd741032cde2fafc46aae50ab 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -31,6 +31,7 @@
  #include <linux/ftrace_event.h>
  #include <linux/slab.h>
  #include <linux/tboot.h>
+#include <linux/hrtimer.h>
  #include "kvm_cache_regs.h"
  #include "x86.h"
  
@@ -42,6 +43,7 @@
  #include <asm/i387.h>
  #include <asm/xcr.h>
  #include <asm/perf_event.h>
+#include <asm/debugreg.h>
  #include <asm/kexec.h>
  
  #include "trace.h"
@@ -110,6 +112,8 @@ module_param(nested, bool, S_IRUGO);
  
  #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
  
+#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
+
  /*
   * These 2 parameters are used to config the controls for Pause-Loop Exiting:
   * ple_gap:    upper bound on the amount of time between two successive
@@ -202,6 +206,7 @@ struct __packed vmcs12 {
         u64 guest_pdptr1;
         u64 guest_pdptr2;
         u64 guest_pdptr3;
+       u64 guest_bndcfgs;
         u64 host_ia32_pat;
         u64 host_ia32_efer;
         u64 host_ia32_perf_global_ctrl;
@@ -374,6 +379,9 @@ struct nested_vmx {
          */
         struct page *apic_access_page;
         u64 msr_ia32_feature_control;
+
+       struct hrtimer preemption_timer;
+       bool preemption_timer_expired;
  };
  
  #define POSTED_INTR_ON  0
@@ -441,6 +449,7 @@ struct vcpu_vmx {
  #endif
                 int           gs_ldt_reload_needed;
                 int           fs_reload_needed;
+               u64           msr_host_bndcfgs;
         } host_state;
         struct {
                 int vm86_active;
@@ -533,6 +542,7 @@ static const unsigned long shadow_read_write_fields[] = {
         GUEST_CS_LIMIT,
         GUEST_CS_BASE,
         GUEST_ES_BASE,
+       GUEST_BNDCFGS,
         CR0_GUEST_HOST_MASK,
         CR0_READ_SHADOW,
         CR4_READ_SHADOW,
@@ -588,6 +598,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
         FIELD64(GUEST_PDPTR1, guest_pdptr1),
         FIELD64(GUEST_PDPTR2, guest_pdptr2),
         FIELD64(GUEST_PDPTR3, guest_pdptr3),
+       FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
         FIELD64(HOST_IA32_PAT, host_ia32_pat),
         FIELD64(HOST_IA32_EFER, host_ia32_efer),
         FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
@@ -718,6 +729,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
  static u64 construct_eptp(unsigned long root_hpa);
  static void kvm_cpu_vmxon(u64 addr);
  static void kvm_cpu_vmxoff(void);
+static bool vmx_mpx_supported(void);
  static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
  static void vmx_set_segment(struct kvm_vcpu *vcpu,
                             struct kvm_segment *var, int seg);
@@ -728,6 +740,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
  static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
  static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
  static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
+static bool vmx_mpx_supported(void);
  
  static DEFINE_PER_CPU(struct vmcs *, vmxarea);
  static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1047,6 +1060,12 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
         return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
  }
  
+static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
+{
+       return vmcs12->pin_based_vm_exec_control &
+               PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
  static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
  {
         return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
@@ -1710,6 +1729,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
         if (is_long_mode(&vmx->vcpu))
                 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
  #endif
+       if (boot_cpu_has(X86_FEATURE_MPX))
+               rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
         for (i = 0; i < vmx->save_nmsrs; ++i)
                 kvm_set_shared_msr(vmx->guest_msrs[i].index,
                                    vmx->guest_msrs[i].data,
@@ -1747,6 +1768,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
  #ifdef CONFIG_X86_64
         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
  #endif
+       if (vmx->host_state.msr_host_bndcfgs)
+               wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
         /*
          * If the FPU is not active (through the host task or
          * the guest vcpu), then restore the cr0.TS bit.
@@ -2248,9 +2271,9 @@ static __init void nested_vmx_setup_ctls_msrs(void)
          */
         nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
         nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
-               PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS |
+               PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
+       nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
                 PIN_BASED_VMX_PREEMPTION_TIMER;
-       nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
  
         /*
          * Exit controls
@@ -2265,15 +2288,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
  #ifdef CONFIG_X86_64
                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
  #endif
-               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+       nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+               VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
-       if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) ||
-           !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
-               nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
-               nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
-       }
-       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
-               VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
+       if (vmx_mpx_supported())
+               nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
  
         /* entry controls */
         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2287,6 +2307,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                 VM_ENTRY_LOAD_IA32_PAT;
         nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
                                        VM_ENTRY_LOAD_IA32_EFER);
+       if (vmx_mpx_supported())
+               nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
  
         /* cpu-based controls */
         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2342,9 +2364,9 @@ static __init void nested_vmx_setup_ctls_msrs(void)
  
         /* miscellaneous data */
         rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
-       nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
-               VMX_MISC_SAVE_EFER_LMA;
-       nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT;
+       nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
+       nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+               VMX_MISC_ACTIVITY_HLT;
         nested_vmx_misc_high = 0;
  }
  
@@ -2479,6 +2501,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
         case MSR_IA32_SYSENTER_ESP:
                 data = vmcs_readl(GUEST_SYSENTER_ESP);
                 break;
+       case MSR_IA32_BNDCFGS:
+               if (!vmx_mpx_supported())
+                       return 1;
+               data = vmcs_read64(GUEST_BNDCFGS);
+               break;
         case MSR_IA32_FEATURE_CONTROL:
                 if (!nested_vmx_allowed(vcpu))
                         return 1;
@@ -2547,6 +2574,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
         case MSR_IA32_SYSENTER_ESP:
                 vmcs_writel(GUEST_SYSENTER_ESP, data);
                 break;
+       case MSR_IA32_BNDCFGS:
+               if (!vmx_mpx_supported())
+                       return 1;
+               vmcs_write64(GUEST_BNDCFGS, data);
+               break;
         case MSR_IA32_TSC:
                 kvm_write_tsc(vcpu, msr_info);
                 break;
@@ -2832,12 +2864,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                       vmx_capability.ept, vmx_capability.vpid);
         }
  
-       min = 0;
+       min = VM_EXIT_SAVE_DEBUG_CONTROLS;
  #ifdef CONFIG_X86_64
         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
  #endif
         opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
-               VM_EXIT_ACK_INTR_ON_EXIT;
+               VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
                                 &_vmexit_control) < 0)
                 return -EIO;
@@ -2853,8 +2885,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                 !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
  
-       min = 0;
-       opt = VM_ENTRY_LOAD_IA32_PAT;
+       min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
+       opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
                                 &_vmentry_control) < 0)
                 return -EIO;
@@ -3452,13 +3484,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                         hw_cr4 &= ~X86_CR4_PAE;
                         hw_cr4 |= X86_CR4_PSE;
                         /*
-                        * SMEP is disabled if CPU is in non-paging mode in
-                        * hardware. However KVM always uses paging mode to
+                        * SMEP/SMAP is disabled if CPU is in non-paging mode
+                        * in hardware. However KVM always uses paging mode to
                          * emulate guest non-paging mode with TDP.
-                        * To emulate this behavior, SMEP needs to be manually
-                        * disabled when guest switches to non-paging mode.
+                        * To emulate this behavior, SMEP/SMAP needs to be
+                        * manually disabled when guest switches to non-paging
+                        * mode.
                          */
-                       hw_cr4 &= ~X86_CR4_SMEP;
+                       hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
                 } else if (!(cr4 & X86_CR4_PAE)) {
                         hw_cr4 &= ~X86_CR4_PAE;
                 }
@@ -4223,6 +4256,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
  static u32 vmx_exec_control(struct vcpu_vmx *vmx)
  {
         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
+
+       if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
+               exec_control &= ~CPU_BASED_MOV_DR_EXITING;
+
         if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
                 exec_control &= ~CPU_BASED_TPR_SHADOW;
  #ifdef CONFIG_X86_64
@@ -4496,39 +4533,28 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
                 PIN_BASED_NMI_EXITING;
  }
  
-static int enable_irq_window(struct kvm_vcpu *vcpu)
+static void enable_irq_window(struct kvm_vcpu *vcpu)
  {
         u32 cpu_based_vm_exec_control;
  
-       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
-               /*
-                * We get here if vmx_interrupt_allowed() said we can't
-                * inject to L1 now because L2 must run. The caller will have
-                * to make L2 exit right after entry, so we can inject to L1
-                * more promptly.
-                */
-               return -EBUSY;
-
         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
         cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-       return 0;
  }
  
-static int enable_nmi_window(struct kvm_vcpu *vcpu)
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
  {
         u32 cpu_based_vm_exec_control;
  
-       if (!cpu_has_virtual_nmis())
-               return enable_irq_window(vcpu);
-
-       if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI)
-               return enable_irq_window(vcpu);
+       if (!cpu_has_virtual_nmis() ||
+           vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+               enable_irq_window(vcpu);
+               return;
+       }
  
         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
         cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-       return 0;
  }
  
  static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -4620,22 +4646,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
  
  static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
  {
-       if (is_guest_mode(vcpu)) {
-               if (to_vmx(vcpu)->nested.nested_run_pending)
-                       return 0;
-               if (nested_exit_on_nmi(vcpu)) {
-                       nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
-                                         NMI_VECTOR | INTR_TYPE_NMI_INTR |
-                                         INTR_INFO_VALID_MASK, 0);
-                       /*
-                        * The NMI-triggered VM exit counts as injection:
-                        * clear this one and block further NMIs.
-                        */
-                       vcpu->arch.nmi_pending = 0;
-                       vmx_set_nmi_mask(vcpu, true);
-                       return 0;
-               }
-       }
+       if (to_vmx(vcpu)->nested.nested_run_pending)
+               return 0;
  
         if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
                 return 0;
@@ -4647,19 +4659,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
  
  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
  {
-       if (is_guest_mode(vcpu)) {
-               if (to_vmx(vcpu)->nested.nested_run_pending)
-                       return 0;
-               if (nested_exit_on_intr(vcpu)) {
-                       nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
-                                         0, 0);
-                       /*
-                        * fall through to normal code, but now in L1, not L2
-                        */
-               }
-       }
-
-       return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+       return (!to_vmx(vcpu)->nested.nested_run_pending &&
+               vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                         (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
  }
@@ -5102,6 +5103,22 @@ static int handle_dr(struct kvm_vcpu *vcpu)
                 }
         }
  
+       if (vcpu->guest_debug == 0) {
+               u32 cpu_based_vm_exec_control;
+
+               cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+               cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
+               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+
+               /*
+                * No more DR vmexits; force a reload of the debug registers
+                * and reenter on this instruction.  The next vmexit will
+                * retrieve the full state of the debug registers.
+                */
+               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+               return 1;
+       }
+
         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
@@ -5128,6 +5145,24 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
  {
  }
  
+static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+       u32 cpu_based_vm_exec_control;
+
+       get_debugreg(vcpu->arch.db[0], 0);
+       get_debugreg(vcpu->arch.db[1], 1);
+       get_debugreg(vcpu->arch.db[2], 2);
+       get_debugreg(vcpu->arch.db[3], 3);
+       get_debugreg(vcpu->arch.dr6, 6);
+       vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
+
+       vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+
+       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+       cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
  static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
  {
         vmcs_writel(GUEST_DR7, val);
@@ -5727,6 +5762,18 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
          */
  }
  
+static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
+{
+       struct vcpu_vmx *vmx =
+               container_of(timer, struct vcpu_vmx, nested.preemption_timer);
+
+       vmx->nested.preemption_timer_expired = true;
+       kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+       kvm_vcpu_kick(&vmx->vcpu);
+
+       return HRTIMER_NORESTART;
+}
+
  /*
   * Emulate the VMXON instruction.
   * Currently, we just remember that VMX is active, and do not save or even
@@ -5791,6 +5838,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
         INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
         vmx->nested.vmcs02_num = 0;
  
+       hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+                    HRTIMER_MODE_REL);
+       vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+
         vmx->nested.vmxon = true;
  
         skip_emulated_instruction(vcpu);
@@ -6767,9 +6818,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                  * table is L0's fault.
                  */
                 return 0;
-       case EXIT_REASON_PREEMPTION_TIMER:
-               return vmcs12->pin_based_vm_exec_control &
-                       PIN_BASED_VMX_PREEMPTION_TIMER;
         case EXIT_REASON_WBINVD:
                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
         case EXIT_REASON_XSETBV:
@@ -6785,27 +6833,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
         *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
  }
  
-static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
-{
-       u64 delta_tsc_l1;
-       u32 preempt_val_l1, preempt_val_l2, preempt_scale;
-
-       if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control &
-                       PIN_BASED_VMX_PREEMPTION_TIMER))
-               return;
-       preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
-                       MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
-       preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
-       delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc())
-               - vcpu->arch.last_guest_tsc;
-       preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
-       if (preempt_val_l2 <= preempt_val_l1)
-               preempt_val_l2 = 0;
-       else
-               preempt_val_l2 -= preempt_val_l1;
-       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
-}
-
  /*
   * The guest has exited.  See if we can fix it or if we need userspace
   * assistance.
@@ -7052,6 +7079,12 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
                 local_irq_enable();
  }
  
+static bool vmx_mpx_supported(void)
+{
+       return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
+               (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
+}
+
  static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
  {
         u32 exit_intr_info;
@@ -7218,8 +7251,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
         atomic_switch_perf_msrs(vmx);
         debugctlmsr = get_debugctlmsr();
  
-       if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending)
-               nested_adjust_preemption_timer(vcpu);
         vmx->__launched = vmx->loaded_vmcs->launched;
         asm(
                 /* Store host registers */
@@ -7616,6 +7647,28 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
                 kvm_inject_page_fault(vcpu, fault);
  }
  
+static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
+{
+       u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (vcpu->arch.virtual_tsc_khz == 0)
+               return;
+
+       /* Make sure short timeouts reliably trigger an immediate vmexit.
+        * hrtimer_start does not guarantee this. */
+       if (preemption_timeout <= 1) {
+               vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
+               return;
+       }
+
+       preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+       preemption_timeout *= 1000000;
+       do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
+       hrtimer_start(&vmx->nested.preemption_timer,
+                     ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
+}
+
  /*
   * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
   * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7629,7 +7682,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         u32 exec_control;
-       u32 exit_control;
  
         vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
         vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -7687,13 +7739,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  
         vmcs_write64(VMCS_LINK_POINTER, -1ull);
  
-       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
-               (vmcs_config.pin_based_exec_ctrl |
-                vmcs12->pin_based_vm_exec_control));
+       exec_control = vmcs12->pin_based_vm_exec_control;
+       exec_control |= vmcs_config.pin_based_exec_ctrl;
+       exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
  
-       if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
-               vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
-                            vmcs12->vmx_preemption_timer_value);
+       vmx->nested.preemption_timer_expired = false;
+       if (nested_cpu_has_preemption_timer(vmcs12))
+               vmx_start_preemption_timer(vcpu);
  
         /*
          * Whether page-faults are trapped is determined by a combination of
@@ -7721,7 +7774,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                 enable_ept ? vmcs12->page_fault_error_code_match : 0);
  
         if (cpu_has_secondary_exec_ctrls()) {
-               u32 exec_control = vmx_secondary_exec_control(vmx);
+               exec_control = vmx_secondary_exec_control(vmx);
                 if (!vmx->rdtscp_enabled)
                         exec_control &= ~SECONDARY_EXEC_RDTSCP;
                 /* Take the following fields only from vmcs12 */
@@ -7808,10 +7861,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
          * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
          * bits are further modified by vmx_set_efer() below.
          */
-       exit_control = vmcs_config.vmexit_ctrl;
-       if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
-               exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
-       vm_exit_controls_init(vmx, exit_control);
+       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
  
         /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
          * emulated by vmx_set_efer(), below.
@@ -7830,6 +7880,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  
         set_cr4_guest_host_mask(vmx);
  
+       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
+               vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+
         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
                 vmcs_write64(TSC_OFFSET,
                         vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
@@ -8155,6 +8208,58 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
         }
  }
  
+static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
+           vmx->nested.preemption_timer_expired) {
+               if (vmx->nested.nested_run_pending)
+                       return -EBUSY;
+               nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
+               return 0;
+       }
+
+       if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
+               if (vmx->nested.nested_run_pending ||
+                   vcpu->arch.interrupt.pending)
+                       return -EBUSY;
+               nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+                                 NMI_VECTOR | INTR_TYPE_NMI_INTR |
+                                 INTR_INFO_VALID_MASK, 0);
+               /*
+                * The NMI-triggered VM exit counts as injection:
+                * clear this one and block further NMIs.
+                */
+               vcpu->arch.nmi_pending = 0;
+               vmx_set_nmi_mask(vcpu, true);
+               return 0;
+       }
+
+       if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
+           nested_exit_on_intr(vcpu)) {
+               if (vmx->nested.nested_run_pending)
+                       return -EBUSY;
+               nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+       }
+
+       return 0;
+}
+
+static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
+{
+       ktime_t remaining =
+               hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
+       u64 value;
+
+       if (ktime_to_ns(remaining) <= 0)
+               return 0;
+
+       value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
+       do_div(value, 1000000);
+       return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+}
+
  /*
   * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
   * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
@@ -8225,10 +8330,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
         else
                 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
  
-       if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
-           (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
-               vmcs12->vmx_preemption_timer_value =
-                       vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+       if (nested_cpu_has_preemption_timer(vmcs12)) {
+               if (vmcs12->vm_exit_controls &
+                   VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
+                       vmcs12->vmx_preemption_timer_value =
+                               vmx_get_preemption_timer_value(vcpu);
+               hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
+       }
  
         /*
          * In some cases (usually, nested EPT), L2 is allowed to change its
@@ -8260,6 +8368,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
         vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
         vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
         vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+       if (vmx_mpx_supported())
+               vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
  
         /* update exit information fields: */
  
@@ -8369,6 +8479,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
         vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
         vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
  
+       /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
+       if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
+               vmcs_write64(GUEST_BNDCFGS, 0);
+
         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
                 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
                 vcpu->arch.pat = vmcs12->host_ia32_pat;
@@ -8495,6 +8609,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                 nested_vmx_succeed(vcpu);
         if (enable_shadow_vmcs)
                 vmx->nested.sync_shadow_vmcs = true;
+
+       /* in case we halted in L2 */
+       vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
  }
  
  /*
@@ -8573,6 +8690,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
         .get_dr6 = vmx_get_dr6,
         .set_dr6 = vmx_set_dr6,
         .set_dr7 = vmx_set_dr7,
+       .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
         .cache_reg = vmx_cache_reg,
         .get_rflags = vmx_get_rflags,
         .set_rflags = vmx_set_rflags,
@@ -8634,6 +8752,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
  
         .check_intercept = vmx_check_intercept,
         .handle_external_intr = vmx_handle_external_intr,
+       .mpx_supported = vmx_mpx_supported,
+
+       .check_nested_events = vmx_check_nested_events,
  };
  
  static int __init vmx_init(void)
@@ -8721,6 +8842,8 @@ static int __init vmx_init(void)
         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+       vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
+
         memcpy(vmx_msr_bitmap_legacy_x2apic,
                         vmx_msr_bitmap_legacy, PAGE_SIZE);
         memcpy(vmx_msr_bitmap_longmode_x2apic,