summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/hyperv/hv_init.c45
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h (renamed from arch/x86/include/uapi/asm/hyperv.h)299
-rw-r--r--arch/x86/include/asm/kvm_host.h54
-rw-r--r--arch/x86/include/asm/kvm_para.h6
-rw-r--r--arch/x86/include/asm/mshyperv.h94
-rw-r--r--arch/x86/include/asm/msr-index.h14
-rw-r--r--arch/x86/include/asm/processor.h10
-rw-r--r--arch/x86/include/asm/svm.h3
-rw-r--r--arch/x86/include/uapi/asm/kvm.h19
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h9
-rw-r--r--arch/x86/kernel/cpu/common.c3
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c22
-rw-r--r--arch/x86/kernel/kvm.c18
-rw-r--r--arch/x86/kernel/process_64.c14
-rw-r--r--arch/x86/kvm/cpuid.c7
-rw-r--r--arch/x86/kvm/emulate.c27
-rw-r--r--arch/x86/kvm/hyperv.c192
-rw-r--r--arch/x86/kvm/hyperv.h4
-rw-r--r--arch/x86/kvm/irq.c26
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h7
-rw-r--r--arch/x86/kvm/lapic.c10
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu.c2
-rw-r--r--arch/x86/kvm/paging_tmpl.h11
-rw-r--r--arch/x86/kvm/pmu.c37
-rw-r--r--arch/x86/kvm/pmu.h6
-rw-r--r--arch/x86/kvm/pmu_amd.c142
-rw-r--r--arch/x86/kvm/svm.c312
-rw-r--r--arch/x86/kvm/vmx.c1091
-rw-r--r--arch/x86/kvm/vmx_evmcs.h324
-rw-r--r--arch/x86/kvm/x86.c374
-rw-r--r--arch/x86/kvm/x86.h86
32 files changed, 2588 insertions, 682 deletions
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 2edc49e7409b..cfecc2272f2d 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -21,7 +21,7 @@
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hypervisor.h>
-#include <asm/hyperv.h>
+#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
@@ -88,11 +88,15 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
u32 *hv_vp_index;
EXPORT_SYMBOL_GPL(hv_vp_index);
+struct hv_vp_assist_page **hv_vp_assist_page;
+EXPORT_SYMBOL_GPL(hv_vp_assist_page);
+
u32 hv_max_vp_index;
static int hv_cpu_init(unsigned int cpu)
{
u64 msr_vp_index;
+ struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()];
hv_get_vp_index(msr_vp_index);
@@ -101,6 +105,22 @@ static int hv_cpu_init(unsigned int cpu)
if (msr_vp_index > hv_max_vp_index)
hv_max_vp_index = msr_vp_index;
+ if (!hv_vp_assist_page)
+ return 0;
+
+ if (!*hvp)
+ *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
+
+ if (*hvp) {
+ u64 val;
+
+ val = vmalloc_to_pfn(*hvp);
+ val = (val << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) |
+ HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val);
+ }
+
return 0;
}
@@ -198,6 +218,9 @@ static int hv_cpu_die(unsigned int cpu)
struct hv_reenlightenment_control re_ctrl;
unsigned int new_cpu;
+ if (hv_vp_assist_page && hv_vp_assist_page[cpu])
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
+
if (hv_reenlightenment_cb == NULL)
return 0;
@@ -224,6 +247,7 @@ void hyperv_init(void)
{
u64 guest_id, required_msrs;
union hv_x64_msr_hypercall_contents hypercall_msr;
+ int cpuhp;
if (x86_hyper_type != X86_HYPER_MS_HYPERV)
return;
@@ -241,9 +265,17 @@ void hyperv_init(void)
if (!hv_vp_index)
return;
- if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
- hv_cpu_init, hv_cpu_die) < 0)
+ hv_vp_assist_page = kcalloc(num_possible_cpus(),
+ sizeof(*hv_vp_assist_page), GFP_KERNEL);
+ if (!hv_vp_assist_page) {
+ ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
goto free_vp_index;
+ }
+
+ cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
+ hv_cpu_init, hv_cpu_die);
+ if (cpuhp < 0)
+ goto free_vp_assist_page;
/*
* Setup the hypercall page and enable hypercalls.
@@ -256,7 +288,7 @@ void hyperv_init(void)
hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX);
if (hv_hypercall_pg == NULL) {
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
- goto free_vp_index;
+ goto remove_cpuhp_state;
}
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
@@ -304,6 +336,11 @@ register_msr_cs:
return;
+remove_cpuhp_state:
+ cpuhp_remove_state(cpuhp);
+free_vp_assist_page:
+ kfree(hv_vp_assist_page);
+ hv_vp_assist_page = NULL;
free_vp_index:
kfree(hv_vp_index);
hv_vp_index = NULL;
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/asm/hyperv-tlfs.h
index 6c0c3a3b631c..416cb0e0c496 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -1,6 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_X86_HYPERV_H
-#define _ASM_X86_HYPERV_H
+
+/*
+ * This file contains definitions from Hyper-V Hypervisor Top-Level Functional
+ * Specification (TLFS):
+ * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs
+ */
+
+#ifndef _ASM_X86_HYPERV_TLFS_H
+#define _ASM_X86_HYPERV_TLFS_H
#include <linux/types.h>
@@ -14,6 +21,7 @@
#define HYPERV_CPUID_FEATURES 0x40000003
#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
+#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A
#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
#define HYPERV_CPUID_MIN 0x40000005
@@ -159,6 +167,9 @@
/* Recommend using the newer ExProcessorMasks interface */
#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11)
+/* Recommend using enlightened VMCS */
+#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED (1 << 14)
+
/*
* Crash notification flag.
*/
@@ -192,7 +203,7 @@
#define HV_X64_MSR_EOI 0x40000070
#define HV_X64_MSR_ICR 0x40000071
#define HV_X64_MSR_TPR 0x40000072
-#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
+#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073
/* Define synthetic interrupt controller model specific registers. */
#define HV_X64_MSR_SCONTROL 0x40000080
@@ -240,6 +251,55 @@
#define HV_X64_MSR_CRASH_PARAMS \
(1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
+/*
+ * Declare the MSR used to setup pages used to communicate with the hypervisor.
+ */
+union hv_x64_msr_hypercall_contents {
+ u64 as_uint64;
+ struct {
+ u64 enable:1;
+ u64 reserved:11;
+ u64 guest_physical_address:52;
+ };
+};
+
+/*
+ * TSC page layout.
+ */
+struct ms_hyperv_tsc_page {
+ volatile u32 tsc_sequence;
+ u32 reserved1;
+ volatile u64 tsc_scale;
+ volatile s64 tsc_offset;
+ u64 reserved2[509];
+};
+
+/*
+ * The guest OS needs to register the guest ID with the hypervisor.
+ * The guest ID is a 64 bit entity and the structure of this ID is
+ * specified in the Hyper-V specification:
+ *
+ * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx
+ *
+ * While the current guideline does not specify how Linux guest ID(s)
+ * need to be generated, our plan is to publish the guidelines for
+ * Linux and other guest operating systems that currently are hosted
+ * on Hyper-V. The implementation here conforms to this yet
+ * unpublished guidelines.
+ *
+ *
+ * Bit(s)
+ * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
+ * 62:56 - Os Type; Linux is 0x100
+ * 55:48 - Distro specific identification
+ * 47:16 - Linux kernel version number
+ * 15:0 - Distro specific identification
+ *
+ *
+ */
+
+#define HV_LINUX_VENDOR_ID 0x8100
+
/* TSC emulation after migration */
#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
@@ -278,10 +338,13 @@ struct hv_tsc_emulation_status {
#define HVCALL_POST_MESSAGE 0x005c
#define HVCALL_SIGNAL_EVENT 0x005d
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
- (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+/* Hyper-V Enlightened VMCS version mask in nested features CPUID */
+#define HV_X64_ENLIGHTENED_VMCS_VERSION 0xff
#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001
#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
@@ -301,12 +364,22 @@ enum HV_GENERIC_SET_FORMAT {
HV_GENERIC_SET_ALL,
};
+#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0)
+#define HV_HYPERCALL_FAST_BIT BIT(16)
+#define HV_HYPERCALL_VARHEAD_OFFSET 17
+#define HV_HYPERCALL_REP_COMP_OFFSET 32
+#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32)
+#define HV_HYPERCALL_REP_START_OFFSET 48
+#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48)
+
/* hypercall status code */
#define HV_STATUS_SUCCESS 0
#define HV_STATUS_INVALID_HYPERCALL_CODE 2
#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
#define HV_STATUS_INVALID_ALIGNMENT 4
+#define HV_STATUS_INVALID_PARAMETER 5
#define HV_STATUS_INSUFFICIENT_MEMORY 11
+#define HV_STATUS_INVALID_PORT_ID 17
#define HV_STATUS_INVALID_CONNECTION_ID 18
#define HV_STATUS_INSUFFICIENT_BUFFERS 19
@@ -321,6 +394,8 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
#define HV_SYNIC_SINT_COUNT (16)
/* Define the expected SynIC version. */
#define HV_SYNIC_VERSION_1 (0x1)
+/* Valid SynIC vectors are 16-255. */
+#define HV_SYNIC_FIRST_VALID_VECTOR (16)
#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0)
#define HV_SYNIC_SIMP_ENABLE (1ULL << 0)
@@ -415,6 +490,216 @@ struct hv_timer_message_payload {
__u64 delivery_time; /* When the message was delivered */
};
+/* Define virtual processor assist page structure. */
+struct hv_vp_assist_page {
+ __u32 apic_assist;
+ __u32 reserved;
+ __u64 vtl_control[2];
+ __u64 nested_enlightenments_control[2];
+ __u32 enlighten_vmentry;
+ __u64 current_nested_vmcs;
+};
+
+struct hv_enlightened_vmcs {
+ u32 revision_id;
+ u32 abort;
+
+ u16 host_es_selector;
+ u16 host_cs_selector;
+ u16 host_ss_selector;
+ u16 host_ds_selector;
+ u16 host_fs_selector;
+ u16 host_gs_selector;
+ u16 host_tr_selector;
+
+ u64 host_ia32_pat;
+ u64 host_ia32_efer;
+
+ u64 host_cr0;
+ u64 host_cr3;
+ u64 host_cr4;
+
+ u64 host_ia32_sysenter_esp;
+ u64 host_ia32_sysenter_eip;
+ u64 host_rip;
+ u32 host_ia32_sysenter_cs;
+
+ u32 pin_based_vm_exec_control;
+ u32 vm_exit_controls;
+ u32 secondary_vm_exec_control;
+
+ u64 io_bitmap_a;
+ u64 io_bitmap_b;
+ u64 msr_bitmap;
+
+ u16 guest_es_selector;
+ u16 guest_cs_selector;
+ u16 guest_ss_selector;
+ u16 guest_ds_selector;
+ u16 guest_fs_selector;
+ u16 guest_gs_selector;
+ u16 guest_ldtr_selector;
+ u16 guest_tr_selector;
+
+ u32 guest_es_limit;
+ u32 guest_cs_limit;
+ u32 guest_ss_limit;
+ u32 guest_ds_limit;
+ u32 guest_fs_limit;
+ u32 guest_gs_limit;
+ u32 guest_ldtr_limit;
+ u32 guest_tr_limit;
+ u32 guest_gdtr_limit;
+ u32 guest_idtr_limit;
+
+ u32 guest_es_ar_bytes;
+ u32 guest_cs_ar_bytes;
+ u32 guest_ss_ar_bytes;
+ u32 guest_ds_ar_bytes;
+ u32 guest_fs_ar_bytes;
+ u32 guest_gs_ar_bytes;
+ u32 guest_ldtr_ar_bytes;
+ u32 guest_tr_ar_bytes;
+
+ u64 guest_es_base;
+ u64 guest_cs_base;
+ u64 guest_ss_base;
+ u64 guest_ds_base;
+ u64 guest_fs_base;
+ u64 guest_gs_base;
+ u64 guest_ldtr_base;
+ u64 guest_tr_base;
+ u64 guest_gdtr_base;
+ u64 guest_idtr_base;
+
+ u64 padding64_1[3];
+
+ u64 vm_exit_msr_store_addr;
+ u64 vm_exit_msr_load_addr;
+ u64 vm_entry_msr_load_addr;
+
+ u64 cr3_target_value0;
+ u64 cr3_target_value1;
+ u64 cr3_target_value2;
+ u64 cr3_target_value3;
+
+ u32 page_fault_error_code_mask;
+ u32 page_fault_error_code_match;
+
+ u32 cr3_target_count;
+ u32 vm_exit_msr_store_count;
+ u32 vm_exit_msr_load_count;
+ u32 vm_entry_msr_load_count;
+
+ u64 tsc_offset;
+ u64 virtual_apic_page_addr;
+ u64 vmcs_link_pointer;
+
+ u64 guest_ia32_debugctl;
+ u64 guest_ia32_pat;
+ u64 guest_ia32_efer;
+
+ u64 guest_pdptr0;
+ u64 guest_pdptr1;
+ u64 guest_pdptr2;
+ u64 guest_pdptr3;
+
+ u64 guest_pending_dbg_exceptions;
+ u64 guest_sysenter_esp;
+ u64 guest_sysenter_eip;
+
+ u32 guest_activity_state;
+ u32 guest_sysenter_cs;
+
+ u64 cr0_guest_host_mask;
+ u64 cr4_guest_host_mask;
+ u64 cr0_read_shadow;
+ u64 cr4_read_shadow;
+ u64 guest_cr0;
+ u64 guest_cr3;
+ u64 guest_cr4;
+ u64 guest_dr7;
+
+ u64 host_fs_base;
+ u64 host_gs_base;
+ u64 host_tr_base;
+ u64 host_gdtr_base;
+ u64 host_idtr_base;
+ u64 host_rsp;
+
+ u64 ept_pointer;
+
+ u16 virtual_processor_id;
+ u16 padding16[3];
+
+ u64 padding64_2[5];
+ u64 guest_physical_address;
+
+ u32 vm_instruction_error;
+ u32 vm_exit_reason;
+ u32 vm_exit_intr_info;
+ u32 vm_exit_intr_error_code;
+ u32 idt_vectoring_info_field;
+ u32 idt_vectoring_error_code;
+ u32 vm_exit_instruction_len;
+ u32 vmx_instruction_info;
+
+ u64 exit_qualification;
+ u64 exit_io_instruction_ecx;
+ u64 exit_io_instruction_esi;
+ u64 exit_io_instruction_edi;
+ u64 exit_io_instruction_eip;
+
+ u64 guest_linear_address;
+ u64 guest_rsp;
+ u64 guest_rflags;
+
+ u32 guest_interruptibility_info;
+ u32 cpu_based_vm_exec_control;
+ u32 exception_bitmap;
+ u32 vm_entry_controls;
+ u32 vm_entry_intr_info_field;
+ u32 vm_entry_exception_error_code;
+ u32 vm_entry_instruction_len;
+ u32 tpr_threshold;
+
+ u64 guest_rip;
+
+ u32 hv_clean_fields;
+ u32 hv_padding_32;
+ u32 hv_synthetic_controls;
+ u32 hv_enlightenments_control;
+ u32 hv_vp_id;
+
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 padding64_4[4];
+ u64 guest_bndcfgs;
+ u64 padding64_5[7];
+ u64 xss_exit_bitmap;
+ u64 padding64_6[7];
+};
+
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP BIT(1)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2 BIT(2)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1 BIT(3)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC BIT(4)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT BIT(5)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY BIT(6)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN BIT(7)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR BIT(8)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT BIT(9)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC BIT(10)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1 BIT(11)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2 BIT(12)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER BIT(13)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1 BIT(14)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15)
+
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF
+
#define HV_STIMER_ENABLE (1ULL << 0)
#define HV_STIMER_PERIODIC (1ULL << 1)
#define HV_STIMER_LAZY (1ULL << 2)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b605a5b6a30c..949c977bc4c9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -34,6 +34,7 @@
#include <asm/msr-index.h>
#include <asm/asm.h>
#include <asm/kvm_page_track.h>
+#include <asm/hyperv-tlfs.h>
#define KVM_MAX_VCPUS 288
#define KVM_SOFT_MAX_VCPUS 240
@@ -73,6 +74,7 @@
#define KVM_REQ_HV_RESET KVM_ARCH_REQ(20)
#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21)
#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
+#define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -498,6 +500,7 @@ struct kvm_vcpu_arch {
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
bool apicv_active;
+ bool load_eoi_exitmap_pending;
DECLARE_BITMAP(ioapic_handled_vectors, 256);
unsigned long apic_attention;
int32_t apic_arb_prio;
@@ -571,7 +574,7 @@ struct kvm_vcpu_arch {
} exception;
struct kvm_queued_interrupt {
- bool pending;
+ bool injected;
bool soft;
u8 nr;
} interrupt;
@@ -754,6 +757,12 @@ struct kvm_hv {
u64 hv_crash_ctl;
HV_REFERENCE_TSC_PAGE tsc_ref;
+
+ struct idr conn_to_evt;
+
+ u64 hv_reenlightenment_control;
+ u64 hv_tsc_emulation_control;
+ u64 hv_tsc_emulation_status;
};
enum kvm_irqchip_mode {
@@ -762,15 +771,6 @@ enum kvm_irqchip_mode {
KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
};
-struct kvm_sev_info {
- bool active; /* SEV enabled guest */
- unsigned int asid; /* ASID used for this guest */
- unsigned int handle; /* SEV firmware handle */
- int fd; /* SEV device fd */
- unsigned long pages_locked; /* Number of pages locked */
- struct list_head regions_list; /* List of registered regions */
-};
-
struct kvm_arch {
unsigned int n_used_mmu_pages;
unsigned int n_requested_mmu_pages;
@@ -800,13 +800,13 @@ struct kvm_arch {
struct mutex apic_map_lock;
struct kvm_apic_map *apic_map;
- unsigned int tss_addr;
bool apic_access_page_done;
gpa_t wall_clock;
- bool ept_identity_pagetable_done;
- gpa_t ept_identity_map_addr;
+ bool mwait_in_guest;
+ bool hlt_in_guest;
+ bool pause_in_guest;
unsigned long irq_sources_bitmap;
s64 kvmclock_offset;
@@ -849,17 +849,8 @@ struct kvm_arch {
bool disabled_lapic_found;
- /* Struct members for AVIC */
- u32 avic_vm_id;
- u32 ldr_mode;
- struct page *avic_logical_id_table_page;
- struct page *avic_physical_id_table_page;
- struct hlist_node hnode;
-
bool x2apic_format;
bool x2apic_broadcast_quirk_disabled;
-
- struct kvm_sev_info sev_info;
};
struct kvm_vm_stat {
@@ -936,6 +927,8 @@ struct kvm_x86_ops {
bool (*cpu_has_high_real_mode_segbase)(void);
void (*cpuid_update)(struct kvm_vcpu *vcpu);
+ struct kvm *(*vm_alloc)(void);
+ void (*vm_free)(struct kvm *);
int (*vm_init)(struct kvm *kvm);
void (*vm_destroy)(struct kvm *kvm);
@@ -1007,6 +1000,7 @@ struct kvm_x86_ops {
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
+ int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
int (*get_tdp_level)(struct kvm_vcpu *vcpu);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
@@ -1109,6 +1103,17 @@ struct kvm_arch_async_pf {
extern struct kvm_x86_ops *kvm_x86_ops;
+#define __KVM_HAVE_ARCH_VM_ALLOC
+static inline struct kvm *kvm_arch_alloc_vm(void)
+{
+ return kvm_x86_ops->vm_alloc();
+}
+
+static inline void kvm_arch_free_vm(struct kvm *kvm)
+{
+ return kvm_x86_ops->vm_free(kvm);
+}
+
int kvm_mmu_module_init(void);
void kvm_mmu_module_exit(void);
@@ -1187,6 +1192,8 @@ enum emulation_result {
#define EMULTYPE_SKIP (1 << 2)
#define EMULTYPE_RETRY (1 << 3)
#define EMULTYPE_NO_REEXECUTE (1 << 4)
+#define EMULTYPE_NO_UD_ON_FAIL (1 << 5)
+#define EMULTYPE_VMWARE (1 << 6)
int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
int emulation_type, void *insn, int insn_len);
@@ -1204,8 +1211,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
struct x86_emulate_ctxt;
-int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
-int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port);
+int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 7b407dda2bd7..3aea2658323a 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -88,6 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
#ifdef CONFIG_KVM_GUEST
bool kvm_para_available(void);
unsigned int kvm_arch_para_features(void);
+unsigned int kvm_arch_para_hints(void);
void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_pf_reason(void);
@@ -115,6 +116,11 @@ static inline unsigned int kvm_arch_para_features(void)
return 0;
}
+static inline unsigned int kvm_arch_para_hints(void)
+{
+ return 0;
+}
+
static inline u32 kvm_read_and_reset_pf_reason(void)
{
return 0;
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index e73c4d0c06ad..b90e79610cf7 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -6,90 +6,23 @@
#include <linux/atomic.h>
#include <linux/nmi.h>
#include <asm/io.h>
-#include <asm/hyperv.h>
+#include <asm/hyperv-tlfs.h>
#include <asm/nospec-branch.h>
-/*
- * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
- * is set by CPUID(HVCPUID_VERSION_FEATURES).
- */
-enum hv_cpuid_function {
- HVCPUID_VERSION_FEATURES = 0x00000001,
- HVCPUID_VENDOR_MAXFUNCTION = 0x40000000,
- HVCPUID_INTERFACE = 0x40000001,
-
- /*
- * The remaining functions depend on the value of
- * HVCPUID_INTERFACE
- */
- HVCPUID_VERSION = 0x40000002,
- HVCPUID_FEATURES = 0x40000003,
- HVCPUID_ENLIGHTENMENT_INFO = 0x40000004,
- HVCPUID_IMPLEMENTATION_LIMITS = 0x40000005,
-};
-
struct ms_hyperv_info {
u32 features;
u32 misc_features;
u32 hints;
+ u32 nested_features;
u32 max_vp_index;
u32 max_lp_index;
};
extern struct ms_hyperv_info ms_hyperv;
-/*
- * Declare the MSR used to setup pages used to communicate with the hypervisor.
- */
-union hv_x64_msr_hypercall_contents {
- u64 as_uint64;
- struct {
- u64 enable:1;
- u64 reserved:11;
- u64 guest_physical_address:52;
- };
-};
/*
- * TSC page layout.
- */
-
-struct ms_hyperv_tsc_page {
- volatile u32 tsc_sequence;
- u32 reserved1;
- volatile u64 tsc_scale;
- volatile s64 tsc_offset;
- u64 reserved2[509];
-};
-
-/*
- * The guest OS needs to register the guest ID with the hypervisor.
- * The guest ID is a 64 bit entity and the structure of this ID is
- * specified in the Hyper-V specification:
- *
- * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx
- *
- * While the current guideline does not specify how Linux guest ID(s)
- * need to be generated, our plan is to publish the guidelines for
- * Linux and other guest operating systems that currently are hosted
- * on Hyper-V. The implementation here conforms to this yet
- * unpublished guidelines.
- *
- *
- * Bit(s)
- * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
- * 62:56 - Os Type; Linux is 0x100
- * 55:48 - Distro specific identification
- * 47:16 - Linux kernel version number
- * 15:0 - Distro specific identification
- *
- *
- */
-
-#define HV_LINUX_VENDOR_ID 0x8100
-
-/*
- * Generate the guest ID based on the guideline described above.
+ * Generate the guest ID.
*/
static inline __u64 generate_guest_id(__u64 d_info1, __u64 kernel_version,
@@ -228,14 +161,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
return hv_status;
}
-#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0)
-#define HV_HYPERCALL_FAST_BIT BIT(16)
-#define HV_HYPERCALL_VARHEAD_OFFSET 17
-#define HV_HYPERCALL_REP_COMP_OFFSET 32
-#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32)
-#define HV_HYPERCALL_REP_START_OFFSET 48
-#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48)
-
/* Fast hypercall with 8 bytes of input and no output */
static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
{
@@ -307,6 +232,15 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
*/
extern u32 *hv_vp_index;
extern u32 hv_max_vp_index;
+extern struct hv_vp_assist_page **hv_vp_assist_page;
+
+static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
+{
+ if (!hv_vp_assist_page)
+ return NULL;
+
+ return hv_vp_assist_page[cpu];
+}
/**
* hv_cpu_number_to_vp_number() - Map CPU to VP.
@@ -343,6 +277,10 @@ static inline void hyperv_setup_mmu_ops(void) {}
static inline void set_hv_tscchange_cb(void (*cb)(void)) {}
static inline void clear_hv_tscchange_cb(void) {}
static inline void hyperv_stop_tsc_emulation(void) {};
+static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
+{
+ return NULL;
+}
#endif /* CONFIG_HYPERV */
#ifdef CONFIG_HYPERV_TSCPAGE
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index c9084dedfcfa..53d5b1b9255e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -353,7 +353,21 @@
/* Fam 15h MSRs */
#define MSR_F15H_PERF_CTL 0xc0010200
+#define MSR_F15H_PERF_CTL0 MSR_F15H_PERF_CTL
+#define MSR_F15H_PERF_CTL1 (MSR_F15H_PERF_CTL + 2)
+#define MSR_F15H_PERF_CTL2 (MSR_F15H_PERF_CTL + 4)
+#define MSR_F15H_PERF_CTL3 (MSR_F15H_PERF_CTL + 6)
+#define MSR_F15H_PERF_CTL4 (MSR_F15H_PERF_CTL + 8)
+#define MSR_F15H_PERF_CTL5 (MSR_F15H_PERF_CTL + 10)
+
#define MSR_F15H_PERF_CTR 0xc0010201
+#define MSR_F15H_PERF_CTR0 MSR_F15H_PERF_CTR
+#define MSR_F15H_PERF_CTR1 (MSR_F15H_PERF_CTR + 2)
+#define MSR_F15H_PERF_CTR2 (MSR_F15H_PERF_CTR + 4)
+#define MSR_F15H_PERF_CTR3 (MSR_F15H_PERF_CTR + 6)
+#define MSR_F15H_PERF_CTR4 (MSR_F15H_PERF_CTR + 8)
+#define MSR_F15H_PERF_CTR5 (MSR_F15H_PERF_CTR + 10)
+
#define MSR_F15H_NB_PERF_CTL 0xc0010240
#define MSR_F15H_NB_PERF_CTR 0xc0010241
#define MSR_F15H_PTSC 0xc0010280
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b0ccd4847a58..4fa4206029e3 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -407,9 +407,19 @@ union irq_stack_union {
DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
DECLARE_INIT_PER_CPU(irq_stack_union);
+static inline unsigned long cpu_kernelmode_gs_base(int cpu)
+{
+ return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu);
+}
+
DECLARE_PER_CPU(char *, irq_stack_ptr);
DECLARE_PER_CPU(unsigned int, irq_count);
extern asmlinkage void ignore_sysret(void);
+
+#if IS_ENABLED(CONFIG_KVM)
+/* Save actual FS/GS selectors and bases to current->thread */
+void save_fsgs_for_kvm(void);
+#endif
#else /* X86_64 */
#ifdef CONFIG_CC_STACKPROTECTOR
/*
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0487ac054870..93b462e48067 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -60,7 +60,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u32 intercept_dr;
u32 intercept_exceptions;
u64 intercept;
- u8 reserved_1[42];
+ u8 reserved_1[40];
+ u16 pause_filter_thresh;
u16 pause_filter_count;
u64 iopm_base_pa;
u64 msrpm_base_pa;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index f3a960488eae..c535c2fdea13 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -354,8 +354,25 @@ struct kvm_xcrs {
__u64 padding[16];
};
-/* definition of registers in kvm_run */
+#define KVM_SYNC_X86_REGS (1UL << 0)
+#define KVM_SYNC_X86_SREGS (1UL << 1)
+#define KVM_SYNC_X86_EVENTS (1UL << 2)
+
+#define KVM_SYNC_X86_VALID_FIELDS \
+ (KVM_SYNC_X86_REGS| \
+ KVM_SYNC_X86_SREGS| \
+ KVM_SYNC_X86_EVENTS)
+
+/* kvm_sync_regs struct included by kvm_run struct */
struct kvm_sync_regs {
+ /* Members of this structure are potentially malicious.
+ * Care must be taken by code reading, esp. interpreting,
+ * data fields from them inside KVM to prevent TOCTOU and
+ * double-fetch types of vulnerabilities.
+ */
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ struct kvm_vcpu_events events;
};
#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 6cfa9c8cb7d6..4c851ebb3ceb 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -3,15 +3,16 @@
#define _UAPI_ASM_X86_KVM_PARA_H
#include <linux/types.h>
-#include <asm/hyperv.h>
/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
* should be used to determine that a VM is running under KVM.
*/
#define KVM_CPUID_SIGNATURE 0x40000000
-/* This CPUID returns a feature bitmap in eax. Before enabling a particular
- * paravirtualization, the appropriate feature bit should be checked.
+/* This CPUID returns two feature bitmaps in eax, edx. Before enabling
+ * a particular paravirtualization, the appropriate feature bit should
+ * be checked in eax. The performance hint feature bit should be checked
+ * in edx.
*/
#define KVM_CPUID_FEATURES 0x40000001
#define KVM_FEATURE_CLOCKSOURCE 0
@@ -28,6 +29,8 @@
#define KVM_FEATURE_PV_TLB_FLUSH 9
#define KVM_FEATURE_ASYNC_PF_VMEXIT 10
+#define KVM_HINTS_DEDICATED 0
+
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
*/
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 348cf4821240..4702fbd98f92 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -487,7 +487,7 @@ void load_percpu_segment(int cpu)
loadsegment(fs, __KERNEL_PERCPU);
#else
__loadsegment_simple(gs, 0);
- wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
+ wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
#endif
load_stack_canary_segment();
}
@@ -1398,6 +1398,7 @@ __setup("clearcpuid=", setup_clearcpuid);
#ifdef CONFIG_X86_64
DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE) __visible;
+EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union);
/*
* The following percpu variables are hot. Align current_task to
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 4488cf0dd499..031082c96db8 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -22,7 +22,7 @@
#include <linux/kexec.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
-#include <asm/hyperv.h>
+#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
#include <asm/irq_regs.h>
@@ -216,8 +216,8 @@ static void __init ms_hyperv_init_platform(void)
pr_info("Hyper-V: features 0x%x, hints 0x%x\n",
ms_hyperv.features, ms_hyperv.hints);
- ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS);
- ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS);
+ ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
+ ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
@@ -225,11 +225,12 @@ static void __init ms_hyperv_init_platform(void)
/*
* Extract host information.
*/
- if (cpuid_eax(HVCPUID_VENDOR_MAXFUNCTION) >= HVCPUID_VERSION) {
- hv_host_info_eax = cpuid_eax(HVCPUID_VERSION);
- hv_host_info_ebx = cpuid_ebx(HVCPUID_VERSION);
- hv_host_info_ecx = cpuid_ecx(HVCPUID_VERSION);
- hv_host_info_edx = cpuid_edx(HVCPUID_VERSION);
+ if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >=
+ HYPERV_CPUID_VERSION) {
+ hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION);
+ hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION);
+ hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION);
+ hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION);
pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n",
hv_host_info_eax, hv_host_info_ebx >> 16,
@@ -243,6 +244,11 @@ static void __init ms_hyperv_init_platform(void)
x86_platform.calibrate_cpu = hv_get_tsc_khz;
}
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
+ ms_hyperv.nested_features =
+ cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
+ }
+
#ifdef CONFIG_X86_LOCAL_APIC
if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index fae86e36e399..7867417cfaff 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -454,6 +454,13 @@ static void __init sev_map_percpu_data(void)
}
#ifdef CONFIG_SMP
+static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
+{
+ native_smp_prepare_cpus(max_cpus);
+ if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
+ static_branch_disable(&virt_spin_lock_key);
+}
+
static void __init kvm_smp_prepare_boot_cpu(void)
{
/*
@@ -546,6 +553,7 @@ static void __init kvm_guest_init(void)
}
if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
+ !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
@@ -556,6 +564,7 @@ static void __init kvm_guest_init(void)
kvm_setup_vsyscall_timeinfo();
#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
kvm_cpu_online, kvm_cpu_down_prepare) < 0)
@@ -605,6 +614,11 @@ unsigned int kvm_arch_para_features(void)
return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
}
+unsigned int kvm_arch_para_hints(void)
+{
+ return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
+}
+
static uint32_t __init kvm_detect(void)
{
return kvm_cpuid_base();
@@ -635,6 +649,7 @@ static __init int kvm_setup_pv_tlb_flush(void)
int cpu;
if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
+ !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
for_each_possible_cpu(cpu) {
zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
@@ -730,6 +745,9 @@ void __init kvm_spinlock_init(void)
if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
return;
+ if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
+ return;
+
__pv_init_lock_hash();
pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9eb448c7859d..4b100fe0f508 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -205,6 +205,20 @@ static __always_inline void save_fsgs(struct task_struct *task)
save_base_legacy(task, task->thread.gsindex, GS);
}
+#if IS_ENABLED(CONFIG_KVM)
+/*
+ * While a process is running,current->thread.fsbase and current->thread.gsbase
+ * may not match the corresponding CPU registers (see save_base_legacy()). KVM
+ * wants an efficient way to save and restore FSBASE and GSBASE.
+ * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
+ */
+void save_fsgs_for_kvm(void)
+{
+ save_fsgs(current);
+}
+EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
+#endif
+
static __always_inline void loadseg(enum which_selector which,
unsigned short sel)
{
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b671fc2d0422..82055b90a8b3 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -135,6 +135,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
return -EINVAL;
}
+ best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
+ if (kvm_hlt_in_guest(vcpu->kvm) && best &&
+ (best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
+ best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
+
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
kvm_mmu_reset_context(vcpu);
@@ -370,7 +375,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
- F(TOPOEXT);
+ F(TOPOEXT) | F(PERFCTR_CORE);
/* cpuid 0x80000008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d91eaeb01034..b3705ae52824 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -30,6 +30,7 @@
#include "x86.h"
#include "tss.h"
#include "mmu.h"
+#include "pmu.h"
/*
* Operand types
@@ -2887,6 +2888,9 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
return ctxt->ops->cpl(ctxt) > iopl;
}
+#define VMWARE_PORT_VMPORT (0x5658)
+#define VMWARE_PORT_VMRPC (0x5659)
+
static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
u16 port, u16 len)
{
@@ -2898,6 +2902,14 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
unsigned mask = (1 << len) - 1;
unsigned long base;
+ /*
+ * VMware allows access to these ports even if denied
+ * by TSS I/O permission bitmap. Mimic behavior.
+ */
+ if (enable_vmware_backdoor &&
+ ((port == VMWARE_PORT_VMPORT) || (port == VMWARE_PORT_VMRPC)))
+ return true;
+
ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
if (!tr_seg.p)
return false;
@@ -4282,6 +4294,13 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
+ /*
+ * VMware allows access to these Pseduo-PMCs even when read via RDPMC
+ * in Ring3 when CR4.PCE=0.
+ */
+ if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx))
+ return X86EMUL_CONTINUE;
+
if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
ctxt->ops->check_pmc(ctxt, rcx))
return emulate_gp(ctxt, 0);
@@ -4498,6 +4517,10 @@ static const struct gprefix pfx_0f_2b = {
ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N,
};
+static const struct gprefix pfx_0f_10_0f_11 = {
+ I(Unaligned, em_mov), I(Unaligned, em_mov), N, N,
+};
+
static const struct gprefix pfx_0f_28_0f_29 = {
I(Aligned, em_mov), I(Aligned, em_mov), N, N,
};
@@ -4709,7 +4732,9 @@ static const struct opcode twobyte_table[256] = {
DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
/* 0x10 - 0x1F */
- N, N, N, N, N, N, N, N,
+ GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11),
+ N, N, N, N, N, N,
D(ImplicitOps | ModRM | SrcMem | NoAccess),
N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess),
/* 0x20 - 0x2F */
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index dc97f2544b6f..98618e397342 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -29,6 +29,7 @@
#include <linux/kvm_host.h>
#include <linux/highmem.h>
#include <linux/sched/cputime.h>
+#include <linux/eventfd.h>
#include <asm/apicdef.h>
#include <trace/events/kvm.h>
@@ -74,13 +75,38 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
return false;
}
+static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
+ return;
+
+ if (synic_has_vector_connected(synic, vector))
+ __set_bit(vector, synic->vec_bitmap);
+ else
+ __clear_bit(vector, synic->vec_bitmap);
+
+ if (synic_has_vector_auto_eoi(synic, vector))
+ __set_bit(vector, synic->auto_eoi_bitmap);
+ else
+ __clear_bit(vector, synic->auto_eoi_bitmap);
+}
+
static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
u64 data, bool host)
{
- int vector;
+ int vector, old_vector;
+ bool masked;
vector = data & HV_SYNIC_SINT_VECTOR_MASK;
- if (vector < 16 && !host)
+ masked = data & HV_SYNIC_SINT_MASKED;
+
+ /*
+ * Valid vectors are 16-255, however, nested Hyper-V attempts to write
+ * default '0x10000' value on boot and this should not #GP. We need to
+ * allow zero-initing the register from host as well.
+ */
+ if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host && !masked)
return 1;
/*
* Guest may configure multiple SINTs to use the same vector, so
@@ -88,18 +114,13 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
* bitmap of vectors with auto-eoi behavior. The bitmaps are
* updated here, and atomically queried on fast paths.
*/
+ old_vector = synic_read_sint(synic, sint) & HV_SYNIC_SINT_VECTOR_MASK;
atomic64_set(&synic->sint[sint], data);
- if (synic_has_vector_connected(synic, vector))
- __set_bit(vector, synic->vec_bitmap);
- else
- __clear_bit(vector, synic->vec_bitmap);
+ synic_update_vector(synic, old_vector);
- if (synic_has_vector_auto_eoi(synic, vector))
- __set_bit(vector, synic->auto_eoi_bitmap);
- else
- __clear_bit(vector, synic->auto_eoi_bitmap);
+ synic_update_vector(synic, vector);
/* Load SynIC vectors into EOI exit bitmap */
kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic));
@@ -736,6 +757,9 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_RESET:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
r = true;
break;
}
@@ -981,6 +1005,15 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
kvm_make_request(KVM_REQ_HV_RESET, vcpu);
}
break;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ hv->hv_reenlightenment_control = data;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ hv->hv_tsc_emulation_control = data;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ hv->hv_tsc_emulation_status = data;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@@ -1009,17 +1042,17 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return 1;
hv->vp_index = (u32)data;
break;
- case HV_X64_MSR_APIC_ASSIST_PAGE: {
+ case HV_X64_MSR_VP_ASSIST_PAGE: {
u64 gfn;
unsigned long addr;
- if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
+ if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
hv->hv_vapic = data;
if (kvm_lapic_enable_pv_eoi(vcpu, 0))
return 1;
break;
}
- gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
+ gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT;
addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
if (kvm_is_error_hva(addr))
return 1;
@@ -1105,6 +1138,15 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case HV_X64_MSR_RESET:
data = 0;
break;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ data = hv->hv_reenlightenment_control;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ data = hv->hv_tsc_emulation_control;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ data = hv->hv_tsc_emulation_status;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -1129,7 +1171,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
case HV_X64_MSR_TPR:
return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
- case HV_X64_MSR_APIC_ASSIST_PAGE:
+ case HV_X64_MSR_VP_ASSIST_PAGE:
data = hv->hv_vapic;
break;
case HV_X64_MSR_VP_RUNTIME:
@@ -1226,10 +1268,47 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
return 1;
}
+static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
+{
+ struct eventfd_ctx *eventfd;
+
+ if (unlikely(!fast)) {
+ int ret;
+ gpa_t gpa = param;
+
+ if ((gpa & (__alignof__(param) - 1)) ||
+ offset_in_page(gpa) + sizeof(param) > PAGE_SIZE)
+ return HV_STATUS_INVALID_ALIGNMENT;
+
+ ret = kvm_vcpu_read_guest(vcpu, gpa, &param, sizeof(param));
+ if (ret < 0)
+ return HV_STATUS_INVALID_ALIGNMENT;
+ }
+
+ /*
+ * Per spec, bits 32-47 contain the extra "flag number". However, we
+ * have no use for it, and in all known usecases it is zero, so just
+ * report lookup failure if it isn't.
+ */
+ if (param & 0xffff00000000ULL)
+ return HV_STATUS_INVALID_PORT_ID;
+ /* remaining bits are reserved-zero */
+ if (param & ~KVM_HYPERV_CONN_ID_MASK)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ /* conn_to_evt is protected by vcpu->kvm->srcu */
+ eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param);
+ if (!eventfd)
+ return HV_STATUS_INVALID_PORT_ID;
+
+ eventfd_signal(eventfd, 1);
+ return HV_STATUS_SUCCESS;
+}
+
int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
{
- u64 param, ingpa, outgpa, ret;
- uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
+ u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS;
+ uint16_t code, rep_idx, rep_cnt;
bool fast, longmode;
/*
@@ -1268,7 +1347,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
/* Hypercall continuation is not supported yet */
if (rep_cnt || rep_idx) {
- res = HV_STATUS_INVALID_HYPERCALL_CODE;
+ ret = HV_STATUS_INVALID_HYPERCALL_CODE;
goto set_result;
}
@@ -1276,11 +1355,15 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
case HVCALL_NOTIFY_LONG_SPIN_WAIT:
kvm_vcpu_on_spin(vcpu, true);
break;
- case HVCALL_POST_MESSAGE:
case HVCALL_SIGNAL_EVENT:
+ ret = kvm_hvcall_signal_event(vcpu, fast, ingpa);
+ if (ret != HV_STATUS_INVALID_PORT_ID)
+ break;
+ /* maybe userspace knows this conn_id: fall through */
+ case HVCALL_POST_MESSAGE:
/* don't bother userspace if it has no way to handle it */
if (!vcpu_to_synic(vcpu)->active) {
- res = HV_STATUS_INVALID_HYPERCALL_CODE;
+ ret = HV_STATUS_INVALID_HYPERCALL_CODE;
break;
}
vcpu->run->exit_reason = KVM_EXIT_HYPERV;
@@ -1292,12 +1375,79 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
kvm_hv_hypercall_complete_userspace;
return 0;
default:
- res = HV_STATUS_INVALID_HYPERCALL_CODE;
+ ret = HV_STATUS_INVALID_HYPERCALL_CODE;
break;
}
set_result:
- ret = res | (((u64)rep_done & 0xfff) << 32);
kvm_hv_hypercall_set_result(vcpu, ret);
return 1;
}
+
+void kvm_hv_init_vm(struct kvm *kvm)
+{
+ mutex_init(&kvm->arch.hyperv.hv_lock);
+ idr_init(&kvm->arch.hyperv.conn_to_evt);
+}
+
+void kvm_hv_destroy_vm(struct kvm *kvm)
+{
+ struct eventfd_ctx *eventfd;
+ int i;
+
+ idr_for_each_entry(&kvm->arch.hyperv.conn_to_evt, eventfd, i)
+ eventfd_ctx_put(eventfd);
+ idr_destroy(&kvm->arch.hyperv.conn_to_evt);
+}
+
+static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct eventfd_ctx *eventfd;
+ int ret;
+
+ eventfd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ mutex_lock(&hv->hv_lock);
+ ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
+ GFP_KERNEL);
+ mutex_unlock(&hv->hv_lock);
+
+ if (ret >= 0)
+ return 0;
+
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+ eventfd_ctx_put(eventfd);
+ return ret;
+}
+
+static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct eventfd_ctx *eventfd;
+
+ mutex_lock(&hv->hv_lock);
+ eventfd = idr_remove(&hv->conn_to_evt, conn_id);
+ mutex_unlock(&hv->hv_lock);
+
+ if (!eventfd)
+ return -ENOENT;
+
+ synchronize_srcu(&kvm->srcu);
+ eventfd_ctx_put(eventfd);
+ return 0;
+}
+
+int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
+{
+ if ((args->flags & ~KVM_HYPERV_EVENTFD_DEASSIGN) ||
+ (args->conn_id & ~KVM_HYPERV_CONN_ID_MASK))
+ return -EINVAL;
+
+ if (args->flags == KVM_HYPERV_EVENTFD_DEASSIGN)
+ return kvm_hv_eventfd_deassign(kvm, args->conn_id);
+ return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd);
+}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index e637631a9574..837465d69c6d 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -88,4 +88,8 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock);
+void kvm_hv_init_vm(struct kvm *kvm);
+void kvm_hv_destroy_vm(struct kvm *kvm);
+int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
+
#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index f171051eecf3..faa264822cee 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -73,8 +73,19 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
*/
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
{
+ /*
+ * FIXME: interrupt.injected represents an interrupt that it's
+ * side-effects have already been applied (e.g. bit from IRR
+ * already moved to ISR). Therefore, it is incorrect to rely
+ * on interrupt.injected to know if there is a pending
+ * interrupt in the user-mode LAPIC.
+ * This leads to nVMX/nSVM not be able to distinguish
+ * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
+ * pending interrupt or should re-inject an injected
+ * interrupt.
+ */
if (!lapic_in_kernel(v))
- return v->arch.interrupt.pending;
+ return v->arch.interrupt.injected;
if (kvm_cpu_has_extint(v))
return 1;
@@ -91,8 +102,19 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
*/
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
+ /*
+ * FIXME: interrupt.injected represents an interrupt that it's
+ * side-effects have already been applied (e.g. bit from IRR
+ * already moved to ISR). Therefore, it is incorrect to rely
+ * on interrupt.injected to know if there is a pending
+ * interrupt in the user-mode LAPIC.
+ * This leads to nVMX/nSVM not be able to distinguish
+ * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
+ * pending interrupt or should re-inject an injected
+ * interrupt.
+ */
if (!lapic_in_kernel(v))
- return v->arch.interrupt.pending;
+ return v->arch.interrupt.injected;
if (kvm_cpu_has_extint(v))
return 1;
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index f500293dad8d..9619dcc2b325 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -41,7 +41,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
if (!test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail))
- kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
+ kvm_x86_ops->cache_reg(vcpu, (enum kvm_reg)VCPU_EXREG_PDPTR);
return vcpu->arch.walk_mmu->pdptrs[index];
}
@@ -93,6 +93,11 @@ static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
{
vcpu->arch.hflags &= ~HF_GUEST_MASK;
+
+ if (vcpu->arch.load_eoi_exitmap_pending) {
+ vcpu->arch.load_eoi_exitmap_pending = false;
+ kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
+ }
}
static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 391dda8d43b7..70dcb5548022 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -321,8 +321,16 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
if (!lapic_in_kernel(vcpu))
return;
+ /*
+ * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
+ * which doesn't have EOI register; Some buggy OSes (e.g. Windows with
+ * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC
+ * version first and level-triggered interrupts never get EOIed in
+ * IOAPIC.
+ */
feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
- if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))))
+ if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) &&
+ !ioapic_in_kernel(vcpu->kvm))
v |= APIC_LVR_DIRECTED_EOI;
kvm_lapic_set_reg(apic, APIC_LVR, v);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 56c36014f7b7..edce055e9fd7 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -109,7 +109,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
+ return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
}
int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 763bb3bade63..8494dbae41b9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3031,7 +3031,7 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
return RET_PF_RETRY;
}
- return RET_PF_EMULATE;
+ return -EFAULT;
}
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5abae72266b7..6288e9d7068e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -452,14 +452,21 @@ error:
* done by is_rsvd_bits_set() above.
*
* We set up the value of exit_qualification to inject:
- * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
+ * [2:0] - Derive from the access bits. The exit_qualification might be
+ * out of date if it is serving an EPT misconfiguration.
* [5:3] - Calculated by the page walk of the guest EPT page tables
* [7:8] - Derived from [7:8] of real exit_qualification
*
* The other bits are set to 0.
*/
if (!(errcode & PFERR_RSVD_MASK)) {
- vcpu->arch.exit_qualification &= 0x187;
+ vcpu->arch.exit_qualification &= 0x180;
+ if (write_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
+ if (user_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
+ if (fetch_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
}
#endif
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 026db42a86c3..58ead7db71a3 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -244,12 +244,49 @@ int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
}
+bool is_vmware_backdoor_pmc(u32 pmc_idx)
+{
+ switch (pmc_idx) {
+ case VMWARE_BACKDOOR_PMC_HOST_TSC:
+ case VMWARE_BACKDOOR_PMC_REAL_TIME:
+ case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
+ return true;
+ }
+ return false;
+}
+
+static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
+{
+ u64 ctr_val;
+
+ switch (idx) {
+ case VMWARE_BACKDOOR_PMC_HOST_TSC:
+ ctr_val = rdtsc();
+ break;
+ case VMWARE_BACKDOOR_PMC_REAL_TIME:
+ ctr_val = ktime_get_boot_ns();
+ break;
+ case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
+ ctr_val = ktime_get_boot_ns() +
+ vcpu->kvm->arch.kvmclock_offset;
+ break;
+ default:
+ return 1;
+ }
+
+ *data = ctr_val;
+ return 0;
+}
+
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
{
bool fast_mode = idx & (1u << 31);
struct kvm_pmc *pmc;
u64 ctr_val;
+ if (is_vmware_backdoor_pmc(idx))
+ return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
+
pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx);
if (!pmc)
return 1;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index a9a62b9a73e2..ba8898e1a854 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -9,6 +9,10 @@
/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
+#define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000
+#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
+#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
+
struct kvm_event_hw_type_mapping {
u8 eventsel;
u8 unit_mask;
@@ -114,6 +118,8 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu);
void kvm_pmu_init(struct kvm_vcpu *vcpu);
void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+bool is_vmware_backdoor_pmc(u32 pmc_idx);
+
extern struct kvm_pmu_ops intel_pmu_ops;
extern struct kvm_pmu_ops amd_pmu_ops;
#endif /* __KVM_X86_PMU_H */
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
index cd944435dfbd..1495a735b38e 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/pmu_amd.c
@@ -19,6 +19,21 @@
#include "lapic.h"
#include "pmu.h"
+enum pmu_type {
+ PMU_TYPE_COUNTER = 0,
+ PMU_TYPE_EVNTSEL,
+};
+
+enum index {
+ INDEX_ZERO = 0,
+ INDEX_ONE,
+ INDEX_TWO,
+ INDEX_THREE,
+ INDEX_FOUR,
+ INDEX_FIVE,
+ INDEX_ERROR,
+};
+
/* duplicated from amd_perfmon_event_map, K7 and above should work. */
static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
[0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
@@ -31,6 +46,88 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
[7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
};
+static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
+{
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
+ if (type == PMU_TYPE_COUNTER)
+ return MSR_F15H_PERF_CTR;
+ else
+ return MSR_F15H_PERF_CTL;
+ } else {
+ if (type == PMU_TYPE_COUNTER)
+ return MSR_K7_PERFCTR0;
+ else
+ return MSR_K7_EVNTSEL0;
+ }
+}
+
+static enum index msr_to_index(u32 msr)
+{
+ switch (msr) {
+ case MSR_F15H_PERF_CTL0:
+ case MSR_F15H_PERF_CTR0:
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_PERFCTR0:
+ return INDEX_ZERO;
+ case MSR_F15H_PERF_CTL1:
+ case MSR_F15H_PERF_CTR1:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_PERFCTR1:
+ return INDEX_ONE;
+ case MSR_F15H_PERF_CTL2:
+ case MSR_F15H_PERF_CTR2:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_PERFCTR2:
+ return INDEX_TWO;
+ case MSR_F15H_PERF_CTL3:
+ case MSR_F15H_PERF_CTR3:
+ case MSR_K7_EVNTSEL3:
+ case MSR_K7_PERFCTR3:
+ return INDEX_THREE;
+ case MSR_F15H_PERF_CTL4:
+ case MSR_F15H_PERF_CTR4:
+ return INDEX_FOUR;
+ case MSR_F15H_PERF_CTL5:
+ case MSR_F15H_PERF_CTR5:
+ return INDEX_FIVE;
+ default:
+ return INDEX_ERROR;
+ }
+}
+
+static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
+ enum pmu_type type)
+{
+ switch (msr) {
+ case MSR_F15H_PERF_CTL0:
+ case MSR_F15H_PERF_CTL1:
+ case MSR_F15H_PERF_CTL2:
+ case MSR_F15H_PERF_CTL3:
+ case MSR_F15H_PERF_CTL4:
+ case MSR_F15H_PERF_CTL5:
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ if (type != PMU_TYPE_EVNTSEL)
+ return NULL;
+ break;
+ case MSR_F15H_PERF_CTR0:
+ case MSR_F15H_PERF_CTR1:
+ case MSR_F15H_PERF_CTR2:
+ case MSR_F15H_PERF_CTR3:
+ case MSR_F15H_PERF_CTR4:
+ case MSR_F15H_PERF_CTR5:
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ if (type != PMU_TYPE_COUNTER)
+ return NULL;
+ break;
+ default:
+ return NULL;
+ }
+
+ return &pmu->gp_counters[msr_to_index(msr)];
+}
+
static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
u8 event_select,
u8 unit_mask)
@@ -64,7 +161,18 @@ static bool amd_pmc_is_enabled(struct kvm_pmc *pmc)
static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
{
- return get_gp_pmc(pmu, MSR_K7_EVNTSEL0 + pmc_idx, MSR_K7_EVNTSEL0);
+ unsigned int base = get_msr_base(pmu, PMU_TYPE_COUNTER);
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
+ /*
+ * The idx is contiguous. The MSRs are not. The counter MSRs
+ * are interleaved with the event select MSRs.
+ */
+ pmc_idx *= 2;
+ }
+
+ return get_gp_pmc_amd(pmu, base + pmc_idx, PMU_TYPE_COUNTER);
}
/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
@@ -96,8 +204,8 @@ static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
int ret = false;
- ret = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0) ||
- get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+ ret = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER) ||
+ get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
return ret;
}
@@ -107,14 +215,14 @@ static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
- /* MSR_K7_PERFCTRn */
- pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
if (pmc) {
*data = pmc_read_counter(pmc);
return 0;
}
- /* MSR_K7_EVNTSELn */
- pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
if (pmc) {
*data = pmc->eventsel;
return 0;
@@ -130,14 +238,14 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
u32 msr = msr_info->index;
u64 data = msr_info->data;
- /* MSR_K7_PERFCTRn */
- pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
if (pmc) {
pmc->counter += data - pmc_read_counter(pmc);
return 0;
}
- /* MSR_K7_EVNTSELn */
- pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
if (pmc) {
if (data == pmc->eventsel)
return 0;
@@ -154,7 +262,11 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
+ else
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
pmu->reserved_bits = 0xffffffff00200000ull;
/* not applicable to AMD; but clean them to prevent any fall out */
@@ -169,7 +281,9 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
int i;
- for (i = 0; i < AMD64_NUM_COUNTERS ; i++) {
+ BUILD_BUG_ON(AMD64_NUM_COUNTERS_CORE > INTEL_PMC_MAX_GENERIC);
+
+ for (i = 0; i < AMD64_NUM_COUNTERS_CORE ; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
@@ -181,7 +295,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
int i;
- for (i = 0; i < AMD64_NUM_COUNTERS; i++) {
+ for (i = 0; i < AMD64_NUM_COUNTERS_CORE; i++) {
struct kvm_pmc *pmc = &pmu->gp_counters[i];
pmc_stop_counter(pmc);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9d2043f94e29..b58787daf9f8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -131,6 +131,28 @@ static const u32 host_save_user_msrs[] = {
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+struct kvm_sev_info {
+ bool active; /* SEV enabled guest */
+ unsigned int asid; /* ASID used for this guest */
+ unsigned int handle; /* SEV firmware handle */
+ int fd; /* SEV device fd */
+ unsigned long pages_locked; /* Number of pages locked */
+ struct list_head regions_list; /* List of registered regions */
+};
+
+struct kvm_svm {
+ struct kvm kvm;
+
+ /* Struct members for AVIC */
+ u32 avic_vm_id;
+ u32 ldr_mode;
+ struct page *avic_logical_id_table_page;
+ struct page *avic_physical_id_table_page;
+ struct hlist_node hnode;
+
+ struct kvm_sev_info sev_info;
+};
+
struct kvm_vcpu;
struct nested_state {
@@ -276,6 +298,54 @@ static bool npt_enabled = true;
static bool npt_enabled;
#endif
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * pause_filter_count: On processors that support Pause filtering(indicated
+ * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
+ * count value. On VMRUN this value is loaded into an internal counter.
+ * Each time a pause instruction is executed, this counter is decremented
+ * until it reaches zero at which time a #VMEXIT is generated if pause
+ * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
+ * Intercept Filtering for more details.
+ * This also indicate if ple logic enabled.
+ *
+ * pause_filter_thresh: In addition, some processor families support advanced
+ * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
+ * the amount of time a guest is allowed to execute in a pause loop.
+ * In this mode, a 16-bit pause filter threshold field is added in the
+ * VMCB. The threshold value is a cycle count that is used to reset the
+ * pause counter. As with simple pause filtering, VMRUN loads the pause
+ * count value from VMCB into an internal counter. Then, on each pause
+ * instruction the hardware checks the elapsed number of cycles since
+ * the most recent pause instruction against the pause filter threshold.
+ * If the elapsed cycle count is greater than the pause filter threshold,
+ * then the internal pause count is reloaded from the VMCB and execution
+ * continues. If the elapsed cycle count is less than the pause filter
+ * threshold, then the internal pause count is decremented. If the count
+ * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
+ * triggered. If advanced pause filtering is supported and pause filter
+ * threshold field is set to zero, the filter will operate in the simpler,
+ * count only mode.
+ */
+
+static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
+module_param(pause_filter_thresh, ushort, 0444);
+
+static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
+module_param(pause_filter_count, ushort, 0444);
+
+/* Default doubles per-vcpu window every exit. */
+static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(pause_filter_count_grow, ushort, 0444);
+
+/* Default resets per-vcpu window every exit to pause_filter_count. */
+static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(pause_filter_count_shrink, ushort, 0444);
+
+/* Default is to compute the maximum so we can never overflow. */
+static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
+module_param(pause_filter_count_max, ushort, 0444);
+
/* allow nested paging (virtualized MMU) for all guests */
static int npt = true;
module_param(npt, int, S_IRUGO);
@@ -352,6 +422,12 @@ struct enc_region {
unsigned long size;
};
+
+static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_svm, kvm);
+}
+
static inline bool svm_sev_enabled(void)
{
return max_sev_asid;
@@ -359,14 +435,14 @@ static inline bool svm_sev_enabled(void)
static inline bool sev_guest(struct kvm *kvm)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
return sev->active;
}
static inline int sev_get_asid(struct kvm *kvm)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
return sev->asid;
}
@@ -1083,7 +1159,7 @@ static void disable_nmi_singlestep(struct vcpu_svm *svm)
}
/* Note:
- * This hash table is used to map VM_ID to a struct kvm_arch,
+ * This hash table is used to map VM_ID to a struct kvm_svm,
* when handling AMD IOMMU GALOG notification to schedule in
* a particular vCPU.
*/
@@ -1100,7 +1176,7 @@ static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
static int avic_ga_log_notifier(u32 ga_tag)
{
unsigned long flags;
- struct kvm_arch *ka = NULL;
+ struct kvm_svm *kvm_svm;
struct kvm_vcpu *vcpu = NULL;
u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
@@ -1108,13 +1184,10 @@ static int avic_ga_log_notifier(u32 ga_tag)
pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
- hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
- struct kvm *kvm = container_of(ka, struct kvm, arch);
- struct kvm_arch *vm_data = &kvm->arch;
-
- if (vm_data->avic_vm_id != vm_id)
+ hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
+ if (kvm_svm->avic_vm_id != vm_id)
continue;
- vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
break;
}
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
@@ -1172,6 +1245,42 @@ err:
return rc;
}
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ control->pause_filter_count = __grow_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_grow,
+ pause_filter_count_max);
+
+ if (control->pause_filter_count != old)
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ trace_kvm_ple_window_grow(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ control->pause_filter_count =
+ __shrink_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_shrink,
+ pause_filter_count);
+ if (control->pause_filter_count != old)
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ trace_kvm_ple_window_shrink(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+}
+
static __init int svm_hardware_setup(void)
{
int cpu;
@@ -1202,6 +1311,14 @@ static __init int svm_hardware_setup(void)
kvm_tsc_scaling_ratio_frac_bits = 32;
}
+ /* Check for pause filtering support */
+ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+ pause_filter_count = 0;
+ pause_filter_thresh = 0;
+ } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
+ pause_filter_thresh = 0;
+ }
+
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
@@ -1328,10 +1445,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
static void avic_init_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb;
- struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
+ struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
- phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page));
- phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page));
+ phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
+ phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
@@ -1363,6 +1480,14 @@ static void init_vmcb(struct vcpu_svm *svm)
set_exception_intercept(svm, MC_VECTOR);
set_exception_intercept(svm, AC_VECTOR);
set_exception_intercept(svm, DB_VECTOR);
+ /*
+ * Guest access to VMware backdoor ports could legitimately
+ * trigger #GP because of TSS I/O permission bitmap.
+ * We intercept those #GP and allow access to them anyway
+ * as VMware does.
+ */
+ if (enable_vmware_backdoor)
+ set_exception_intercept(svm, GP_VECTOR);
set_intercept(svm, INTERCEPT_INTR);
set_intercept(svm, INTERCEPT_NMI);
@@ -1371,7 +1496,6 @@ static void init_vmcb(struct vcpu_svm *svm)
set_intercept(svm, INTERCEPT_RDPMC);
set_intercept(svm, INTERCEPT_CPUID);
set_intercept(svm, INTERCEPT_INVD);
- set_intercept(svm, INTERCEPT_HLT);
set_intercept(svm, INTERCEPT_INVLPG);
set_intercept(svm, INTERCEPT_INVLPGA);
set_intercept(svm, INTERCEPT_IOIO_PROT);
@@ -1389,11 +1513,14 @@ static void init_vmcb(struct vcpu_svm *svm)
set_intercept(svm, INTERCEPT_XSETBV);
set_intercept(svm, INTERCEPT_RSM);
- if (!kvm_mwait_in_guest()) {
+ if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
set_intercept(svm, INTERCEPT_MONITOR);
set_intercept(svm, INTERCEPT_MWAIT);
}
+ if (!kvm_hlt_in_guest(svm->vcpu.kvm))
+ set_intercept(svm, INTERCEPT_HLT);
+
control->iopm_base_pa = __sme_set(iopm_base);
control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
control->int_ctl = V_INTR_MASKING_MASK;
@@ -1449,9 +1576,13 @@ static void init_vmcb(struct vcpu_svm *svm)
svm->nested.vmcb = 0;
svm->vcpu.arch.hflags = 0;
- if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
- control->pause_filter_count = 3000;
+ if (pause_filter_count) {
+ control->pause_filter_count = pause_filter_count;
+ if (pause_filter_thresh)
+ control->pause_filter_thresh = pause_filter_thresh;
set_intercept(svm, INTERCEPT_PAUSE);
+ } else {
+ clr_intercept(svm, INTERCEPT_PAUSE);
}
if (kvm_vcpu_apicv_active(&svm->vcpu))
@@ -1488,12 +1619,12 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
unsigned int index)
{
u64 *avic_physical_id_table;
- struct kvm_arch *vm_data = &vcpu->kvm->arch;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
return NULL;
- avic_physical_id_table = page_address(vm_data->avic_physical_id_table_page);
+ avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
return &avic_physical_id_table[index];
}
@@ -1576,7 +1707,7 @@ static void __sev_asid_free(int asid)
static void sev_asid_free(struct kvm *kvm)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
__sev_asid_free(sev->asid);
}
@@ -1616,7 +1747,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
unsigned long ulen, unsigned long *n,
int write)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
unsigned long npages, npinned, size;
unsigned long locked, lock_limit;
struct page **pages;
@@ -1667,7 +1798,7 @@ err:
static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
unsigned long npages)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
release_pages(pages, npages);
kvfree(pages);
@@ -1705,9 +1836,20 @@ static void __unregister_enc_region_locked(struct kvm *kvm,
kfree(region);
}
+static struct kvm *svm_vm_alloc(void)
+{
+ struct kvm_svm *kvm_svm = kzalloc(sizeof(struct kvm_svm), GFP_KERNEL);
+ return &kvm_svm->kvm;
+}
+
+static void svm_vm_free(struct kvm *kvm)
+{
+ kfree(to_kvm_svm(kvm));
+}
+
static void sev_vm_destroy(struct kvm *kvm)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct list_head *head = &sev->regions_list;
struct list_head *pos, *q;
@@ -1736,18 +1878,18 @@ static void sev_vm_destroy(struct kvm *kvm)
static void avic_vm_destroy(struct kvm *kvm)
{
unsigned long flags;
- struct kvm_arch *vm_data = &kvm->arch;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
if (!avic)
return;
- if (vm_data->avic_logical_id_table_page)
- __free_page(vm_data->avic_logical_id_table_page);
- if (vm_data->avic_physical_id_table_page)
- __free_page(vm_data->avic_physical_id_table_page);
+ if (kvm_svm->avic_logical_id_table_page)
+ __free_page(kvm_svm->avic_logical_id_table_page);
+ if (kvm_svm->avic_physical_id_table_page)
+ __free_page(kvm_svm->avic_physical_id_table_page);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
- hash_del(&vm_data->hnode);
+ hash_del(&kvm_svm->hnode);
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
}
@@ -1761,10 +1903,10 @@ static int avic_vm_init(struct kvm *kvm)
{
unsigned long flags;
int err = -ENOMEM;
- struct kvm_arch *vm_data = &kvm->arch;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ struct kvm_svm *k2;
struct page *p_page;
struct page *l_page;
- struct kvm_arch *ka;
u32 vm_id;
if (!avic)
@@ -1775,7 +1917,7 @@ static int avic_vm_init(struct kvm *kvm)
if (!p_page)
goto free_avic;
- vm_data->avic_physical_id_table_page = p_page;
+ kvm_svm->avic_physical_id_table_page = p_page;
clear_page(page_address(p_page));
/* Allocating logical APIC ID table (4KB) */
@@ -1783,7 +1925,7 @@ static int avic_vm_init(struct kvm *kvm)
if (!l_page)
goto free_avic;
- vm_data->avic_logical_id_table_page = l_page;
+ kvm_svm->avic_logical_id_table_page = l_page;
clear_page(page_address(l_page));
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
@@ -1795,15 +1937,13 @@ static int avic_vm_init(struct kvm *kvm)
}
/* Is it still in use? Only possible if wrapped at least once */
if (next_vm_id_wrapped) {
- hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
- struct kvm *k2 = container_of(ka, struct kvm, arch);
- struct kvm_arch *vd2 = &k2->arch;
- if (vd2->avic_vm_id == vm_id)
+ hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
+ if (k2->avic_vm_id == vm_id)
goto again;
}
}
- vm_data->avic_vm_id = vm_id;
- hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
+ kvm_svm->avic_vm_id = vm_id;
+ hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
return 0;
@@ -2535,14 +2675,7 @@ static int bp_interception(struct vcpu_svm *svm)
static int ud_interception(struct vcpu_svm *svm)
{
- int er;
-
- er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
- if (er == EMULATE_USER_EXIT)
- return 0;
- if (er != EMULATE_DONE)
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
+ return handle_ud(&svm->vcpu);
}
static int ac_interception(struct vcpu_svm *svm)
@@ -2551,6 +2684,23 @@ static int ac_interception(struct vcpu_svm *svm)
return 1;
}
+static int gp_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 error_code = svm->vmcb->control.exit_info_1;
+ int er;
+
+ WARN_ON_ONCE(!enable_vmware_backdoor);
+
+ er = emulate_instruction(vcpu,
+ EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
+ else if (er != EMULATE_DONE)
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+}
+
static bool is_erratum_383(void)
{
int err, i;
@@ -2639,7 +2789,7 @@ static int io_interception(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
- int size, in, string, ret;
+ int size, in, string;
unsigned port;
++svm->vcpu.stat.io_exits;
@@ -2651,16 +2801,8 @@ static int io_interception(struct vcpu_svm *svm)
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
svm->next_rip = svm->vmcb->control.exit_info_2;
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
- /*
- * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
- * KVM_EXIT_DEBUG here.
- */
- if (in)
- return kvm_fast_pio_in(vcpu, size, port) && ret;
- else
- return kvm_fast_pio_out(vcpu, size, port) && ret;
+ return kvm_fast_pio(&svm->vcpu, size, port, in);
}
static int nmi_interception(struct vcpu_svm *svm)
@@ -4233,6 +4375,9 @@ static int pause_interception(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
bool in_kernel = (svm_get_cpl(vcpu) == 0);
+ if (pause_filter_thresh)
+ grow_ple_window(vcpu);
+
kvm_vcpu_on_spin(vcpu, in_kernel);
return 1;
}
@@ -4323,7 +4468,7 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
- struct kvm_arch *vm_data = &vcpu->kvm->arch;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
int index;
u32 *logical_apic_id_table;
int dlid = GET_APIC_LOGICAL_ID(ldr);
@@ -4345,7 +4490,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
index = (cluster << 2) + apic;
}
- logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page);
+ logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
return &logical_apic_id_table[index];
}
@@ -4425,7 +4570,7 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct kvm_arch *vm_data = &vcpu->kvm->arch;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
u32 mod = (dfr >> 28) & 0xf;
@@ -4434,11 +4579,11 @@ static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
* If this changes, we need to flush the AVIC logical
* APID id table.
*/
- if (vm_data->ldr_mode == mod)
+ if (kvm_svm->ldr_mode == mod)
return 0;
- clear_page(page_address(vm_data->avic_logical_id_table_page));
- vm_data->ldr_mode = mod;
+ clear_page(page_address(kvm_svm->avic_logical_id_table_page));
+ kvm_svm->ldr_mode = mod;
if (svm->ldr_reg)
avic_handle_ldr_update(vcpu);
@@ -4558,6 +4703,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
[SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
[SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
+ [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
[SVM_EXIT_INTR] = intr_interception,
[SVM_EXIT_NMI] = nmi_interception,
[SVM_EXIT_SMI] = nop_on_interception,
@@ -4606,6 +4752,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
+ pr_err("%-20s%d\n", "pause filter threshold:",
+ control->pause_filter_thresh);
pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
@@ -5073,7 +5221,7 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
/* Try to enable guest_mode in IRTE */
pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
AVIC_HPA_MASK);
- pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
+ pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
svm->vcpu.vcpu_id);
pi.is_guest_mode = true;
pi.vcpu_data = &vcpu_info;
@@ -5237,6 +5385,11 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
return 0;
}
+static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+{
+ return 0;
+}
+
static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -5538,14 +5691,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- kvm_before_handle_nmi(&svm->vcpu);
+ kvm_before_interrupt(&svm->vcpu);
stgi();
/* Any pending NMI will happen here */
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- kvm_after_handle_nmi(&svm->vcpu);
+ kvm_after_interrupt(&svm->vcpu);
sync_cr8_to_lapic(vcpu);
@@ -5921,6 +6074,8 @@ static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
+ if (pause_filter_thresh)
+ shrink_ple_window(vcpu);
}
static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
@@ -6037,7 +6192,7 @@ static int sev_asid_new(void)
static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
int asid, ret;
ret = -EBUSY;
@@ -6102,14 +6257,14 @@ static int __sev_issue_cmd(int fd, int id, void *data, int *error)
static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
return __sev_issue_cmd(sev->fd, id, data, error);
}
static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_start *start;
struct kvm_sev_launch_start params;
void *dh_blob, *session_blob;
@@ -6207,7 +6362,7 @@ static int get_num_contig_pages(int idx, struct page **inpages,
static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
unsigned long vaddr, vaddr_end, next_vaddr, npages, size;
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_launch_update_data params;
struct sev_data_launch_update_data *data;
struct page **inpages;
@@ -6283,7 +6438,7 @@ e_free:
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *measure = (void __user *)(uintptr_t)argp->data;
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_measure *data;
struct kvm_sev_launch_measure params;
void __user *p = NULL;
@@ -6351,7 +6506,7 @@ e_free:
static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_finish *data;
int ret;
@@ -6371,7 +6526,7 @@ static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_guest_status params;
struct sev_data_guest_status *data;
int ret;
@@ -6403,7 +6558,7 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
unsigned long dst, int size,
int *error, bool enc)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_dbg *data;
int ret;
@@ -6635,7 +6790,7 @@ err:
static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_secret *data;
struct kvm_sev_launch_secret params;
struct page **pages;
@@ -6759,7 +6914,7 @@ out:
static int svm_register_enc_region(struct kvm *kvm,
struct kvm_enc_region *range)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct enc_region *region;
int ret = 0;
@@ -6801,7 +6956,7 @@ e_free:
static struct enc_region *
find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
{
- struct kvm_sev_info *sev = &kvm->arch.sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct list_head *head = &sev->regions_list;
struct enc_region *i;
@@ -6859,6 +7014,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.vcpu_free = svm_free_vcpu,
.vcpu_reset = svm_vcpu_reset,
+ .vm_alloc = svm_vm_alloc,
+ .vm_free = svm_vm_free,
.vm_init = avic_vm_init,
.vm_destroy = svm_vm_destroy,
@@ -6925,6 +7082,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.apicv_post_state_restore = avic_post_state_restore,
.set_tss_addr = svm_set_tss_addr,
+ .set_identity_map_addr = svm_set_identity_map_addr,
.get_tdp_level = get_npt_level,
.get_mt_mask = svm_get_mt_mask,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 92496b9b5f2b..aafcc9881e88 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -52,9 +52,11 @@
#include <asm/irq_remapping.h>
#include <asm/mmu_context.h>
#include <asm/nospec-branch.h>
+#include <asm/mshyperv.h>
#include "trace.h"
#include "pmu.h"
+#include "vmx_evmcs.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
#define __ex_clear(x, reg) \
@@ -130,13 +132,15 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#endif
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
-#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
-#define KVM_VM_CR0_ALWAYS_ON \
- (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
+#define KVM_VM_CR0_ALWAYS_ON \
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
+ X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
#define KVM_CR4_GUEST_OWNED_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
+#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
@@ -165,34 +169,33 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
* Time is measured based on a counter that runs at the same rate as the TSC,
* refer SDM volume 3b section 21.6.13 & 22.1.3.
*/
-#define KVM_VMX_DEFAULT_PLE_GAP 128
-#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
-#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2
-#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0
-#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \
- INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW
+static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
-static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
-module_param(ple_gap, int, S_IRUGO);
-
-static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
-module_param(ple_window, int, S_IRUGO);
+static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
+module_param(ple_window, uint, 0444);
/* Default doubles per-vcpu window every exit. */
-static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
-module_param(ple_window_grow, int, S_IRUGO);
+static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(ple_window_grow, uint, 0444);
/* Default resets per-vcpu window every exit to ple_window. */
-static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
-module_param(ple_window_shrink, int, S_IRUGO);
+static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(ple_window_shrink, uint, 0444);
/* Default is to compute the maximum so we can never overflow. */
-static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
-static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
-module_param(ple_window_max, int, S_IRUGO);
+static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
+module_param(ple_window_max, uint, 0444);
extern const ulong vmx_return;
+struct kvm_vmx {
+ struct kvm kvm;
+
+ unsigned int tss_addr;
+ bool ept_identity_pagetable_done;
+ gpa_t ept_identity_map_addr;
+};
+
#define NR_AUTOLOAD_MSRS 8
struct vmcs {
@@ -424,6 +427,35 @@ struct __packed vmcs12 {
*/
#define VMCS12_MAX_FIELD_INDEX 0x17
+struct nested_vmx_msrs {
+ /*
+ * We only store the "true" versions of the VMX capability MSRs. We
+ * generate the "non-true" versions by setting the must-be-1 bits
+ * according to the SDM.
+ */
+ u32 procbased_ctls_low;
+ u32 procbased_ctls_high;
+ u32 secondary_ctls_low;
+ u32 secondary_ctls_high;
+ u32 pinbased_ctls_low;
+ u32 pinbased_ctls_high;
+ u32 exit_ctls_low;
+ u32 exit_ctls_high;
+ u32 entry_ctls_low;
+ u32 entry_ctls_high;
+ u32 misc_low;
+ u32 misc_high;
+ u32 ept_caps;
+ u32 vpid_caps;
+ u64 basic;
+ u64 cr0_fixed0;
+ u64 cr0_fixed1;
+ u64 cr4_fixed0;
+ u64 cr4_fixed1;
+ u64 vmcs_enum;
+ u64 vmfunc_controls;
+};
+
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -475,32 +507,7 @@ struct nested_vmx {
u16 vpid02;
u16 last_vpid;
- /*
- * We only store the "true" versions of the VMX capability MSRs. We
- * generate the "non-true" versions by setting the must-be-1 bits
- * according to the SDM.
- */
- u32 nested_vmx_procbased_ctls_low;
- u32 nested_vmx_procbased_ctls_high;
- u32 nested_vmx_secondary_ctls_low;
- u32 nested_vmx_secondary_ctls_high;
- u32 nested_vmx_pinbased_ctls_low;
- u32 nested_vmx_pinbased_ctls_high;
- u32 nested_vmx_exit_ctls_low;
- u32 nested_vmx_exit_ctls_high;
- u32 nested_vmx_entry_ctls_low;
- u32 nested_vmx_entry_ctls_high;
- u32 nested_vmx_misc_low;
- u32 nested_vmx_misc_high;
- u32 nested_vmx_ept_caps;
- u32 nested_vmx_vpid_caps;
- u64 nested_vmx_basic;
- u64 nested_vmx_cr0_fixed0;
- u64 nested_vmx_cr0_fixed1;
- u64 nested_vmx_cr4_fixed0;
- u64 nested_vmx_cr4_fixed1;
- u64 nested_vmx_vmcs_enum;
- u64 nested_vmx_vmfunc_controls;
+ struct nested_vmx_msrs msrs;
/* SMM related state */
struct {
@@ -691,6 +698,11 @@ enum segment_cache_field {
SEG_FIELD_NR = 4
};
+static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_vmx, kvm);
+}
+
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_vmx, vcpu);
@@ -953,6 +965,7 @@ static struct vmcs_config {
u32 cpu_based_2nd_exec_ctrl;
u32 vmexit_ctrl;
u32 vmentry_ctrl;
+ struct nested_vmx_msrs nested;
} vmcs_config;
static struct vmx_capability {
@@ -999,6 +1012,169 @@ static const u32 vmx_msr_index[] = {
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
+DEFINE_STATIC_KEY_FALSE(enable_evmcs);
+
+#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
+
+#define KVM_EVMCS_VERSION 1
+
+#if IS_ENABLED(CONFIG_HYPERV)
+static bool __read_mostly enlightened_vmcs = true;
+module_param(enlightened_vmcs, bool, 0444);
+
+static inline void evmcs_write64(unsigned long field, u64 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u64 *)((char *)current_evmcs + offset) = value;
+
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline void evmcs_write32(unsigned long field, u32 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u32 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline void evmcs_write16(unsigned long field, u16 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u16 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline u64 evmcs_read64(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u64 *)((char *)current_evmcs + offset);
+}
+
+static inline u32 evmcs_read32(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u32 *)((char *)current_evmcs + offset);
+}
+
+static inline u16 evmcs_read16(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u16 *)((char *)current_evmcs + offset);
+}
+
+static void evmcs_load(u64 phys_addr)
+{
+ struct hv_vp_assist_page *vp_ap =
+ hv_get_vp_assist_page(smp_processor_id());
+
+ vp_ap->current_nested_vmcs = phys_addr;
+ vp_ap->enlighten_vmentry = 1;
+}
+
+static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+{
+ /*
+ * Enlightened VMCSv1 doesn't support these:
+ *
+ * POSTED_INTR_NV = 0x00000002,
+ * GUEST_INTR_STATUS = 0x00000810,
+ * APIC_ACCESS_ADDR = 0x00002014,
+ * POSTED_INTR_DESC_ADDR = 0x00002016,
+ * EOI_EXIT_BITMAP0 = 0x0000201c,
+ * EOI_EXIT_BITMAP1 = 0x0000201e,
+ * EOI_EXIT_BITMAP2 = 0x00002020,
+ * EOI_EXIT_BITMAP3 = 0x00002022,
+ */
+ vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+ vmcs_conf->cpu_based_2nd_exec_ctrl &=
+ ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+ vmcs_conf->cpu_based_2nd_exec_ctrl &=
+ ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ vmcs_conf->cpu_based_2nd_exec_ctrl &=
+ ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
+
+ /*
+ * GUEST_PML_INDEX = 0x00000812,
+ * PML_ADDRESS = 0x0000200e,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ /* VM_FUNCTION_CONTROL = 0x00002018, */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
+
+ /*
+ * EPTP_LIST_ADDRESS = 0x00002024,
+ * VMREAD_BITMAP = 0x00002026,
+ * VMWRITE_BITMAP = 0x00002028,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
+ /*
+ * TSC_MULTIPLIER = 0x00002032,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
+
+ /*
+ * PLE_GAP = 0x00004020,
+ * PLE_WINDOW = 0x00004022,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+
+ /*
+ * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ */
+ vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
+ /*
+ * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
+ * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
+ */
+ vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+
+ /*
+ * Currently unsupported in KVM:
+ * GUEST_IA32_RTIT_CTL = 0x00002814,
+ */
+}
+#else /* !IS_ENABLED(CONFIG_HYPERV) */
+static inline void evmcs_write64(unsigned long field, u64 value) {}
+static inline void evmcs_write32(unsigned long field, u32 value) {}
+static inline void evmcs_write16(unsigned long field, u16 value) {}
+static inline u64 evmcs_read64(unsigned long field) { return 0; }
+static inline u32 evmcs_read32(unsigned long field) { return 0; }
+static inline u16 evmcs_read16(unsigned long field) { return 0; }
+static inline void evmcs_load(u64 phys_addr) {}
+static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
+#endif /* IS_ENABLED(CONFIG_HYPERV) */
+
static inline bool is_exception_n(u32 intr_info, u8 vector)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -1031,6 +1207,11 @@ static inline bool is_invalid_opcode(u32 intr_info)
return is_exception_n(intr_info, UD_VECTOR);
}
+static inline bool is_gp_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, GP_VECTOR);
+}
+
static inline bool is_external_interrupt(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -1320,7 +1501,7 @@ static inline bool report_flexpriority(void)
static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
{
- return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low);
+ return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
}
static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
@@ -1341,6 +1522,16 @@ static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
PIN_BASED_VMX_PREEMPTION_TIMER;
}
+static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING;
+}
+
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
+}
+
static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
{
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
@@ -1479,6 +1670,9 @@ static void vmcs_load(struct vmcs *vmcs)
u64 phys_addr = __pa(vmcs);
u8 error;
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_load(phys_addr);
+
asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
@@ -1652,18 +1846,24 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
static __always_inline u16 vmcs_read16(unsigned long field)
{
vmcs_check16(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read16(field);
return __vmcs_readl(field);
}
static __always_inline u32 vmcs_read32(unsigned long field)
{
vmcs_check32(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read32(field);
return __vmcs_readl(field);
}
static __always_inline u64 vmcs_read64(unsigned long field)
{
vmcs_check64(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read64(field);
#ifdef CONFIG_X86_64
return __vmcs_readl(field);
#else
@@ -1674,6 +1874,8 @@ static __always_inline u64 vmcs_read64(unsigned long field)
static __always_inline unsigned long vmcs_readl(unsigned long field)
{
vmcs_checkl(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read64(field);
return __vmcs_readl(field);
}
@@ -1697,18 +1899,27 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val
static __always_inline void vmcs_write16(unsigned long field, u16 value)
{
vmcs_check16(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write16(field, value);
+
__vmcs_writel(field, value);
}
static __always_inline void vmcs_write32(unsigned long field, u32 value)
{
vmcs_check32(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, value);
+
__vmcs_writel(field, value);
}
static __always_inline void vmcs_write64(unsigned long field, u64 value)
{
vmcs_check64(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write64(field, value);
+
__vmcs_writel(field, value);
#ifndef CONFIG_X86_64
asm volatile ("");
@@ -1719,6 +1930,9 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value)
static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
{
vmcs_checkl(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write64(field, value);
+
__vmcs_writel(field, value);
}
@@ -1726,6 +1940,9 @@ static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
"vmcs_clear_bits does not support 64-bit fields");
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, evmcs_read32(field) & ~mask);
+
__vmcs_writel(field, __vmcs_readl(field) & ~mask);
}
@@ -1733,6 +1950,9 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
"vmcs_set_bits does not support 64-bit fields");
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, evmcs_read32(field) | mask);
+
__vmcs_writel(field, __vmcs_readl(field) | mask);
}
@@ -1864,6 +2084,14 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR);
+ /*
+ * Guest access to VMware backdoor ports could legitimately
+ * trigger #GP because of TSS I/O permission bitmap.
+ * We intercept those #GP and allow access to them anyway
+ * as VMware does.
+ */
+ if (enable_vmware_backdoor)
+ eb |= (1u << GP_VECTOR);
if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -2129,6 +2357,9 @@ static unsigned long segment_base(u16 selector)
static void vmx_save_host_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+#ifdef CONFIG_X86_64
+ int cpu = raw_smp_processor_id();
+#endif
int i;
if (vmx->host_state.loaded)
@@ -2141,7 +2372,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
*/
vmx->host_state.ldt_sel = kvm_read_ldt();
vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
+
+#ifdef CONFIG_X86_64
+ save_fsgs_for_kvm();
+ vmx->host_state.fs_sel = current->thread.fsindex;
+ vmx->host_state.gs_sel = current->thread.gsindex;
+#else
savesegment(fs, vmx->host_state.fs_sel);
+ savesegment(gs, vmx->host_state.gs_sel);
+#endif
if (!(vmx->host_state.fs_sel & 7)) {
vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
vmx->host_state.fs_reload_needed = 0;
@@ -2149,7 +2388,6 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
vmcs_write16(HOST_FS_SELECTOR, 0);
vmx->host_state.fs_reload_needed = 1;
}
- savesegment(gs, vmx->host_state.gs_sel);
if (!(vmx->host_state.gs_sel & 7))
vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
else {
@@ -2160,20 +2398,16 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_64
savesegment(ds, vmx->host_state.ds_sel);
savesegment(es, vmx->host_state.es_sel);
-#endif
-#ifdef CONFIG_X86_64
- vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
- vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
-#else
- vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
- vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
-#endif
+ vmcs_writel(HOST_FS_BASE, current->thread.fsbase);
+ vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu));
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ vmx->msr_host_kernel_gs_base = current->thread.gsbase;
if (is_long_mode(&vmx->vcpu))
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#else
+ vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
+ vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
#endif
if (boot_cpu_has(X86_FEATURE_MPX))
rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
@@ -2532,6 +2766,19 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
return 0;
}
+static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Ensure that we clear the HLT state in the VMCS. We don't need to
+ * explicitly skip the instruction because if the HLT state is set,
+ * then the instruction is already executing and RIP has already been
+ * advanced.
+ */
+ if (kvm_hlt_in_guest(vcpu->kvm) &&
+ vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
static void vmx_queue_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2554,6 +2801,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
return;
}
+ WARN_ON_ONCE(vmx->emulation_required);
+
if (kvm_exception_is_soft(nr)) {
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmx->vcpu.arch.event_exit_inst_len);
@@ -2562,6 +2811,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
intr_info |= INTR_TYPE_HARD_EXCEPTION;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+
+ vmx_clear_hlt(vcpu);
}
static bool vmx_rdtscp_supported(void)
@@ -2689,8 +2940,13 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
* bit in the high half is on if the corresponding bit in the control field
* may be on. See also vmx_control_verify().
*/
-static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
+static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
{
+ if (!nested) {
+ memset(msrs, 0, sizeof(*msrs));
+ return;
+ }
+
/*
* Note that as a general rule, the high half of the MSRs (bits in
* the control fields which may be 1) should be initialized by the
@@ -2708,70 +2964,68 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
/* pin-based controls */
rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
- vmx->nested.nested_vmx_pinbased_ctls_low,
- vmx->nested.nested_vmx_pinbased_ctls_high);
- vmx->nested.nested_vmx_pinbased_ctls_low |=
+ msrs->pinbased_ctls_low,
+ msrs->pinbased_ctls_high);
+ msrs->pinbased_ctls_low |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
- vmx->nested.nested_vmx_pinbased_ctls_high &=
+ msrs->pinbased_ctls_high &=
PIN_BASED_EXT_INTR_MASK |
PIN_BASED_NMI_EXITING |
- PIN_BASED_VIRTUAL_NMIS;
- vmx->nested.nested_vmx_pinbased_ctls_high |=
+ PIN_BASED_VIRTUAL_NMIS |
+ (apicv ? PIN_BASED_POSTED_INTR : 0);
+ msrs->pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
- if (kvm_vcpu_apicv_active(&vmx->vcpu))
- vmx->nested.nested_vmx_pinbased_ctls_high |=
- PIN_BASED_POSTED_INTR;
/* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
- vmx->nested.nested_vmx_exit_ctls_low,
- vmx->nested.nested_vmx_exit_ctls_high);
- vmx->nested.nested_vmx_exit_ctls_low =
+ msrs->exit_ctls_low,
+ msrs->exit_ctls_high);
+ msrs->exit_ctls_low =
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
- vmx->nested.nested_vmx_exit_ctls_high &=
+ msrs->exit_ctls_high &=
#ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
#endif
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
- vmx->nested.nested_vmx_exit_ctls_high |=
+ msrs->exit_ctls_high |=
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
if (kvm_mpx_supported())
- vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+ msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
/* We support free control of debug control saving. */
- vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
+ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
- vmx->nested.nested_vmx_entry_ctls_low,
- vmx->nested.nested_vmx_entry_ctls_high);
- vmx->nested.nested_vmx_entry_ctls_low =
+ msrs->entry_ctls_low,
+ msrs->entry_ctls_high);
+ msrs->entry_ctls_low =
VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
- vmx->nested.nested_vmx_entry_ctls_high &=
+ msrs->entry_ctls_high &=
#ifdef CONFIG_X86_64
VM_ENTRY_IA32E_MODE |
#endif
VM_ENTRY_LOAD_IA32_PAT;
- vmx->nested.nested_vmx_entry_ctls_high |=
+ msrs->entry_ctls_high |=
(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
if (kvm_mpx_supported())
- vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+ msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
/* We support free control of debug control loading. */
- vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
+ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
- vmx->nested.nested_vmx_procbased_ctls_low,
- vmx->nested.nested_vmx_procbased_ctls_high);
- vmx->nested.nested_vmx_procbased_ctls_low =
+ msrs->procbased_ctls_low,
+ msrs->procbased_ctls_high);
+ msrs->procbased_ctls_low =
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
- vmx->nested.nested_vmx_procbased_ctls_high &=
+ msrs->procbased_ctls_high &=
CPU_BASED_VIRTUAL_INTR_PENDING |
CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
@@ -2791,12 +3045,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
* can use it to avoid exits to L1 - even when L0 runs L2
* without MSR bitmaps.
*/
- vmx->nested.nested_vmx_procbased_ctls_high |=
+ msrs->procbased_ctls_high |=
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
CPU_BASED_USE_MSR_BITMAPS;
/* We support free control of CR3 access interception. */
- vmx->nested.nested_vmx_procbased_ctls_low &=
+ msrs->procbased_ctls_low &=
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
/*
@@ -2804,10 +3058,10 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
* depend on CPUID bits, they are added later by vmx_cpuid_update.
*/
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx->nested.nested_vmx_secondary_ctls_low,
- vmx->nested.nested_vmx_secondary_ctls_high);
- vmx->nested.nested_vmx_secondary_ctls_low = 0;
- vmx->nested.nested_vmx_secondary_ctls_high &=
+ msrs->secondary_ctls_low,
+ msrs->secondary_ctls_high);
+ msrs->secondary_ctls_low = 0;
+ msrs->secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_DESC |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
@@ -2817,33 +3071,33 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ msrs->secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_EPT;
- vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+ msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
if (cpu_has_vmx_ept_execute_only())
- vmx->nested.nested_vmx_ept_caps |=
+ msrs->ept_caps |=
VMX_EPT_EXECUTE_ONLY_BIT;
- vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
- vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+ msrs->ept_caps &= vmx_capability.ept;
+ msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
VMX_EPT_1GB_PAGE_BIT;
if (enable_ept_ad_bits) {
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ msrs->secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_PML;
- vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
+ msrs->ept_caps |= VMX_EPT_AD_BIT;
}
}
if (cpu_has_vmx_vmfunc()) {
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ msrs->secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_VMFUNC;
/*
* Advertise EPTP switching unconditionally
* since we emulate it
*/
if (enable_ept)
- vmx->nested.nested_vmx_vmfunc_controls =
+ msrs->vmfunc_controls =
VMX_VMFUNC_EPTP_SWITCHING;
}
@@ -2854,25 +3108,25 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
* not failing the single-context invvpid, and it is worse.
*/
if (enable_vpid) {
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ msrs->secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_VPID;
- vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+ msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
VMX_VPID_EXTENT_SUPPORTED_MASK;
}
if (enable_unrestricted_guest)
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ msrs->secondary_ctls_high |=
SECONDARY_EXEC_UNRESTRICTED_GUEST;
/* miscellaneous data */
rdmsr(MSR_IA32_VMX_MISC,
- vmx->nested.nested_vmx_misc_low,
- vmx->nested.nested_vmx_misc_high);
- vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
- vmx->nested.nested_vmx_misc_low |=
+ msrs->misc_low,
+ msrs->misc_high);
+ msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
+ msrs->misc_low |=
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
VMX_MISC_ACTIVITY_HLT;
- vmx->nested.nested_vmx_misc_high = 0;
+ msrs->misc_high = 0;
/*
* This MSR reports some information about VMX support. We
@@ -2880,14 +3134,14 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
* guest, and the VMCS structure we give it - not about the
* VMX support of the underlying hardware.
*/
- vmx->nested.nested_vmx_basic =
+ msrs->basic =
VMCS12_REVISION |
VMX_BASIC_TRUE_CTLS |
((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
if (cpu_has_vmx_basic_inout())
- vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT;
+ msrs->basic |= VMX_BASIC_INOUT;
/*
* These MSRs specify bits which the guest must keep fixed on
@@ -2896,15 +3150,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
*/
#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
- vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON;
- vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON;
+ msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
+ msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
/* These MSRs specify bits which the guest must keep fixed off. */
- rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1);
- rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1);
+ rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
+ rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
/* highest index: VMX_PREEMPTION_TIMER_VALUE */
- vmx->nested.nested_vmx_vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
+ msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
}
/*
@@ -2941,7 +3195,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
/* reserved */
BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
- u64 vmx_basic = vmx->nested.nested_vmx_basic;
+ u64 vmx_basic = vmx->nested.msrs.basic;
if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
return -EINVAL;
@@ -2960,7 +3214,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
return -EINVAL;
- vmx->nested.nested_vmx_basic = data;
+ vmx->nested.msrs.basic = data;
return 0;
}
@@ -2972,24 +3226,24 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
switch (msr_index) {
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
- lowp = &vmx->nested.nested_vmx_pinbased_ctls_low;
- highp = &vmx->nested.nested_vmx_pinbased_ctls_high;
+ lowp = &vmx->nested.msrs.pinbased_ctls_low;
+ highp = &vmx->nested.msrs.pinbased_ctls_high;
break;
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
- lowp = &vmx->nested.nested_vmx_procbased_ctls_low;
- highp = &vmx->nested.nested_vmx_procbased_ctls_high;
+ lowp = &vmx->nested.msrs.procbased_ctls_low;
+ highp = &vmx->nested.msrs.procbased_ctls_high;
break;
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- lowp = &vmx->nested.nested_vmx_exit_ctls_low;
- highp = &vmx->nested.nested_vmx_exit_ctls_high;
+ lowp = &vmx->nested.msrs.exit_ctls_low;
+ highp = &vmx->nested.msrs.exit_ctls_high;
break;
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- lowp = &vmx->nested.nested_vmx_entry_ctls_low;
- highp = &vmx->nested.nested_vmx_entry_ctls_high;
+ lowp = &vmx->nested.msrs.entry_ctls_low;
+ highp = &vmx->nested.msrs.entry_ctls_high;
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
- lowp = &vmx->nested.nested_vmx_secondary_ctls_low;
- highp = &vmx->nested.nested_vmx_secondary_ctls_high;
+ lowp = &vmx->nested.msrs.secondary_ctls_low;
+ highp = &vmx->nested.msrs.secondary_ctls_high;
break;
default:
BUG();
@@ -3020,13 +3274,13 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
GENMASK_ULL(13, 9) | BIT_ULL(31);
u64 vmx_misc;
- vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low,
- vmx->nested.nested_vmx_misc_high);
+ vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
+ vmx->nested.msrs.misc_high);
if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
return -EINVAL;
- if ((vmx->nested.nested_vmx_pinbased_ctls_high &
+ if ((vmx->nested.msrs.pinbased_ctls_high &
PIN_BASED_VMX_PREEMPTION_TIMER) &&
vmx_misc_preemption_timer_rate(data) !=
vmx_misc_preemption_timer_rate(vmx_misc))
@@ -3041,8 +3295,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
return -EINVAL;
- vmx->nested.nested_vmx_misc_low = data;
- vmx->nested.nested_vmx_misc_high = data >> 32;
+ vmx->nested.msrs.misc_low = data;
+ vmx->nested.msrs.misc_high = data >> 32;
return 0;
}
@@ -3050,15 +3304,15 @@ static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
{
u64 vmx_ept_vpid_cap;
- vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps,
- vmx->nested.nested_vmx_vpid_caps);
+ vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
+ vmx->nested.msrs.vpid_caps);
/* Every bit is either reserved or a feature bit. */
if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
return -EINVAL;
- vmx->nested.nested_vmx_ept_caps = data;
- vmx->nested.nested_vmx_vpid_caps = data >> 32;
+ vmx->nested.msrs.ept_caps = data;
+ vmx->nested.msrs.vpid_caps = data >> 32;
return 0;
}
@@ -3068,10 +3322,10 @@ static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
switch (msr_index) {
case MSR_IA32_VMX_CR0_FIXED0:
- msr = &vmx->nested.nested_vmx_cr0_fixed0;
+ msr = &vmx->nested.msrs.cr0_fixed0;
break;
case MSR_IA32_VMX_CR4_FIXED0:
- msr = &vmx->nested.nested_vmx_cr4_fixed0;
+ msr = &vmx->nested.msrs.cr4_fixed0;
break;
default:
BUG();
@@ -3135,7 +3389,7 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
case MSR_IA32_VMX_EPT_VPID_CAP:
return vmx_restore_vmx_ept_vpid_cap(vmx, data);
case MSR_IA32_VMX_VMCS_ENUM:
- vmx->nested.nested_vmx_vmcs_enum = data;
+ vmx->nested.msrs.vmcs_enum = data;
return 0;
default:
/*
@@ -3146,77 +3400,75 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
}
/* Returns 0 on success, non-0 otherwise. */
-static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
switch (msr_index) {
case MSR_IA32_VMX_BASIC:
- *pdata = vmx->nested.nested_vmx_basic;
+ *pdata = msrs->basic;
break;
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
*pdata = vmx_control_msr(
- vmx->nested.nested_vmx_pinbased_ctls_low,
- vmx->nested.nested_vmx_pinbased_ctls_high);
+ msrs->pinbased_ctls_low,
+ msrs->pinbased_ctls_high);
if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
*pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
case MSR_IA32_VMX_PROCBASED_CTLS:
*pdata = vmx_control_msr(
- vmx->nested.nested_vmx_procbased_ctls_low,
- vmx->nested.nested_vmx_procbased_ctls_high);
+ msrs->procbased_ctls_low,
+ msrs->procbased_ctls_high);
if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
*pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
case MSR_IA32_VMX_EXIT_CTLS:
*pdata = vmx_control_msr(
- vmx->nested.nested_vmx_exit_ctls_low,
- vmx->nested.nested_vmx_exit_ctls_high);
+ msrs->exit_ctls_low,
+ msrs->exit_ctls_high);
if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
*pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
case MSR_IA32_VMX_ENTRY_CTLS:
*pdata = vmx_control_msr(
- vmx->nested.nested_vmx_entry_ctls_low,
- vmx->nested.nested_vmx_entry_ctls_high);
+ msrs->entry_ctls_low,
+ msrs->entry_ctls_high);
if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
*pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_MISC:
*pdata = vmx_control_msr(
- vmx->nested.nested_vmx_misc_low,
- vmx->nested.nested_vmx_misc_high);
+ msrs->misc_low,
+ msrs->misc_high);
break;
case MSR_IA32_VMX_CR0_FIXED0:
- *pdata = vmx->nested.nested_vmx_cr0_fixed0;
+ *pdata = msrs->cr0_fixed0;
break;
case MSR_IA32_VMX_CR0_FIXED1:
- *pdata = vmx->nested.nested_vmx_cr0_fixed1;
+ *pdata = msrs->cr0_fixed1;
break;
case MSR_IA32_VMX_CR4_FIXED0:
- *pdata = vmx->nested.nested_vmx_cr4_fixed0;
+ *pdata = msrs->cr4_fixed0;
break;
case MSR_IA32_VMX_CR4_FIXED1:
- *pdata = vmx->nested.nested_vmx_cr4_fixed1;
+ *pdata = msrs->cr4_fixed1;
break;
case MSR_IA32_VMX_VMCS_ENUM:
- *pdata = vmx->nested.nested_vmx_vmcs_enum;
+ *pdata = msrs->vmcs_enum;
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
*pdata = vmx_control_msr(
- vmx->nested.nested_vmx_secondary_ctls_low,
- vmx->nested.nested_vmx_secondary_ctls_high);
+ msrs->secondary_ctls_low,
+ msrs->secondary_ctls_high);
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
- *pdata = vmx->nested.nested_vmx_ept_caps |
- ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
+ *pdata = msrs->ept_caps |
+ ((u64)msrs->vpid_caps << 32);
break;
case MSR_IA32_VMX_VMFUNC:
- *pdata = vmx->nested.nested_vmx_vmfunc_controls;
+ *pdata = msrs->vmfunc_controls;
break;
default:
return 1;
@@ -3235,7 +3487,16 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
{
- return 1;
+ switch (msr->index) {
+ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ if (!nested)
+ return 1;
+ return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ default:
+ return 1;
+ }
+
+ return 0;
}
/*
@@ -3309,7 +3570,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
return 1;
- return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data);
+ return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
+ &msr_info->data);
case MSR_IA32_XSS:
if (!vmx_xsaves_supported())
return 1;
@@ -3602,6 +3864,14 @@ static int hardware_enable(void)
if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
+ /*
+ * This can happen if we hot-added a CPU but failed to allocate
+ * VP assist page for it.
+ */
+ if (static_branch_unlikely(&enable_evmcs) &&
+ !hv_get_vp_assist_page(cpu))
+ return -EFAULT;
+
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
@@ -3700,6 +3970,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
+ memset(vmcs_conf, 0, sizeof(*vmcs_conf));
min = CPU_BASED_HLT_EXITING |
#ifdef CONFIG_X86_64
CPU_BASED_CR8_LOAD_EXITING |
@@ -3710,13 +3981,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING |
CPU_BASED_RDPMC_EXITING;
- if (!kvm_mwait_in_guest())
- min |= CPU_BASED_MWAIT_EXITING |
- CPU_BASED_MONITOR_EXITING;
-
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -3835,7 +4104,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmcs_conf->size = vmx_msr_high & 0x1fff;
vmcs_conf->order = get_order(vmcs_conf->size);
vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
- vmcs_conf->revision_id = vmx_msr_low;
+
+ /* KVM supports Enlightened VMCS v1 only */
+ if (static_branch_unlikely(&enable_evmcs))
+ vmcs_conf->revision_id = KVM_EVMCS_VERSION;
+ else
+ vmcs_conf->revision_id = vmx_msr_low;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
@@ -3843,6 +4117,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_sanitize_exec_ctrls(vmcs_conf);
+
cpu_has_load_ia32_efer =
allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
VM_ENTRY_LOAD_IA32_EFER)
@@ -4162,6 +4439,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
@@ -4177,13 +4455,13 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
* Very old userspace does not call KVM_SET_TSS_ADDR before entering
* vcpu. Warn the user that an update is overdue.
*/
- if (!vcpu->kvm->arch.tss_addr)
+ if (!kvm_vmx->tss_addr)
printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
"called before entering vcpu\n");
vmx_segment_cache_clear(vmx);
- vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
+ vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
@@ -4291,7 +4569,7 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
{
- if (enable_ept && is_paging(vcpu))
+ if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
}
@@ -4339,11 +4617,11 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
- u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
- u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
+ if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_UNRESTRICTED_GUEST &&
nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
@@ -4353,16 +4631,16 @@ static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
- u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
- u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
return fixed_bits_valid(val, fixed0, fixed1);
}
static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
- u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0;
- u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1;
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
return fixed_bits_valid(val, fixed0, fixed1);
}
@@ -4428,7 +4706,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
#endif
- if (enable_ept)
+ if (enable_ept && !enable_unrestricted_guest)
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
vmcs_writel(CR0_READ_SHADOW, cr0);
@@ -4469,10 +4747,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (enable_ept) {
eptp = construct_eptp(vcpu, cr3);
vmcs_write64(EPT_POINTER, eptp);
- if (is_paging(vcpu) || is_guest_mode(vcpu))
+ if (enable_unrestricted_guest || is_paging(vcpu) ||
+ is_guest_mode(vcpu))
guest_cr3 = kvm_read_cr3(vcpu);
else
- guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;
+ guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr;
ept_load_pdptrs(vcpu);
}
@@ -4487,11 +4766,15 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
* is in force while we are in guest mode. Do not let guests control
* this bit, even if host CR4.MCE == 0.
*/
- unsigned long hw_cr4 =
- (cr4_read_shadow() & X86_CR4_MCE) |
- (cr4 & ~X86_CR4_MCE) |
- (to_vmx(vcpu)->rmode.vm86_active ?
- KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+ unsigned long hw_cr4;
+
+ hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
+ if (enable_unrestricted_guest)
+ hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else if (to_vmx(vcpu)->rmode.vm86_active)
+ hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
+ else
+ hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) {
vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
@@ -4517,16 +4800,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
vcpu->arch.cr4 = cr4;
- if (enable_ept) {
- if (!is_paging(vcpu)) {
- hw_cr4 &= ~X86_CR4_PAE;
- hw_cr4 |= X86_CR4_PSE;
- } else if (!(cr4 & X86_CR4_PAE)) {
- hw_cr4 &= ~X86_CR4_PAE;
+
+ if (!enable_unrestricted_guest) {
+ if (enable_ept) {
+ if (!is_paging(vcpu)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ hw_cr4 |= X86_CR4_PSE;
+ } else if (!(cr4 & X86_CR4_PAE)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ }
}
- }
- if (!enable_unrestricted_guest && !is_paging(vcpu))
/*
* SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
* hardware. To emulate this behavior, SMEP/SMAP/PKU needs
@@ -4538,7 +4822,9 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
* If enable_unrestricted_guest, the CPU automatically
* disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
*/
- hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ if (!is_paging(vcpu))
+ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ }
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
@@ -4906,7 +5192,7 @@ static int init_rmode_tss(struct kvm *kvm)
int idx, r;
idx = srcu_read_lock(&kvm->srcu);
- fn = kvm->arch.tss_addr >> PAGE_SHIFT;
+ fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
if (r < 0)
goto out;
@@ -4932,22 +5218,23 @@ out:
static int init_rmode_identity_map(struct kvm *kvm)
{
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
int i, idx, r = 0;
kvm_pfn_t identity_map_pfn;
u32 tmp;
- /* Protect kvm->arch.ept_identity_pagetable_done. */
+ /* Protect kvm_vmx->ept_identity_pagetable_done. */
mutex_lock(&kvm->slots_lock);
- if (likely(kvm->arch.ept_identity_pagetable_done))
+ if (likely(kvm_vmx->ept_identity_pagetable_done))
goto out2;
- if (!kvm->arch.ept_identity_map_addr)
- kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
- identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
+ if (!kvm_vmx->ept_identity_map_addr)
+ kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+ identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
- kvm->arch.ept_identity_map_addr, PAGE_SIZE);
+ kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
if (r < 0)
goto out2;
@@ -4964,7 +5251,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
if (r < 0)
goto out;
}
- kvm->arch.ept_identity_pagetable_done = true;
+ kvm_vmx->ept_identity_pagetable_done = true;
out:
srcu_read_unlock(&kvm->srcu, idx);
@@ -5500,6 +5787,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
exec_control |= CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_INVLPG_EXITING;
+ if (kvm_mwait_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~(CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING);
+ if (kvm_hlt_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~CPU_BASED_HLT_EXITING;
return exec_control;
}
@@ -5533,7 +5825,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
}
if (!enable_unrestricted_guest)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
- if (!ple_gap)
+ if (kvm_pause_in_guest(vmx->vcpu.kvm))
exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
if (!kvm_vcpu_apicv_active(vcpu))
exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@ -5565,10 +5857,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (nested) {
if (xsaves_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ vmx->nested.msrs.secondary_ctls_high |=
SECONDARY_EXEC_XSAVES;
else
- vmx->nested.nested_vmx_secondary_ctls_high &=
+ vmx->nested.msrs.secondary_ctls_high &=
~SECONDARY_EXEC_XSAVES;
}
}
@@ -5580,10 +5872,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (nested) {
if (rdtscp_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ vmx->nested.msrs.secondary_ctls_high |=
SECONDARY_EXEC_RDTSCP;
else
- vmx->nested.nested_vmx_secondary_ctls_high &=
+ vmx->nested.msrs.secondary_ctls_high &=
~SECONDARY_EXEC_RDTSCP;
}
}
@@ -5601,10 +5893,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (nested) {
if (invpcid_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ vmx->nested.msrs.secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_INVPCID;
else
- vmx->nested.nested_vmx_secondary_ctls_high &=
+ vmx->nested.msrs.secondary_ctls_high &=
~SECONDARY_EXEC_ENABLE_INVPCID;
}
}
@@ -5616,10 +5908,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (nested) {
if (rdrand_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ vmx->nested.msrs.secondary_ctls_high |=
SECONDARY_EXEC_RDRAND_EXITING;
else
- vmx->nested.nested_vmx_secondary_ctls_high &=
+ vmx->nested.msrs.secondary_ctls_high &=
~SECONDARY_EXEC_RDRAND_EXITING;
}
}
@@ -5631,10 +5923,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (nested) {
if (rdseed_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high |=
+ vmx->nested.msrs.secondary_ctls_high |=
SECONDARY_EXEC_RDSEED_EXITING;
else
- vmx->nested.nested_vmx_secondary_ctls_high &=
+ vmx->nested.msrs.secondary_ctls_high &=
~SECONDARY_EXEC_RDSEED_EXITING;
}
}
@@ -5696,7 +5988,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
}
- if (ple_gap) {
+ if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
vmcs_write32(PLE_GAP, ple_gap);
vmx->ple_window = ple_window;
vmx->ple_window_dirty = true;
@@ -5861,6 +6153,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
update_exception_bitmap(vcpu);
vpid_sync_context(vmx->vpid);
+ if (init_event)
+ vmx_clear_hlt(vcpu);
}
/*
@@ -5885,8 +6179,7 @@ static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
{
- return get_vmcs12(vcpu)->pin_based_vm_exec_control &
- PIN_BASED_NMI_EXITING;
+ return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
}
static void enable_irq_window(struct kvm_vcpu *vcpu)
@@ -5932,6 +6225,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+
+ vmx_clear_hlt(vcpu);
}
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -5962,6 +6257,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+
+ vmx_clear_hlt(vcpu);
}
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -6024,14 +6321,23 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
int ret;
+ if (enable_unrestricted_guest)
+ return 0;
+
ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
PAGE_SIZE * 3);
if (ret)
return ret;
- kvm->arch.tss_addr = addr;
+ to_kvm_vmx(kvm)->tss_addr = addr;
return init_rmode_tss(kvm);
}
+static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+{
+ to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
+ return 0;
+}
+
static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
{
switch (vec) {
@@ -6134,19 +6440,24 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (is_nmi(intr_info))
return 1; /* already handled by vmx_vcpu_run() */
- if (is_invalid_opcode(intr_info)) {
- er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
- if (er == EMULATE_USER_EXIT)
- return 0;
- if (er != EMULATE_DONE)
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
+ if (is_invalid_opcode(intr_info))
+ return handle_ud(vcpu);
error_code = 0;
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
+ WARN_ON_ONCE(!enable_vmware_backdoor);
+ er = emulate_instruction(vcpu,
+ EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
+ else if (er != EMULATE_DONE)
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+ }
+
/*
* The #PF with PFEC.RSVD = 1 indicates the guest is accessing
* MMIO, it is better to report an internal error.
@@ -6232,28 +6543,22 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu)
static int handle_io(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
- int size, in, string, ret;
+ int size, in, string;
unsigned port;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
string = (exit_qualification & 16) != 0;
- in = (exit_qualification & 8) != 0;
++vcpu->stat.io_exits;
- if (string || in)
+ if (string)
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
+ in = (exit_qualification & 8) != 0;
- ret = kvm_skip_emulated_instruction(vcpu);
-
- /*
- * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
- * KVM_EXIT_DEBUG here.
- */
- return kvm_fast_pio_out(vcpu, size, port) && ret;
+ return kvm_fast_pio(vcpu, size, port, in);
}
static void
@@ -6344,6 +6649,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
err = handle_set_cr0(vcpu, val);
return kvm_complete_insn_gp(vcpu, err);
case 3:
+ WARN_ON_ONCE(enable_unrestricted_guest);
err = kvm_set_cr3(vcpu, val);
return kvm_complete_insn_gp(vcpu, err);
case 4:
@@ -6376,6 +6682,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
case 1: /*mov from cr*/
switch (cr) {
case 3:
+ WARN_ON_ONCE(enable_unrestricted_guest);
val = kvm_read_cr3(vcpu);
kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
@@ -6769,7 +7076,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
- int ret;
gpa_t gpa;
/*
@@ -6797,17 +7103,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
NULL, 0) == EMULATE_DONE;
}
- ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
- if (ret >= 0)
- return ret;
-
- /* It is the real ept misconfig */
- WARN_ON(1);
-
- vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
- vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
-
- return 0;
+ return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
}
static int handle_nmi_window(struct kvm_vcpu *vcpu)
@@ -6830,6 +7126,13 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
bool intr_window_requested;
unsigned count = 130;
+ /*
+ * We should never reach the point where we are emulating L2
+ * due to invalid guest state as that means we incorrectly
+ * allowed a nested VMEntry with an invalid vmcs12.
+ */
+ WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
+
cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -6848,12 +7151,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
goto out;
}
- if (err != EMULATE_DONE) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- return 0;
- }
+ if (err != EMULATE_DONE)
+ goto emulation_error;
+
+ if (vmx->emulation_required && !vmx->rmode.vm86_active &&
+ vcpu->arch.exception.pending)
+ goto emulation_error;
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
@@ -6869,34 +7172,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
out:
return ret;
-}
-
-static int __grow_ple_window(int val)
-{
- if (ple_window_grow < 1)
- return ple_window;
-
- val = min(val, ple_window_actual_max);
-
- if (ple_window_grow < ple_window)
- val *= ple_window_grow;
- else
- val += ple_window_grow;
-
- return val;
-}
-static int __shrink_ple_window(int val, int modifier, int minimum)
-{
- if (modifier < 1)
- return ple_window;
-
- if (modifier < ple_window)
- val /= modifier;
- else
- val -= modifier;
-
- return max(val, minimum);
+emulation_error:
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
}
static void grow_ple_window(struct kvm_vcpu *vcpu)
@@ -6904,7 +7185,9 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
int old = vmx->ple_window;
- vmx->ple_window = __grow_ple_window(old);
+ vmx->ple_window = __grow_ple_window(old, ple_window,
+ ple_window_grow,
+ ple_window_max);
if (vmx->ple_window != old)
vmx->ple_window_dirty = true;
@@ -6917,8 +7200,9 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
int old = vmx->ple_window;
- vmx->ple_window = __shrink_ple_window(old,
- ple_window_shrink, ple_window);
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
+ ple_window_shrink,
+ ple_window);
if (vmx->ple_window != old)
vmx->ple_window_dirty = true;
@@ -6927,21 +7211,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
/*
- * ple_window_actual_max is computed to be one grow_ple_window() below
- * ple_window_max. (See __grow_ple_window for the reason.)
- * This prevents overflows, because ple_window_max is int.
- * ple_window_max effectively rounded down to a multiple of ple_window_grow in
- * this process.
- * ple_window_max is also prevented from setting vmx->ple_window < ple_window.
- */
-static void update_ple_window_actual_max(void)
-{
- ple_window_actual_max =
- __shrink_ple_window(max(ple_window_max, ple_window),
- ple_window_grow, INT_MIN);
-}
-
-/*
* Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
*/
static void wakeup_handler(void)
@@ -6960,7 +7229,7 @@ static void wakeup_handler(void)
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
}
-void vmx_enable_tdp(void)
+static void vmx_enable_tdp(void)
{
kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
@@ -7061,8 +7330,6 @@ static __init int hardware_setup(void)
else
kvm_disable_tdp();
- update_ple_window_actual_max();
-
/*
* Only enable PML when hardware supports PML feature, and both EPT
* and EPT A/D bit features are enabled -- PML depends on them to work.
@@ -7094,6 +7361,7 @@ static __init int hardware_setup(void)
init_vmcs_shadow_fields();
kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+ nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv);
kvm_mce_cap_supported |= MCG_LMCE_P;
@@ -7122,7 +7390,7 @@ static __exit void hardware_unsetup(void)
*/
static int handle_pause(struct kvm_vcpu *vcpu)
{
- if (ple_gap)
+ if (!kvm_pause_in_guest(vcpu->kvm))
grow_ple_window(vcpu);
/*
@@ -7954,9 +8222,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
u64 eptp, gpa;
} operand;
- if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+ if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_EPT) ||
- !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+ !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -7967,7 +8235,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
- types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+ types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
if (type >= 32 || !(types & (1 << type))) {
nested_vmx_failValid(vcpu,
@@ -8018,9 +8286,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
u64 gla;
} operand;
- if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+ if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_VPID) ||
- !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) {
+ !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -8031,7 +8299,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
- types = (vmx->nested.nested_vmx_vpid_caps &
+ types = (vmx->nested.msrs.vpid_caps &
VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
if (type >= 32 || !(types & (1 << type))) {
@@ -8125,11 +8393,11 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
/* Check for memory type validity */
switch (address & VMX_EPTP_MT_MASK) {
case VMX_EPTP_MT_UC:
- if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT))
+ if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
return false;
break;
case VMX_EPTP_MT_WB:
- if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_WB_BIT))
+ if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
return false;
break;
default:
@@ -8146,7 +8414,7 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
/* AD, if set, should be supported */
if (address & VMX_EPTP_AD_ENABLE_BIT) {
- if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPT_AD_BIT))
+ if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
return false;
}
@@ -8790,7 +9058,8 @@ static void dump_vmcs(void)
pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
vmcs_read64(GUEST_IA32_DEBUGCTL),
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
- if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ if (cpu_has_load_perf_global_ctrl &&
+ vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
pr_err("PerfGlobCtl = 0x%016llx\n",
vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
@@ -8826,7 +9095,8 @@ static void dump_vmcs(void)
pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
vmcs_read64(HOST_IA32_EFER),
vmcs_read64(HOST_IA32_PAT));
- if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ if (cpu_has_load_perf_global_ctrl &&
+ vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
pr_err("PerfGlobCtl = 0x%016llx\n",
vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
@@ -9178,9 +9448,9 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
/* We need to handle NMIs before interrupts are enabled */
if (is_nmi(exit_intr_info)) {
- kvm_before_handle_nmi(&vmx->vcpu);
+ kvm_before_interrupt(&vmx->vcpu);
asm("int $2");
- kvm_after_handle_nmi(&vmx->vcpu);
+ kvm_after_interrupt(&vmx->vcpu);
}
}
@@ -9403,7 +9673,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4;
+ unsigned long cr3, cr4, evmcs_rsp;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
@@ -9469,6 +9739,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
vmx->__launched = vmx->loaded_vmcs->launched;
+
+ evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
+ (unsigned long)&current_evmcs->host_rsp : 0;
+
asm(
/* Store host registers */
"push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -9477,15 +9751,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
"cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
"je 1f \n\t"
"mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
+ /* Avoid VMWRITE when Enlightened VMCS is in use */
+ "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
+ "jz 2f \n\t"
+ "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
+ "jmp 1f \n\t"
+ "2: \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
/* Reload cr2 if changed */
"mov %c[cr2](%0), %%" _ASM_AX " \n\t"
"mov %%cr2, %%" _ASM_DX " \n\t"
"cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
- "je 2f \n\t"
+ "je 3f \n\t"
"mov %%" _ASM_AX", %%cr2 \n\t"
- "2: \n\t"
+ "3: \n\t"
/* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
@@ -9554,7 +9834,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
".global vmx_return \n\t"
"vmx_return: " _ASM_PTR " 2b \n\t"
".popsection"
- : : "c"(vmx), "d"((unsigned long)HOST_RSP),
+ : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
@@ -9579,10 +9859,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
[wordsize]"i"(sizeof(ulong))
: "cc", "memory"
#ifdef CONFIG_X86_64
- , "rax", "rbx", "rdi", "rsi"
+ , "rax", "rbx", "rdi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
#else
- , "eax", "ebx", "edi", "esi"
+ , "eax", "ebx", "edi"
#endif
);
@@ -9610,6 +9890,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
/* Eliminate branch target predictions from guest mode */
vmexit_fill_RSB();
+ /* All fields are clean at this point */
+ if (static_branch_unlikely(&enable_evmcs))
+ current_evmcs->hv_clean_fields |=
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (vmx->host_debugctlmsr)
update_debugctlmsr(vmx->host_debugctlmsr);
@@ -9646,14 +9931,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
__write_pkru(vmx->host_pkru);
}
- /*
- * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
- * we did not inject a still-pending event to L1 now because of
- * nested_run_pending, we need to re-enable this bit.
- */
- if (vmx->nested.nested_run_pending)
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
vmx->nested.nested_run_pending = 0;
vmx->idt_vectoring_info = 0;
@@ -9670,6 +9947,17 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
}
STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
+static struct kvm *vmx_vm_alloc(void)
+{
+ struct kvm_vmx *kvm_vmx = kzalloc(sizeof(struct kvm_vmx), GFP_KERNEL);
+ return &kvm_vmx->kvm;
+}
+
+static void vmx_vm_free(struct kvm *kvm)
+{
+ kfree(to_kvm_vmx(kvm));
+}
+
static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -9777,14 +10065,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vmcs;
}
- if (enable_ept) {
+ if (enable_ept && !enable_unrestricted_guest) {
err = init_rmode_identity_map(kvm);
if (err)
goto free_vmcs;
}
if (nested) {
- nested_vmx_setup_ctls_msrs(vmx);
+ nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
+ kvm_vcpu_apicv_active(&vmx->vcpu));
vmx->nested.vpid02 = allocate_vpid();
}
@@ -9817,6 +10106,13 @@ free_vcpu:
return ERR_PTR(err);
}
+static int vmx_vm_init(struct kvm *kvm)
+{
+ if (!ple_gap)
+ kvm->arch.pause_in_guest = true;
+ return 0;
+}
+
static void __init vmx_check_processor_compat(void *rtn)
{
struct vmcs_config vmcs_conf;
@@ -9824,6 +10120,7 @@ static void __init vmx_check_processor_compat(void *rtn)
*(int *)rtn = 0;
if (setup_vmcs_config(&vmcs_conf) < 0)
*(int *)rtn = -EIO;
+ nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv);
if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
smp_processor_id());
@@ -9911,12 +10208,12 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_cpuid_entry2 *entry;
- vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff;
- vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE;
+ vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
+ vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
if (entry && (entry->_reg & (_cpuid_mask))) \
- vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask); \
+ vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
} while (0)
entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
@@ -10013,7 +10310,7 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
kvm_mmu_unload(vcpu);
kvm_init_shadow_ept_mmu(vcpu,
- to_vmx(vcpu)->nested.nested_vmx_ept_caps &
+ to_vmx(vcpu)->nested.msrs.ept_caps &
VMX_EPT_EXECUTE_ONLY_BIT,
nested_ept_ad_enabled(vcpu));
vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
@@ -10952,6 +11249,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
/* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
vmx_set_efer(vcpu, vcpu->arch.efer);
+ /*
+ * Guest state is invalid and unrestricted guest is disabled,
+ * which means L1 attempted VMEntry to L2 with invalid state.
+ * Fail the VMEntry.
+ */
+ if (vmx->emulation_required) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return 1;
+ }
+
/* Shadow page tables on either EPT or shadow page tables. */
if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
entry_failure_code))
@@ -10965,6 +11272,19 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
return 0;
}
+static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_nmi_exiting(vmcs12) &&
+ nested_cpu_has_virtual_nmis(vmcs12))
+ return -EINVAL;
+
+ if (!nested_cpu_has_virtual_nmis(vmcs12) &&
+ nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
+ return -EINVAL;
+
+ return 0;
+}
+
static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -10992,26 +11312,29 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
- vmx->nested.nested_vmx_procbased_ctls_low,
- vmx->nested.nested_vmx_procbased_ctls_high) ||
+ vmx->nested.msrs.procbased_ctls_low,
+ vmx->nested.msrs.procbased_ctls_high) ||
(nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
!vmx_control_verify(vmcs12->secondary_vm_exec_control,
- vmx->nested.nested_vmx_secondary_ctls_low,
- vmx->nested.nested_vmx_secondary_ctls_high)) ||
+ vmx->nested.msrs.secondary_ctls_low,
+ vmx->nested.msrs.secondary_ctls_high)) ||
!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
- vmx->nested.nested_vmx_pinbased_ctls_low,
- vmx->nested.nested_vmx_pinbased_ctls_high) ||
+ vmx->nested.msrs.pinbased_ctls_low,
+ vmx->nested.msrs.pinbased_ctls_high) ||
!vmx_control_verify(vmcs12->vm_exit_controls,
- vmx->nested.nested_vmx_exit_ctls_low,
- vmx->nested.nested_vmx_exit_ctls_high) ||
+ vmx->nested.msrs.exit_ctls_low,
+ vmx->nested.msrs.exit_ctls_high) ||
!vmx_control_verify(vmcs12->vm_entry_controls,
- vmx->nested.nested_vmx_entry_ctls_low,
- vmx->nested.nested_vmx_entry_ctls_high))
+ vmx->nested.msrs.entry_ctls_low,
+ vmx->nested.msrs.entry_ctls_high))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_nmi_controls(vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
if (nested_cpu_has_vmfunc(vmcs12)) {
if (vmcs12->vm_function_control &
- ~vmx->nested.nested_vmx_vmfunc_controls)
+ ~vmx->nested.msrs.vmfunc_controls)
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
if (nested_cpu_has_eptp_switching(vmcs12)) {
@@ -11293,7 +11616,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
} else if (vcpu->arch.nmi_injected) {
vmcs12->idt_vectoring_info_field =
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
- } else if (vcpu->arch.interrupt.pending) {
+ } else if (vcpu->arch.interrupt.injected) {
nr = vcpu->arch.interrupt.nr;
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
@@ -11941,7 +12264,7 @@ static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
- if (ple_gap)
+ if (!kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
}
@@ -12259,6 +12582,7 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
vmx->nested.smm.vmxon = vmx->nested.vmxon;
vmx->nested.vmxon = false;
+ vmx_clear_hlt(vcpu);
return 0;
}
@@ -12300,6 +12624,10 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_accelerated_tpr = report_flexpriority,
.cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
+ .vm_init = vmx_vm_init,
+ .vm_alloc = vmx_vm_alloc,
+ .vm_free = vmx_vm_free,
+
.vcpu_create = vmx_create_vcpu,
.vcpu_free = vmx_free_vcpu,
.vcpu_reset = vmx_vcpu_reset,
@@ -12367,6 +12695,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
.set_tss_addr = vmx_set_tss_addr,
+ .set_identity_map_addr = vmx_set_identity_map_addr,
.get_tdp_level = get_ept_level,
.get_mt_mask = vmx_get_mt_mask,
@@ -12425,7 +12754,38 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
static int __init vmx_init(void)
{
- int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+ int r;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ /*
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above. We can also disable eVMCS support
+ * with module parameter.
+ */
+ if (enlightened_vmcs &&
+ ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
+ int cpu;
+
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs) {
+ pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&enable_evmcs);
+ }
+ } else {
+ enlightened_vmcs = false;
+ }
+#endif
+
+ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
@@ -12446,6 +12806,29 @@ static void __exit vmx_exit(void)
#endif
kvm_exit();
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (static_branch_unlikely(&enable_evmcs)) {
+ int cpu;
+ struct hv_vp_assist_page *vp_ap;
+ /*
+ * Reset everything to support using non-enlightened VMCS
+ * access later (e.g. when we reload the module with
+ * enlightened_vmcs=0)
+ */
+ for_each_online_cpu(cpu) {
+ vp_ap = hv_get_vp_assist_page(cpu);
+
+ if (!vp_ap)
+ continue;
+
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+ }
+
+ static_branch_disable(&enable_evmcs);
+ }
+#endif
}
module_init(vmx_init)
diff --git a/arch/x86/kvm/vmx_evmcs.h b/arch/x86/kvm/vmx_evmcs.h
new file mode 100644
index 000000000000..210a884090ad
--- /dev/null
+++ b/arch/x86/kvm/vmx_evmcs.h
@@ -0,0 +1,324 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_EVMCS_H
+#define __KVM_X86_VMX_EVMCS_H
+
+#include <asm/hyperv-tlfs.h>
+
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
+#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
+#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
+ {EVMCS1_OFFSET(name), clean_field}
+
+struct evmcs_field {
+ u16 offset;
+ u16 clean_field;
+};
+
+static const struct evmcs_field vmcs_field_to_evmcs_1[] = {
+ /* 64 bit rw */
+ EVMCS1_FIELD(GUEST_RIP, guest_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_RSP, guest_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR0, host_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR3, host_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR4, host_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_RIP, host_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(MSR_BITMAP, msr_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP),
+ EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(TSC_OFFSET, tsc_offset,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR0, guest_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR3, guest_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR4, guest_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_DR7, guest_dr7,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(HOST_FS_BASE, host_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GS_BASE, host_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_TR_BASE, host_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_RSP, host_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(EPT_POINTER, ept_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+ EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+
+ /* 64 bit read only */
+ EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ /*
+ * Not defined in KVM:
+ *
+ * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ */
+ EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /*
+ * No mask defined in the spec as Hyper-V doesn't currently support
+ * these. Future proof by resetting the whole clean field mask on
+ * access.
+ */
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 32 bit rw */
+ EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC),
+ EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN),
+ EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY),
+ EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vm_entry_exception_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+
+ /* 32 bit read only */
+ EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /* No mask defined in the spec (not used) */
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 16 bit rw */
+ EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+};
+
+static __always_inline int get_evmcs_offset(unsigned long field,
+ u16 *clean_field)
+{
+ unsigned int index = ROL16(field, 6);
+ const struct evmcs_field *evmcs_field;
+
+ if (unlikely(index >= ARRAY_SIZE(vmcs_field_to_evmcs_1))) {
+ WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n",
+ field);
+ return -ENOENT;
+ }
+
+ evmcs_field = &vmcs_field_to_evmcs_1[index];
+
+ if (clean_field)
+ *clean_field = evmcs_field->clean_field;
+
+ return evmcs_field->offset;
+}
+
+#undef ROL16
+
+#endif /* __KVM_X86_VMX_EVMCS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 18b5ca7a3197..b2ff74b12ec4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -102,6 +102,8 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+static void store_regs(struct kvm_vcpu *vcpu);
+static int sync_regs(struct kvm_vcpu *vcpu);
struct kvm_x86_ops *kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -140,6 +142,13 @@ module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO);
+bool __read_mostly enable_vmware_backdoor = false;
+module_param(enable_vmware_backdoor, bool, S_IRUGO);
+EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
+
+static bool __read_mostly force_emulation_prefix = false;
+module_param(force_emulation_prefix, bool, S_IRUGO);
+
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
@@ -1032,7 +1041,11 @@ static u32 emulated_msrs[] = {
HV_X64_MSR_VP_RUNTIME,
HV_X64_MSR_SCONTROL,
HV_X64_MSR_STIMER0_CONFIG,
- HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ HV_X64_MSR_VP_ASSIST_PAGE,
+ HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
+ HV_X64_MSR_TSC_EMULATION_STATUS,
+
+ MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN,
MSR_IA32_TSC_ADJUST,
@@ -1054,6 +1067,25 @@ static unsigned num_emulated_msrs;
* can be used by a hypervisor to validate requested CPU features.
*/
static u32 msr_based_features[] = {
+ MSR_IA32_VMX_BASIC,
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+ MSR_IA32_VMX_PINBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+ MSR_IA32_VMX_PROCBASED_CTLS,
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
+ MSR_IA32_VMX_EXIT_CTLS,
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+ MSR_IA32_VMX_ENTRY_CTLS,
+ MSR_IA32_VMX_MISC,
+ MSR_IA32_VMX_CR0_FIXED0,
+ MSR_IA32_VMX_CR0_FIXED1,
+ MSR_IA32_VMX_CR4_FIXED0,
+ MSR_IA32_VMX_CR4_FIXED1,
+ MSR_IA32_VMX_VMCS_ENUM,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ MSR_IA32_VMX_EPT_VPID_CAP,
+ MSR_IA32_VMX_VMFUNC,
+
MSR_F10H_DECFG,
MSR_IA32_UCODE_REV,
};
@@ -2432,6 +2464,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
return kvm_hv_set_msr_common(vcpu, msr, data,
msr_info->host_initiated);
case MSR_IA32_BBL_CR_CTL3:
@@ -2558,6 +2593,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_AMD64_DC_CFG:
msr_info->data = 0;
break;
+ case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -2661,6 +2697,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
return kvm_hv_get_msr_common(vcpu,
msr_info->index, &msr_info->data);
break;
@@ -2777,9 +2816,15 @@ out:
return r;
}
+static inline bool kvm_can_mwait_in_guest(void)
+{
+ return boot_cpu_has(X86_FEATURE_MWAIT) &&
+ !boot_cpu_has_bug(X86_BUG_MONITOR);
+}
+
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
- int r;
+ int r = 0;
switch (ext) {
case KVM_CAP_IRQCHIP:
@@ -2809,6 +2854,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_SYNIC:
case KVM_CAP_HYPERV_SYNIC2:
case KVM_CAP_HYPERV_VP_INDEX:
+ case KVM_CAP_HYPERV_EVENTFD:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -2828,11 +2874,16 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_GET_MSR_FEATURES:
r = 1;
break;
+ case KVM_CAP_SYNC_REGS:
+ r = KVM_SYNC_X86_VALID_FIELDS;
+ break;
case KVM_CAP_ADJUST_CLOCK:
r = KVM_CLOCK_TSC_STABLE;
break;
- case KVM_CAP_X86_GUEST_MWAIT:
- r = kvm_mwait_in_guest();
+ case KVM_CAP_X86_DISABLE_EXITS:
+ r |= KVM_X86_DISABLE_EXITS_HTL | KVM_X86_DISABLE_EXITS_PAUSE;
+ if(kvm_can_mwait_in_guest())
+ r |= KVM_X86_DISABLE_EXITS_MWAIT;
break;
case KVM_CAP_X86_SMM:
/* SMBASE is usually relocated above 1M on modern chipsets,
@@ -2873,7 +2924,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_X2APIC_API_VALID_FLAGS;
break;
default:
- r = 0;
break;
}
return r;
@@ -3265,7 +3315,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->exception.error_code = vcpu->arch.exception.error_code;
events->interrupt.injected =
- vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
+ vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
events->interrupt.soft = 0;
events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
@@ -3318,7 +3368,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.exception.has_error_code = events->exception.has_error_code;
vcpu->arch.exception.error_code = events->exception.error_code;
- vcpu->arch.interrupt.pending = events->interrupt.injected;
+ vcpu->arch.interrupt.injected = events->interrupt.injected;
vcpu->arch.interrupt.nr = events->interrupt.nr;
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
@@ -3917,8 +3967,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
u64 ident_addr)
{
- kvm->arch.ept_identity_map_addr = ident_addr;
- return 0;
+ return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
@@ -4178,6 +4227,20 @@ split_irqchip_unlock:
r = 0;
break;
+ case KVM_CAP_X86_DISABLE_EXITS:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
+ break;
+
+ if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
+ kvm_can_mwait_in_guest())
+ kvm->arch.mwait_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_HTL)
+ kvm->arch.hlt_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
+ kvm->arch.pause_in_guest = true;
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -4482,6 +4545,15 @@ set_identity_unlock:
r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
break;
}
+ case KVM_HYPERV_EVENTFD: {
+ struct kvm_hyperv_eventfd hvevfd;
+
+ r = -EFAULT;
+ if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
+ goto out;
+ r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
+ break;
+ }
default:
r = -ENOTTY;
}
@@ -4771,6 +4843,30 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+int handle_ud(struct kvm_vcpu *vcpu)
+{
+ int emul_type = EMULTYPE_TRAP_UD;
+ enum emulation_result er;
+ char sig[5]; /* ud2; .ascii "kvm" */
+ struct x86_exception e;
+
+ if (force_emulation_prefix &&
+ kvm_read_guest_virt(&vcpu->arch.emulate_ctxt,
+ kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 &&
+ memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
+ kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
+ emul_type = 0;
+ }
+
+ er = emulate_instruction(vcpu, emul_type);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
+ if (er != EMULATE_DONE)
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(handle_ud);
+
static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t gpa, bool write)
{
@@ -5612,27 +5708,27 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
kvm_rip_write(vcpu, ctxt->eip);
kvm_set_rflags(vcpu, ctxt->eflags);
- if (irq == NMI_VECTOR)
- vcpu->arch.nmi_pending = 0;
- else
- vcpu->arch.interrupt.pending = false;
-
return EMULATE_DONE;
}
EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
-static int handle_emulation_failure(struct kvm_vcpu *vcpu)
+static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
int r = EMULATE_DONE;
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
+
+ if (emulation_type & EMULTYPE_NO_UD_ON_FAIL)
+ return EMULATE_FAIL;
+
if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
r = EMULATE_USER_EXIT;
}
+
kvm_queue_exception(vcpu, UD_VECTOR);
return r;
@@ -5876,6 +5972,37 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
return false;
}
+static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
+{
+ switch (ctxt->opcode_len) {
+ case 1:
+ switch (ctxt->b) {
+ case 0xe4: /* IN */
+ case 0xe5:
+ case 0xec:
+ case 0xed:
+ case 0xe6: /* OUT */
+ case 0xe7:
+ case 0xee:
+ case 0xef:
+ case 0x6c: /* INS */
+ case 0x6d:
+ case 0x6e: /* OUTS */
+ case 0x6f:
+ return true;
+ }
+ break;
+ case 2:
+ switch (ctxt->b) {
+ case 0x33: /* RDPMC */
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
int x86_emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
int emulation_type,
@@ -5928,10 +6055,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DONE;
if (emulation_type & EMULTYPE_SKIP)
return EMULATE_FAIL;
- return handle_emulation_failure(vcpu);
+ return handle_emulation_failure(vcpu, emulation_type);
}
}
+ if ((emulation_type & EMULTYPE_VMWARE) &&
+ !is_vmware_backdoor_opcode(ctxt))
+ return EMULATE_FAIL;
+
if (emulation_type & EMULTYPE_SKIP) {
kvm_rip_write(vcpu, ctxt->_eip);
if (ctxt->eflags & X86_EFLAGS_RF)
@@ -5963,7 +6094,7 @@ restart:
emulation_type))
return EMULATE_DONE;
- return handle_emulation_failure(vcpu);
+ return handle_emulation_failure(vcpu, emulation_type);
}
if (ctxt->have_exception) {
@@ -6016,7 +6147,8 @@ restart:
}
EXPORT_SYMBOL_GPL(x86_emulate_instruction);
-int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
+static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port)
{
unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
@@ -6025,7 +6157,6 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
vcpu->arch.pio.count = 0;
return ret;
}
-EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
{
@@ -6049,7 +6180,8 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
return 1;
}
-int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port)
+static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port)
{
unsigned long val;
int ret;
@@ -6068,7 +6200,21 @@ int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port)
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_fast_pio_in);
+
+int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+
+ /*
+ * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ if (in)
+ return kvm_fast_pio_in(vcpu, size, port) && ret;
+ else
+ return kvm_fast_pio_out(vcpu, size, port) && ret;
+}
+EXPORT_SYMBOL_GPL(kvm_fast_pio);
static int kvmclock_cpu_down_prep(unsigned int cpu)
{
@@ -6246,7 +6392,8 @@ static void kvm_timer_init(void)
kvmclock_cpu_online, kvmclock_cpu_down_prep);
}
-static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
int kvm_is_in_guest(void)
{
@@ -6279,18 +6426,6 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
.get_guest_ip = kvm_get_guest_ip,
};
-void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
-{
- __this_cpu_write(current_vcpu, vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
-
-void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
-{
- __this_cpu_write(current_vcpu, NULL);
-}
-EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
-
static void kvm_set_mmio_spte_mask(void)
{
u64 mask;
@@ -6644,27 +6779,36 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
int r;
/* try to reinject previous events if any */
- if (vcpu->arch.exception.injected) {
- kvm_x86_ops->queue_exception(vcpu);
- return 0;
- }
+ if (vcpu->arch.exception.injected)
+ kvm_x86_ops->queue_exception(vcpu);
/*
- * Exceptions must be injected immediately, or the exception
- * frame will have the address of the NMI or interrupt handler.
+ * Do not inject an NMI or interrupt if there is a pending
+ * exception. Exceptions and interrupts are recognized at
+ * instruction boundaries, i.e. the start of an instruction.
+ * Trap-like exceptions, e.g. #DB, have higher priority than
+ * NMIs and interrupts, i.e. traps are recognized before an
+ * NMI/interrupt that's pending on the same instruction.
+ * Fault-like exceptions, e.g. #GP and #PF, are the lowest
+ * priority, but are only generated (pended) during instruction
+ * execution, i.e. a pending fault-like exception means the
+ * fault occurred on the *previous* instruction and must be
+ * serviced prior to recognizing any new events in order to
+ * fully complete the previous instruction.
*/
- if (!vcpu->arch.exception.pending) {
- if (vcpu->arch.nmi_injected) {
+ else if (!vcpu->arch.exception.pending) {
+ if (vcpu->arch.nmi_injected)
kvm_x86_ops->set_nmi(vcpu);
- return 0;
- }
-
- if (vcpu->arch.interrupt.pending) {
+ else if (vcpu->arch.interrupt.injected)
kvm_x86_ops->set_irq(vcpu);
- return 0;
- }
}
+ /*
+ * Call check_nested_events() even if we reinjected a previous event
+ * in order for caller to determine if it should require immediate-exit
+ * from L2 to L1 due to pending L1 events which require exit
+ * from L2 to L1.
+ */
if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
if (r != 0)
@@ -6677,6 +6821,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code);
+ WARN_ON_ONCE(vcpu->arch.exception.injected);
vcpu->arch.exception.pending = false;
vcpu->arch.exception.injected = true;
@@ -6691,7 +6836,14 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
}
kvm_x86_ops->queue_exception(vcpu);
- } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) {
+ }
+
+ /* Don't consider new event if we re-injected an event */
+ if (kvm_event_needs_reinjection(vcpu))
+ return 0;
+
+ if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
+ kvm_x86_ops->smi_allowed(vcpu)) {
vcpu->arch.smi_pending = false;
++vcpu->arch.smi_count;
enter_smm(vcpu);
@@ -6985,8 +7137,6 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
- u64 eoi_exit_bitmap[4];
-
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
@@ -6999,6 +7149,20 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
kvm_x86_ops->sync_pir_to_irr(vcpu);
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
+
+ if (is_guest_mode(vcpu))
+ vcpu->arch.load_eoi_exitmap_pending = true;
+ else
+ kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
+}
+
+static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
+{
+ u64 eoi_exit_bitmap[4];
+
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+ return;
+
bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
vcpu_to_synic(vcpu)->vec_bitmap, 256);
kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
@@ -7113,6 +7277,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
vcpu_scan_ioapic(vcpu);
+ if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
+ vcpu_load_eoi_exitmap(vcpu);
if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
kvm_vcpu_reload_apic_access_page(vcpu);
if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
@@ -7291,7 +7457,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_put_guest_xcr0(vcpu);
+ kvm_before_interrupt(vcpu);
kvm_x86_ops->handle_external_intr(vcpu);
+ kvm_after_interrupt(vcpu);
++vcpu->stat.exits;
@@ -7500,7 +7668,6 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
return 0;
}
-
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
@@ -7526,6 +7693,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
goto out;
}
+ if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (vcpu->run->kvm_dirty_regs) {
+ r = sync_regs(vcpu);
+ if (r != 0)
+ goto out;
+ }
+
/* re-sync apic's tpr */
if (!lapic_in_kernel(vcpu)) {
if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
@@ -7550,6 +7728,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
out:
kvm_put_guest_fpu(vcpu);
+ if (vcpu->run->kvm_valid_regs)
+ store_regs(vcpu);
post_kvm_run_save(vcpu);
kvm_sigset_deactivate(vcpu);
@@ -7557,10 +7737,8 @@ out:
return r;
}
-int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
- vcpu_load(vcpu);
-
if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
/*
* We are here if userspace calls get_regs() in the middle of
@@ -7593,15 +7771,18 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->rip = kvm_rip_read(vcpu);
regs->rflags = kvm_get_rflags(vcpu);
+}
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu_load(vcpu);
+ __get_regs(vcpu, regs);
vcpu_put(vcpu);
return 0;
}
-int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
- vcpu_load(vcpu);
-
vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
@@ -7630,7 +7811,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.exception.pending = false;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu_load(vcpu);
+ __set_regs(vcpu, regs);
vcpu_put(vcpu);
return 0;
}
@@ -7645,13 +7831,10 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
}
EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
+static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct desc_ptr dt;
- vcpu_load(vcpu);
-
kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -7679,10 +7862,16 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
- if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
+ if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
set_bit(vcpu->arch.interrupt.nr,
(unsigned long *)sregs->interrupt_bitmap);
+}
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ vcpu_load(vcpu);
+ __get_sregs(vcpu, sregs);
vcpu_put(vcpu);
return 0;
}
@@ -7754,7 +7943,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
-int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
/*
@@ -7777,8 +7966,7 @@ int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
return 0;
}
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
+static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct msr_data apic_base_msr;
int mmu_reset_needed = 0;
@@ -7786,8 +7974,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
struct desc_ptr dt;
int ret = -EINVAL;
- vcpu_load(vcpu);
-
if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
(sregs->cr4 & X86_CR4_OSXSAVE))
goto out;
@@ -7866,6 +8052,16 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
ret = 0;
out:
+ return ret;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ int ret;
+
+ vcpu_load(vcpu);
+ ret = __set_sregs(vcpu, sregs);
vcpu_put(vcpu);
return ret;
}
@@ -7992,6 +8188,45 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
return 0;
}
+static void store_regs(struct kvm_vcpu *vcpu)
+{
+ BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
+ __get_regs(vcpu, &vcpu->run->s.regs.regs);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
+ __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
+ kvm_vcpu_ioctl_x86_get_vcpu_events(
+ vcpu, &vcpu->run->s.regs.events);
+}
+
+static int sync_regs(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
+ return -EINVAL;
+
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
+ __set_regs(vcpu, &vcpu->run->s.regs.regs);
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
+ }
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
+ if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
+ return -EINVAL;
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
+ }
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
+ if (kvm_vcpu_ioctl_x86_set_vcpu_events(
+ vcpu, &vcpu->run->s.regs.events))
+ return -EINVAL;
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
+ }
+
+ return 0;
+}
+
static void fx_init(struct kvm_vcpu *vcpu)
{
fpstate_init(&vcpu->arch.guest_fpu.state);
@@ -8447,7 +8682,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
- mutex_init(&kvm->arch.hyperv.hv_lock);
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
@@ -8456,6 +8690,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+ kvm_hv_init_vm(kvm);
kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
@@ -8586,6 +8821,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kvm_mmu_uninit_vm(kvm);
kvm_page_track_cleanup(kvm);
+ kvm_hv_destroy_vm(kvm);
}
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b91215d1fd80..7d35ce672989 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -2,12 +2,48 @@
#ifndef ARCH_X86_KVM_X86_H
#define ARCH_X86_KVM_X86_H
-#include <asm/processor.h>
-#include <asm/mwait.h>
#include <linux/kvm_host.h>
#include <asm/pvclock.h>
#include "kvm_cache_regs.h"
+#define KVM_DEFAULT_PLE_GAP 128
+#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+#define KVM_DEFAULT_PLE_WINDOW_GROW 2
+#define KVM_DEFAULT_PLE_WINDOW_SHRINK 0
+#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX UINT_MAX
+#define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX
+#define KVM_SVM_DEFAULT_PLE_WINDOW 3000
+
+static inline unsigned int __grow_ple_window(unsigned int val,
+ unsigned int base, unsigned int modifier, unsigned int max)
+{
+ u64 ret = val;
+
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ ret *= modifier;
+ else
+ ret += modifier;
+
+ return min(ret, (u64)max);
+}
+
+static inline unsigned int __shrink_ple_window(unsigned int val,
+ unsigned int base, unsigned int modifier, unsigned int min)
+{
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ val /= modifier;
+ else
+ val -= modifier;
+
+ return max(val, min);
+}
+
#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
@@ -19,19 +55,19 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
bool soft)
{
- vcpu->arch.interrupt.pending = true;
+ vcpu->arch.interrupt.injected = true;
vcpu->arch.interrupt.soft = soft;
vcpu->arch.interrupt.nr = vector;
}
static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
{
- vcpu->arch.interrupt.pending = false;
+ vcpu->arch.interrupt.injected = false;
}
static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.exception.injected || vcpu->arch.interrupt.pending ||
+ return vcpu->arch.exception.injected || vcpu->arch.interrupt.injected ||
vcpu->arch.nmi_injected;
}
@@ -205,8 +241,6 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
return !(kvm->arch.disabled_quirks & quirk);
}
-void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
-void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
@@ -221,6 +255,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception);
+int handle_ud(struct kvm_vcpu *vcpu);
+
void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
@@ -242,6 +278,8 @@ extern unsigned int min_timer_period_us;
extern unsigned int lapic_timer_advance_ns;
+extern bool enable_vmware_backdoor;
+
extern struct static_key kvm_no_apic_vcpu;
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
@@ -264,10 +302,38 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
__rem; \
})
-static inline bool kvm_mwait_in_guest(void)
+#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
+#define KVM_X86_DISABLE_EXITS_HTL (1 << 1)
+#define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2)
+#define KVM_X86_DISABLE_VALID_EXITS (KVM_X86_DISABLE_EXITS_MWAIT | \
+ KVM_X86_DISABLE_EXITS_HTL | \
+ KVM_X86_DISABLE_EXITS_PAUSE)
+
+static inline bool kvm_mwait_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.mwait_in_guest;
+}
+
+static inline bool kvm_hlt_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.hlt_in_guest;
+}
+
+static inline bool kvm_pause_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.pause_in_guest;
+}
+
+DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+
+static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)
+{
+ __this_cpu_write(current_vcpu, vcpu);
+}
+
+static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
{
- return boot_cpu_has(X86_FEATURE_MWAIT) &&
- !boot_cpu_has_bug(X86_BUG_MONITOR);
+ __this_cpu_write(current_vcpu, NULL);
}
#endif